# Estudo de caso com o framework MathFeature
- https://bonidia.github.io/MathFeature/

Utilização por linha de comando:
- git clone https://github.com/Bonidia/MathFeature.git MathFeature
- cd MathFeature
- pip3 install -r requirements.txt
- apt-get -y install python3-igraph

## Baixando sequências de repositórios públicos
- GENCODE: https://www.gencodegenes.org/
- PHYTOZOME: https://phytozome-next.jgi.doe.gov/
- GREENC: http://greenc.sequentiabiotech.com/wiki/Main_Page

## Etapas de pré-processamento

- Contagem da quantidade de sequências;
- Amostragem;
- Remoção de redundância;
- Eliminação de ruídos.
- Extração de características;
- Composição da base de dados

#### Divisão da base em treino e teste

In [23]:
#Base lncRNA e mRNA
import os 
import pandas
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (recall_score,
                             accuracy_score,
                             precision_score,
                             f1_score)

def split(finput, test_rate):
    dataset = pandas.read_csv(finput)
    X = dataset.iloc[:, :-1]
    y = dataset.iloc[:, -1]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_rate)
    train = pandas.concat([X_train, y_train], axis=1)
    test = pandas.concat([X_test, y_test], axis=1)
    
    trainData = os.path.splitext(finput)[0]+"_train"+os.path.splitext(finput)[1]
    testData = os.path.splitext(finput)[0]+"_test"+os.path.splitext(finput)[1]
    train.to_csv(trainData, index=False)
    test.to_csv(testData, index=False)    
    return                             


# Aplica a divisão treino e teste nas bases mRNA e lncRNA

split('bases/mRNA.csv',0.3)
split('bases/lncRNA.csv',0.3)

# carrega a base de dados treino lncRNA e mRNA
lncRNA_data = pandas.read_csv('bases/lncRNA_train.csv')
mRNA_data = pandas.read_csv('bases/mRNA_train.csv')
dadosTreino = pandas.concat([lncRNA_data,mRNA_data])

In [24]:
dadosTreino

Unnamed: 0,nameseq,A,C,G,T,AA,AC,AG,AT,CA,...,minimum_ORF_length,std_ORF_length,average_ORF_length,cv_ORF_length,maximum_GC_content_ORF,minimum_GC_content_ORF,std_GC_content_ORF,average_GC_content_ORF,cv_GC_content_ORF,label
0,lcl|Athaliana_AT1G53980.1,0.344203,0.195652,0.257246,0.202899,0.101818,0.054545,0.109091,0.076364,0.090909,...,21,112.151683,119.000000,0.942451,57.142857,45.289855,5.478770,53.033126,0.103308,lncRNA
1,lcl|Athaliana_AT1G30757.1,0.288189,0.157480,0.218898,0.335433,0.113565,0.031546,0.059937,0.083596,0.039432,...,6,33.337816,39.300000,0.848290,52.777778,16.666667,14.259950,34.185301,0.417137,lncRNA
2,lcl|Athaliana_AT5G47455.6,0.239683,0.239683,0.200000,0.320635,0.068362,0.046105,0.054054,0.071542,0.065183,...,6,88.123404,86.625000,1.017298,52.145215,16.666667,10.946662,38.301527,0.285802,lncRNA
3,lcl|Athaliana_AT2G43780.2,0.248598,0.205607,0.224299,0.321495,0.069288,0.063670,0.061798,0.054307,0.065543,...,21,73.710159,102.750000,0.717374,48.039216,33.333333,5.346745,39.441415,0.135562,lncRNA
4,lcl|Athaliana_AT1G42888.1,0.285944,0.215825,0.205211,0.293020,0.082368,0.056628,0.074324,0.072716,0.067568,...,6,68.317351,72.947368,0.936529,54.166667,29.411765,6.097615,40.741124,0.149667,lncRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100,AT3G32960.1|PACid:19661139,0.330233,0.198450,0.215504,0.255814,0.094720,0.062112,0.079193,0.094720,0.085404,...,15,220.785529,160.500000,1.375611,45.138889,33.333333,4.780372,39.731015,0.120318,mRNA
2101,AT1G18485.1|PACid:19655281,0.273601,0.168211,0.254720,0.303467,0.087912,0.040179,0.067308,0.077953,0.050824,...,9,440.395630,105.785714,4.163092,66.666667,20.000000,8.668627,39.163799,0.221343,mRNA
2102,AT4G09260.1|PACid:19647980,0.247312,0.212903,0.232258,0.307527,0.058190,0.045259,0.064655,0.079741,0.079741,...,12,191.264215,204.000000,0.937570,44.516129,16.666667,13.111499,35.209080,0.372390,mRNA
2103,AT2G19540.1|PACid:19643370,0.274468,0.215603,0.248936,0.260993,0.073101,0.061746,0.071682,0.067424,0.077360,...,9,323.161449,134.823529,2.396922,55.555556,22.222222,8.583157,44.252327,0.193959,mRNA


In [25]:
## Remove column nameseq
dadosTreino.drop(columns='nameseq', inplace=True)

In [26]:
dadosTreino

Unnamed: 0,A,C,G,T,AA,AC,AG,AT,CA,CC,...,minimum_ORF_length,std_ORF_length,average_ORF_length,cv_ORF_length,maximum_GC_content_ORF,minimum_GC_content_ORF,std_GC_content_ORF,average_GC_content_ORF,cv_GC_content_ORF,label
0,0.344203,0.195652,0.257246,0.202899,0.101818,0.054545,0.109091,0.076364,0.090909,0.050909,...,21,112.151683,119.000000,0.942451,57.142857,45.289855,5.478770,53.033126,0.103308,lncRNA
1,0.288189,0.157480,0.218898,0.335433,0.113565,0.031546,0.059937,0.083596,0.039432,0.031546,...,6,33.337816,39.300000,0.848290,52.777778,16.666667,14.259950,34.185301,0.417137,lncRNA
2,0.239683,0.239683,0.200000,0.320635,0.068362,0.046105,0.054054,0.071542,0.065183,0.047695,...,6,88.123404,86.625000,1.017298,52.145215,16.666667,10.946662,38.301527,0.285802,lncRNA
3,0.248598,0.205607,0.224299,0.321495,0.069288,0.063670,0.061798,0.054307,0.065543,0.033708,...,21,73.710159,102.750000,0.717374,48.039216,33.333333,5.346745,39.441415,0.135562,lncRNA
4,0.285944,0.215825,0.205211,0.293020,0.082368,0.056628,0.074324,0.072716,0.067568,0.050837,...,6,68.317351,72.947368,0.936529,54.166667,29.411765,6.097615,40.741124,0.149667,lncRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2100,0.330233,0.198450,0.215504,0.255814,0.094720,0.062112,0.079193,0.094720,0.085404,0.037267,...,15,220.785529,160.500000,1.375611,45.138889,33.333333,4.780372,39.731015,0.120318,mRNA
2101,0.273601,0.168211,0.254720,0.303467,0.087912,0.040179,0.067308,0.077953,0.050824,0.028503,...,9,440.395630,105.785714,4.163092,66.666667,20.000000,8.668627,39.163799,0.221343,mRNA
2102,0.247312,0.212903,0.232258,0.307527,0.058190,0.045259,0.064655,0.079741,0.079741,0.034483,...,12,191.264215,204.000000,0.937570,44.516129,16.666667,13.111499,35.209080,0.372390,mRNA
2103,0.274468,0.215603,0.248936,0.260993,0.073101,0.061746,0.071682,0.067424,0.077360,0.041874,...,9,323.161449,134.823529,2.396922,55.555556,22.222222,8.583157,44.252327,0.193959,mRNA


In [27]:
#Divide a base entre os previsores e classe
colunas = dadosTreino.columns.drop('label')

# Gera os previsores e classe (X e y)
X = dadosTreino[colunas].values
y = dadosTreino['label']

## Transform categorical in binary class values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Gera os dados de teste
# carrega a base de dados teste lncRNA e mRNA
lncRNA_test = pandas.read_csv('bases/lncRNA_test.csv')
mRNA_test = pandas.read_csv('bases/mRNA_test.csv')
dados = pandas.concat([lncRNA_test,mRNA_test])

dadosTeste = dados.columns.drop('label')
X_teste = dados[dadosTeste].values

##Remove categorical column nameseq
import numpy as np
X_teste = np.delete(X_teste, 0, axis=1)

##Label Encoder Tranform
y_teste = dados['label']
y_teste = le.fit_transform(y_teste)

#### Atributos utilizados: 

In [28]:
# Exibe a quantidade de atributos
print("Columns size >>> %d"%len(colunas))

# Exibe o nome dos atributos
print(dadosTreino.columns)

Columns size >>> 94
Index(['A', 'C', 'G', 'T', 'AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT',
       'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT', 'AAA', 'AAC', 'AAG',
       'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA',
       'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG',
       'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA',
       'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG',
       'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA',
       'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG',
       'TTT', 'maximum_ORF_length', 'minimum_ORF_length', 'std_ORF_length',
       'average_ORF_length', 'cv_ORF_length', 'maximum_GC_content_ORF',
       'minimum_GC_content_ORF', 'std_GC_content_ORF',
       'average_GC_content_ORF', 'cv_GC_content_ORF', 'label'],
      dtype='object')


In [29]:
print(X.shape, y.shape, X_teste.shape, y_teste.shape)

(4209, 94) (4209,) (1805, 94) (1805,)


#### Aplica o modelo de predição com Decision Tree

In [30]:
# instancia um DecisionTreeClassifier
clf = DecisionTreeClassifier()
# treina o knn
clf.fit(X, y)

y_pred = clf.predict(X_teste)
#print(y_pred)

 # gerar score baseado na acurácia
acuracidade = round(accuracy_score(y_teste,y_pred)*100,2)
print(acuracidade)

92.8


## Seleção de características com Wrapper e Recursive feature elimination
#### Detalhes: https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection

In [31]:
print(X.shape)

(4209, 94)


In [32]:
from sklearn.feature_selection import RFE

# instancia um DecisionTreeClassifier
clf = DecisionTreeClassifier()
selector = RFE(clf, step=1)
selector = selector.fit(X, y)

#Armazena a nova dimensão do vetor de características
features = selector.fit_transform(X,y)

# instancia um DecisionTreeClassifier
y_pred = selector.predict(X_teste)

print(selector.ranking_)
print(selector.get_support(indices=True))

 # gerar score baseado na acurácia
acuracidade = round(accuracy_score(y_teste,y_pred)*100,2)
print(acuracidade)



[47 45  1 26  1  1 20 19  9 16  1 25 30 29 11 15  7 17  1  1  1  1  1  1
  1  1  1  1  1  1 31 35  1  1  1  1 27 21  1  2 18 32 33 37 39 42 46  1
  5  3  1 14  1 10  1  6  1  1  1 40  1  1 28  1  4  1  1  1 44 24  1  1
 12 22  1 38 43  1  1  1 13  8 34  1  1 23  1  1  1  1  1 36 41 48]
[ 2  4  5 10 18 19 20 21 22 23 24 25 26 27 28 29 32 33 34 35 38 47 50 52
 54 56 57 58 60 61 63 65 66 67 70 71 74 77 78 79 83 84 86 87 88 89 90]
92.91


In [33]:
print(features.shape)

(4209, 47)


### Obtendo o nome dos 42 atributos

In [34]:
temp = pandas.Series(selector.support_,index = colunas)
wrapperApproach = temp[temp==True].index
print(wrapperApproach)

Index(['G', 'AA', 'AC', 'CG', 'TG', 'TT', 'AAA', 'AAC', 'AAG', 'AAT', 'ACA',
       'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'ATA', 'ATC', 'ATG', 'ATT', 'CAG',
       'CGT', 'CTG', 'GAA', 'GAG', 'GCA', 'GCC', 'GCG', 'GGA', 'GGC', 'GGT',
       'GTC', 'GTG', 'GTT', 'TAG', 'TAT', 'TCG', 'TGC', 'TGG', 'TGT', 'TTT',
       'maximum_ORF_length', 'std_ORF_length', 'average_ORF_length',
       'cv_ORF_length', 'maximum_GC_content_ORF', 'minimum_GC_content_ORF'],
      dtype='object')


In [35]:
# Deixa no formato list
featuresList = wrapperApproach.tolist()

In [36]:
print(featuresList)

['G', 'AA', 'AC', 'CG', 'TG', 'TT', 'AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'ATA', 'ATC', 'ATG', 'ATT', 'CAG', 'CGT', 'CTG', 'GAA', 'GAG', 'GCA', 'GCC', 'GCG', 'GGA', 'GGC', 'GGT', 'GTC', 'GTG', 'GTT', 'TAG', 'TAT', 'TCG', 'TGC', 'TGG', 'TGT', 'TTT', 'maximum_ORF_length', 'std_ORF_length', 'average_ORF_length', 'cv_ORF_length', 'maximum_GC_content_ORF', 'minimum_GC_content_ORF']


## Seleção de atributos abordagem filtro

In [37]:
#Feature selection with Mutual Information 
import numpy
from pandas.plotting import scatter_matrix
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif

#selector = SelectKBest(score_func=chi2, k=42)
selector = SelectKBest(score_func=mutual_info_classif, k=42)
#selector = SelectKBest(score_func=f_classif, k=42)

selector.fit(X, y)
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
print(cols)

[ 2  3  7 12 13 14 16 19 23 29 30 32 35 38 42 44 49 50 52 53 54 57 59 60
 61 62 64 65 68 70 71 78 79 80 83 84 86 87 88 90 92 93]


In [38]:
# to remove the rest of the features:
X = selector.transform(X)
X_teste = selector.transform(X_teste)

In [39]:
# instancia um DecisionTreeClassifier
clf = DecisionTreeClassifier()
# treina o knn
clf.fit(X, y)

y_pred = clf.predict(X_teste)
#print(y_pred)

 # gerar score baseado na acurácia
acuracidade = round(accuracy_score(y_teste,y_pred)*100,2)
print(acuracidade)

92.85


### Nome dos atributos

In [40]:
temp = pandas.Series(selector.get_support(indices=False),index = colunas)
filterApproach = temp[temp==True].index
print(filterApproach)

Index(['G', 'T', 'AT', 'GA', 'GC', 'GG', 'TA', 'TT', 'AAT', 'AGC', 'AGG',
       'ATA', 'ATT', 'CAG', 'CCG', 'CGA', 'CTC', 'CTG', 'GAA', 'GAC', 'GAG',
       'GCC', 'GCT', 'GGA', 'GGC', 'GGG', 'GTA', 'GTC', 'TAA', 'TAG', 'TAT',
       'TGG', 'TGT', 'TTA', 'TTT', 'maximum_ORF_length', 'std_ORF_length',
       'average_ORF_length', 'cv_ORF_length', 'minimum_GC_content_ORF',
       'average_GC_content_ORF', 'cv_GC_content_ORF'],
      dtype='object')


In [41]:
# Deixa no formato list
featuresList = filterApproach.tolist()

In [42]:
print(featuresList)

['G', 'T', 'AT', 'GA', 'GC', 'GG', 'TA', 'TT', 'AAT', 'AGC', 'AGG', 'ATA', 'ATT', 'CAG', 'CCG', 'CGA', 'CTC', 'CTG', 'GAA', 'GAC', 'GAG', 'GCC', 'GCT', 'GGA', 'GGC', 'GGG', 'GTA', 'GTC', 'TAA', 'TAG', 'TAT', 'TGG', 'TGT', 'TTA', 'TTT', 'maximum_ORF_length', 'std_ORF_length', 'average_ORF_length', 'cv_ORF_length', 'minimum_GC_content_ORF', 'average_GC_content_ORF', 'cv_GC_content_ORF']


### Avaliação entre as features (Filter x Wrapper)

In [43]:
diff = list(set(wrapperApproach) - set(filterApproach))
equal = list(set(wrapperApproach).intersection(filterApproach))
print("k-mers diferentes:")
print(diff)
print("k-mers iguais:")
print(equal)

k-mers diferentes:
['ACA', 'AAG', 'ATC', 'ACC', 'AA', 'GTT', 'ATG', 'ACT', 'CGT', 'TGC', 'AC', 'GCG', 'GCA', 'AAA', 'ACG', 'GTG', 'TG', 'maximum_GC_content_ORF', 'GGT', 'TCG', 'AGA', 'AAC', 'CG']
k-mers iguais:
['cv_ORF_length', 'maximum_ORF_length', 'CAG', 'GCC', 'ATT', 'AGC', 'GAG', 'GAA', 'AAT', 'G', 'TAT', 'ATA', 'GGA', 'TGT', 'TT', 'TGG', 'CTG', 'GTC', 'GGC', 'std_ORF_length', 'minimum_GC_content_ORF', 'TTT', 'average_ORF_length', 'TAG']
