In [128]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import multilabel_confusion_matrix, roc_auc_score, classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

In [121]:
X = pd.read_csv('../data/PromoterTrain.csv', index_col='id')
y = pd.read_csv('../data/SigmaTrain.csv',index_col='id')

In [122]:
X.head()

Unnamed: 0_level_0,SEQ
id,Unnamed: 1_level_1
0,CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATG...
1,CCGGTAAACTCTGTGGAAAGAGCAATGTGAAATCAGCGAGATAATG...
2,GGAATTTTCTCGAGCATAGCCAGAGCCGCAGAATTTGCTACGGTTA...
3,TCACCAATACCGCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGC...
4,GCACGGTATCGTGCTTGGTAACCTGGTAGGATTGATCGATTCTGAC...


In [123]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3399 entries, 0 to 3398
Data columns (total 1 columns):
SEQ    3399 non-null object
dtypes: object(1)
memory usage: 53.1+ KB


In [124]:
y.head()

Unnamed: 0_level_0,RPOS,RPOD,RPOH,RPON,RPOF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0,0,0,0
1,0,0,0,0,0
2,1,1,1,0,0
3,0,0,0,0,0
4,1,0,0,0,0


In [125]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3399 entries, 0 to 3398
Data columns (total 5 columns):
RPOS    3399 non-null int64
RPOD    3399 non-null int64
RPOH    3399 non-null int64
RPON    3399 non-null int64
RPOF    3399 non-null int64
dtypes: int64(5)
memory usage: 159.3 KB


In [126]:
X['SEQ'].iloc[0]

'CAAACGCATCAGGATCAAAGTGAACATCACGAAACTTCTTACAATGGCGCA'

In [10]:
#generates overlapping k-mers of len=length from each sequence
# def create_kmers_word(seq, length):
#     k_mers = [seq[i:i+length].lower() for i in range(len(seq) - length + 1)]
#     kmers_word = ' '.join(k_mers).lower()
#     return kmers_word

In [12]:
# db_X['promoter_train'] = promoter_train.apply(lambda i : create_kmers_word(i['SEQ'], 6), axis=1)
# db_y['promoter_test'] = promoter_test.apply(lambda i : create_kmers_word(i['SEQ'], 6), axis=1)

In [12]:
#define metrics for the models
def get_metrics(y_test, y_predicted):
    accuracy = accuracy_score(y_test, y_predicted)
    precision = precision_score(y_test, y_predicted, average='weighted')
    recall = recall_score(y_test, y_predicted, average='weighted')
    f1 = f1_score(y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X['SEQ'], y, test_size = 0.20, random_state=42)

In [118]:
#build the model with MultinomialNB
steps = [('cv', CountVectorizer(analyzer='char',ngram_range=(3,7))),
         ('ovr', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None)))]

In [120]:
pipeline_ovr = Pipeline(steps)
ovr_clf = pipeline_ovr.fit(X_train, y_train)

In [49]:
y_pred_test = ovr_clf.predict(X_test)
y_pred_test

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [1, 1, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [50]:
multilabel_confusion_matrix(y_test, y_pred_test)

array([[[239, 103],
        [106, 232]],

       [[344, 104],
        [ 94, 138]],

       [[547,  45],
        [ 65,  23]],

       [[628,   7],
        [ 41,   4]],

       [[666,   0],
        [ 14,   0]]])

In [51]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69       338
           1       0.57      0.59      0.58       232
           2       0.34      0.26      0.29        88
           3       0.36      0.09      0.14        45
           4       0.00      0.00      0.00        14

   micro avg       0.61      0.55      0.58       717
   macro avg       0.39      0.33      0.34       717
weighted avg       0.58      0.55      0.56       717
 samples avg       0.30      0.29      0.28       717



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [52]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_test)
print(f'accuracy = {accuracy:.2f} \nprecision = {precision:.2f} \nrecall = {recall:.2f} \nf1 = {f1:.2f}')

accuracy = 0.46 
precision = 0.58 
recall = 0.55 
f1 = 0.56


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [53]:
auc_score_test = roc_auc_score(y_test, y_pred_test)
auc_score_test

0.6011121060656911

In [67]:
#build model with RandomForestClassifier
steps_rfc = [('cv', CountVectorizer(ngram_range=(1,2))),
         ('rfc', RandomForestClassifier())]
params_rfc = {'rfc__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90],
              'rfc__max_features': ['auto', 'sqrt'],
              'rfc__min_samples_leaf': [1, 2, 4],
              'rfc__min_samples_split': [2, 5, 10],
              'rfc__n_estimators': [50, 100, 200, 400, 600, 800, 1000]}
pipeline_rfc = Pipeline(steps_rfc)

In [68]:
grid_search_rfc = RandomizedSearchCV(pipeline_rfc, params_rfc, cv=5)

In [69]:
rfc_clf = grid_search_rfc.fit(X_train, y_train)

In [75]:
y_pred_rfc = rfc_clf.predict(X_test)

In [76]:
multilabel_confusion_matrix(y_test, y_pred_rfc)

array([[[342,   0],
        [338,   0]],

       [[448,   0],
        [232,   0]],

       [[592,   0],
        [ 88,   0]],

       [[635,   0],
        [ 45,   0]],

       [[666,   0],
        [ 14,   0]]])

In [77]:
print(classification_report(y_test, y_pred_rfc))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       338
           1       0.00      0.00      0.00       232
           2       0.00      0.00      0.00        88
           3       0.00      0.00      0.00        45
           4       0.00      0.00      0.00        14

   micro avg       0.00      0.00      0.00       717
   macro avg       0.00      0.00      0.00       717
weighted avg       0.00      0.00      0.00       717
 samples avg       0.00      0.00      0.00       717



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [78]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_rfc)
print(f'accuracy = {accuracy:.2f} \nprecision = {precision:.2f} \nrecall = {recall:.2f} \nf1 = {f1:.2f}')

accuracy = 0.47 
precision = 0.00 
recall = 0.00 
f1 = 0.00


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [79]:
auc_score_rfc = roc_auc_score(y_test, y_pred_rfc)
auc_score_rfc

0.5

In [83]:
#build model with kNN
steps_knn = [('cv', CountVectorizer(analyzer='char', ngram_range=(3,7))),
         ('knn', KNeighborsClassifier())]
params_knn = {'knn__n_neighbors': np.arange(1,35),
              'knn__leaf_size': [10,20,30,40],
              'knn__p': [1,2]}
pipeline_knn = Pipeline(steps_knn)

In [84]:
knn_cv = RandomizedSearchCV(pipeline_knn, params_knn, cv=5)

In [85]:
knn_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('cv',
                                              CountVectorizer(analyzer='char',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.int64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                                            

In [100]:
y_pred_knn = knn_cv.predict(X_test)

In [102]:
multilabel_confusion_matrix(y_test, y_pred_knn)

array([[[268,  74],
        [203, 135]],

       [[413,  35],
        [188,  44]],

       [[582,  10],
        [ 81,   7]],

       [[631,   4],
        [ 44,   1]],

       [[666,   0],
        [ 14,   0]]])

In [103]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.65      0.40      0.49       338
           1       0.56      0.19      0.28       232
           2       0.41      0.08      0.13        88
           3       0.20      0.02      0.04        45
           4       0.00      0.00      0.00        14

   micro avg       0.60      0.26      0.36       717
   macro avg       0.36      0.14      0.19       717
weighted avg       0.55      0.26      0.34       717
 samples avg       0.19      0.15      0.16       717



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [104]:
accuracy, precision, recall, f1 = get_metrics(y_test, y_pred_knn)
print(f'accuracy = {accuracy:.2f} \nprecision = {precision:.2f} \nrecall = {recall:.2f} \nf1 = {f1:.2f}')

accuracy = 0.45 
precision = 0.55 
recall = 0.26 
f1 = 0.34


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [105]:
auc_score_knn = roc_auc_score(y_test, y_pred_knn)
auc_score_knn

0.5373140759708973