In [104]:
import os
import numpy as np
import pandas as pd
import deepdish as dd

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_curve, auc, matthews_corrcoef, make_scorer
from sklearn.ensemble import RandomForestClassifier

In [105]:
train = dd.io.load('../../splits/train.h5')
val = dd.io.load('../../splits/val.h5')
test = dd.io.load('../../splits/test.h5')
y_train = np.load('../../splits/y_train.npy')
y_val = np.load('../../splits/y_val.npy')
y_test = np.load('../../splits/y_test.npy')

In [106]:
y_train = y_train * 100
y_val = y_val * 100
y_test = y_test * 100

In [4]:
df_feat = pd.read_csv('../../datasets/dataset_extracted_features.csv')

In [5]:
df_feat

Unnamed: 0,Seq1,Seq2,Aln,GC1,GC2,SingleCon1,SingleCon2,PairCon1,PairCon2,SingleMFE1,SingleMFE2,Yield,Label
0,AGTACAAGTAGGACAGGAAGATA,TATCTTCCTGTCCTACTCGTACT,106,39.130435,43.478261,0.999957,0.999974,0.000022,0.000013,0.000000,0.000000,0.974025,1
1,GAGTTCCGGTTGCCTTTCA,GAGTTCCGGGTGCCTTTCA,12,52.631579,57.894737,0.999844,0.999739,0.000078,0.000130,0.000000,0.000000,0.000193,0
2,TCTGGAACTAGTGCAATTTAGC,TCTAAATTGCACTAGTTCCAGA,101,40.909091,36.363636,0.999556,0.999598,0.000222,0.000201,0.000000,0.000000,0.991153,1
3,GGCGCGCGACGCGACATCCGATAAGA,TCTTATCGGATCTCGCGTCGCGCGCC,121,65.384615,65.384615,0.999455,0.999183,0.000273,0.000408,-2.595847,-1.870977,0.999227,1
4,GTGTACCATACCATGAACGCCGGA,TCCGGCGTCATGGTATGGTACAC,110,54.166667,56.521739,0.998893,0.999652,0.000554,0.000174,0.000000,0.000000,0.996585,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2556971,AGCAAGGCACAGGACTTAGTACCAAA,TTTGGTACTAAGTCCTGTGCCGCT,113,46.153846,50.000000,0.999938,0.999916,0.000031,0.000042,-0.317577,0.000000,0.995240,1
2556972,CAGGCTGAGACCTAACTCTTTAGCC,GGCTAAAAGGTTAGGTCTCAGCCTG,110,52.000000,52.000000,0.999786,0.999896,0.000107,0.000052,0.000000,0.000000,0.973733,1
2556973,CATATCAACTCCAGGAATCCTT,AAGGATTCCTGCAGATGATATG,92,40.909091,40.909091,0.999334,0.998677,0.000333,0.000662,0.000000,0.000000,0.529762,1
2556974,GAATGCAGAGTTCGATGTCGGT,ACCAGTACACATCGAACTCTGCATTC,96,50.000000,46.153846,0.999855,0.999893,0.000073,0.000053,0.000000,0.000000,0.984665,1


In [6]:
train_features = df_feat.loc[(df_feat['Seq1'].isin([item[0] for item in train])) & (df_feat['Seq2'].isin([item[1] for item in train]))]
val_features = df_feat.loc[(df_feat['Seq1'].isin([item[0] for item in val])) & (df_feat['Seq2'].isin([item[1] for item in val]))]
test_features = df_feat.loc[(df_feat['Seq1'].isin([item[0] for item in test])) & (df_feat['Seq2'].isin([item[1] for item in test]))]

In [9]:
train_feat = train_features[['Aln', 'GC1', 'GC2', 'SingleCon1', 'SingleCon2', 'PairCon1', 'PairCon2', 'SingleMFE1', 'SingleMFE2']].values
val_feat = val_features[['Aln', 'GC1', 'GC2', 'SingleCon1', 'SingleCon2', 'PairCon1', 'PairCon2', 'SingleMFE1', 'SingleMFE2']].values
test_feat = test_features[['Aln', 'GC1', 'GC2', 'SingleCon1', 'SingleCon2', 'PairCon1', 'PairCon2', 'SingleMFE1', 'SingleMFE2']].values

In [10]:
train_labels = train_features[['Label']].values.squeeze()
val_labels = val_features[['Label']].values.squeeze()
test_labels = test_features[['Label']].values.squeeze()

In [36]:
scaler = MinMaxScaler()
scaler = scaler.fit(np.concatenate((train_feat, val_feat, test_feat), axis=0))
scaled_train = scaler.transform(train_feat)
scaled_val = scaler.transform(val_feat)
scaled_test = scaler.transform(test_feat)

## RandomForest

In [16]:
n_estimators = [30, 100, 200]
max_features = [None, 'auto', 'sqrt']
max_depth = [None, 10, 30, 100]
min_samples_split = [2, 5]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [17]:
# Create a list where train data indices are -1 and validation data indices are 0
scaled_train_val = np.concatenate((scaled_train, scaled_val))
labels_train_val = np.concatenate((train_labels, val_labels))
split_index = [-1 if x in range(len(scaled_train)) else 0 for x in range(len(scaled_train_val))]

# Use the list to create PredefinedSplit
pds = PredefinedSplit(test_fold=split_index)

In [18]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(n_jobs=4), param_grid=grid, cv=pds, verbose=10, n_jobs=8)

In [19]:
grid_search.fit(scaled_train_val, labels_train_val)

Fitting 1 folds for each of 432 candidates, totalling 432 fits


GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=RandomForestClassifier(n_jobs=4), n_jobs=8,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [None, 10, 30, 100],
                         'max_features': [None, 'auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5],
                         'n_estimators': [30, 100, 200]},
             verbose=10)

In [22]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}