In [1]:
import pandas as pd
import os
from scipy.stats import kruskal
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
maple_train_df = pd.read_csv('maple_train.csv')
maple_test_df = pd.read_csv('maple_test.csv')
maple_eval_df = pd.read_csv('maple_eval.csv')


In [3]:

path = './PathwayFeatures'
pathway_names = []
for i, attrlist in enumerate(os.listdir(path)):
    attributes = pd.read_csv(path+'/'+ attrlist)
    attributes = attributes["Attrib"].tolist()
    pathway_name = attrlist.split('-')[0]
    pathway_names.append(pathway_name)

    maple_train_df.loc[maple_train_df['Path'] == pathway_name.replace('_', ' '), 'Class'] = 'positive'
    maple_test_df.loc[maple_test_df['Path'] == pathway_name.replace('_', ' '), 'Class'] = 'positive'
    maple_eval_df.loc[maple_eval_df['Path'] == pathway_name.replace('_', ' '), 'Class'] = 'positive'

    maple_train_df.loc[maple_train_df['Path'] != pathway_name.replace('_', ' '), 'Class'] = 'negative'
    maple_test_df.loc[maple_test_df['Path'] != pathway_name.replace('_', ' '), 'Class'] = 'negative'
    maple_eval_df.loc[maple_eval_df['Path'] != pathway_name.replace('_', ' '), 'Class'] = 'negative'

#     train_df_prep = maple_train_df[attributes]
#     test_df_prep = maple_test_df[attributes]
#     eval_df_prep = maple_eval_df[attributes]

#     train_df_prep.to_csv('Datasets/'+pathway_name+'_train.csv', index=False)
#     test_df_prep.to_csv('Datasets/'+pathway_name+'_test.csv', index=False)
#     eval_df_prep.to_csv('Datasets/'+pathway_name+'_eval.csv', index=False)

In [4]:
# https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
# by: @invoktheshell
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)):
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, recall_score, precision_score

def calc_metrics(y_true, y_pred, model_name, pw_name, dataset_name, fold_num):
    acc = accuracy_score(y_true, y_pred)
    tp, fp, tn, fn = perf_measure(y_true, y_pred)
    recall = precision_score(y_true, y_pred)
    specifity = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    # print(precision, specifity, tp,fp,tn,fn)

    try:
        fpr = fp / (fp + tn)
    except:
        fpr = 0

    try:
        tpr = tp / (tp + fn)
    except:
        tpr = 0

    try:
        fnr = fn / (fn + tp)
    except:
        fnr = 0

    try:
        fdr = fp / (fp + tp)
    except:
        fdr = 0

    return model_name, pw_name, dataset_name, fold_num, acc, f1, mcc, tp, fp, tn, fn, recall, precision, specifity, fpr, tpr, fnr, fdr

In [5]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [6]:
#.3 Metabolic pathway modeling and analysis
# The 10-fold cross-validation and bagging were used on all training process,
# and the metrics accuracy, percentage of correctly classified instances (CCI),
# true positive (TP) rate, false positive (FP) rate, false negative (FN) rate, recall,
# specificity, F-measure, false discovery rate (FDR), and Matthews coefficient correlation (MCC)
# were used to evaluate the predictive performances.
from tqdm import tqdm
res = []

for pathway_name in tqdm(pathway_names):
    train_df = pd.read_csv('Datasets/'+ pathway_name+'_train.csv')
    # test_df = pd.read_csv('Datasets/'+ pathway_name+'_test.csv')
    # eval_df = pd.read_csv('Datasets/'+ pathway_name+'_eval.csv')

    print('pathway: ', pathway_name)
    # print(len(train_df[train_df["Class"]=='positive']))

    train_df["Class"] = train_df["Class"].apply(lambda x: 1 if x=='positive' else 0)
    # test_df["Class"] = test_df["Class"].apply(lambda x: 1 if x=='positive' else 0)
    # eval_df["Class"] = eval_df["Class"].apply(lambda x: 1 if x=='positive' else 0)

    names = ['svc', 'nb', 'dt', 'mlp', 'knn', 'rf', 'sgd']
    models = [BaggingClassifier(SVC()),
              BaggingClassifier(GaussianNB(), n_jobs=-1),
              BaggingClassifier(DecisionTreeClassifier()),
#               BaggingClassifier(MLPClassifier()),
              BaggingClassifier(KNeighborsClassifier(n_jobs=-1),n_jobs=-1),
              BaggingClassifier(RandomForestClassifier(n_jobs=-1),n_jobs=-1),
              BaggingClassifier(SGDClassifier(n_jobs=-1),n_jobs=-1),]

    # 'shuffle' is added, bc 'random_state' is set
    skf = StratifiedKFold(n_splits=10, random_state=1337, shuffle=True)
    X = train_df.drop('Class', axis=1)
    y = train_df["Class"]

    # res = []
    for i in tqdm(range(len(models))):
        fold_num=0
        for train_index, test_index in skf.split(X, y):
            fold_num+=1
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]
            models[i].fit(X_train, y_train)
            preds = models[i].predict(X_test)
            # print(y_test[y_test[]])
            # res.append(calc_metrics(y_test.tolist(), preds.tolist(), names[i], pathway_name, 'test_df', fold_num))
            res.append(calc_metrics(y_test.tolist(), preds.tolist(), names[i], pathway_name, 'train_df', fold_num))

res_df = pd.DataFrame(res, columns=["model_name", "pw_name", "dataset_name", "fold_num", "acc", "f1", "mcc", "tp", "fp", "tn", "fn", "recall", "precision", "specifity", "fpr", "tpr", "fnr", "fdr"])
# res_dict[pathway_name] = res_df

file_name = '10fold_results.csv'
res_df.to_csv(file_name, index=False)
print(file_name, "is done.")

del y_test, y_train, X_test, X_train, X, y

  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  energy



 17%|█▋        | 1/6 [00:06<00:33,  6.80s/it][A
 33%|███▎      | 2/6 [00:09<00:18,  4.64s/it][A
 50%|█████     | 3/6 [00:21<00:23,  7.79s/it][A
 67%|██████▋   | 4/6 [00:28<00:15,  7.65s/it][A
 83%|████████▎ | 5/6 [00:39<00:08,  8.53s/it][A
100%|██████████| 6/6 [00:41<00:00,  6.92s/it][A
  8%|▊         | 1/12 [00:41<07:38, 41.64s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  protein_modification



 17%|█▋        | 1/6 [00:43<03:36, 43.38s/it][A
 33%|███▎      | 2/6 [00:45<01:17, 19.28s/it][A
 50%|█████     | 3/6 [01:17<01:15, 25.00s/it][A
 67%|██████▋   | 4/6 [01:24<00:35, 17.94s/it][A
 83%|████████▎ | 5/6 [01:40<00:17, 17.04s/it][A
100%|██████████| 6/6 [01:44<00:00, 17.34s/it][A
 17%|█▋        | 2/12 [02:25<13:04, 78.44s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  amino_acid



 17%|█▋        | 1/6 [00:33<02:48, 33.61s/it][A
 33%|███▎      | 2/6 [00:36<01:01, 15.32s/it][A
 50%|█████     | 3/6 [01:00<00:57, 19.23s/it][A
 67%|██████▋   | 4/6 [01:07<00:28, 14.49s/it][A
 83%|████████▎ | 5/6 [01:20<00:14, 14.04s/it][A
100%|██████████| 6/6 [01:24<00:00, 14.04s/it][A
 25%|██▌       | 3/12 [03:50<12:10, 81.15s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  nitrogen



 17%|█▋        | 1/6 [00:05<00:26,  5.31s/it][A
 33%|███▎      | 2/6 [00:07<00:14,  3.65s/it][A
 50%|█████     | 3/6 [00:14<00:14,  4.92s/it][A
 67%|██████▋   | 4/6 [00:21<00:11,  5.85s/it][A
 83%|████████▎ | 5/6 [00:31<00:07,  7.25s/it][A
100%|██████████| 6/6 [00:33<00:00,  5.66s/it][A
 33%|███▎      | 4/12 [04:24<08:20, 62.58s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  aromatic_compound


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 1/6 [00:08<00:41,  8.27s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 6/6 [00:43<00:00,  7.24s/it][A
 42%|████▏     | 5/12 [05:07<06:30, 55.74s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  secondary_metabolite


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 1/6 [00:18<01:33, 18.68s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 3/6 [00:37<00:37, 12.44s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 5/6 [00:55<00:10, 10.48s/it][A
100%|██████████| 6/6 [00:57<00:00,  9.64s/it][A
 50%|█████     | 6/12 [06:05<05:38, 56.50s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  carbohydrate



 17%|█▋        | 1/6 [00:37<03:07, 37.46s/it][A
 33%|███▎      | 2/6 [00:40<01:07, 16.93s/it][A
 50%|█████     | 3/6 [01:01<00:56, 18.99s/it][A
 67%|██████▋   | 4/6 [01:08<00:28, 14.35s/it][A
 83%|████████▎ | 5/6 [01:22<00:14, 14.08s/it][A
100%|██████████| 6/6 [01:26<00:00, 14.34s/it][A
 58%|█████▊    | 7/12 [07:32<05:30, 66.19s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  lipid



 17%|█▋        | 1/6 [00:27<02:17, 27.45s/it][A
 33%|███▎      | 2/6 [00:29<00:50, 12.66s/it][A
 50%|█████     | 3/6 [00:45<00:41, 13.98s/it][A
 67%|██████▋   | 4/6 [00:51<00:22, 11.09s/it][A
 83%|████████▎ | 5/6 [01:04<00:11, 11.57s/it][A
100%|██████████| 6/6 [01:07<00:00, 11.23s/it][A
 67%|██████▋   | 8/12 [08:39<04:26, 66.62s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  carotenoid



 17%|█▋        | 1/6 [00:07<00:39,  7.92s/it][A
 33%|███▎      | 2/6 [00:10<00:18,  4.72s/it][A
 50%|█████     | 3/6 [00:19<00:20,  6.90s/it][A
 67%|██████▋   | 4/6 [00:27<00:14,  7.04s/it][A
 83%|████████▎ | 5/6 [00:37<00:08,  8.05s/it][A
100%|██████████| 6/6 [00:39<00:00,  6.64s/it][A
 75%|███████▌  | 9/12 [09:19<02:54, 58.32s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  amine_and_polyamine



 17%|█▋        | 1/6 [00:10<00:51, 10.29s/it][A
 33%|███▎      | 2/6 [00:12<00:22,  5.70s/it][A
 50%|█████     | 3/6 [00:26<00:27,  9.26s/it][A
 67%|██████▋   | 4/6 [00:33<00:16,  8.44s/it][A
 83%|████████▎ | 5/6 [00:44<00:09,  9.33s/it][A
100%|██████████| 6/6 [00:47<00:00,  7.88s/it][A
 83%|████████▎ | 10/12 [10:07<01:49, 54.97s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  phospholipid



 17%|█▋        | 1/6 [00:13<01:07, 13.46s/it][A
 33%|███▎      | 2/6 [00:15<00:27,  6.89s/it][A
 50%|█████     | 3/6 [00:26<00:25,  8.47s/it][A
 67%|██████▋   | 4/6 [00:32<00:15,  7.60s/it][A
 83%|████████▎ | 5/6 [00:42<00:08,  8.67s/it][A
100%|██████████| 6/6 [00:45<00:00,  7.63s/it][A
 92%|█████████▏| 11/12 [10:53<00:52, 52.19s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  nucleotide_sugar



 17%|█▋        | 1/6 [00:10<00:54, 10.97s/it][A
 33%|███▎      | 2/6 [00:13<00:23,  5.90s/it][A
 50%|█████     | 3/6 [00:32<00:36, 12.18s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 5/6 [00:50<00:10, 10.31s/it][A
100%|██████████| 6/6 [00:53<00:00,  8.94s/it][A
100%|██████████| 12/12 [11:46<00:00, 58.90s/it]

10fold_results.csv is done.





In [29]:
res_df[res_df["model_name"] == "mlp"]

Unnamed: 0,model_name,pw_name,dataset_name,fold_num,acc,f1,mcc,tp,fp,tn,fn,recall,precision,specifity,fpr,tpr,fnr,fdr,score


In [30]:
# res_df.loc[res_df["model_name"] == "rf", "model_name"] = "sgd"
# res_df.loc[res_df["model_name"] == "knn", "model_name"] = "rf"
# res_df.loc[res_df["model_name"] == "mlp", "model_name"] = "knn"
res_df.to_csv('10fold_results.csv', index=False)

In [None]:
# The best classifiers for each metabolic pathway were ranked using the Kruskal-Wallis test.
# Bu test algoritmaları nasıl sıralıyor anlamadım, benim anladığım fark var mı diye bakıyor dağılımlarda


res_df = pd.read_csv('10fold_results.csv')
res_rows = res_df.drop('fold_num', axis=1).groupby('model_name').mean()
for i in range(len(res_rows.index)):
    for j in range(i+1, len(res_rows.index)):
        # print(pw_name, "-", str(res_rows.index[i]), "-", str(res_rows.index[j]))
        stat, p = kruskal(res_rows.loc[str(res_rows.index[i])], res_rows.loc[str(res_rows.index[j])])
        print('Statistics=%.3f, p=%.3f' % (stat, p))
        if p < 0.05:
            print("statistically significant p value is found: ", str(res_rows.index[i], str(res_rows.index[j])))
        print("")

In [45]:
res_df.groupby(['pw_name','model_name'], as_index=False).mean().sort_values(["pw_name", "score"], ascending=False)

Unnamed: 0,pw_name,model_name,fold_num,acc,f1,mcc,tp,fp,tn,fn,recall,precision,specifity,fpr,tpr,fnr,fdr,score
67,secondary_metabolite,knn,5.5,0.986701,0.395663,0.489199,2.6,0.0,539.0,7.3,1.000000,1.000000,1.000000,0.000000,0.263333,0.736667,0.000000,1.382364
71,secondary_metabolite,svc,5.5,0.983969,0.192424,0.289982,1.1,0.0,539.0,8.8,0.800000,0.800000,1.000000,0.000000,0.113333,0.886667,0.000000,1.176393
69,secondary_metabolite,rf,5.5,0.982329,0.038182,0.064451,0.2,0.0,539.0,9.7,0.200000,0.200000,1.000000,0.000000,0.021111,0.978889,0.000000,1.020511
70,secondary_metabolite,sgd,5.5,0.983422,0.260430,0.318543,1.7,0.9,538.1,8.2,0.656667,0.656667,0.998330,0.001670,0.174444,0.825556,0.343333,0.343852
66,secondary_metabolite,dt,5.5,0.983058,0.241282,0.274227,1.6,1.0,538.0,8.3,0.485000,0.485000,0.998145,0.001855,0.163333,0.836667,0.315000,0.224340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,amine_and_polyamine,svc,5.5,0.994899,0.916997,0.918930,16.0,0.0,530.1,2.8,1.000000,1.000000,1.000000,0.000000,0.851170,0.148830,0.000000,1.911896
3,amine_and_polyamine,rf,5.5,0.992530,0.875477,0.879670,14.7,0.0,530.1,4.1,1.000000,1.000000,1.000000,0.000000,0.781287,0.218713,0.000000,1.868007
4,amine_and_polyamine,sgd,5.5,0.994352,0.910586,0.911025,16.1,0.4,529.7,2.7,0.977341,0.977341,0.999245,0.000755,0.856433,0.143567,0.022659,1.504939
0,amine_and_polyamine,dt,5.5,0.992713,0.881525,0.883023,15.3,0.5,529.6,3.5,0.969722,0.969722,0.999057,0.000943,0.813158,0.186842,0.030278,1.374238


In [46]:
# The best classifiers were pre-selected, seeking a low FP & high F-measure and CCI

best_models = {}    # {"pathway": [best 3 models]}
models = []
scores = {}
res_df["score"] =  res_df["acc"] + res_df["f1"] - res_df["fp"]
best_df = res_df.groupby(['pw_name','model_name'], as_index=False).mean().sort_values(["pw_name", "score"], ascending=False)
for pw in pathway_names:
    best_models[pw] = best_df[best_df["pw_name"] == pw].iloc[:3]["model_name"].tolist()
print(best_models)

{'energy': ['sgd', 'svc', 'rf'], 'protein_modification': ['rf', 'svc', 'dt'], 'amino_acid': ['rf', 'svc', 'knn'], 'nitrogen': ['svc', 'sgd', 'rf'], 'aromatic_compound': ['sgd', 'knn', 'svc'], 'secondary_metabolite': ['knn', 'svc', 'rf'], 'carbohydrate': ['rf', 'svc', 'knn'], 'lipid': ['svc', 'rf', 'knn'], 'carotenoid': ['knn', 'svc', 'sgd'], 'amine_and_polyamine': ['knn', 'svc', 'rf'], 'phospholipid': ['knn', 'svc', 'rf'], 'nucleotide_sugar': ['knn', 'svc', 'sgd']}


In [47]:
# estimators for bagging classifier
bagging_cls = {'svc': ('svc', BaggingClassifier(SVC())),
               'dt': ('dt', BaggingClassifier(DecisionTreeClassifier())),
#                'mlp': ('mlp', BaggingClassifier(MLPClassifier())),
               'nb' : ('nb', BaggingClassifier(GaussianNB())),
               'rf': ('rf', BaggingClassifier(RandomForestClassifier())),
               'knn': ('knn', BaggingClassifier(KNeighborsClassifier())),
               'sgd': ('sgd', BaggingClassifier(SGDClassifier()))}

In [58]:
params_rf = {'rf__base_estimator__bootstrap': [True],
             'rf__base_estimator__max_depth': [80, 100],
             'rf__base_estimator__max_features': [2, 3],
             'rf__base_estimator__min_samples_leaf': [3, 4],
             'rf__base_estimator__min_samples_split': [8, 10],
             'rf__base_estimator__n_estimators': [100, 200],
             'rf__base_estimator__n_jobs': [-1]
            }

params_SVC={'svc__base_estimator__C': [1, 100, 1000],
            'svc__base_estimator__gamma': [1, 0.01, 0.0001],
            'svc__base_estimator__kernel': ['rbf']
           }


params_MLP = {'mlp__base_estimator__hidden_layer_sizes': [(10,30,10),(20,)],
              'mlp__base_estimator__activation': ['tanh', 'relu'],
              'mlp__base_estimator__solver': ['sgd', 'adam'],
              'mlp__base_estimator__alpha': [0.0001, 0.05],
              'mlp__base_estimator__learning_rate': ['constant','adaptive']
             }

params_DT = { 
             'dt__base_estimator__max_depth': list(range(6,9)),
             'dt__base_estimator__min_samples_leaf': list(range(4,10,2))
            }

params_KNN = { 
             'knn__base_estimator__n_neighbors': list(range(3, 7)),
             'knn__base_estimator__n_jobs': [-1]
             }

params_SGD = {
    'sgd__base_estimator__penalty': ['l2'],
    'sgd__base_estimator__n_jobs': [-1],
#     'sgd__base_estimator__C': [1e-7, 1e-5, 1e-3],
}

all_params = {"rf": params_rf, 
              "svc": params_SVC, 
#               "mlp": params_MLP, 
              "dt": params_DT, 
              "knn": params_KNN, 
              "sgd": params_SGD}


In [59]:
voting_models_w_params = {}    # ["pathway": (voting model, parameters)]

for pw_name, models in best_models.items():

    estimators = []
    parameters = {}

    for model in models:
        # model's estimator
        estimators.append(bagging_cls[model])

        # there is no param for NB
        if model in all_params.keys():
            for param, values in all_params[model].items():
                parameters[param] = values

    # pramameters =

    voting_model = VotingClassifier(estimators=estimators, voting='soft')
    voting_models_w_params[pw_name] = (voting_model, parameters)

In [56]:
voting_models_w_params

{'energy': (VotingClassifier(estimators=[('sgd',
                                BaggingClassifier(base_estimator=SGDClassifier())),
                               ('svc', BaggingClassifier(base_estimator=SVC())),
                               ('rf',
                                BaggingClassifier(base_estimator=RandomForestClassifier()))],
                   voting='soft'),
  {'sgd__base_estimator__penalty': ['l2'],
   'sgd__base_estimator__n_jobs': [-1],
   'sgd__base_estimator__C': [1e-07, 1e-05, 0.001],
   'svc__base_estimator__C': [1, 100, 1000],
   'svc__base_estimator__gamma': [1, 0.01, 0.0001],
   'svc__base_estimator__kernel': ['rbf'],
   'rf__base_estimator__bootstrap': [True],
   'rf__base_estimator__max_depth': [80, 100],
   'rf__base_estimator__max_features': [2, 3],
   'rf__base_estimator__min_samples_leaf': [3, 4],
   'rf__base_estimator__min_samples_split': [8, 10],
   'rf__base_estimator__n_estimators': [100, 200],
   'rf__base_estimator__n_jobs': [-1]}),
 'protein_

In [None]:
import json

# parameters tuning of combined algorithms

best_scores = {}    # "model": best scores
best_params = {}    # "model": best parameters
params_rows = []
for pathway_name in pathway_names:
    df = pd.read_csv('Datasets/' + pathway_name+'_train.csv')

    voting_model = voting_models_w_params[pathway_name][0]
    parameters = voting_models_w_params[pathway_name][1]

    print("Starting for", pathway_name)
    clf = GridSearchCV(voting_model, parameters, n_jobs=-1, verbose=3, scoring='f1_macro', cv=3)
    clf.fit(df.drop('Class', axis=1), df["Class"])

    best_scores[pathway_name] = clf.best_score_
    best_params[pathway_name] = clf.best_params_
    print(clf.best_score_)
    print(clf.best_params_)

with open('best_params.json', 'w') as fp:
    json.dump(best_params, fp)

In [21]:
for (model, params) in voting_models_w_params.values():
    print(model)

In [22]:
voting_models_w_tuned_params = voting_models_w_params.copy()

for pw, (model, params) in voting_models_w_params.items():

    voting_models_w_params[pw][0] = model.set_params(**best_params[pw])
    voting_models_w_params[pw][1] = best_params[pw]

best_models

In [22]:
for pw, (model, params) in voting_models_w_params.items():
    