In [219]:
import pandas as pd
import os
import json
from scipy.stats import kruskal
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, recall_score, precision_score

In [202]:
maple_train_df = pd.read_csv('maple_train.csv')
maple_test_df = pd.read_csv('maple_test.csv')
maple_eval_df = pd.read_csv('maple_eval.csv')

In [203]:
path = './PathwayFeatures'
pathway_names = []
for i, attrlist in enumerate(os.listdir(path)):
    attributes = pd.read_csv(path+'/'+ attrlist)
    attributes = attributes["Attrib"].tolist()
    pathway_name = attrlist.split('-')[0]
    pathway_names.append(pathway_name)

    maple_train_df.loc[maple_train_df['Path'] == pathway_name.replace('_', ' '), 'Class'] = 'positive'
    maple_test_df.loc[maple_test_df['Path'] == pathway_name.replace('_', ' '), 'Class'] = 'positive'
    maple_eval_df.loc[maple_eval_df['Path'] == pathway_name.replace('_', ' '), 'Class'] = 'positive'

    maple_train_df.loc[maple_train_df['Path'] != pathway_name.replace('_', ' '), 'Class'] = 'negative'
    maple_test_df.loc[maple_test_df['Path'] != pathway_name.replace('_', ' '), 'Class'] = 'negative'
    maple_eval_df.loc[maple_eval_df['Path'] != pathway_name.replace('_', ' '), 'Class'] = 'negative'

#     train_df_prep = maple_train_df[attributes]
#     test_df_prep = maple_test_df[attributes]
#     eval_df_prep = maple_eval_df[attributes]

#     train_df_prep.to_csv('Datasets/'+pathway_name+'_train.csv', index=False)
#     test_df_prep.to_csv('Datasets/'+pathway_name+'_test.csv', index=False)
#     eval_df_prep.to_csv('Datasets/'+pathway_name+'_eval.csv', index=False)

In [220]:
# https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
# by: @invoktheshell
def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)):
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return(TP, FP, TN, FN)

def calc_metrics(y_true, y_pred, model_name, pw_name, fold_num):
    acc = accuracy_score(y_true, y_pred)
    tp, fp, tn, fn = perf_measure(y_true, y_pred)
    recall = precision_score(y_true, y_pred)
    specifity = tn / (tn + fp)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    # print(precision, specifity, tp,fp,tn,fn)

    try:
        fpr = fp / (fp + tn)
    except:
        fpr = 0

    try:
        tpr = tp / (tp + fn)
    except:
        tpr = 0

    try:
        fnr = fn / (fn + tp)
    except:
        fnr = 0

    try:
        fdr = fp / (fp + tp)
    except:
        fdr = 0

    return model_name, pw_name, fold_num, acc, f1, mcc, tp, fp, tn, fn, recall, precision, specifity, fpr, tpr, fnr, fdr

In [205]:
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !jupyter labextension install @jupyter-widgets/jupyterlab-manager

In [6]:
#.3 Metabolic pathway modeling and analysis
# The 10-fold cross-validation and bagging were used on all training process,
# and the metrics accuracy, percentage of correctly classified instances (CCI),
# true positive (TP) rate, false positive (FP) rate, false negative (FN) rate, recall,
# specificity, F-measure, false discovery rate (FDR), and Matthews coefficient correlation (MCC)
# were used to evaluate the predictive performances.
from tqdm import tqdm
res = []

for pathway_name in tqdm(pathway_names):
    train_df = pd.read_csv('Datasets/'+ pathway_name+'_train.csv')
    # test_df = pd.read_csv('Datasets/'+ pathway_name+'_test.csv')
    # eval_df = pd.read_csv('Datasets/'+ pathway_name+'_eval.csv')

    print('pathway: ', pathway_name)
    # print(len(train_df[train_df["Class"]=='positive']))

    train_df["Class"] = train_df["Class"].apply(lambda x: 1 if x=='positive' else 0)
    # test_df["Class"] = test_df["Class"].apply(lambda x: 1 if x=='positive' else 0)
    # eval_df["Class"] = eval_df["Class"].apply(lambda x: 1 if x=='positive' else 0)

    names = ['svc', 'nb', 'dt', 'mlp', 'knn', 'rf', 'sgd']
    models = [BaggingClassifier(SVC()),
              BaggingClassifier(GaussianNB(), n_jobs=-1),
              BaggingClassifier(DecisionTreeClassifier()),
#               BaggingClassifier(MLPClassifier()),
              BaggingClassifier(KNeighborsClassifier(n_jobs=-1),n_jobs=-1),
              BaggingClassifier(RandomForestClassifier(n_jobs=-1),n_jobs=-1),
              BaggingClassifier(SGDClassifier(n_jobs=-1),n_jobs=-1),]

    # 'shuffle' is added, bc 'random_state' is set
    skf = StratifiedKFold(n_splits=10, random_state=1337, shuffle=True)
    X = train_df.drop('Class', axis=1)
    y = train_df["Class"]

    # res = []
    for i in tqdm(range(len(models))):
        fold_num=0
        for train_index, test_index in skf.split(X, y):
            fold_num+=1
            X_train, X_test = X.loc[train_index], X.loc[test_index]
            y_train, y_test = y.loc[train_index], y.loc[test_index]
            models[i].fit(X_train, y_train)
            preds = models[i].predict(X_test)
            # print(y_test[y_test[]])
            # res.append(calc_metrics(y_test.tolist(), preds.tolist(), names[i], pathway_name, 'test_df', fold_num))
            res.append(calc_metrics(y_test.tolist(), preds.tolist(), names[i], pathway_name, 'train_df', fold_num))

res_df = pd.DataFrame(res, columns=["model_name", "pw_name", "fold_num", "acc", "f1", "mcc", "tp", "fp", "tn", "fn", "recall", "precision", "specifity", "fpr", "tpr", "fnr", "fdr"])
# res_dict[pathway_name] = res_df

file_name = '10fold_results.csv'
res_df.to_csv(file_name, index=False)
print(file_name, "is done.")

del y_test, y_train, X_test, X_train, X, y

  0%|          | 0/12 [00:00<?, ?it/s]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  energy



 17%|█▋        | 1/6 [00:06<00:33,  6.80s/it][A
 33%|███▎      | 2/6 [00:09<00:18,  4.64s/it][A
 50%|█████     | 3/6 [00:21<00:23,  7.79s/it][A
 67%|██████▋   | 4/6 [00:28<00:15,  7.65s/it][A
 83%|████████▎ | 5/6 [00:39<00:08,  8.53s/it][A
100%|██████████| 6/6 [00:41<00:00,  6.92s/it][A
  8%|▊         | 1/12 [00:41<07:38, 41.64s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  protein_modification



 17%|█▋        | 1/6 [00:43<03:36, 43.38s/it][A
 33%|███▎      | 2/6 [00:45<01:17, 19.28s/it][A
 50%|█████     | 3/6 [01:17<01:15, 25.00s/it][A
 67%|██████▋   | 4/6 [01:24<00:35, 17.94s/it][A
 83%|████████▎ | 5/6 [01:40<00:17, 17.04s/it][A
100%|██████████| 6/6 [01:44<00:00, 17.34s/it][A
 17%|█▋        | 2/12 [02:25<13:04, 78.44s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  amino_acid



 17%|█▋        | 1/6 [00:33<02:48, 33.61s/it][A
 33%|███▎      | 2/6 [00:36<01:01, 15.32s/it][A
 50%|█████     | 3/6 [01:00<00:57, 19.23s/it][A
 67%|██████▋   | 4/6 [01:07<00:28, 14.49s/it][A
 83%|████████▎ | 5/6 [01:20<00:14, 14.04s/it][A
100%|██████████| 6/6 [01:24<00:00, 14.04s/it][A
 25%|██▌       | 3/12 [03:50<12:10, 81.15s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  nitrogen



 17%|█▋        | 1/6 [00:05<00:26,  5.31s/it][A
 33%|███▎      | 2/6 [00:07<00:14,  3.65s/it][A
 50%|█████     | 3/6 [00:14<00:14,  4.92s/it][A
 67%|██████▋   | 4/6 [00:21<00:11,  5.85s/it][A
 83%|████████▎ | 5/6 [00:31<00:07,  7.25s/it][A
100%|██████████| 6/6 [00:33<00:00,  5.66s/it][A
 33%|███▎      | 4/12 [04:24<08:20, 62.58s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  aromatic_compound


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 1/6 [00:08<00:41,  8.27s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

100%|██████████| 6/6 [00:43<00:00,  7.24s/it][A
 42%|████▏     | 5/12 [05:07<06:30, 55.74s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  secondary_metabolite


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 17%|█▋        | 1/6 [00:18<01:33, 18.68s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 50%|█████     | 3/6 [00:37<00:37, 12.44s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 5/6 [00:55<00:10, 10.48s/it][A
100%|██████████| 6/6 [00:57<00:00,  9.64s/it][A
 50%|█████     | 6/12 [06:05<05:38, 56.50s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  carbohydrate



 17%|█▋        | 1/6 [00:37<03:07, 37.46s/it][A
 33%|███▎      | 2/6 [00:40<01:07, 16.93s/it][A
 50%|█████     | 3/6 [01:01<00:56, 18.99s/it][A
 67%|██████▋   | 4/6 [01:08<00:28, 14.35s/it][A
 83%|████████▎ | 5/6 [01:22<00:14, 14.08s/it][A
100%|██████████| 6/6 [01:26<00:00, 14.34s/it][A
 58%|█████▊    | 7/12 [07:32<05:30, 66.19s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  lipid



 17%|█▋        | 1/6 [00:27<02:17, 27.45s/it][A
 33%|███▎      | 2/6 [00:29<00:50, 12.66s/it][A
 50%|█████     | 3/6 [00:45<00:41, 13.98s/it][A
 67%|██████▋   | 4/6 [00:51<00:22, 11.09s/it][A
 83%|████████▎ | 5/6 [01:04<00:11, 11.57s/it][A
100%|██████████| 6/6 [01:07<00:00, 11.23s/it][A
 67%|██████▋   | 8/12 [08:39<04:26, 66.62s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  carotenoid



 17%|█▋        | 1/6 [00:07<00:39,  7.92s/it][A
 33%|███▎      | 2/6 [00:10<00:18,  4.72s/it][A
 50%|█████     | 3/6 [00:19<00:20,  6.90s/it][A
 67%|██████▋   | 4/6 [00:27<00:14,  7.04s/it][A
 83%|████████▎ | 5/6 [00:37<00:08,  8.05s/it][A
100%|██████████| 6/6 [00:39<00:00,  6.64s/it][A
 75%|███████▌  | 9/12 [09:19<02:54, 58.32s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  amine_and_polyamine



 17%|█▋        | 1/6 [00:10<00:51, 10.29s/it][A
 33%|███▎      | 2/6 [00:12<00:22,  5.70s/it][A
 50%|█████     | 3/6 [00:26<00:27,  9.26s/it][A
 67%|██████▋   | 4/6 [00:33<00:16,  8.44s/it][A
 83%|████████▎ | 5/6 [00:44<00:09,  9.33s/it][A
100%|██████████| 6/6 [00:47<00:00,  7.88s/it][A
 83%|████████▎ | 10/12 [10:07<01:49, 54.97s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  phospholipid



 17%|█▋        | 1/6 [00:13<01:07, 13.46s/it][A
 33%|███▎      | 2/6 [00:15<00:27,  6.89s/it][A
 50%|█████     | 3/6 [00:26<00:25,  8.47s/it][A
 67%|██████▋   | 4/6 [00:32<00:15,  7.60s/it][A
 83%|████████▎ | 5/6 [00:42<00:08,  8.67s/it][A
100%|██████████| 6/6 [00:45<00:00,  7.63s/it][A
 92%|█████████▏| 11/12 [10:53<00:52, 52.19s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

pathway:  nucleotide_sugar



 17%|█▋        | 1/6 [00:10<00:54, 10.97s/it][A
 33%|███▎      | 2/6 [00:13<00:23,  5.90s/it][A
 50%|█████     | 3/6 [00:32<00:36, 12.18s/it][A
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))

 83%|████████▎ | 5/6 [00:50<00:10, 10.31s/it][A
100%|██████████| 6/6 [00:53<00:00,  8.94s/it][A
100%|██████████| 12/12 [11:46<00:00, 58.90s/it]

10fold_results.csv is done.





In [206]:
# The best classifiers for each metabolic pathway were ranked using the Kruskal-Wallis test.
# Bu test algoritmaları nasıl sıralıyor anlamadım, benim anladığım fark var mı diye bakıyor dağılımlarda

res_df = pd.read_csv('10fold_results.csv')
res_rows = res_df.drop('fold_num', axis=1).groupby('model_name').mean()
for i in range(len(res_rows.index)):
    for j in range(i+1, len(res_rows.index)):
        # print(pw_name, "-", str(res_rows.index[i]), "-", str(res_rows.index[j]))
        stat, p = kruskal(res_rows.loc[str(res_rows.index[i])], res_rows.loc[str(res_rows.index[j])])
        print('Statistics=%.3f, p=%.3f' % (stat, p))
        if p < 0.05:
            print("statistically significant p value is found: ", str(res_rows.index[i], str(res_rows.index[j])))
        print("")

Statistics=0.314, p=0.575

Statistics=0.589, p=0.443

Statistics=0.021, p=0.885

Statistics=0.021, p=0.885

Statistics=0.073, p=0.787

Statistics=1.498, p=0.221

Statistics=0.190, p=0.663

Statistics=0.073, p=0.787

Statistics=0.035, p=0.852

Statistics=0.269, p=0.604

Statistics=0.469, p=0.494

Statistics=0.796, p=0.372

Statistics=0.073, p=0.787

Statistics=0.073, p=0.787

Statistics=0.052, p=0.820



In [208]:
# The best classifiers were pre-selected, seeking a low FP & high F-measure and CCI

best_models = {}    # {"pathway": [best 3 models]}
models = []
scores = {}
res_df["score"] =  res_df["acc"] + res_df["f1"] - res_df["fp"]
best_df = res_df.groupby(['pw_name','model_name'], as_index=False).mean().sort_values(["pw_name", "score"], ascending=False)
for pw in pathway_names:
    best_models[pw] = best_df[best_df["pw_name"] == pw].iloc[:3]["model_name"].tolist()
print(best_models)

{'amine_and_polyamine': ['knn', 'svc', 'rf'], 'amino_acid': ['rf', 'svc', 'knn'], 'aromatic_compound': ['sgd', 'knn', 'svc'], 'carbohydrate': ['rf', 'svc', 'knn'], 'carotenoid': ['knn', 'svc', 'sgd'], 'energy': ['sgd', 'svc', 'rf'], 'lipid': ['svc', 'rf', 'knn'], 'nitrogen': ['svc', 'sgd', 'rf'], 'nucleotide_sugar': ['knn', 'svc', 'sgd'], 'phospholipid': ['knn', 'svc', 'rf'], 'protein_modification': ['rf', 'svc', 'dt'], 'secondary_metabolite': ['knn', 'svc', 'rf']}


In [209]:
# estimators for bagging classifier
bagging_cls = {'svc': ('svc', BaggingClassifier(SVC())),
               'dt': ('dt', BaggingClassifier(DecisionTreeClassifier())),
#                'mlp': ('mlp', BaggingClassifier(MLPClassifier())),
               'nb' : ('nb', BaggingClassifier(GaussianNB())),
               'rf': ('rf', BaggingClassifier(RandomForestClassifier())),
               'knn': ('knn', BaggingClassifier(KNeighborsClassifier())),
               'sgd': ('sgd', BaggingClassifier(SGDClassifier()))}

In [210]:
params_rf = {'rf__base_estimator__bootstrap': [True],
             'rf__base_estimator__max_depth': [80, 100],
             'rf__base_estimator__max_features': [2, 3],
             'rf__base_estimator__min_samples_leaf': [3, 4],
             'rf__base_estimator__min_samples_split': [8, 10],
             'rf__base_estimator__n_estimators': [100, 200],
             'rf__base_estimator__n_jobs': [-1]
            }

params_SVC={'svc__base_estimator__C': [1, 100, 1000],
            'svc__base_estimator__gamma': [1, 0.01, 0.0001],
            'svc__base_estimator__kernel': ['rbf']
           }


params_MLP = {'mlp__base_estimator__hidden_layer_sizes': [(10,30,10),(20,)],
              'mlp__base_estimator__activation': ['tanh', 'relu'],
              'mlp__base_estimator__solver': ['sgd', 'adam'],
              'mlp__base_estimator__alpha': [0.0001, 0.05],
              'mlp__base_estimator__learning_rate': ['constant','adaptive']
             }

params_DT = { 
             'dt__base_estimator__max_depth': list(range(6,9)),
             'dt__base_estimator__min_samples_leaf': list(range(4,10,2))
            }

params_KNN = { 
             'knn__base_estimator__n_neighbors': list(range(3, 7)),
             'knn__base_estimator__n_jobs': [-1]
             }

params_SGD = {
    'sgd__base_estimator__penalty': ['l2'],
    'sgd__base_estimator__n_jobs': [-1],
#     'sgd__base_estimator__C': [1e-7, 1e-5, 1e-3],
}

all_params = {"rf": params_rf, 
              "svc": params_SVC, 
#               "mlp": params_MLP, 
              "dt": params_DT, 
              "knn": params_KNN, 
              "sgd": params_SGD}


In [226]:
voting_models_w_params = {}    # ["pathway": (voting model, parameters)]

for pw_name, models in best_models.items():

    estimators = []
    parameters = {}

    for model in models:
        # model's estimator
        estimators.append(bagging_cls[model])

        # there is no param for NB
        if model in all_params.keys():
            for param, values in all_params[model].items():
                parameters[param] = values

    # pramameters =

    voting_model = VotingClassifier(estimators=estimators, voting='soft')
    voting_models_w_params[pw_name] = [voting_model, parameters]

In [212]:
voting_models_w_params

{'amine_and_polyamine': (VotingClassifier(estimators=[('knn',
                                BaggingClassifier(base_estimator=KNeighborsClassifier())),
                               ('svc', BaggingClassifier(base_estimator=SVC())),
                               ('rf',
                                BaggingClassifier(base_estimator=RandomForestClassifier()))],
                   voting='soft'),
  {'knn__base_estimator__n_neighbors': [3, 4, 5, 6],
   'knn__base_estimator__n_jobs': [-1],
   'svc__base_estimator__C': [1, 100, 1000],
   'svc__base_estimator__gamma': [1, 0.01, 0.0001],
   'svc__base_estimator__kernel': ['rbf'],
   'rf__base_estimator__bootstrap': [True],
   'rf__base_estimator__max_depth': [80, 100],
   'rf__base_estimator__max_features': [2, 3],
   'rf__base_estimator__min_samples_leaf': [3, 4],
   'rf__base_estimator__min_samples_split': [8, 10],
   'rf__base_estimator__n_estimators': [100, 200],
   'rf__base_estimator__n_jobs': [-1]}),
 'amino_acid': (VotingClassifier

In [227]:
import json

# parameters tuning of combined algorithms

def params_tuning():
    best_scores = {}    # "model": best scores
    best_params = {}    # "model": best parameters
    params_rows = []
    for pathway_name in pathway_names:
        df = pd.read_csv('Datasets/' + pathway_name+'_train.csv')

        voting_model = voting_models_w_params[pathway_name][0]
        parameters = voting_models_w_params[pathway_name][1]

        print("Starting for", pathway_name)
        clf = RandomizedSearchCV(voting_model, parameters, n_jobs=-1, verbose=3, scoring='f1_macro', cv=3, n_iters=60)
        clf.fit(df.drop('Class', axis=1), df["Class"])

        best_scores[pathway_name] = clf.best_score_
        best_params[pathway_name] = clf.best_params_
        print(clf.best_score_)
        print(clf.best_params_)

    with open('best_params.json', 'w') as fp:
        json.dump(best_params, fp)

In [228]:
for (model, params) in voting_models_w_params.values():
    print(model)

VotingClassifier(estimators=[('knn',
                              BaggingClassifier(base_estimator=KNeighborsClassifier(n_jobs=-1,
                                                                                    n_neighbors=4))),
                             ('svc',
                              BaggingClassifier(base_estimator=SVC(C=1000,
                                                                   gamma=0.0001))),
                             ('rf',
                              BaggingClassifier(base_estimator=RandomForestClassifier(max_depth=100,
                                                                                      max_features=2,
                                                                                      min_samples_leaf=4,
                                                                                      min_samples_split=8,
                                                                                      n_jobs=-1)))],
                 

In [223]:
f = open('best_params.json',)
best_params = json.load(f)
f.close()

In [229]:
for pw in pathway_names:
    voting_models_w_params[pw][0] = voting_models_w_params[pw][0].set_params(**best_params[pw])
    voting_models_w_params[pw][1] = best_params[pw]

In [254]:
def tunedClf():
    results = []
    for pathway_name in pathway_names:
        df_train = pd.read_csv('Datasets/' + pathway_name+'_train.csv')
        df_test = pd.read_csv('Datasets/' + pathway_name+'_test.csv')
        df_eval = pd.read_csv('Datasets/' + pathway_name+'_eval.csv')
        
        df_train["Class"] = df_train["Class"].apply(lambda x: 1 if x=='positive' else 0)
        df_test["Class"] = df_test["Class"].apply(lambda x: 1 if x=='positive' else 0)
        df_eval["Class"] = df_eval["Class"].apply(lambda x: 1 if x=='positive' else 0)
        
        X = df_train.drop('Class', axis=1)
        y = df_train["Class"]

        print("Starting for", pathway_name)
        clf = voting_models_w_params[pathway_name][0]
        clf.fit(X, y)
        
        y_test = df_test["Class"]
        y_pred_test = clf.predict(df_test.drop('Class', axis=1))
        
        y_eval = df_eval["Class"]
        y_pred_eval = clf.predict(df_eval.drop('Class', axis=1))
        
        results.append(calc_metrics(y_test.tolist(), y_pred_test.tolist(), "voting clf", pathway_name, "test"))
        results.append(calc_metrics(y_eval.tolist(), y_pred_eval.tolist(), "voting clf", pathway_name, "eval"))
        
        
    results = pd.DataFrame(results, columns=["model_name", "pw_name", "type", "acc", "f1", "mcc", "tp", "fp", "tn", "fn", "recall", "precision", "specifity", "fpr", "tpr", "fnr", "fdr"])
    results.to_csv("tuned_results.csv", index=False)
    
    return results

In [255]:
tuned_results = tunedClf()

Starting for amine_and_polyamine
Starting for amino_acid
Starting for aromatic_compound


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))


Starting for carbohydrate
Starting for carotenoid
Starting for energy
Starting for lipid
Starting for nitrogen
Starting for nucleotide_sugar
Starting for phospholipid
Starting for protein_modification
Starting for secondary_metabolite


In [258]:
tuned_results = pd.read_csv('tuned_results.csv')

In [259]:
tuned_results = tuned_results.drop(('model_name'), axis=1)

In [260]:
tuned_results

Unnamed: 0,pw_name,type,acc,f1,mcc,tp,fp,tn,fn,recall,precision,specifity,fpr,tpr,fnr,fdr
0,amine_and_polyamine,test,0.996721,0.95,0.949579,19,0,589,2,1.0,1.0,1.0,0.0,0.904762,0.095238,0.0
1,amine_and_polyamine,eval,0.995595,0.933333,0.933286,7,0,219,1,1.0,1.0,1.0,0.0,0.875,0.125,0.0
2,amino_acid,test,0.97377,0.804878,0.809194,33,0,561,16,1.0,1.0,1.0,0.0,0.673469,0.326531,0.0
3,amino_acid,eval,0.991189,0.928571,0.926589,13,0,212,2,1.0,1.0,1.0,0.0,0.866667,0.133333,0.0
4,aromatic_compound,test,0.998361,0.8,0.815825,2,0,607,1,1.0,1.0,1.0,0.0,0.666667,0.333333,0.0
5,aromatic_compound,eval,0.995595,0.0,0.0,0,0,226,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6,carbohydrate,test,0.988525,0.960452,0.954774,85,0,518,7,1.0,1.0,1.0,0.0,0.923913,0.076087,0.0
7,carbohydrate,eval,1.0,1.0,1.0,60,0,167,0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
8,carotenoid,test,0.998361,0.933333,0.934638,7,0,602,1,1.0,1.0,1.0,0.0,0.875,0.125,0.0
9,carotenoid,eval,0.995595,0.909091,0.910813,5,0,221,1,1.0,1.0,1.0,0.0,0.833333,0.166667,0.0
