In [2]:
import sys
sys.path.append("../..")
import warnings
warnings.filterwarnings("ignore")
from chemocommons import *
import pandas as pd
import numpy as np
from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes
from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?
from skmultilearn.adapt import MLkNN, MLTSVM
from skmultilearn.problem_transform import LabelPowerset # sorry, we only used LP
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?
from sklearn.model_selection import LeaveOneOut, RepeatedKFold #, KFold # jackknife, "socalled"
from sklearn.metrics import jaccard_similarity_score, f1_score # for some calculation
from sklearn.utils.multiclass import unique_labels
from lightgbm import LGBMClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from joblib import load



In [15]:
#loocv = LeaveOneOut() # jackknife
rmskf = MultilabelStratifiedKFold(n_splits=10,  random_state=19941115)
label_names = ["ABCG2", "MDR1", "MRP1", "MRP2", "MRP3", "MRP4", "NTCP2", "S15A1", 
               "S22A1", "SO1A2", "SO1B1", "SO1B3", "SO2B1"]

Y = pd.read_csv("label_matrix.txt", sep="\t", names=label_names)
Y[Y==-1]=0

ft_FP = pd.read_csv("query_smiles_feature_similarity_four_average.csv", names=label_names)
ft_FP.rename(mapper= lambda x: x + "_FP", axis=1, inplace=True)
ft_OT = pd.read_csv("feature_similarity_chebi_ontology_DiShIn_2.csv", names=label_names)
ft_OT.rename(mapper= lambda x: x + "_OT", axis=1, inplace=True)

X = np.concatenate((ft_FP, ft_OT), axis=1)
Y = Y.values

In [70]:
def measure_per_label(measure, y_true, y_predicted):
    """
        This code is inspired by skmultilearn, but our y_true and y_predicted are all dense numpy.ndarray
    """
    return [
        measure(
            y_true[:, i],
            y_predicted[:, i]
        )
        for i in range(y_true.shape[1])
    ]


NLSP_RF = load("rf.joblib")[0]

final_model = NLSP_RF.best_estimator_
label_acc = []
label_sp = []
label_rc = []
label_f1 = []
label_auc = []

y_pred = np.zeros_like(Y)
y_proba = np.zeros_like(Y)



for i in range(10): #10*10-cv
    print(i, "th repeat:")
    kfold = MultilabelStratifiedKFold(n_splits=10,  random_state=19941115)
    for k, (train, test) in enumerate(kfold.split(X, Y)):
        print(k, "th fold.")
        final_model.fit(X[train], Y[train])
        y_pred = np.array(final_model.predict(X[test]).todense())
        y_proba = np.array(final_model.predict_proba(X[test]).todense())
        label_acc.append(measure_per_label(metrics.accuracy_score, Y[test], y_pred))
        label_sp.append(measure_per_label(metrics.precision_score, Y[test], y_pred))
        label_rc.append(measure_per_label(metrics.recall_score, Y[test], y_pred))
        label_f1.append(measure_per_label(metrics.f1_score, Y[test], y_pred))
        label_auc.append(measure_per_label(metrics.roc_auc_score, Y[test], y_proba))

label_acc = np.array(label_acc)
label_sp = np.array(label_sp)
label_rc = np.array(label_rc)
label_f1 = np.array(label_f1)
label_auc = np.array(label_auc)

to_sav = dump((label_acc, label_sp, label_rc, label_f1, label_auc), filename="report_array.joblib")

print(label_acc.mean(axis=0), label_sp.mean(axis=0), label_rc.mean(axis=0),label_f1.mean(axis=0), label_auc.mean(axis=0))

    

0 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
1 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
2 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
3 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
4 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
5 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
6 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
7 th repeat:
0 th fold.
1 th fold.
2 th fold.
3 th fold.
4 th fold.
5 th fold.
6 th fold.
7 th fold.
8 th fold.
9 th fold.
8 th repeat:
0 t

In [66]:
print(label_acc.mean(axis=0), label_sp.mean(axis=0), label_rc.mean(axis=0),label_f1.mean(axis=0), label_auc.mean(axis=0))

[0.86728061 0.82665222 0.95449621 0.92741062 0.96749729 0.97074756
 0.99458288 0.9723727  0.96533044 0.97453954 0.95612134 0.97508126
 0.97995666] [0.09046587 0.45016251 0.03575298 0.01787649 0.00975081 0.00270856
 0.02491874 0.1099675  0.04767064 0.00433369 0.00595883 0.00270856
 0.00216685] [0.09046587 0.45016251 0.03575298 0.01787649 0.00975081 0.00270856
 0.02491874 0.1099675  0.04767064 0.00433369 0.00595883 0.00270856
 0.00216685] [0.09046587 0.45016251 0.03575298 0.01787649 0.00975081 0.00270856
 0.02491874 0.1099675  0.04767064 0.00433369 0.00595883 0.00270856
 0.00216685] 0.0


In [71]:
label_acc.mean(axis=0)

array([0.86886624, 0.82629165, 0.95213385, 0.93531963, 0.97052574,
       0.97484388, 0.99404831, 0.97431425, 0.96509124, 0.97319891,
       0.95619702, 0.97633315, 0.98074716])

In [72]:
label_sp.mean(axis=0)

array([0.72206496, 0.77961642, 0.83943579, 0.72212698, 0.59753571,
       0.36666667, 0.925     , 0.91736341, 0.9194019 , 0.49666667,
       0.519     , 0.5125    , 0.3       ])

In [73]:
label_rc.mean(axis=0)

array([0.48469748, 0.90494505, 0.44450549, 0.25406593, 0.31071429,
       0.167     , 0.86666667, 0.87695652, 0.60961905, 0.13333333,
       0.14097222, 0.151     , 0.11      ])

In [21]:
label_f1.mean(axis=0)

array([0.09046587, 0.45016251, 0.03575298, 0.01787649, 0.00975081,
       0.00270856, 0.02491874, 0.1099675 , 0.04767064, 0.00433369,
       0.00595883, 0.00270856, 0.00216685])

In [3]:
reports = load("report_array.joblib")

In [24]:
final_reports = []
for i in reports:
    final_reports.append(i.mean(axis=0))

In [25]:
final_reports = pd.DataFrame(final_reports)

In [26]:
final_reports = final_reports.T

In [29]:
final_reports.index = label_names

In [30]:
final_reports.columns = ["ACC", "SP", "RC", "F1", "AUC"]

In [33]:
final_reports.to_csv("final_reports.csv")

In [8]:
my_list = reports[0].mean(axis=0)

(13,)

In [7]:
reports[1].mean(axis=0)

array([0.72206496, 0.77961642, 0.83943579, 0.72212698, 0.59753571,
       0.36666667, 0.925     , 0.91736341, 0.9194019 , 0.49666667,
       0.519     , 0.5125    , 0.3       ])