In [4]:
import warnings
import pprint
import skrebate
import imblearn
from imblearn import under_sampling, over_sampling, combine
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn import (preprocessing, svm, linear_model, ensemble, naive_bayes,
                    tree, neighbors, decomposition, kernel_approximation, cluster)
from sklearn.pipeline import Pipeline
from sklearn.base import clone

from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import (KFold, GroupKFold, StratifiedKFold,
                                    LeaveOneGroupOut, cross_validate,
                                    cross_val_predict, learning_curve,
                                    GridSearchCV)
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel, VarianceThreshold, f_classif
from sklearn.metrics import (r2_score, auc, roc_auc_score, balanced_accuracy_score, 
                             average_precision_score, confusion_matrix, roc_curve,
                             precision_recall_curve)
from sklearn.metrics.scorer import roc_auc_scorer
from sklearn.preprocessing import QuantileTransformer, quantile_transform, StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.utils.validation import check_memory
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestClassifier

warnings.simplefilter('ignore')

In [5]:
import os
import sys
import numpy as np
import pandas as pd
import re


import plotly.plotly as py
import plotly.graph_objs as go
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

## result

In [12]:
work_dir = './drug_respond/results/non_smmart_protein_ran_tissue10/'

sub1 = 'iraps_1-249'
sub2 = 'exacloud_xgbc'
sub3 = 'galaxydev_xgbc_05'
sub4 = 'galaxydev_xgbr'
sub5 = 'exastack_randomforest'

In [13]:
def concate_best_result(folder, file_name, scorer, classifier, results, discretize_z_score=-1, exact_z_score=False):
    
    if not file_name.endswith('tabular'):
        return results
    path = os.path.join(folder, file_name)
    res = pd.read_csv(path, sep='\t')
    
    res_filtered = res
    if exact_z_score:
        #print(path)
        if classifier == 'IRAPSClassifier':
            res_filtered = res_filtered[res_filtered['param_estimator__discretize'] == discretize_z_score]
            if discretize_z_score != -1:
                classifier = 'IRAPSClassifier_05'
        else:
            res_filtered = res_filtered[res_filtered['param_estimator__z_score'] == discretize_z_score]
            if discretize_z_score != -1:
                classifier = classifier + '_05'
        #print(classifier)
    res_sort = res_filtered.sort_values(['mean_test_'+scorer, 'std_test_'+scorer], ascending=[False, True])
    res_best = res_sort[['mean_test_'+scorer, 'std_test_'+scorer,'params']].head(1).reset_index(drop=True)
    if res_best['mean_test_'+scorer].isnull().any():
        print("%s got null %s" %(file_name, scorer))
    else:
        res_best.insert(loc=0, column='dataset', value=file_name[:-11])
        res_best.insert(loc=0, column='classifier', value=classifier)
        if results is None:
            results = res_best
        else:
            results = results.append(res_best, ignore_index=True)
    return results

In [19]:
# best AP scores
files1 = os.listdir(work_dir+sub1)
files2 = os.listdir(work_dir+sub2)
files3 = os.listdir(work_dir+sub3)
files4 = os.listdir(work_dir+sub4)
files5 = os.listdir(work_dir+sub5)
results = None
scorer = 'binarize_average_precision_scorer'
for fl in files1:
    results = concate_best_result(work_dir+sub1, fl, scorer, 'IRAPSClassifier', results, discretize_z_score=-1, exact_z_score=True)

for fl in files1:
    results = concate_best_result(work_dir+sub1, fl, scorer, 'IRAPSClassifier', results, discretize_z_score=-0.5, exact_z_score=True)

# xgb_classifier
for fl in files2:
    results = concate_best_result(work_dir+sub2, fl, scorer, 'XGBClassifier', results, discretize_z_score=-1, exact_z_score=True)

# xgb_classifier_05
for fl in files3:
    results = concate_best_result(work_dir+sub3, fl, scorer, 'XGBClassifier', results, discretize_z_score=-0.5, exact_z_score=True)

# xgb_regressor
for fl in files4:
    results = concate_best_result(work_dir+sub4, fl, scorer, 'XGBRegressor', results, discretize_z_score=-1, exact_z_score=True)
for fl in files4:
    results = concate_best_result(work_dir+sub4, fl, scorer, 'XGBRegressor', results, discretize_z_score=-0.5, exact_z_score=True)

# randomforestclassifier
for fl in files5:
    results = concate_best_result(work_dir+sub5, fl, scorer, 'RandomForestClassifier', results)


results = results.sort_values(['classifier', 'dataset'])
results

Unnamed: 0,classifier,dataset,mean_test_binarize_average_precision_scorer,std_test_binarize_average_precision_scorer,params
57,IRAPSClassifier,(5Z)-7-Oxozeaenol_1242.tsv,0.521701,0.132076,"{'estimator__discretize': -1, 'estimator__fc_t..."
175,IRAPSClassifier,5-Fluorouracil_179.tsv,0.503553,0.135022,"{'estimator__discretize': -1, 'estimator__fc_t..."
99,IRAPSClassifier,681640_1046.tsv,0.263553,0.135433,"{'estimator__discretize': -1, 'estimator__fc_t..."
235,IRAPSClassifier,A-443654_86.tsv,0.456833,0.184957,"{'estimator__discretize': -1, 'estimator__fc_t..."
218,IRAPSClassifier,A-770041_55.tsv,0.629845,0.292975,"{'estimator__discretize': -1, 'estimator__fc_t..."
133,IRAPSClassifier,AICA Ribonucleotide_1001.tsv,0.511618,0.152801,"{'estimator__discretize': -1, 'estimator__fc_t..."
24,IRAPSClassifier,AKT inhibitor VIII_171.tsv,0.339105,0.132650,"{'estimator__discretize': -1, 'estimator__fc_t..."
6,IRAPSClassifier,AKT inhibitor VIII_228.tsv,0.541215,0.110306,"{'estimator__discretize': -1, 'estimator__fc_t..."
139,IRAPSClassifier,AR-42_272.tsv,0.742867,0.096798,"{'estimator__discretize': -1, 'estimator__fc_t..."
185,IRAPSClassifier,AS601245_207.tsv,0.315521,0.109106,"{'estimator__discretize': -1, 'estimator__fc_t..."


In [None]:
#results.to_csv('/Users/guq/Documents/ohsu/drug_respond/results/smmart_proten_rna_tissue10/ap.csv', sep=',', index=False)

In [15]:
# best roc-auc scores
files1 = os.listdir(work_dir+sub1)
files2 = os.listdir(work_dir+sub2)
files3 = os.listdir(work_dir+sub3)
files4 = os.listdir(work_dir+sub4)
files5 = os.listdir(work_dir+sub5)
results_auc = None
scorer = 'binarize_auc_scorer'
for fl in files1:
    results_auc = concate_best_result(work_dir+sub1, fl, scorer, 'IRAPSClassifier', results_auc, discretize_z_score=-1, exact_z_score=True)

for fl in files1:
    results_auc = concate_best_result(work_dir+sub1, fl, scorer, 'IRAPSClassifier', results_auc, discretize_z_score=-0.5, exact_z_score=True)

# xgb_classifier
for fl in files2:
    results_auc = concate_best_result(work_dir+sub2, fl, scorer, 'XGBClassifier', results_auc, discretize_z_score=-1, exact_z_score=True)

# xgb_classifier_05
for fl in files3:
    results_auc = concate_best_result(work_dir+sub3, fl, scorer, 'XGBClassifier', results_auc, discretize_z_score=-0.5, exact_z_score=True)

# xgb_regressor
for fl in files4:
    results_auc = concate_best_result(work_dir+sub4, fl, scorer, 'XGBRegressor', results_auc, discretize_z_score=-1, exact_z_score=True)
for fl in files4:
    results_auc = concate_best_result(work_dir+sub4, fl, scorer, 'XGBRegressor', results_auc, discretize_z_score=-0.5, exact_z_score=True)

# randomforestclassifier
for fl in files5:
    results_auc = concate_best_result(work_dir+sub5, fl, scorer, 'RandomForestClassifier', results_auc)

results_auc = results_auc.sort_values(['classifier', 'dataset'])
results_auc

Unnamed: 0,classifier,dataset,mean_test_binarize_auc_scorer,std_test_binarize_auc_scorer,params
57,IRAPSClassifier,(5Z)-7-Oxozeaenol_1242.tsv,0.763874,0.103438,"{'estimator__discretize': -1, 'estimator__fc_t..."
175,IRAPSClassifier,5-Fluorouracil_179.tsv,0.788364,0.068541,"{'estimator__discretize': -1, 'estimator__fc_t..."
99,IRAPSClassifier,681640_1046.tsv,0.611398,0.138529,"{'estimator__discretize': -1, 'estimator__fc_t..."
235,IRAPSClassifier,A-443654_86.tsv,0.669264,0.130437,"{'estimator__discretize': -1, 'estimator__fc_t..."
218,IRAPSClassifier,A-770041_55.tsv,0.800373,0.190455,"{'estimator__discretize': -1, 'estimator__fc_t..."
133,IRAPSClassifier,AICA Ribonucleotide_1001.tsv,0.791697,0.060945,"{'estimator__discretize': -1, 'estimator__fc_t..."
24,IRAPSClassifier,AKT inhibitor VIII_171.tsv,0.665955,0.104560,"{'estimator__discretize': -1, 'estimator__fc_t..."
6,IRAPSClassifier,AKT inhibitor VIII_228.tsv,0.815941,0.079621,"{'estimator__discretize': -1, 'estimator__fc_t..."
139,IRAPSClassifier,AR-42_272.tsv,0.879060,0.062971,"{'estimator__discretize': -1, 'estimator__fc_t..."
185,IRAPSClassifier,AS601245_207.tsv,0.624720,0.061157,"{'estimator__discretize': -1, 'estimator__fc_t..."


In [40]:
#results_auc.to_csv('/Users/guq/Documents/ohsu/drug_respond/results/smmart_proten_rna_tissue10/roc-auc.csv', sep=',', index=False)

In [20]:
data1 = go.Bar(
    x = results[results['classifier'] == 'IRAPSClassifier']['dataset'],
    y = results[results['classifier'] == 'IRAPSClassifier']['mean_test_binarize_average_precision_scorer'],
    name = 'IRAPS_AP'
)
data2 = go.Bar(
    x = results[results['classifier'] == 'IRAPSClassifier_05']['dataset'],
    y = results[results['classifier'] == 'IRAPSClassifier_05']['mean_test_binarize_average_precision_scorer'],
    name = 'IRAPS_AP05'
)
data3 = go.Bar(
    x = results[results['classifier'] == 'RandomForestClassifier']['dataset'],
    y = results[results['classifier'] == 'RandomForestClassifier']['mean_test_binarize_average_precision_scorer'],
    name = 'RF_AP'
)
data4 = go.Bar(
    x = results[results['classifier'] == 'XGBClassifier']['dataset'],
    y = results[results['classifier'] == 'XGBClassifier']['mean_test_binarize_average_precision_scorer'],
    name = 'XGBC_AP'
)
data5 = go.Bar(
    x = results[results['classifier'] == 'XGBClassifier_05']['dataset'],
    y = results[results['classifier'] == 'XGBClassifier_05']['mean_test_binarize_average_precision_scorer'],
    name = 'XGBC_AP05'
)


data6 = go.Bar(
    x = results[results['classifier'] == 'XGBRegressor']['dataset'],
    y = results[results['classifier'] == 'XGBRegressor']['mean_test_binarize_average_precision_scorer'],
    name = 'XGBRegr_AP'
)

data7 = go.Bar(
    x = results[results['classifier'] == 'XGBRegressor_05']['dataset'],
    y = results[results['classifier'] == 'XGBRegressor_05']['mean_test_binarize_average_precision_scorer'],
    name = 'XGBRegr_AP05'
)

data8 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'IRAPSClassifier']['dataset'],
    y = results_auc[results_auc['classifier'] == 'IRAPSClassifier']['mean_test_binarize_auc_scorer'],
    name = 'IRAPS_ROC'
)
data9 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'IRAPSClassifier_05']['dataset'],
    y = results_auc[results_auc['classifier'] == 'IRAPSClassifier_05']['mean_test_binarize_auc_scorer'],
    name = 'IRAPS_ROC05'
)

data10 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'RandomForestClassifier']['dataset'],
    y = results_auc[results_auc['classifier'] == 'RandomForestClassifier']['mean_test_binarize_auc_scorer'],
    name = 'RF_ROC'
)

data11 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'XGBClassifier']['dataset'],
    y = results_auc[results_auc['classifier'] == 'XGBClassifier']['mean_test_binarize_auc_scorer'],
    name = 'XGBC_ROC'
)
data12 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'XGBClassifier_05']['dataset'],
    y = results_auc[results_auc['classifier'] == 'XGBClassifier_05']['mean_test_binarize_auc_scorer'],
    name = 'XGBC_ROC05'
)

data13 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'XGBRegressor']['dataset'],
    y = results_auc[results_auc['classifier'] == 'XGBRegressor']['mean_test_binarize_auc_scorer'],
    name = 'XGBRegr_ROC'
)

data14 = go.Bar(
    x = results_auc[results_auc['classifier'] == 'XGBRegressor_05']['dataset'],
    y = results_auc[results_auc['classifier'] == 'XGBRegressor_05']['mean_test_binarize_auc_scorer'],
    name = 'XGBRegr_ROC05'
)

layout = go.Layout(
    xaxis=dict(
        title='Dataset'
    ),
    yaxis=dict(
        title='Performance score'
    ),
    barmode = 'group'
)
fig = go.Figure(data=[data1, data2, data3, data4, data5, data6, data7], layout=layout)
iplot(fig)

fig = go.Figure(data=[data8, data9, data10, data11, data12, data13, data14], layout=layout)
iplot(fig)
# To show plot, paste the link to this GitHub notebook into http://nbviewer.jupyter.org/

In [21]:
trace1 = {
    "type": 'violin',
    "x": results['classifier'],
    "y": results['mean_test_binarize_average_precision_scorer'],
    "legendgroup": 'AP',
    "scalegroup": 'AP',
    "name": 'AP',
    "box": {
        "visible": True
    },
    "meanline": {
        "visible": True
    },
    "line": {
        "color": 'blue'
    }
}

trace2 = {
    "type": 'violin',
    "x": results_auc['classifier'],
    "y": results_auc['mean_test_binarize_auc_scorer'],
    "legendgroup": 'ROC-AUC',
    "scalegroup": 'ROC-AUC',
    "name": 'ROC-AUC',
    "box": {
        "visible": True
    },
    "meanline": {
        "visible": True
    },
    "line": {
        "color": 'pink'
    }
}

layout = {
    "yaxis": {
        "zeroline": False,
    },
    "violinmode": 'group'
}
fig = go.Figure(data=[trace1, trace2], layout=layout)
iplot(fig)
# To show plot, paste the link to this GitHub notebook into http://nbviewer.jupyter.org/