In [13]:
# PART OF THIS CODE IS FROM MICHAEL MURPHY - THANKS!
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import OrderedDict
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, sys
import glob, re
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 100)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.utils.multiclass import type_of_target # used to check the Y labels are appropriate for classification
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.utils import shuffle
from scipy import interp

In [14]:
def get_num_labels(ds):
    ds['labels'] = ds['labels']*1
    vals = ds['labels'].values
    try:
        vals = [item for sublist in vals for item in sublist]
    except:
        pass
    labels = set(vals)
    ds['num_labels'] = len(labels)
    ds['label_set'] = labels
    return ds

def check_pre_norm(ds):
    if ds['data_set'] in pre_norm_ds:
        ds['pre_norm'] = 'Yes'
    else:
        ds['pre_norm'] = 'No'
    return ds

def convert_nan_to_val(data, value=0):
    data[pd.isnull(data)] = value
    return data

def train_model(X,y,ds,model):
    X,y = shuffle(X,y)
    if model == 'log_reg':
        if ds['num_labels'] != 2:
            clf = LogisticRegressionCV(scoring='accuracy', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500, multi_class='ovr')
        else:
            clf = LogisticRegressionCV(scoring='roc_auc', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
    elif model == 'rf':
        param_grid = {'n_estimators':[100,500,1000]}
        clf = GridSearchCV(RandomForestClassifier(n_estimators=1000, n_jobs=-1), param_grid, cv=3, n_jobs=-1)
    elif model == 'svm':
        param_grid = {'gamma': [1e-3, 0.01, 0.1, 1], 'C': [0.01, 0.1, 1, 10, 100]}
        clf = GridSearchCV(SVC(kernel='linear', probability=True), param_grid, cv=3, n_jobs=-1)
    else:
        print('no valid classifier input, please try again with one of: log_reg, rf or svm')
        exit(0)
    cv = StratifiedKFold(n_splits=5, shuffle=True) # so this will probably give rather high - at the end you just get the last model...
    aucs = []
    for train, test in cv.split(X,y):
        x_train, y_train = X[train], y[train]
        x_test, y_test = X[test], y[test]
        if stand_scaler:
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.transform(x_test)
        clf.fit(x_train, y_train)
        if ds['num_labels'] != 2:
            if ovr_auc:
                # to do one v the rest AUCs:
                y_pred = clf.predict_proba(x_test)
                num_labels = y_pred.shape[1]
                set_to = num_labels+10
                indiv_aucs = []
                for ind in range(y_pred.shape[1]):
                    y_mut = y_test.copy()
                    y_mut[y_mut==ind] = set_to
                    y_mut[y_mut!=set_to] = 0
                    y_mut[y_mut==set_to] = 1
                    fpr, tpr, _ = roc_curve(y_mut, y_pred[:,ind])
                    auc_value = metrics.auc(fpr, tpr)
                    indiv_aucs.append(auc_value)
                aucs.append(indiv_aucs)
            else: aucs.append(clf.score(x_test, y_test))
        else:
            y_pred = clf.predict_proba(x_test)
            fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
            auc_value = metrics.auc(fpr, tpr)
            aucs.append(auc_value) 
    auc = np.asarray(aucs)
    if ds['num_labels'] != 2:
        multi_aucs = auc
    else: multi_aucs = 0
    return auc.mean(), auc.std(), clf, y_train.shape, y_test.shape, multi_aucs

def fit_model(X,y,ds,model):
    mean, std, clf, train_size, test_size, multi_aucs =  train_model(X,y,ds,model)
    if mean == 1.0 or mean == 0.5:
        mean, std, clf, train_size, test_size, multi_aucs = train_model(X,y,ds,model)
    return mean, std, train_size[0], test_size[0], clf, multi_aucs

In [15]:
#### Use this if DOING a fresh modeling fitting analysis
reanalysis = True # train new models? 
bn = True # use the percentile normalized data or no? 
# if not using bn data, use the following:
log = False 
stand_scaler = False
model = 'rf' #svm, rf, log_reg


if reanalysis:
    pre_norm_ds = [ 'plasmaall_author',
                    'urineall_author',
                    'm_oxylipin_chronic_hep_b',
                    'm_chronic_hep_b_POS',
                    'm_chronic_hep_b_NEG',
                    'm_CER_mass_spectrometry_v4',
                    'm_CER_mass_spectrometry_v4_3_CS',
                    'm_CER_mass_spectrometry_v4_0_NS',
                    'm_CER_mass_spectrometry_v4_2_FS',
                    'm_CER_mass_spectrometry_v4_1_COPD',
                    'm_EICO_mass_spectrometry_v4',
                    'm_EICO_mass_spectrometry_v4_3_CS',
                    'm_EICO_mass_spectrometry_v4_0_NS',
                    'm_EICO_mass_spectrometry_v4_2_FS',
                    'm_EICO_mass_spectrometry_v4_1_COPD',
                    'AN000580',
                    'AN000581',
                    'AN001503']

    if bn:
        path = './bn_pickles/*.pkl'
#         path = './bn_pickles/MTBLS72*.pkl'
    else:
        path = './pickles/*.pkl'

    datasets = OrderedDict()
    for fn in sorted(glob.glob(path)):
        data = pd.read_pickle(open(fn,'rb'))
        datasets[data[0]['study']] = data
    
else:
    #### Use this if NOT doing a fresh modeling fitting analysis
    pickle_file = './YES_bn_ds_models_and_sigfeat_NO_log_NO_standscal_NO_multi_mapped_labels.pkl'
    ### The non-batch corrected pickle for the dataset
    # pickle_file = './NO_bn_dataset_models_and_sigfeat_YES_log.pkl'
    datasets = pickle.load(open(pickle_file, 'rb'))

In [16]:
# this is just to combine datasets and make a new dictionary. 
original_multi_class = ['ST000284', 'ST000046', 'ST000045', 'ST000763', 'ST000329', 'MTBLS358', 'MTBLS352']

combinable_ds = {'MTBLS17':[['IPO_aligned_MTBLS17_neg_exp1', 'IPO_aligned_MTBLS17_pos_exp1'],
                            ['IPO_aligned_MTBLS17_neg_exp2', 'IPO_aligned_MTBLS17_pos_exp2'],
                            ['IPO_aligned_MTBLS17_neg_exp3', 'IPO_aligned_MTBLS17_pos_exp3'],
                            ['IPO_aligned_MTBLS17_neg_onebatch','IPO_aligned_MTBLS17_pos_onebatch'],
                            ['Peaklist_EXP1_POS','Peaklist_EXP1_NEG'],
                            ['Peaklist_EXP2_POS','Peaklist_EXP2_NEG'],
                            ['Peaklist_EXP3_POS','Peaklist_EXP3_NEG']],
                 'MTBLS19': [['Exp1F_POS', 'Exp1F_NEG'], ['Exp2F_POS', 'Exp2F_NEG'], 
                             ['IPO_aligned_MTBLS19_neg_exp1_F', 'IPO_aligned_MTBLS19_pos_exp1_F'],
                             ['IPO_aligned_MTBLS19_neg_exp2_F','IPO_aligned_MTBLS19_pos_exp2_F'],
                             ['IPO_aligned_MTBLS19_neg_all_F_R', 'IPO_aligned_MTBLS19_pos_all_F_R']],
                 'MTBLS28': [['m_mtbls28_NEG_v2_maf', 'm_mtbls28_POS_v2_maf'],
                             ['IPO_aligned_MTBLS28_neg', 'IPO_aligned_MTBLS28_pos']],
                 'MTBLS72': [['IPO_aligned_MTBLS72_neg', 'IPO_aligned_MTBLS72_pos']],
                 'MTBLS105': [['IPO_aligned_MTBLS105_qMS', 'IPO_aligned_MTBLS105_SIM-MS']],
                 'MTBLS266': [['m_mtbls266_NEG_mass_spectrometry_v2_maf', 'm_mtbls266_POS_mass_spectrometry_v2_maf'],
                              ['IPO_aligned_MTBLS266_neg', 'IPO_aligned_MTBLS266_pos']],
                 'MTBLS315': [['m_GC_nmfi_and_bsi_diagnosis_v2_maf', 'm_LC_nmfi_and_bsi_diagnosis_v2_maf', 
                               'm_UPLC_NEG_nmfi_and_bsi_diagnosis_v2_maf', 'm_UPLC_POS_nmfi_and_bsi_diagnosis_v2_maf'],
                              ['IPO_aligned_MTBLS315_mzData', 'IPO_aligned_MTBLS315_mzXML', 
                               'IPO_aligned_MTBLS315_n_mzML', 'IPO_aligned_MTBLS315_p_mzML']],
                 'MTBLS354': [['m_cap_metabolite_profiling_mass_spectrometry_v2_maf', 'm_cap_metabolite_profiling_mass_spectrometry-1_v2_maf'],
                              ['IPO_aligned_MTBLS354_neg', 'IPO_aligned_MTBLS354_pos']],
                 'MTBLS364': [['IPO_aligned_MTBLS364_hil_neg', 'IPO_aligned_MTBLS364_hil_pos',
                               'IPO_aligned_MTBLS364_lip_neg', 'IPO_aligned_MTBLS364_lip_pos']],
                 'ST000045': [#['02Feb10-21-r0_1_II', '11Feb10-21-r0_1_II', '11March10-21-r0_1_II', '17March10-21-r0_1_II'],
                              #['02Feb10-21-r0_0_ND', '11Feb10-21-r0_0_ND', '11March10-21-r0_0_ND', '17March10-21-r0_0_ND'],
                              #['02Feb10-21-r0_2_IW', '11Feb10-21-r0_2_IW', '11March10-21-r0_2_IW', '17March10-21-r0_2_IW'],
                              #['IPO_aligned_ST000045_2feb_pos_1_II', 'IPO_aligned_ST000045_11feb_neg_1_II', 
                              # 'IPO_aligned_ST000045_11mar_pos_1_II', 'IPO_aligned_ST000045_17mar_neg_1_II'],
                              #['IPO_aligned_ST000045_2feb_pos_0_ND', 'IPO_aligned_ST000045_11feb_neg_0_ND',
                              # 'IPO_aligned_ST000045_11mar_pos_0_ND', 'IPO_aligned_ST000045_17mar_neg_0_ND'],
                              #['IPO_aligned_ST000045_2feb_pos_2_IW', 'IPO_aligned_ST000045_11feb_neg_2_IW',
                              # 'IPO_aligned_ST000045_11mar_pos_2_IW', 'IPO_aligned_ST000045_17mar_neg_2_IW']
                              ['02Feb10-21-r0_ND_II','11Feb10-21-r0_ND_II', '11March10-21-_ND_II','17March10-21-_ND_II'],
                              ['02Feb10-21-r0_ND_IW', '11Feb10-21-r0_ND_IW', '11March10-21-_ND_IW', '17March10-21-_ND_IW'],
                              ['02Feb10-21-r0_II_IW', '11Feb10-21-r0_II_IW', '11March10-21-_II_IW', '17March10-21-_II_IW'],
                              ['IPO_aligned_ST000045_2feb_pos_ND_II', 'IPO_aligned_ST000045_11feb_neg_ND_II',
                               'IPO_aligned_ST000045_11mar_pos_ND_II', 'IPO_aligned_ST000045_17mar_neg_ND_II'],
                              ['IPO_aligned_ST000045_2feb_pos_ND_IW', 'IPO_aligned_ST000045_11feb_neg_ND_IW',
                               'IPO_aligned_ST000045_11mar_pos_ND_IW', 'IPO_aligned_ST000045_17mar_neg_ND_IW'],
                              ['IPO_aligned_ST000045_2feb_pos_II_IW', 'IPO_aligned_ST000045_11feb_neg_II_IW',
                               'IPO_aligned_ST000045_11mar_pos_II_IW', 'IPO_aligned_ST000045_17mar_neg_II_IW']],
                 'ST000329': [#['AN000525_0_Control', 'AN000526_0_Control'],
                              #['AN000525_2_FSGS', 'AN000526_2_FSGS'],
                              #['AN000525_1_MCD', 'AN000526_1_MCD'],
                              #['IPO_aligned_ST000329_pos_0_Control', 'IPO_aligned_ST000329_neg_0_Control'],
                              #['IPO_aligned_ST000329_pos_2_FSGS', 'IPO_aligned_ST000329_neg_2_FSGS'],
                              #['IPO_aligned_ST000329_neg_1_MCD', 'IPO_aligned_ST000329_pos_1_MCD']
                              ['AN000525_MCD_FSGS','AN000526_MCD_FSGS'],
                              ['AN000525_MCD_Control','AN000526_MCD_Control'],
                              ['AN000525_FSGS_Control', 'AN000526_FSGS_Control'],
                              ['IPO_aligned_ST000329_pos_MCD_FSGS', 'IPO_aligned_ST000329_neg_MCD_FSGS'],
                              ['IPO_aligned_ST000329_pos_MCD_Control', 'IPO_aligned_ST000329_neg_MCD_Control'],
                              ['IPO_aligned_ST000329_pos_FSGS_Control', 'IPO_aligned_ST000329_neg_FSGS_Control']],
                 'ST000385': [['AN000603_plasma', 'AN000603_serum'],
                              ['AN000620_plasma', 'AN000620_serum'],
                              ['IPO_aligned_ST000385_adc2_plasma', 'IPO_aligned_ST000385_adc2_serum'],
                              ['IPO_aligned_ST000385_adc1_plasma', 'IPO_aligned_ST000385_adc1_serum'],
                              ['IPO_aligned_ST000385_onebatch_plasma','IPO_aligned_ST000385_onebatch_serum']],
                 'ST000392': [['AN000628_plasma', 'AN000628_serum'], 
                              ['IPO_aligned_ST000392_plasma', 'IPO_aligned_ST000392_serum']],
                 'ST000578': [['AN000888', 'AN000889'],
                              ['IPO_aligned_ST000578_AE', 'IPO_aligned_ST000578_C18']],
                 'ST000763': [#['AN001201_0_Healthy', 'AN001202_0_Healthy'],
                              #['AN001201_1_LowRisk', 'AN001202_1_LowRisk'],
                              #['AN001201_4_PAH', 'AN001202_4_PAH'],
                              #['AN001201_2_Normal Pressures', 'AN001202_2_Normal Pressures'],
                              #['AN001201_3_Borderline Pressures', 'AN001202_3_Borderline Pressures'],
                              #['IPO_aligned_ST000763_untar_neg_0_Healthy', 'IPO_aligned_ST000763_untar_pos_0_Healthy'],
                              #['IPO_aligned_ST000763_untar_neg_4_PAH', 'IPO_aligned_ST000763_untar_pos_4_PAH'],
                              #['IPO_aligned_ST000763_untar_neg_1_LowRisk', 'IPO_aligned_ST000763_untar_pos_1_LowRisk'],
                              #['IPO_aligned_ST000763_untar_neg_2_Normal Pressures', 'IPO_aligned_ST000763_untar_pos_2_Normal Pressures'],
                              #['IPO_aligned_ST000763_untar_neg_3_Borderline Pressures', 'IPO_aligned_ST000763_untar_pos_3_Borderline Pressures']
                              ['AN001201_Healthy_PAH','AN001202_Healthy_PAH'],
                              ['AN001201_Healthy_Normal Pressures', 'AN001202_Healthy_Normal Pressures'],
                              ['AN001201_Healthy_Borderline Pressures','AN001202_Healthy_Borderline Pressures'],
                              ['AN001201_Healthy_LowRisk','AN001202_Healthy_LowRisk'],
                              ['AN001201_PAH_Normal Pressures','AN001202_PAH_Normal Pressures'],
                              ['AN001201_PAH_Borderline Pressures','AN001202_PAH_Borderline Pressures'],
                              ['AN001201_PAH_LowRisk','AN001202_PAH_LowRisk'],
                              ['AN001201_Normal Pressures_Borderline Pressures','AN001202_Normal Pressures_Borderline Pressures'],
                              ['AN001201_Normal Pressures_LowRisk','AN001202_Normal Pressures_LowRisk'],
                              ['AN001201_Borderline Pressures_LowRisk','AN001202_Borderline Pressures_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_PAH','IPO_aligned_ST000763_untar_neg_Healthy_PAH'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_Normal Pressures','IPO_aligned_ST000763_untar_neg_Healthy_Normal Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_Borderline Pressures','IPO_aligned_ST000763_untar_neg_Healthy_Borderline Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_LowRisk','IPO_aligned_ST000763_untar_neg_Healthy_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_PAH_Normal Pressures','IPO_aligned_ST000763_untar_neg_PAH_Normal Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_PAH_Borderline Pressures','IPO_aligned_ST000763_untar_neg_PAH_Borderline Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_PAH_LowRisk','IPO_aligned_ST000763_untar_neg_PAH_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_Normal Pressures_Borderline Pressures','IPO_aligned_ST000763_untar_neg_Normal Pressures_Borderline Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_Normal Pressures_LowRisk','IPO_aligned_ST000763_untar_neg_Normal Pressures_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_Borderline Pressures_LowRisk','IPO_aligned_ST000763_untar_neg_Borderline Pressures_LowRisk']],
                 'ST000046': [#['AN000076_0_CN', 'AN000077_0_CN', 'AN000078_0_CN', 'AN000079_0_CN'],
                              #['AN000076_2_MCI', 'AN000077_2_MCI', 'AN000078_2_MCI', 'AN000079_2_MCI'],
                              #['AN000076_1_AD', 'AN000077_1_AD', 'AN000078_1_AD', 'AN000079_1_AD'],
                              #['IPO_aligned_ST000046_20120618_pos_c18_0_CN', 'XCMS-Report-annotated-SingleClass.04jun12_0_CN', 
                              # 'IPO_aligned_ST000046_20120606_neg_hilic_0_CN', 'IPO_aligned_ST000046_20120620_neg_c18_0_CN'],
                              #['IPO_aligned_ST000046_20120618_pos_c18_2_MCI', 'XCMS-Report-annotated-SingleClass.04jun12_2_MCI',
                              # 'IPO_aligned_ST000046_20120606_neg_hilic_2_MCI', 'IPO_aligned_ST000046_20120620_neg_c18_2_MCI'],
                              #['IPO_aligned_ST000046_20120618_pos_c18_1_AD', 'XCMS-Report-annotated-SingleClass.04jun12_1_AD', 
                              # 'IPO_aligned_ST000046_20120606_neg_hilic_1_AD', 'IPO_aligned_ST000046_20120620_neg_c18_1_AD'],
                              #['IPO_aligned_ST000046_20120613_neg_hilic_0_CN', 'IPO_aligned_ST000046_20120625_pos_c18_0_CN',
                              # 'XCMS-Report-annotated-SingleClass.11jun12_0_CN', 'XCMS-Report-annotated-SingleClass.27jun12_0_CN'],
                              #['IPO_aligned_ST000046_20120613_neg_hilic_2_MCI', 'IPO_aligned_ST000046_20120625_pos_c18_2_MCI', 
                              # 'XCMS-Report-annotated-SingleClass.11jun12_2_MCI', 'XCMS-Report-annotated-SingleClass.27jun12_2_MCI'],
                              #['IPO_aligned_ST000046_20120613_neg_hilic_1_AD', 'IPO_aligned_ST000046_20120625_pos_c18_1_AD',
                              # 'XCMS-Report-annotated-SingleClass.11jun12_1_AD', 'XCMS-Report-annotated-SingleClass.27jun12_1_AD']
                              ['AN000076_CN_MCI', 'AN000077_CN_MCI', 'AN000078_CN_MCI','AN000079_CN_MCI'],
                              ['AN000076_CN_AD', 'AN000077_CN_AD', 'AN000078_CN_AD', 'AN000079_CN_AD'],
                              ['AN000076_MCI_AD','AN000077_MCI_AD','AN000078_MCI_AD','AN000079_MCI_AD'],
                              ['IPO_aligned_ST000046_20120606_neg_hilic_CN_MCI','IPO_aligned_ST000046_20120618_pos_c18_CN_MCI',
                               'IPO_aligned_ST000046_20120620_neg_c18_CN_MCI','XCMS-Report-annotated-SingleClass.04jun12_CN_MCI'],
                              ['IPO_aligned_ST000046_20120606_neg_hilic_CN_AD','IPO_aligned_ST000046_20120618_pos_c18_CN_AD',
                               'IPO_aligned_ST000046_20120620_neg_c18_CN_AD','XCMS-Report-annotated-SingleClass.04jun12_CN_AD'],
                              ['IPO_aligned_ST000046_20120606_neg_hilic_MCI_AD','IPO_aligned_ST000046_20120618_pos_c18_MCI_AD',
                               'IPO_aligned_ST000046_20120620_neg_c18_MCI_AD','XCMS-Report-annotated-SingleClass.04jun12_MCI_AD'],
                              ['IPO_aligned_ST000046_20120613_neg_hilic_CN_MCI','IPO_aligned_ST000046_20120625_pos_c18_CN_MCI',
                               'XCMS-Report-annotated-SingleClass.11jun12_CN_MCI','XCMS-Report-annotated-SingleClass.27jun12_CN_MCI'],
                              ['IPO_aligned_ST000046_20120613_neg_hilic_CN_AD','IPO_aligned_ST000046_20120625_pos_c18_CN_AD',
                               'XCMS-Report-annotated-SingleClass.11jun12_CN_AD','XCMS-Report-annotated-SingleClass.27jun12_CN_AD'],
                              ['IPO_aligned_ST000046_20120613_neg_hilic_MCI_AD','IPO_aligned_ST000046_20120625_pos_c18_MCI_AD',
                               'XCMS-Report-annotated-SingleClass.11jun12_MCI_AD','XCMS-Report-annotated-SingleClass.27jun12_MCI_AD']],
                 'MTBLS408': [['IPO_aligned_MTBLS408_neg', 'IPO_aligned_MTBLS408_pos']],
                 'MTBLS352': [#['DEMO_neg-norm-metaboAnalystInput_0_NGT', 'DEMO_pos-norm-metaboAnalystInput_0_NGT'],
                              #['DEMO_neg-norm-metaboAnalystInput_2_Pre-DM', 'DEMO_pos-norm-metaboAnalystInput_2_Pre-DM'],
                              #['DEMO_neg-norm-metaboAnalystInput_1_T2D', 'DEMO_pos-norm-metaboAnalystInput_1_T2D']],
                              ['DEMO_neg-norm-metaboAnalystInput_T2D_NGT', 'DEMO_pos-norm-metaboAnalystInput_T2D_NGT'],
                              ['DEMO_neg-norm-metaboAnalystInput_T2D_Pre-DM', 'DEMO_pos-norm-metaboAnalystInput_T2D_Pre-DM'],
                              ['DEMO_neg-norm-metaboAnalystInput_NGT_Pre-DM', 'DEMO_pos-norm-metaboAnalystInput_NGT_Pre-DM']],
                 'MTBLS358': [#['m_CER_mass_spectrometry_v4_3_CS', 'm_EICO_mass_spectrometry_v4_3_CS',
#                                'm_SHOT_mass_spectrometry_v4_3_CS', 'm_TAG_mass_spectrometry_v4_3_CS'],
#                               ['m_CER_mass_spectrometry_v4_0_NS', 'm_EICO_mass_spectrometry_v4_0_NS', 
#                                'm_SHOT_mass_spectrometry_v4_0_NS', 'm_TAG_mass_spectrometry_v4_0_NS'],
#                               ['m_CER_mass_spectrometry_v4_2_FS', 'm_EICO_mass_spectrometry_v4_2_FS', 
#                                'm_SHOT_mass_spectrometry_v4_2_FS', 'm_TAG_mass_spectrometry_v4_2_FS'],
#                               ['m_CER_mass_spectrometry_v4_1_COPD', 'm_EICO_mass_spectrometry_v4_1_COPD',
#                                'm_SHOT_mass_spectrometry_v4_1_COPD', 'm_TAG_mass_spectrometry_v4_1_COPD']
                              ['m_CER_mass_spectrometry_v4_COPD_FS', 'm_EICO_mass_spectrometry_v4_COPD_FS',
                               'm_SHOT_mass_spectrometry_v4_COPD_FS', 'm_TAG_mass_spectrometry_v4_COPD_FS'],
                              ['m_CER_mass_spectrometry_v4_COPD_CS', 'm_EICO_mass_spectrometry_v4_COPD_CS',
                               'm_SHOT_mass_spectrometry_v4_COPD_CS', 'm_TAG_mass_spectrometry_v4_COPD_CS'],
                              ['m_CER_mass_spectrometry_v4_COPD_NS', 'm_EICO_mass_spectrometry_v4_COPD_NS',
                               'm_SHOT_mass_spectrometry_v4_COPD_NS', 'm_TAG_mass_spectrometry_v4_COPD_NS'],
                              ['m_CER_mass_spectrometry_v4_FS_CS', 'm_EICO_mass_spectrometry_v4_FS_CS',
                               'm_SHOT_mass_spectrometry_v4_FS_CS', 'm_TAG_mass_spectrometry_v4_FS_CS'],
                              ['m_CER_mass_spectrometry_v4_FS_NS', 'm_EICO_mass_spectrometry_v4_FS_NS',
                               'm_SHOT_mass_spectrometry_v4_FS_NS', 'm_TAG_mass_spectrometry_v4_FS_NS'],
                              ['m_CER_mass_spectrometry_v4_CS_NS', 'm_EICO_mass_spectrometry_v4_CS_NS',
                               'm_SHOT_mass_spectrometry_v4_CS_NS', 'm_TAG_mass_spectrometry_v4_CS_NS']],
                 'MTBLS279': [['m_chronic_hep_b_POS', 'm_chronic_hep_b_NEG']],
                 'ST000608': [['AN000929', 'AN000930', 'AN000931']],
                 'ST000450': [['AN000705', 'AN000706']],
                 'ST000356': [['AN000582', 'AN000583']],
                 'ST000355': [['AN000580', 'AN000581']]}
# cannot do on Feng or ST000381; MTBLS148, 264, 665 not included in datasets(?), ST000421, 726
#['peaks', 'data_set', 'study', 'labels', 'disease', 'samples', 'features', 'pre_norm']
mapping_aucs = pd.read_csv('./auc_sigfeat_summary_YES_bn_NO_log_NO_standscal_NO_multi_mapped_labels.csv')

combined_ds = {}
for k, v in datasets.items(): 
    try:
        to_combine = combinable_ds[k]
    except:
        continue
    combined_ds[k] = []
    for combo in to_combine:
        aucs = []
        combined_feat = []
        combined_peaks = []
        combined_samples = []
        combined_labels = []
        combined_feat_names = []
        study = k  
        disease = v[0]['disease']
        a_or_r = 'reprocessed' if 'XCMS' in combo[0] or 'IPO' in combo[0] else 'author'
        combined_name = a_or_r + '_' + combo[0]
        for ds_name in combo:
#             aucs.append(mapping_aucs[mapping_aucs['analysis']==ds_name]['auc'].iloc[0])
            for ds in v:
                if ds['data_set'] == ds_name:
                    combined_feat.append(ds['features'].values)
                    combined_feat_names.append(list(ds['features'].index))
                    combined_peaks.append(ds['peaks'])
                    combined_samples.append(ds['samples'])
                    combined_labels.append(ds['labels'])
#         for ele in combined_feat_names:
#             print(ele[:5])
        combined_feat = np.hstack(tuple(combined_feat))
        combined_feat = pd.DataFrame(combined_feat, index=combined_feat_names[0])
        combined_samples = pd.concat(combined_samples, axis=1)
        ds = {'peaks': combined_peaks,
              'data_set': combined_name,
              'study': k,
              'labels':combined_labels[0],
              'disease': disease,
              'samples': combined_samples,
              'features': combined_feat,
              'single_ds_aucs':aucs}
        combined_ds[k].append(ds)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [17]:
# read in csv file which will be used to sort the indicies of the datasets and labels
ds388_9_combine = pd.read_csv('./ST000388_9_combined.csv')

for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        data_name = ds['features'].index
        for data in ['lc', 'gc', 'min_lc']:
            not_shared = []
            for ele in ds388_9_combine[data]:
                if ele not in data_name:
                    not_shared.append(ele)
            if len(not_shared) < 50:
                for ele in not_shared:
                    ds388_9_combine = ds388_9_combine[ds388_9_combine[data] != ele]
                    
for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        for ele in ['lc', 'gc', 'min_lc']:
            try:
                ds['features'] = ds['features'].loc[ds388_9_combine[ele]]
            except:
                pass
        
for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        ind_names = list(ds['features'].index)
        if ds['data_set'] == 'IPO_aligned_ST000388_LC':   
            ind_names = list(ds388_9_combine['min_lc'])
#             ind_names = [ele.split('_')[0]+'_'+ele.split('_')[-1][:-5] for ele in ind_names]
        ds['labels'] = ds['labels'].loc[ind_names]
# ok now i need to go through and combine the author data and the ipo data
rep_ds = {'data_set': 'reprocessed_ST000388',
          'study': 'ST000388'}
auth_ds = {'data_set': 'author_ST000388',
          'study': 'ST000388'}
rep_feat = []
rep_feat_names = []
auth_feat = []
auth_feat_names = []

for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        if 'IPO' in ds['data_set']:
            rep_ds['disease'] = ds['disease']
            rep_ds['samples'] = ds['samples']
            rep_ds['labels'] = ds['labels']
            rep_ds['peaks'] = ds['peaks']
            rep_feat.append(ds['features'].values)
            rep_feat_names.append(list(ds['features'].index))
        else:
            auth_ds['disease'] = ds['disease']
            auth_ds['samples'] = ds['samples']
            auth_ds['labels'] = ds['labels']
            auth_ds['peaks'] = ds['peaks']
            auth_feat.append(ds['features'].values)
            auth_feat_names.append(list(ds['features'].index))
for combined_feat, comb_ds, inds in zip([rep_feat, auth_feat], [rep_ds,auth_ds], [rep_feat_names, auth_feat_names]):
    combined_feat = np.hstack(tuple(combined_feat))
    combined_feat = pd.DataFrame(combined_feat, index=inds[0])
    comb_ds['features'] = combined_feat
            
combined_ds['ST000388'] = []
combined_ds['ST000388'].append(rep_ds)
combined_ds['ST000388'].append(auth_ds)


In [18]:
p1 = pd.read_csv('combining_st62and63.csv')
p2 = pd.read_csv('combining_st62and63_2.csv')
p1 = p1.sort_values(by=['code']).set_index('code')
p2 = p2.sort_values(by=['code']).set_index('code')
p1_ind_s = set([ele[-2:] for ele in p1.index])
p2_ind_s = set([ele[-2:] for ele in p2.index])
in_both = p1_ind_s.intersection(p2_ind_s)
p1_ind = list(p1.index)
p2_ind = list(p2.index)
p1_ind_new = [ele for ele in p1_ind if ele[-2:] in in_both]
p2_ind_new = [ele for ele in p2_ind if ele[-2:] in in_both]

p1 = p1.loc[p1_ind_new]
p2 = p2.loc[p2_ind_new]

ds_list = datasets['ST000062']
for ds in ds_list:
    disease = ds['disease']
    if ds['data_set'] == 'XCMS-Report-annotated-SingleClass-GCTOF.':
        peaks = ds['peaks']
        data_62, labels_62 = ds['features'].loc[list(p1.name)], ds['labels'].loc[list(p1.name)]
    
ds_list = datasets['ST000063']
for ds in ds_list:
    if ds['data_set'] == 'XCMS-Report-annotated-SingleClass-GCTOF.':
        data_63, labels_63 = ds['features'].loc[list(p2.name)], ds['labels'].loc[list(p2.name)]
name = 'reprocessed_XCMS-Report-annotated-SingleClass-GCTOF'

combined_feat = []
combined_feat_names = []
combined_feat.append(data_62.values)
combined_feat.append(data_63.values)
combined_feat_names.append(list(data_62.index))
combined_feat = np.hstack(tuple(combined_feat))
combined_feat = pd.DataFrame(combined_feat, index=combined_feat_names[0])
combined_labels = labels_62

combined_ds['ST000062'] = []
combined_ds['ST000062'].append({'peaks': peaks,
                                'data_set': name,
                                'study': k,
                                'labels':combined_labels,
                                'disease': disease,
                                'samples': combined_samples,
                                'features': combined_feat,
                                'single_ds_aucs':[]})

In [None]:
for k, v in combined_ds.items():  
    for ds in v:
        print(k)
        print(ds['data_set'])
# if multiclass has not been pre-reduced to different one-v-rest datasets do you want to use one-v-rest or true multi class accuracy
        ovr_auc = True
    
        ds = get_num_labels(ds)
        ds = check_pre_norm(ds)                
        y = ds['labels'].values.copy().ravel().astype(int)
        X = ds['features'].values.copy()
        X = convert_nan_to_val(X, value=0)
        X[np.isinf(X)] = 0
        X[X<0] = 0
        if log and ds['pre_norm'] == 'No':
            X[X<1] = 1
            X = np.log2(X)
#         ds['auc'], ds['auc_std'], ds['train_size'], ds['test_size'], ds['clf'], ds['multi_aucs'] = fit_model(X, y, ds, model)   
        aucs = []
        for i in range(30):
            auc, std,train_size,test_size,clf,multi_auc= fit_model(X,y,ds,model)
            aucs.append(auc)
        aucs = np.asarray(aucs)
        ds['auc'] = aucs.mean()
        ds['auc_std'] = aucs.std()
        ds['train_size'], ds['test_size'], ds['clf'], ds['multi_aucs'] = train_size, test_size, clf, multi_auc
        print(ds['auc'],ds['auc_std'])

MTBLS105
reprocessed_IPO_aligned_MTBLS105_qMS
0.7483796296296297 0.035115611037627587
MTBLS17
reprocessed_IPO_aligned_MTBLS17_neg_exp1
0.6764235819735821 0.022233424366011178
MTBLS17
reprocessed_IPO_aligned_MTBLS17_neg_exp2
0.5820000000000001 0.07792137537762497
MTBLS17
reprocessed_IPO_aligned_MTBLS17_neg_exp3
0.9499999999999998 0.08465616732800194
MTBLS17
reprocessed_IPO_aligned_MTBLS17_neg_onebatch
0.730162912912913 0.01725549033507775
MTBLS17
author_Peaklist_EXP1_POS
0.6524525252525252 0.020823057990428016
MTBLS17
author_Peaklist_EXP2_POS
0.6044999999999999 0.05067900069171758
MTBLS17
author_Peaklist_EXP3_POS
1.0 0.0
MTBLS19
author_Exp1F_POS
0.8275 0.04198710119389842
MTBLS19
author_Exp2F_POS
0.828 0.03912799509302771
MTBLS19
reprocessed_IPO_aligned_MTBLS19_neg_exp1_F
0.8016666666666669 0.03994440581052066
MTBLS19
reprocessed_IPO_aligned_MTBLS19_neg_exp2_F
0.8509999999999999 0.029251780572585092
MTBLS19
reprocessed_IPO_aligned_MTBLS19_neg_all_F_R
0.8657916666666666 0.015812980339652

In [30]:
def make_summary(u,l,i,k,j=0, replace=False):
    if replace:
        auc = u['multi_aucs'].mean(0)[j]
        auc_std = u['multi_aucs'].std(0)[j]
        analysis = u['data_set']+'_'+str(j)
        label = str(l)+str(i)+str(j)
        try:
            model_coef = np.count_nonzero(u['clf'].coef_[j])
        except:
            model_coef = np.count_nonzero(u['clf'].best_estimator_.feature_importances_)
    else:
        auc = u['auc']
        auc_std = u['auc_std']
        analysis = u['data_set']
        label = str(l)+str(i)
        try:
            model_coef = np.count_nonzero(u['clf'].coef_[j])
        except:
            model_coef = np.count_nonzero(u['clf'].best_estimator_.feature_importances_)
    s = {'disease': u['disease'], 
        'number_labels': 2,
        'auc':auc,
        'auc_std': auc_std,
        'samples': u['features'].shape[0],
        'model_nonzero_coef': model_coef,
        'features': u['features'].shape[1],
        'train_size': u['train_size'],
        'test_size': u['test_size'],
        'label': label,
        'case': summed_case,
        'control': summed_control,
        'analysis': analysis,
        'study': k}
#         'single_ds_aucs': u['single_ds_aucs']}
    return s

from string import ascii_letters
summary = []
for l,k in zip(ascii_letters, combined_ds):
    for i, u in enumerate(combined_ds[k]):
        control = u['labels']==0
        case = u['labels']==1
        try:
            summed_control = int(control.sum())
            summed_case = int(case.sum())
        except:
            pass
        summary.append(make_summary(u,l,i,k))                  
summary = pd.DataFrame(summary)
# summary = summary.set_index('study')
# summary['disease_type'] = summary['disease_type'].astype('category')
summary

Unnamed: 0,analysis,auc,auc_std,case,control,disease,features,label,model_nonzero_coef,number_labels,samples,study,test_size,train_size
0,reprocessed_IPO_aligned_MTBLS105_qMS,0.74838,0.035116,40,49,Hepatocellular carcinoma,2031,a0,1515,2,89,MTBLS105,17,72
1,reprocessed_IPO_aligned_MTBLS17_neg_exp1,0.676424,0.022233,59,129,hepatocellular carcinoma,30443,b0,1095,2,188,MTBLS17,36,152
2,reprocessed_IPO_aligned_MTBLS17_neg_exp2,0.582,0.077921,13,50,hepatocellular carcinoma,48752,b1,310,2,63,MTBLS17,12,51
3,reprocessed_IPO_aligned_MTBLS17_neg_exp3,0.95,0.084656,5,5,hepatocellular carcinoma,4721,b2,450,2,10,MTBLS17,2,8
4,reprocessed_IPO_aligned_MTBLS17_neg_onebatch,0.730163,0.017255,77,184,hepatocellular carcinoma,34420,b3,6301,2,261,MTBLS17,51,210
5,author_Peaklist_EXP1_POS,0.652453,0.020823,59,129,hepatocellular carcinoma,2527,b4,1051,2,188,MTBLS17,36,152
6,author_Peaklist_EXP2_POS,0.6045,0.050679,13,50,hepatocellular carcinoma,4439,b5,368,2,63,MTBLS17,12,51
7,author_Peaklist_EXP3_POS,1.0,0.0,5,5,hepatocellular carcinoma,1003,b6,88,2,10,MTBLS17,2,8
8,author_Exp1F_POS,0.8275,0.041987,20,25,hepatocellular carcinoma,2409,c0,307,2,45,MTBLS19,9,36
9,author_Exp2F_POS,0.828,0.039128,20,25,hepatocellular carcinoma,2367,c1,1519,2,45,MTBLS19,9,36


In [31]:
summary.to_csv('30avg_{}_combined_ds_summary.csv'.format(model))

In [12]:
single_ds_aucs = summary['single_ds_aucs'].values
combined_ds_aucs = list(summary['auc'].values)
aucs_mean = []
for ele in single_ds_aucs:
    ele = np.asarray(ele)
    aucs_mean.append(ele.mean())
aucs_mean = np.asarray(aucs_mean)
combined_ds_aucs = np.asarray(combined_ds_aucs)
diff = combined_ds_aucs - aucs_mean
plt.scatter([i for i in range(diff.shape[0])], sorted(diff))
plt.show()
diff = pd.DataFrame(sorted(diff))
# diff

KeyError: 'single_ds_aucs'