In [1]:
# Elements in the code are inspired by Michael Murphy - Thanks!
import glob, re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, sys

from collections import OrderedDict
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import normalize
from sklearn.preprocessing import minmax_scale
from sklearn.svm import SVC
from sklearn.utils.multiclass import type_of_target # used to check the Y labels are appropriate for classification
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.utils import shuffle
from statsmodels.stats.multitest import multipletests
from scipy import interp
from scipy.stats import kruskal, mannwhitneyu
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 100)

In [2]:
bn = True # use the percentile normalized data? All of the paper does
log = False
stand_scaler = False
model = 'log_reg' #log_reg, rf, svm or plsda
remove_sig = True
stat_sig = False
top_sig = False
top_sig_num = 0

pre_norm_ds = [ 'plasmaall_author',
                'urineall_author',
                'm_oxylipin_chronic_hep_b',
                'm_chronic_hep_b_POS',
                'm_chronic_hep_b_NEG',
                'm_CER_mass_spectrometry_v4',
                'm_CER_mass_spectrometry_v4_3_CS',
                'm_CER_mass_spectrometry_v4_0_NS',
                'm_CER_mass_spectrometry_v4_2_FS',
                'm_CER_mass_spectrometry_v4_1_COPD',
                'm_EICO_mass_spectrometry_v4',
                'm_EICO_mass_spectrometry_v4_3_CS',
                'm_EICO_mass_spectrometry_v4_0_NS',
                'm_EICO_mass_spectrometry_v4_2_FS',
                'm_EICO_mass_spectrometry_v4_1_COPD',
                'AN000580',
                'AN000581',
                'AN001503',
                'ulsam_author']
if bn:
    path = './bn_pickles_paper/*.pkl'
else:
    path = './pickles/*.pkl'

datasets = OrderedDict()
for fn in sorted(glob.glob(path)):
    data = pd.read_pickle(open(fn,'rb'), compression=None)
    datasets[data[0]['study']] = data

In [3]:
def get_num_labels(ds):
    ds['labels'] = ds['labels']*1
    vals = ds['labels'].values
    try:
        vals = [item for sublist in vals for item in sublist]
    except:
        pass
    labels = set(vals)
    ds['num_labels'] = len(labels)
    ds['label_set'] = labels
    return ds

def check_pre_norm(ds):
    if ds['data_set'] in pre_norm_ds:
        ds['pre_norm'] = 'Yes'
    else:
        ds['pre_norm'] = 'No'
    return ds

def convert_nan_to_val(data, value=0):
    data[pd.isnull(data)] = value
    return data

In [4]:
def fdr_corrected_p(dataset, fill_nan=False, log=False):
    X = dataset['features'].values.copy()
    # tuning the data with nan filling to 0, changing 0s to a diff number and or applying log transform
    X[np.isinf(X)] = 0
    X[X<0] = 0
    if fill_nan:
        X = convert_nan_to_val(X, value=0)
    if log and dataset['pre_norm'] == 'No':
        X[X<1] = 1
        X = np.log2(X)
    
    y = dataset['labels'].values.ravel()
    y = np.asarray([int(i) for i in y])
    p = np.zeros(X.shape[1]) + np.nan
    
    for i in range(X.shape[1]):
        feat_data = []
        for j in ds['label_set']:
            try:
                X_0 = X[y==j,i]
                X_0 = X_0[~np.isnan(X_0)]
                feat_data.append(X_0)
            except:
                pass 
        ### do MW-U or kruskal but first find places were vectors all the same. 
        if ds['num_labels'] == 2:
            if set(feat_data[0]) == set(feat_data[1]):
                p[i] = 1
                continue
            else:
                _, p[i] = mannwhitneyu(feat_data[0],feat_data[1], alternative='two-sided')
        else:
            if set(feat_data[0]) == set(feat_data[1]) == set(feat_data[2]):
                p[i] = 1
                continue
            else:
                _, p[i] = kruskal(*feat_data)                       
    try:
        _, p[~np.isnan(p)], _, _ = multipletests(p[~np.isnan(p)], alpha=0.05, method='fdr_bh')
    except:
        pass
    dataset['pvalues'] = p
    dataset['significant'] = (dataset['pvalues'] < 0.05).sum() if (~np.isnan(p)).any() else np.nan

In [5]:
def train_model(X,y,ds,model):
    X,y = shuffle(X,y)
    if model == 'log_reg':
        if ds['num_labels'] != 2:
            clf = LogisticRegressionCV(scoring='accuracy', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500, multi_class='ovr')
        else:
            clf = LogisticRegressionCV(scoring='roc_auc', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
    elif model == 'rf':
        clf = RandomForestClassifier(n_estimators=1000)
    elif model == 'svm':
        param_grid = {'gamma': [1e-3, 0.01, 0.1, 1], 'C': [0.01, 0.1, 1, 10, 100]}
        clf = GridSearchCV(SVC(kernel='linear', probability=True), param_grid, cv=3)
    elif model == 'plsda':
        param_grid = {'n_components': [2,5,20,50,100]}
        clf = GridSearchCV(PLSRegression(), param_grid, cv=3)
    else:
        print('no valid classifier input, please try again with one of: log_reg, rf or svm')
        exit(0)
        
    cv = StratifiedKFold(n_splits=5, shuffle=True) # so this will probably give rather high - at the end you just get the last model...
    aucs = []
    num_stat = []
    coefs = np.zeros(X.shape[1])
    for train, test in cv.split(X,y):
        x_train, y_train = X[train], y[train]
        x_test, y_test = X[test], y[test]
        
        if stat_sig:
            p = np.zeros(x_train.shape[1]) + np.nan
            for i in range(x_train.shape[1]):
                feat_data = []
                for j in set(y_train):
                    try:
                        X_0 = x_train[y_train==j,i]
                        X_0 = X_0[~np.isnan(X_0)]
                        feat_data.append(X_0)
                    except:
                        pass 
                if set(feat_data[0]) == set(feat_data[1]):
                    p[i] = 1
                    continue
                else:
                    _, p[i] = mannwhitneyu(feat_data[0],feat_data[1], alternative='two-sided')                     
            try:
                _, p[~np.isnan(p)], _, _ = multipletests(p[~np.isnan(p)], alpha=0.05, method='fdr_bh')
            except:
                pass
            # get the top X 'most' significant 
            if top_sig:
                p_copy = p.copy()
                sorted_ps = sorted(list(p_copy))
                top_ps = sorted_ps[:top_sig_num] 
                to_keep = []
                for p_val in top_ps:
                    new_ps = np.where(p==p_val)[0].tolist()
                    dup_removed_ps = []
                    for ele in new_ps:
                        if ele not in to_keep:
                            dup_removed_ps.append(ele)
                    new_ps = dup_removed_ps
                    if len(to_keep) + len(new_ps) > top_sig_num:
                        oversum =  len(to_keep) + len(new_ps) - top_sig_num
                        new_ps = random.sample(new_ps, len(new_ps)-oversum)
                        to_keep += new_ps
                    else:
                        to_keep += new_ps
                ### trying to set the non important values to be greater than 0.05...set to 1
                n_p = np.ones(p.shape[0])
                n_p[to_keep] = p[to_keep]
                p = n_p
                x_train = x_train[:,p<0.05]
                x_test = x_test[:,p<0.05]
            else:
                x_train = x_train[:,p<0.05]
                x_test = x_test[:,p<0.05]
            num_stat.append(x_train.shape[1])
        elif remove_sig:
            p = ds['pvalues'][ds['pvalues']>=0.05]
            num_stat.append(0)
        else:
            num_stat.append(ds['significant'])
            p = ds['pvalues']
            
        if model == 'plsda':
            # massage the y valeus into the format for pls-da
            y_train_new = np.zeros((y_train.shape[0],2))
            y_train_new[np.arange(y_train.shape[0]), y_train] = 1
            y_train = y_train_new
        if stand_scaler:
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.transform(x_test)   
        if x_train.shape[1] == 0:
            aucs.append(0.5) 
            continue
        try:
            ### the try-except is just for the plsda models, which error when n_comp > # features
            ### this then just trains a model with the max number of features
            clf.fit(x_train, y_train)
        except:
            if model == 'plsda':
                comp = x_train.shape[1]
                clf = PLSRegression(n_components = comp)
                clf.fit(x_train, y_train)
            else:
                print('erroring, cannot train model')
                break
        if model == 'plsda':
            try:
                ds['indiv_split_model_coefs'].append(clf.best_estimator_.coef_[:,0])
            except:
                ds['indiv_split_model_coefs'].append(clf.coef_[:,0])
        else:
            ds['indiv_split_model_coefs'].append(clf.coef_[0])
        ds['indiv_split_p_vals'].append(p)
    
        if ds['num_labels'] != 2:
            ### if problem is set up as a multiclass problem and you are doing one v the rest training or true multiclass predictions
            if ovr_auc:
                # to do one v the rest AUCs:
                y_pred = clf.predict_proba(x_test)
                num_labels = y_pred.shape[1]
                set_to = num_labels+10
                indiv_aucs = []
                for ind in range(y_pred.shape[1]):
                    y_mut = y_test.copy()
                    y_mut[y_mut==ind] = set_to
                    y_mut[y_mut!=set_to] = 0
                    y_mut[y_mut==set_to] = 1
                    fpr, tpr, _ = roc_curve(y_mut, y_pred[:,ind])
                    auc_value = metrics.auc(fpr, tpr)
                    indiv_aucs.append(auc_value)
                aucs.append(indiv_aucs)
            else: aucs.append(clf.score(x_test, y_test))
        else:
            if model == 'plsda':
                y_pred = clf.predict(x_test)
                y_pred = np.absolute(y_pred)
                row_sum = np.repeat(y_pred.sum(axis=1),2).reshape((-1,2))
                y_pred = np.divide(y_pred, row_sum)
            else:
                y_pred = clf.predict_proba(x_test)
            fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
            auc_value = metrics.auc(fpr, tpr)
            aucs.append(auc_value) 
    auc = np.asarray(aucs)
    return auc.mean(), auc.std(), clf, y_train.shape, y_test.shape, np.asarray(num_stat).mean()

def fit_model(X,y,ds,model):
    mean, std, clf, train_size, test_size, avg_num_stat_feat =  train_model(X,y,ds,model)
    return mean, std, train_size[0], test_size[0], clf, avg_num_stat_feat

In [6]:
# this is just to combine datasets and make a new dictionary. 
original_multi_class = ['ST000284', 'ST000046', 'ST000045', 'ST000763', 'ST000329', 'MTBLS358', 'MTBLS352']

combinable_ds = {'MTBLS17':[['IPO_aligned_MTBLS17_neg_exp1', 'IPO_aligned_MTBLS17_pos_exp1'],
                            ['IPO_aligned_MTBLS17_neg_exp2', 'IPO_aligned_MTBLS17_pos_exp2'],
                            ['IPO_aligned_MTBLS17_neg_exp3', 'IPO_aligned_MTBLS17_pos_exp3'],
                            ['IPO_aligned_MTBLS17_neg_onebatch','IPO_aligned_MTBLS17_pos_onebatch'],
                            ['Peaklist_EXP1_POS','Peaklist_EXP1_NEG'],
                            ['Peaklist_EXP2_POS','Peaklist_EXP2_NEG'],
                            ['Peaklist_EXP3_POS','Peaklist_EXP3_NEG']],
                 'MTBLS19': [['Exp1F_POS', 'Exp1F_NEG'], ['Exp2F_POS', 'Exp2F_NEG'], 
                             ['IPO_aligned_MTBLS19_neg_exp1_F', 'IPO_aligned_MTBLS19_pos_exp1_F'],
                             ['IPO_aligned_MTBLS19_neg_exp2_F','IPO_aligned_MTBLS19_pos_exp2_F'],
                             ['IPO_aligned_MTBLS19_neg_all_F_R', 'IPO_aligned_MTBLS19_pos_all_F_R']],
                 'MTBLS19_data': [['Exp1F_POS', 'Exp1F_NEG'], ['Exp2F_POS', 'Exp2F_NEG'], 
                             ['IPO_aligned_MTBLS19_neg_exp1_F', 'IPO_aligned_MTBLS19_pos_exp1_F'],
                             ['IPO_aligned_MTBLS19_neg_exp2_F','IPO_aligned_MTBLS19_pos_exp2_F'],
                             ['IPO_aligned_MTBLS19_neg_all_F_R', 'IPO_aligned_MTBLS19_pos_all_F_R']],
                 'MTBLS28': [['m_mtbls28_NEG_v2_maf', 'm_mtbls28_POS_v2_maf'],
                             ['IPO_aligned_MTBLS28_neg', 'IPO_aligned_MTBLS28_pos']],
                 'MTBLS72': [['IPO_aligned_MTBLS72_neg', 'IPO_aligned_MTBLS72_pos']],
                 'MTBLS105': [['IPO_aligned_MTBLS105_qMS', 'IPO_aligned_MTBLS105_SIM-MS']],
                 'MTBLS266': [['m_mtbls266_NEG_mass_spectrometry_v2_maf', 'm_mtbls266_POS_mass_spectrometry_v2_maf'],
                              ['IPO_aligned_MTBLS266_neg', 'IPO_aligned_MTBLS266_pos']],
                 'MTBLS315': [['m_GC_nmfi_and_bsi_diagnosis_v2_maf', 'm_LC_nmfi_and_bsi_diagnosis_v2_maf', 
                               'm_UPLC_NEG_nmfi_and_bsi_diagnosis_v2_maf', 'm_UPLC_POS_nmfi_and_bsi_diagnosis_v2_maf'],
                              ['IPO_aligned_MTBLS315_mzData', 'IPO_aligned_MTBLS315_mzXML', 
                               'IPO_aligned_MTBLS315_n_mzML', 'IPO_aligned_MTBLS315_p_mzML']],
                 'MTBLS354': [['m_cap_metabolite_profiling_mass_spectrometry_v2_maf', 'm_cap_metabolite_profiling_mass_spectrometry-1_v2_maf'],
                              ['IPO_aligned_MTBLS354_neg', 'IPO_aligned_MTBLS354_pos']],
                 'MTBLS364': [['IPO_aligned_MTBLS364_hil_neg', 'IPO_aligned_MTBLS364_hil_pos',
                               'IPO_aligned_MTBLS364_lip_neg', 'IPO_aligned_MTBLS364_lip_pos']],
                 'ST000045': [['02Feb10-21-r0_ND_II','11Feb10-21-r0_ND_II', '11March10-21-_ND_II','17March10-21-_ND_II'],
                              ['02Feb10-21-r0_ND_IW', '11Feb10-21-r0_ND_IW', '11March10-21-_ND_IW', '17March10-21-_ND_IW'],
                              ['02Feb10-21-r0_II_IW', '11Feb10-21-r0_II_IW', '11March10-21-_II_IW', '17March10-21-_II_IW'],
                              ['IPO_aligned_ST000045_2feb_pos_ND_II', 'IPO_aligned_ST000045_11feb_neg_ND_II',
                               'IPO_aligned_ST000045_11mar_pos_ND_II', 'IPO_aligned_ST000045_17mar_neg_ND_II'],
                              ['IPO_aligned_ST000045_2feb_pos_ND_IW', 'IPO_aligned_ST000045_11feb_neg_ND_IW',
                               'IPO_aligned_ST000045_11mar_pos_ND_IW', 'IPO_aligned_ST000045_17mar_neg_ND_IW'],
                              ['IPO_aligned_ST000045_2feb_pos_II_IW', 'IPO_aligned_ST000045_11feb_neg_II_IW',
                               'IPO_aligned_ST000045_11mar_pos_II_IW', 'IPO_aligned_ST000045_17mar_neg_II_IW']],
                 'ST000329': [['AN000525_MCD_FSGS','AN000526_MCD_FSGS'],
                              ['AN000525_MCD_Control','AN000526_MCD_Control'],
                              ['AN000525_FSGS_Control', 'AN000526_FSGS_Control'],
                              ['IPO_aligned_ST000329_pos_MCD_FSGS', 'IPO_aligned_ST000329_neg_MCD_FSGS'],
                              ['IPO_aligned_ST000329_pos_MCD_Control', 'IPO_aligned_ST000329_neg_MCD_Control'],
                              ['IPO_aligned_ST000329_pos_FSGS_Control', 'IPO_aligned_ST000329_neg_FSGS_Control']],
                 'ST000385': [['AN000603_plasma', 'AN000603_serum'],
                              ['AN000620_plasma', 'AN000620_serum'],
                              ['IPO_aligned_ST000385_adc2_plasma', 'IPO_aligned_ST000385_adc2_serum'],
                              ['IPO_aligned_ST000385_adc1_plasma', 'IPO_aligned_ST000385_adc1_serum'],
                              ['IPO_aligned_ST000385_onebatch_plasma','IPO_aligned_ST000385_onebatch_serum']],
                 'ST000392': [['AN000628_plasma', 'AN000628_serum'], 
                              ['IPO_aligned_ST000392_plasma', 'IPO_aligned_ST000392_serum']],
                 'ST000578': [['AN000888', 'AN000889'],
                              ['IPO_aligned_ST000578_AE', 'IPO_aligned_ST000578_C18']],
                 'ST000763': [['AN001201_Healthy_PAH','AN001202_Healthy_PAH'],
                              ['AN001201_Healthy_Normal Pressures', 'AN001202_Healthy_Normal Pressures'],
                              ['AN001201_Healthy_Borderline Pressures','AN001202_Healthy_Borderline Pressures'],
                              ['AN001201_Healthy_LowRisk','AN001202_Healthy_LowRisk'],
                              ['AN001201_PAH_Normal Pressures','AN001202_PAH_Normal Pressures'],
                              ['AN001201_PAH_Borderline Pressures','AN001202_PAH_Borderline Pressures'],
                              ['AN001201_PAH_LowRisk','AN001202_PAH_LowRisk'],
                              ['AN001201_Normal Pressures_Borderline Pressures','AN001202_Normal Pressures_Borderline Pressures'],
                              ['AN001201_Normal Pressures_LowRisk','AN001202_Normal Pressures_LowRisk'],
                              ['AN001201_Borderline Pressures_LowRisk','AN001202_Borderline Pressures_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_PAH','IPO_aligned_ST000763_untar_neg_Healthy_PAH'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_Normal Pressures','IPO_aligned_ST000763_untar_neg_Healthy_Normal Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_Borderline Pressures','IPO_aligned_ST000763_untar_neg_Healthy_Borderline Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_Healthy_LowRisk','IPO_aligned_ST000763_untar_neg_Healthy_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_PAH_Normal Pressures','IPO_aligned_ST000763_untar_neg_PAH_Normal Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_PAH_Borderline Pressures','IPO_aligned_ST000763_untar_neg_PAH_Borderline Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_PAH_LowRisk','IPO_aligned_ST000763_untar_neg_PAH_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_Normal Pressures_Borderline Pressures','IPO_aligned_ST000763_untar_neg_Normal Pressures_Borderline Pressures'],
                              ['IPO_aligned_ST000763_untar_pos_Normal Pressures_LowRisk','IPO_aligned_ST000763_untar_neg_Normal Pressures_LowRisk'],
                              ['IPO_aligned_ST000763_untar_pos_Borderline Pressures_LowRisk','IPO_aligned_ST000763_untar_neg_Borderline Pressures_LowRisk']],
                 'ST000046': [['AN000076_CN_MCI', 'AN000077_CN_MCI', 'AN000078_CN_MCI','AN000079_CN_MCI'],
                              ['AN000076_CN_AD', 'AN000077_CN_AD', 'AN000078_CN_AD', 'AN000079_CN_AD'],
                              ['AN000076_MCI_AD','AN000077_MCI_AD','AN000078_MCI_AD','AN000079_MCI_AD'],
                              ['IPO_aligned_ST000046_20120606_neg_hilic_CN_MCI','IPO_aligned_ST000046_20120618_pos_c18_CN_MCI',
                               'IPO_aligned_ST000046_20120620_neg_c18_CN_MCI','XCMS-Report-annotated-SingleClass.04jun12_CN_MCI'],
                              ['IPO_aligned_ST000046_20120606_neg_hilic_CN_AD','IPO_aligned_ST000046_20120618_pos_c18_CN_AD',
                               'IPO_aligned_ST000046_20120620_neg_c18_CN_AD','XCMS-Report-annotated-SingleClass.04jun12_CN_AD'],
                              ['IPO_aligned_ST000046_20120606_neg_hilic_MCI_AD','IPO_aligned_ST000046_20120618_pos_c18_MCI_AD',
                               'IPO_aligned_ST000046_20120620_neg_c18_MCI_AD','XCMS-Report-annotated-SingleClass.04jun12_MCI_AD'],
                              ['IPO_aligned_ST000046_20120613_neg_hilic_CN_MCI','IPO_aligned_ST000046_20120625_pos_c18_CN_MCI',
                               'XCMS-Report-annotated-SingleClass.11jun12_CN_MCI','XCMS-Report-annotated-SingleClass.27jun12_CN_MCI'],
                              ['IPO_aligned_ST000046_20120613_neg_hilic_CN_AD','IPO_aligned_ST000046_20120625_pos_c18_CN_AD',
                               'XCMS-Report-annotated-SingleClass.11jun12_CN_AD','XCMS-Report-annotated-SingleClass.27jun12_CN_AD'],
                              ['IPO_aligned_ST000046_20120613_neg_hilic_MCI_AD','IPO_aligned_ST000046_20120625_pos_c18_MCI_AD',
                               'XCMS-Report-annotated-SingleClass.11jun12_MCI_AD','XCMS-Report-annotated-SingleClass.27jun12_MCI_AD']],
                 'MTBLS408': [['IPO_aligned_MTBLS408_neg', 'IPO_aligned_MTBLS408_pos']],
                 'MTBLS352': [['DEMO_neg-norm-metaboAnalystInput_T2D_NGT', 'DEMO_pos-norm-metaboAnalystInput_T2D_NGT'],
                              ['DEMO_neg-norm-metaboAnalystInput_T2D_Pre-DM', 'DEMO_pos-norm-metaboAnalystInput_T2D_Pre-DM'],
                              ['DEMO_neg-norm-metaboAnalystInput_NGT_Pre-DM', 'DEMO_pos-norm-metaboAnalystInput_NGT_Pre-DM']],
                 'MTBLS358': [['m_CER_mass_spectrometry_v4_COPD_FS', 'm_EICO_mass_spectrometry_v4_COPD_FS',
                               'm_SHOT_mass_spectrometry_v4_COPD_FS', 'm_TAG_mass_spectrometry_v4_COPD_FS'],
                              ['m_CER_mass_spectrometry_v4_COPD_CS', 'm_EICO_mass_spectrometry_v4_COPD_CS',
                               'm_SHOT_mass_spectrometry_v4_COPD_CS', 'm_TAG_mass_spectrometry_v4_COPD_CS'],
                              ['m_CER_mass_spectrometry_v4_COPD_NS', 'm_EICO_mass_spectrometry_v4_COPD_NS',
                               'm_SHOT_mass_spectrometry_v4_COPD_NS', 'm_TAG_mass_spectrometry_v4_COPD_NS'],
                              ['m_CER_mass_spectrometry_v4_FS_CS', 'm_EICO_mass_spectrometry_v4_FS_CS',
                               'm_SHOT_mass_spectrometry_v4_FS_CS', 'm_TAG_mass_spectrometry_v4_FS_CS'],
                              ['m_CER_mass_spectrometry_v4_FS_NS', 'm_EICO_mass_spectrometry_v4_FS_NS',
                               'm_SHOT_mass_spectrometry_v4_FS_NS', 'm_TAG_mass_spectrometry_v4_FS_NS'],
                              ['m_CER_mass_spectrometry_v4_CS_NS', 'm_EICO_mass_spectrometry_v4_CS_NS',
                               'm_SHOT_mass_spectrometry_v4_CS_NS', 'm_TAG_mass_spectrometry_v4_CS_NS']],
                 'MTBLS279': [['m_chronic_hep_b_POS', 'm_chronic_hep_b_NEG']],
                 'ST000608': [['AN000929', 'AN000930', 'AN000931']],
                 'ST000450': [['AN000705', 'AN000706']],
                 'ST000356': [['AN000582', 'AN000583']],
                 'ST000355': [['AN000580', 'AN000581']]}

# cannot do on Feng or ST000381; MTBLS148, 264, 665 not included in datasets(?), ST000421, 726
#['peaks', 'data_set', 'study', 'labels', 'disease', 'samples', 'features', 'pre_norm']

combined_ds = {}
ds_names = []
for k, v in datasets.items():
    for ds in v:
        ds_names.append(ds['data_set'])
        
for k, v in datasets.items(): 
    try:
        to_combine = combinable_ds[k]
    except:
        continue
    combined_ds[k] = []
    for combo in to_combine:
        if combo[0] not in ds_names:
            continue
        aucs = []
        combined_feat = []
        combined_peaks = []
        combined_samples = []
        combined_labels = []
        combined_feat_names = []
        study = k  
        disease = v[0]['disease']
        a_or_r = 'reprocessed' if 'XCMS' in combo[0] or 'IPO' in combo[0] else 'author'
        combined_name = a_or_r + '_' + combo[0]
        for ds_name in combo:
            for ds in v:
                if ds['data_set'] == ds_name:
                    combined_feat.append(ds['features'].values)
                    combined_feat_names.append(list(ds['features'].index))
                    combined_peaks.append(ds['peaks'])
                    combined_samples.append(ds['samples'])
                    combined_labels.append(ds['labels'])
        combined_feat = np.hstack(tuple(combined_feat))
        combined_feat = pd.DataFrame(combined_feat, index=combined_feat_names[0])
        combined_samples = pd.concat(combined_samples, axis=1)
        ds = {'peaks': combined_peaks,
              'data_set': combined_name,
              'study': k,
              'labels':combined_labels[0],
              'disease': disease,
              'samples': combined_samples,
              'features': combined_feat,
              'single_ds_aucs':aucs}
        combined_ds[k].append(ds)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [7]:
# read in csv file which will be used to sort the indicies of the datasets and labels
ds388_9_combine = pd.read_csv('./ST000388_9_combined.csv')

for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        data_name = ds['features'].index
        for data in ['lc', 'gc', 'min_lc']:
            not_shared = []
            for ele in ds388_9_combine[data]:
                if ele not in data_name:
                    not_shared.append(ele)
            if len(not_shared) < 50:
                for ele in not_shared:
                    ds388_9_combine = ds388_9_combine[ds388_9_combine[data] != ele]
                    
for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        for ele in ['lc', 'gc', 'min_lc']:
            try:
                ds['features'] = ds['features'].loc[ds388_9_combine[ele]]
            except:
                pass
        
for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        ind_names = list(ds['features'].index)
        if ds['data_set'] == 'IPO_aligned_ST000388_LC':   
            ind_names = list(ds388_9_combine['min_lc'])
#             ind_names = [ele.split('_')[0]+'_'+ele.split('_')[-1][:-5] for ele in ind_names]
        ds['labels'] = ds['labels'].loc[ind_names]
# ok now i need to go through and combine the author data and the ipo data
rep_ds = {'data_set': 'reprocessed_ST000388',
          'study': 'ST000388'}
auth_ds = {'data_set': 'author_ST000388',
          'study': 'ST000388'}
rep_feat = []
rep_feat_names = []
auth_feat = []
auth_feat_names = []

for k, v in datasets.items(): 
    if k not in ['ST000388', 'ST000389']:
        continue
    for ds in v:
        if 'IPO' in ds['data_set']:
            rep_ds['disease'] = ds['disease']
            rep_ds['samples'] = ds['samples']
            rep_ds['labels'] = ds['labels']
            rep_ds['peaks'] = ds['peaks']
            rep_feat.append(ds['features'].values)
            rep_feat_names.append(list(ds['features'].index))
        else:
            auth_ds['disease'] = ds['disease']
            auth_ds['samples'] = ds['samples']
            auth_ds['labels'] = ds['labels']
            auth_ds['peaks'] = ds['peaks']
            auth_feat.append(ds['features'].values)
            auth_feat_names.append(list(ds['features'].index))
# for combined_feat, comb_ds, inds in zip([rep_feat, auth_feat], [rep_ds,auth_ds], [rep_feat_names, auth_feat_names]):
for combined_feat, comb_ds, inds in zip([rep_feat], [rep_ds], [rep_feat_names]):
    combined_feat = np.hstack(tuple(combined_feat))
    combined_feat = pd.DataFrame(combined_feat, index=inds[0])
    comb_ds['features'] = combined_feat
            
combined_ds['ST000388'] = []
combined_ds['ST000388'].append(rep_ds)
# combined_ds['ST000388'].append(auth_ds)


In [None]:
for k, v in combined_ds.items():
    for ds in v:
        ds = get_num_labels(ds)
        ds = check_pre_norm(ds)
        print(k, ds['data_set'])
        if log:
            fdr_corrected_p(ds, fill_nan=True, log=True)  
        else:
            fdr_corrected_p(ds, fill_nan=True, log=False)

MTBLS105 reprocessed_IPO_aligned_MTBLS105_qMS
MTBLS17 reprocessed_IPO_aligned_MTBLS17_neg_onebatch
MTBLS19_data reprocessed_IPO_aligned_MTBLS19_neg_all_F_R
MTBLS266 reprocessed_IPO_aligned_MTBLS266_neg
MTBLS279 author_m_chronic_hep_b_POS
MTBLS28 reprocessed_IPO_aligned_MTBLS28_neg
MTBLS315 reprocessed_IPO_aligned_MTBLS315_mzData
MTBLS354 reprocessed_IPO_aligned_MTBLS354_neg
MTBLS358 author_m_CER_mass_spectrometry_v4_COPD_FS
MTBLS358 author_m_CER_mass_spectrometry_v4_COPD_CS
MTBLS358 author_m_CER_mass_spectrometry_v4_COPD_NS
MTBLS358 author_m_CER_mass_spectrometry_v4_FS_CS
MTBLS358 author_m_CER_mass_spectrometry_v4_FS_NS
MTBLS358 author_m_CER_mass_spectrometry_v4_CS_NS
MTBLS364 reprocessed_IPO_aligned_MTBLS364_hil_neg
MTBLS408 reprocessed_IPO_aligned_MTBLS408_neg
MTBLS72 reprocessed_IPO_aligned_MTBLS72_neg
ST000045 reprocessed_IPO_aligned_ST000045_2feb_pos_ND_II


In [None]:
if remove_sig:
    for k, v in combined_ds.items():  
        for ds in v: 
            ds['features'] = ds['features'].iloc[:,ds['pvalues']>=0.05]

In [None]:
def avg_indiv_ps_coefs(ds):
    if stat_sig:
        if ds['indiv_split_model_coefs'] != []:
            combined_coefs = []
            for coefs, p in zip(ds['indiv_split_model_coefs'],ds['indiv_split_p_vals']):
                to_combine_coefs = np.zeros(p.shape[0])
                to_combine_coefs[p<0.05] = coefs
                combined_coefs.append(to_combine_coefs)
            combined_coefs = np.asarray(combined_coefs)
            combined_coefs = combined_coefs.mean(axis=0)
        else:
            combined_coefs = np.zeros(ds['indiv_split_p_vals'][0].shape[0])
        ps = np.asarray(ds['indiv_split_p_vals'])
        combined_ps = ps.mean(axis=0)
    else:
        combined_coefs = []
        if remove_sig:
            for coefs in ds['indiv_split_model_coefs']:
                to_combine_coefs = np.zeros(ds['pvalues'].shape[0])
                to_combine_coefs[ds['pvalues']>=0.05] = coefs
                combined_coefs.append(coefs)
            combined_coefs = np.asarray(combined_coefs)
            combined_coefs = combined_coefs.mean(axis=0)
            combined_ps = ds['pvalues']
        else:
            for coefs in ds['indiv_split_model_coefs']:
                combined_coefs.append(coefs)
            combined_coefs = np.asarray(combined_coefs)
            combined_coefs = combined_coefs.mean(axis=0)
            combined_ps = ds['pvalues']
    return combined_coefs, combined_ps

In [None]:
import warnings
warnings.filterwarnings("ignore")
for k, v in combined_ds.items(): 
    for ds in v: 
        ds['indiv_split_model_coefs'] = []
        ds['indiv_split_p_vals'] = []
        print(k, ds['data_set'], ds['features'].shape)
        ovr_auc = True
        ds = get_num_labels(ds)
        ds = check_pre_norm(ds)                
        y = ds['labels'].values.copy().ravel().astype(int)
        X = ds['features'].values.copy()
        if X.shape[1] == 0:
            ds['train_size'], ds['test_size'], ds['clf'] = 'na','na','na'
            ds['auc'] = 0.5
            ds['auc_std'] = 0
        X = convert_nan_to_val(X, value=0)
        X[np.isinf(X)] = 0
        X[X<0] = 0
        if log and ds['pre_norm'] == 'No':
            X[X<1] = 1
            X = np.log2(X)
        aucs = []
        model_feat = []
        avg_stat_sig = []
        for i in range(30):
            auc, std,train_size,test_size,clf,avg_num_stat_feat = fit_model(X,y,ds,model)
            aucs.append(auc)
            avg_stat_sig.append(avg_num_stat_feat)
        aucs = np.asarray(aucs)
        if ds['indiv_split_model_coefs'] == [] and ds['indiv_split_p_vals'] == []:
            pass
        else:
            ds['indiv_split_model_coefs'], ds['indiv_split_p_vals'] = avg_indiv_ps_coefs(ds)
        ds['auc'] = aucs.mean()
        ds['auc_std'] = aucs.std()
        ds['avg_stat_sig'] = np.asarray(avg_stat_sig).mean()
        ds['train_size'], ds['test_size'], ds['clf'] = train_size, test_size, clf
        print(ds['auc'],ds['auc_std'])

In [None]:
disease_type = {
    'acute myocardial infarction': 'cardiovascular', 
    'cardiovascular': 'cardiovascular',
    'coronary heart disease': 'cardiovascular',
    'hepatocellular carcinoma': 'cancer',
    'Hepatocellular carcinoma': 'cancer',
    'Hepatocellular Carcinoma': 'cancer',
    'hepatitis b': 'infectious',
    'Malaria': 'infectious',
    'Malaria (P. vivax)':'infectious',
    'non-malaria febrile illness':'infectious',
    'scleroderma PAH': 'autoimmune',
    'psoriasis':'autoimmune',
    'pneumonia': 'infectious',
    'Pneumonia - Community acquired': 'infectious',
    'copd': 'respiratory',
    'COPD': 'respiratory',
    'chronic hepatitis B' : 'infectious',
    'typhoid': 'infectious',
    'typhoid carriage':'infectious',
    'lyme': 'infectious',
    'common cold - longitudinal':'infectious',
    'Lyme disease': 'infectious',
    'Alzheimers': 'neurological',
    "Alzheimer's": 'neurological',
    'colorectal cancer': 'cancer',
    'Colorectal Cancer': 'cancer',
    'depression': 'neurological',
    'Depression':'neurological',
    'Breast Cancer': 'cancer',
    'Breast cancer':'cancer',
    'Lung cancer': 'cancer',
    'lung cancer': 'cancer',
    'Lung Cancer': 'cancer',
    'lung cancer - adenocarcinoma': 'cancer',
    'lung cancer - non-small-cell lung cancer (adenocarcinoma, etc)': 'cancer',
    'Stability of dried blood samples - diabetic men' : 'metabolic',
    'Obesity - Non-diabetic and T2 diabetic': 'metabolic',
    't2 diabetes': 'metabolic',
    't1 diabetes': 'metabolic',
    'Diabetes - Type I': 'metabolic',
    'Diabetes - healthy v. T2 v. prediabetic': 'metabolic',
    'Polycystic Ovarian Syndrome': 'metabolic',
    'minimal change disease, focal segmental sclerosis': 'glomerular',
    'interstitial cystitis/painful bladder syndrome': 'other',
    'prepubertal children with obesity': 'other', #MAYBE CHANGE THIS ONE?
    'chronic fatigue syndrome': 'other',
    'Chronic fatigue': 'other',
    'polycystic ovarian syndrome': 'other',
    'scleroderma': 'other',
    'Pregnancy': 'other',
    'smoker v. nonsmoker':'other',
    'Interperson variation':'other',
    'short-term and long-term metabolic changes after bariatric surgery':'other',
    'high intensity exercise metabolomics':'other',
    'Age related metabolomics': 'other',
    'Urine sample storage': 'other',
    'urine metabolome': 'other',
    'Single human time study': 'other'
    }

def make_summary(u,i,k,j=0):
    auc = u['auc']
    auc_std = u['auc_std']
    analysis = u['data_set']
    label = str(l)+str(i)
    if u['clf'] == 'na' or u['clf'] == '0 features no model':
        model_coef = 0
#         if remove_sig == True:
#             try:
#                 model_coef = np.count_nonzero(u['clf'].coef_)
#             except:
#                 model_coef = 0
#         else:
#             if model == 'log_reg':
#                 model_coef = np.count_nonzero(u['clf'].coef_)
    else:
        cutoff = 5e-4
        if model == 'plsda':
            try:
                model_coef = u['clf'].best_estimator_.coef_[:,0]
                model_coef = model_coef[np.absolute(model_coef)>cutoff].shape[0]
            except:
                try:
                    model_coef = u['clf'].coef_[:,0]
                    model_coef = model_coef[np.absolute(model_coef)>cutoff].shape[0]
                except:
                    model_coef = 'cannot tell'
        else:
            try:
                model_coef = np.count_nonzero(u['clf'].coef_[j])
            except:
                try:
                    model_coef = np.count_nonzero(u['clf'].best_estimator_.feature_importances_)
                except:
                    model_coef = 'cannot tell'
    s = {'disease': u['disease'], 
        'number_labels': 2,
        'auc':auc,
        'auc_std': auc_std,
        'samples': u['features'].shape[0],
        'model_nonzero_coef': model_coef,
        'significant': u['significant'],
        'avg_stat_sig_per_model': u['avg_stat_sig'],
        'features': u['features'].shape[1],
        'train_size': u['train_size'],
        'test_size': u['test_size'],
        'label': label,
        'case': summed_case,
        'control': summed_control,
        'analysis': analysis,
        'disease_type': disease_type[u['disease']],
        'study': k}
    return s
    

summary = []
for k in combined_ds:
    for i, u in enumerate(combined_ds[k]):
        if (k == 'ST000062' and u['data_set'] == 'XCMS-Report-annotated-SingleClass-GCTOF.'):
            u['data_set'] = 'XCMS-Report-annotated-SingleClass-GCTOF.plasma'
        if u['num_labels'] == 2:
            control = u['labels']==0
            case = u['labels']==1
            try:
                summed_control = int(control.sum())
                summed_case = int(case.sum())
            except:
                pass
            summary.append(make_summary(u,i,k))
        else:
            for j in range(u['num_labels']):
                summary.append(make_summary(u,i,k,j=j,replace=True))                    
summary = pd.DataFrame(summary)
# summary = summary.set_index('study')
# summary['disease_type'] = summary['disease_type'].astype('category')
summary

In [None]:
# save the df as a csv:
summary.to_csv('./combined_summary-sig_rem_{}_stat_sig_{}_top_sig_{}_{}_30avg_auc_{}_YES_bn_NO_log_NO_standscal_YES_ovo.csv'.format(remove_sig,stat_sig, top_sig, top_sig_num, model))
# save dataset object:
pickle.dump(combined_ds, open('./combined_data_models-sig_rem_{}_stat_sig_{}_top_sig_{}_{}_30avg_{}_YES_bn_NO_log_NO_standscal_YES_ovo.pkl'.format(remove_sig,stat_sig, top_sig, top_sig_num, model), 'wb'))

In [None]:
# map the extra metadata onto this data (column, mode, sample type)
# metadata = pd.read_csv('./ms_instrument_column_polarity_dataset_names.csv', sep='\t')
metadata = pd.read_csv('./ms_instrument_column_polarity_dataset_names.csv', sep=',').set_index('Accession')
summary_w_metadata = summary.merge(metadata, on='analysis')
summary_w_metadata = summary_w_metadata.replace(np.nan,'unknown')
summary_w_metadata.to_csv('./combined_summary-sig_rem_{}_stat_sig_{}_top_sig_{}_{}_30avg_auc_{}_YES_bn_NO_log_NO_standscal_YES_ovo_YES_meta.csv'.format(remove_sig,stat_sig, top_sig, top_sig_num, model))
summary_w_metadata
