In [1]:
# PART OF THIS CODE IS FROM MICHAEL MURPHY - THANKS!
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import OrderedDict
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, sys
import glob, re
import seaborn as sns
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 100)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.utils.multiclass import type_of_target # used to check the Y labels are appropriate for classification
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.utils import shuffle
from scipy import interp

def get_num_labels(ds):
    ds['labels'] = ds['labels']*1
    vals = ds['labels'].values
    try:
        vals = [item for sublist in vals for item in sublist]
    except:
        pass
    labels = set(vals)
    ds['num_labels'] = len(labels)
    ds['label_set'] = labels
    return ds

def check_pre_norm(ds):
    if ds['data_set'] in pre_norm_ds:
        ds['pre_norm'] = 'Yes'
    else:
        ds['pre_norm'] = 'No'
    return ds

def convert_nan_to_val(data, value=0):
    data[pd.isnull(data)] = value
    return data

In [4]:
#### Use this if DOING a fresh modeling fitting analysis
bn = True # use the percentile normalized data or no? 
log = False
stand_scaler = False
reanalysis = True
model = 'log_reg' #log_reg, rf or svm

if reanalysis:
    pre_norm_ds = [ 'plasmaall_author',
                    'urineall_author',
                    'm_oxylipin_chronic_hep_b',
                    'm_chronic_hep_b_POS',
                    'm_chronic_hep_b_NEG',
                    'm_CER_mass_spectrometry_v4',
                    'm_CER_mass_spectrometry_v4_3_CS',
                    'm_CER_mass_spectrometry_v4_0_NS',
                    'm_CER_mass_spectrometry_v4_2_FS',
                    'm_CER_mass_spectrometry_v4_1_COPD',
                    'm_EICO_mass_spectrometry_v4',
                    'm_EICO_mass_spectrometry_v4_3_CS',
                    'm_EICO_mass_spectrometry_v4_0_NS',
                    'm_EICO_mass_spectrometry_v4_2_FS',
                    'm_EICO_mass_spectrometry_v4_1_COPD',
                    'AN000580',
                    'AN000581',
                    'AN001503']

    if bn:
#         path = './bn_pickles/*.pkl'
#         path = './bn_pickles/ST00006*_bn_data.pkl'
        path = './bn_pickles/MTBLS72*.pkl'
    else:
        path = './pickles/*.pkl'

    datasets = OrderedDict()
    for fn in sorted(glob.glob(path)):
        data = pd.read_pickle(open(fn,'rb'))
        datasets[data[0]['study']] = data
    
else:
    #### Use this if NOT doing a fresh modeling fitting analysis
    pickle_file = './YES_bn_ds_models_and_sigfeat_NO_log_NO_standscal_NO_multi_mapped_labels.pkl'
    ### The non-batch corrected pickle for the dataset
    # pickle_file = './NO_bn_dataset_models_and_sigfeat_YES_log.pkl'
    datasets = pickle.load(open(pickle_file, 'rb'))

In [47]:
def train_model(X,y,ds,model):
    X,y = shuffle(X,y)
    if model == 'log_reg':
        print('using logistic regression')
        if ds['num_labels'] != 2:
            clf = LogisticRegressionCV(scoring='accuracy', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500, multi_class='ovr')
        else:
            clf = LogisticRegressionCV(scoring='roc_auc', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
    elif model == 'rf':
        print('using random forests')
        param_grid = {'n_estimators':[100,500,1000]}
#         clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
        clf = GridSearchCV(RandomForestClassifier(n_estimators=1000, n_jobs=-1), param_grid, cv=3, n_jobs=-1)
    elif model == 'svm':
        print('using SVMs')
#         clf = SVC(kernel='rbf', probability=True, C=0.000001)
        param_grid = {'gamma': [1e-3, 0.01, 0.1, 1], 'C': [0.01, 0.1, 1, 10, 100]}
        clf = GridSearchCV(SVC(kernel='linear', probability=True), param_grid, cv=3, n_jobs=-1)
    else:
        print('no valid classifier input, please try again with one of: log_reg, rf or svm')
        exit(0)
        
    cv = StratifiedKFold(n_splits=5, shuffle=False) # so this will probably give rather high - at the end you just get the last model...
    aucs = []
    for train, test in cv.split(X,y):
        x_train, y_train = X[train], y[train]
        x_test, y_test = X[test], y[test]
        if stand_scaler:
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.transform(x_test)
        clf.fit(x_train, y_train)
        if ds['num_labels'] != 2:
            if ovr_auc:
                # to do one v the rest AUCs:
                y_pred = clf.predict_proba(x_test)
                num_labels = y_pred.shape[1]
                set_to = num_labels+10
                indiv_aucs = []
                for ind in range(y_pred.shape[1]):
                    y_mut = y_test.copy()
                    y_mut[y_mut==ind] = set_to
                    y_mut[y_mut!=set_to] = 0
                    y_mut[y_mut==set_to] = 1
                    fpr, tpr, _ = roc_curve(y_mut, y_pred[:,ind])
                    auc_value = metrics.auc(fpr, tpr)
                    indiv_aucs.append(auc_value)
                aucs.append(indiv_aucs)
            else: aucs.append(clf.score(x_test, y_test))
        else:
            y_pred = clf.predict_proba(x_test)
#             print(y_test, y_pred)
#             print(clf.predict(x_test))
            fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
            auc_value = metrics.auc(fpr, tpr)
            aucs.append(auc_value) 
#     x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#     clf.fit(x_train, y_train)
#     print(aucs)
    auc = np.asarray(aucs)
    if ds['num_labels'] != 2:
        multi_aucs = auc
    else: multi_aucs = 0
    return auc.mean(), auc.std(), clf, y_train.shape, y_test.shape, multi_aucs

def fit_model(X,y,ds,model):
    mean, std, clf, train_size, test_size, multi_aucs =  train_model(X,y,ds,model)
    if mean == 1.0 or mean == 0.5:
        mean, std, clf, train_size, test_size, multi_aucs = train_model(X,y,ds,model)
    return mean, std, train_size[0], test_size[0], clf, multi_aucs

In [48]:
# Used to fit models for all the datasets!
for k, v in datasets.items():  
    for ds in v: 
        print(k, ds['data_set'], ds['features'].shape)
# if multiclass has not been pre-reduced to different one-v-rest datasets do you want to use one-v-rest or true multi class accuracy
        ovr_auc = True
# if working with batch corrected data, must make the labels and data into pd dfs...
#         if bn:
#             ds['labels'] = pd.DataFrame(ds['labels'])
#             ds['features'] = pd.DataFrame(ds['features'])
        ds = get_num_labels(ds)
        ds = check_pre_norm(ds)                
        y = ds['labels'].values.copy().ravel().astype(int)
        X = ds['features'].values.copy()
        X = convert_nan_to_val(X, value=0)
        X[np.isinf(X)] = 0
        X[X<0] = 0
        if log and ds['pre_norm'] == 'No':
            print('using log')
            X[X<1] = 1
            X = np.log2(X)
        aucs = []
#         X,y = shuffle(X,y)
        for i in range(10):
            auc, auc_std, train_size, test_size, clf, multi_aucs = fit_model(X,y,ds,model)
            print(auc, auc_std)
            aucs.append(auc)
        aucs = np.asarray(aucs)
        print(aucs.mean(), aucs.std())
#         ds['auc'], ds['auc_std'], ds['train_size'], ds['test_size'], ds['clf'], ds['multi_aucs'] = fit_model(X, y, ds, model=model)     
#         print(ds['auc'],ds['auc_std'])

MTBLS72 IPO_aligned_MTBLS72_neg (127, 6417)
using logistic regression
0.8519632414369257 0.055249500879693415
using logistic regression
0.8720969089390141 0.03058762366786248
using logistic regression
0.8154553049289893 0.08027156765043858
using logistic regression
0.8604427736006682 0.0400955511030673
using logistic regression
0.7816833751044276 0.11767525234528883
using logistic regression
0.836173767752715 0.03139608377386206
using logistic regression
0.8058688387635756 0.10206358984958748
using logistic regression
0.751106934001671 0.18573426354938558
using logistic regression
0.8388053467000836 0.06146860307687308
using logistic regression
0.8270258980785297 0.10186499524133383
0.82406223893066 0.03507947282703626
MTBLS72 IPO_aligned_MTBLS72_pos (127, 4529)
using logistic regression
0.7501670843776107 0.09068468999852028
using logistic regression
0.8889724310776943 0.020824304528210755
using logistic regression
0.8104636591478697 0.09767940668947135
using logistic regression
0.828