In [1]:
# PART OF THIS CODE IS FROM MICHAEL MURPHY - THANKS!
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import OrderedDict
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 100)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.utils.multiclass import type_of_target # used to check the Y labels are appropriate for classification
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.utils import shuffle
from scipy import interp

In [3]:
#### Use this if NOT doing a fresh modeling fitting analysis
pickle_file = './YES_batch_correct_dataset_models_and_sigfeat_NO_log.pkl'
### The non-batch corrected pickle for the dataset
# pickle_file = './non_bn_dataset_models_and_sigfeat.pkl'
datasets = pickle.load(open(pickle_file, 'rb'))

In [11]:
datasets = OrderedDict()
for fn in sorted(glob.glob('./pickles/*.pkl')):
    data = pd.read_pickle(open(fn,'rb'))
    datasets[data[0]['study']] = data

In [12]:
def convert_nan_to_val(data, value=0):
    data[pd.isnull(data)] = value
    return data

def extract_random_subset(X, y, percent):
    num_to_keep = int(X.shape[0]*percent)
    idx = np.random.randint(X.shape[0], size=num_to_keep)
    return X[idx,:], y[idx,:].ravel()

def l1_log_reg(X,y,ds):
    X,y = shuffle(X,y)
    # intercept scaling of 1 seems to help with the 0.5 AUC, think it was a convergence issues? 
    clf = LogisticRegressionCV(scoring='roc_auc', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
    cv = StratifiedKFold(n_splits=3, shuffle=True)
    aucs = []
    if ds['study'] == 'MTBLS423':
        y[y==6] = 1
    for train, test in cv.split(X,y):
        x_train, y_train = X[train], y[train]
        x_test,y_test = X[test], y[test]
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        clf.fit(x_train, y_train)
        if ds['num_labels'] != 2:
            aucs.append(clf.score(x_test, y_test))
        else:
            y_pred = clf.predict_proba(x_test)
            fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
            auc_value = metrics.auc(fpr, tpr)
            aucs.append(auc_value)            
    auc = np.asarray(aucs)
    # found sometimes that something will just go wrong in fitting and it will shut down all features and give a 0.5 model
    # also sometimes the 1.0 models are fit wrong...
    if auc.mean() == 1.0 or auc.mean() == 0.5:
        X,y = shuffle(X,y)
        clf = LogisticRegressionCV(penalty='l1', scoring='roc_auc', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
        cv = StratifiedKFold(n_splits=3, shuffle=True)
        aucs = []
        for train, test in cv.split(X,y):
            x_train, y_train = X[train], y[train]
            x_test,y_test = X[test], y[test]
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.transform(x_test)
            clf.fit(x_train, y_train)
            if ds['num_labels'] != 2:
                aucs.append(clf.score(x_test, y_test))
            else:
                y_pred = clf.predict_proba(x_test)
                fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
                auc_value = metrics.auc(fpr, tpr)
                aucs.append(auc_value)  
    return auc.mean(), auc.std()

# Filter based on feature presence (ie not >0) in > X%

In [5]:
# loose outcome: really no positive or negative bonus to doing this type of filtering
for k, v in datasets.items():  
    plt.figure(figsize=(8,5))
    for ds in v:
        if k in ['MTBLS148','MTBLS200', 'MTBLS20', 'ST000397', 'MTBLS264', 'snyder']:
            ds['p_auc'] = []
            ds['p_std'] = [] # use len(ds['p_auc']) to pass these during plotting
            continue
#         if k != 'ST000063':
#             continue
#         if 'IPO' not in ds['data_set']:
#             continue
            
        ds['labels'] = ds['labels']*1
        vals = ds['labels'].values
        try:
            vals = [item for sublist in vals for item in sublist]
        except:
            pass
        labels = set(vals)
        ds['num_labels'] = len(labels)

        if ds['data_set'] == 'IPO_aligned_MTBLS92':
            f = [fi for fi in list(ds['features'].index) if '163' in fi]
            ds['features'] = ds['features'].loc[f]
            ds['labels'] = ds['labels'].loc[f]
        if ds['study'] == 'MTBLS92' and ds['data_set'] == 'Author_data':
            f = [fi for fi in list(ds['features'].index) if 'A' in fi]
            ds['features'] = ds['features'].loc[f]
            ds['labels'] = ds['labels'].loc[f]
        print(k, ds['data_set'])    
        

        #define the number of repeats for each level of subsetting
        percents = [0, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] # levels of subsetting
        ds['p_auc'] = []
        ds['p_std'] = []
        ds['feature_count'] = []
        unique_p = []
        unique_shapes = []
        for p in percents:
            X = ds['features'].values.copy()
            ds['max_features'] = X.shape[1]
            y = ds['labels'].values.copy().ravel().astype(int)
            X = convert_nan_to_val(X, value=0)
            X[np.isinf(X)] = 0
            col_keep = []
            for i in range(X.shape[1]):
                if np.count_nonzero(X[:,i]) >= int(p*X.shape[0]):
                    col_keep.append(i)
            if len(col_keep) < 2:
                continue
            X = X[:,col_keep]
            ds['feature_count'].append(X.shape[1])
            # for log transform:
            if 'IPO' in ds['data_set'] or 'XCMS' in ds['data_set']:
                X[X==0] = 1
                X = np.log2(X)
            auc_mean, auc_std = l1_log_reg(X,y,ds)
            ds['p_auc'].append(auc_mean)
            ds['p_std'].append(auc_std)
# #         plt.plot(percents, ds['p_auc'], label=ds['data_set'])
        plt.errorbar(np.asarray(ds['feature_count'])/ds['max_features'], ds['p_auc'], ds['p_std'], marker='.', label=ds['data_set']+' '+str(X.shape[1]))
# #         plt.legend(bbox_to_anchor=(1, 0.5))
    plt.legend()
    plt.xlabel('Fraction of features used for model')
    plt.ylabel('AUC')  
    plt.title(k)
#     plt.show()
# #     plt.tight_layout()
    plt.savefig('./fraction_not_0/'+k)
    plt.gcf().clear()

Feng plasma
Feng urine
Feng serum_IPO_aligned_Feng_serum_batch1
Feng serum_IPO_aligned_Feng_serum_batch2
Feng urine_IPO_aligned_Feng_urine_batch1
Feng urine_IPO_aligned_Feng_urine_batch2
MTBLS105 IPO_aligned_MTBLS105_qMS
MTBLS105 IPO_aligned_MTBLS105_SIM-MS
MTBLS146 IPO_aligned_MTBLS146_global_neg
MTBLS146 IPO_aligned_MTBLS146_global_pos


No handles with labels found to put in legend.


MTBLS17 IPO_aligned_MTBLS17_neg_exp1
MTBLS17 IPO_aligned_MTBLS17_neg_exp2
MTBLS17 IPO_aligned_MTBLS17_neg_exp3
MTBLS17 IPO_aligned_MTBLS17_pos_exp1
MTBLS17 IPO_aligned_MTBLS17_pos_exp2
MTBLS17 IPO_aligned_MTBLS17_pos_exp3
MTBLS17 Peaklist_EXP1_POS
MTBLS17 Peaklist_EXP2_POS
MTBLS17 Peaklist_EXP3_POS
MTBLS17 Peaklist_EXP1_NEG
MTBLS17 Peaklist_EXP2_NEG
MTBLS17 Peaklist_EXP3_NEG
MTBLS191 IPO_aligned_MTBLS191
MTBLS19 Exp1F_POS
MTBLS19 Exp1R_POS
MTBLS19 Exp2F_POS
MTBLS19 Exp2R_POS
MTBLS19 Exp1F_NEG
MTBLS19 Exp1R_NEG
MTBLS19 Exp2F_NEG
MTBLS19 Exp2R_NEG
MTBLS19 IPO_aligned_MTBLS19_neg_exp1_F
MTBLS19 IPO_aligned_MTBLS19_neg_exp1_R
MTBLS19 IPO_aligned_MTBLS19_neg_exp2_F
MTBLS19 IPO_aligned_MTBLS19_neg_exp2_R
MTBLS19 IPO_aligned_MTBLS19_pos_exp1_F
MTBLS19 IPO_aligned_MTBLS19_pos_exp1_R
MTBLS19 IPO_aligned_MTBLS19_pos_exp2_F
MTBLS19 IPO_aligned_MTBLS19_pos_exp2_R


No handles with labels found to put in legend.
No handles with labels found to put in legend.


MTBLS218 IPO_aligned_MTBLS218
MTBLS253 m_oxylipin_chronic_hep_b


No handles with labels found to put in legend.


MTBLS266 m_mtbls266_NEG_mass_spectrometry_v2_maf
MTBLS266 m_mtbls266_POS_mass_spectrometry_v2_maf
MTBLS266 IPO_aligned_MTBLS266_neg
MTBLS266 IPO_aligned_MTBLS266_pos
MTBLS279 m_chronic_hep_b_POS
MTBLS279 m_chronic_hep_b_NEG
MTBLS28 m_mtbls28_NEG_v2_maf
MTBLS28 m_mtbls28_POS_v2_maf
MTBLS28 IPO_aligned_MTBLS28_neg
MTBLS28 IPO_aligned_MTBLS28_pos
MTBLS315 m_GC_nmfi_and_bsi_diagnosis_v2_maf
MTBLS315 m_LC_nmfi_and_bsi_diagnosis_v2_maf
MTBLS315 m_UPLC_NEG_nmfi_and_bsi_diagnosis_v2_maf
MTBLS315 m_UPLC_POS_nmfi_and_bsi_diagnosis_v2_maf
MTBLS315 IPO_aligned_MTBLS315_mzData
MTBLS315 IPO_aligned_MTBLS315_mzXML
MTBLS315 IPO_aligned_MTBLS315_n_mzML
MTBLS315 IPO_aligned_MTBLS315_p_mzML
MTBLS315 XCMS-Report-annotated-SingleClass-CDF.
MTBLS315 XCMS-Report-annotated-SingleClass-n_mzML.
MTBLS352 DEMO_neg-norm-metaboAnalystInput
MTBLS352 DEMO_pos-norm-metaboAnalystInput
MTBLS352 IPO_aligned_MTBLS352_neg
MTBLS354 m_cap_metabolite_profiling_mass_spectrometry_v2_maf
MTBLS354 m_cap_metabolite_profiling_mass_



MTBLS404 QC1
MTBLS404 IPO_aligned_MTBLS404_neg
MTBLS408 IPO_aligned_MTBLS408_neg
MTBLS408 IPO_aligned_MTBLS408_pos
MTBLS423 Author data
MTBLS423 IPO_aligned_MTBLS423
MTBLS579 m_typhoid_carriage_metabolite_profiling_mass_spectrometry_v2_maf
MTBLS665 m_huc_c18neg
MTBLS665 m_huc_hilicpos
MTBLS665 IPO_aligned_MTBLS665_c18
MTBLS665 IPO_aligned_MTBLS665_hilic
MTBLS72 IPO_aligned_MTBLS72_neg
MTBLS72 IPO_aligned_MTBLS72_pos
MTBLS92 IPO_aligned_MTBLS92
MTBLS92 Author_data
ST000045 02Feb10-21-r0
ST000045 11Feb10-21-r0
ST000045 11March10-21-r0
ST000045 17March10-21-r0
ST000045 IPO_aligned_ST000045_2feb_pos
ST000045 IPO_aligned_ST000045_11feb_neg
ST000045 IPO_aligned_ST000045_11mar_pos
ST000045 IPO_aligned_ST000045_17mar_neg
ST000046 AN000076
ST000046 AN000077
ST000046 AN000078
ST000046 AN000079
ST000046 IPO_aligned_ST000046_20120606_neg_hilic
ST000046 IPO_aligned_ST000046_20120613_neg_hilic
ST000046 IPO_aligned_ST000046_20120618_pos_c18
ST000046 IPO_aligned_ST000046_20120620_neg_c18
ST000046 IPO_

No handles with labels found to put in legend.


ST000421 13jan12_57-r0-Poor glycemic control
ST000421 09feb12_57-r0-Poor glycemic control
ST000421 16feb12_57-r0-Poor glycemic control
ST000421 14feb12_57-r0-Poor glycemic control
ST000421 10jan12_62-r0-Good glycemic control
ST000421 08feb12_62-r0-Good glycemic control
ST000421 15feb12_62-r0-Good glycemic control
ST000421 13feb12_62-r0-Good glycemic control
ST000421 IPO_aligned_ST000421_n_hil
ST000421 IPO_aligned_ST000421_p_hil
ST000421 IPO_aligned_ST000421_nc18
ST000421 IPO_aligned_ST000421_pc18
ST000450 AN000705
ST000450 AN000706
ST000578 AN000888
ST000578 AN000889
ST000578 IPO_aligned_ST000578_AE
ST000578 IPO_aligned_ST000578_C18
ST000608 AN000929
ST000608 AN000930
ST000608 AN000931
ST000726 AN001138-Negative
ST000726 AN001139-Positive
ST000726 IPO_aligned_ST000726_neg
ST000726 IPO_aligned_ST000726_pos
ST000763 AN001201
ST000763 AN001202
ST000763 IPO_aligned_ST000763_untar_neg
ST000763 IPO_aligned_ST000763_untar_pos
ST000865 AN001390
ST000865 IPO_aligned_ST000865_batch2
ST000865 IPO

No handles with labels found to put in legend.


<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

# Filter based on feature avg intensity > X
what about filtering a row if the max is <x? might be a way to remove features that are small all around...
or maybe where the max - min diff is > some value?

# NOTE: the cell below has 3 main ways of running, see inside


In [15]:
# loose outcome: really no positive or negative bonus to doing this type of filtering
for k, v in datasets.items():  
    plt.figure(figsize=(8,5))
    for ds in v:
        if k in ['MTBLS148','MTBLS200', 'MTBLS20', 'ST000397', 'MTBLS264', 'snyder']:
            ds['p_auc'] = []
            ds['p_std'] = [] # use len(ds['p_auc']) to pass these during plotting
            continue
        if 'IPO' not in ds['data_set'] and 'XCMS' not in ds['data_set']:
            continue
#         if k != 'ST000063':
#             continue
#         if 'IPO' not in ds['data_set']:
#             continue
            
        ds['labels'] = ds['labels']*1
        vals = ds['labels'].values
        try:
            vals = [item for sublist in vals for item in sublist]
        except:
            pass
        labels = set(vals)
        ds['num_labels'] = len(labels)

        if ds['data_set'] == 'IPO_aligned_MTBLS92':
            f = [fi for fi in list(ds['features'].index) if '163' in fi]
            ds['features'] = ds['features'].loc[f]
            ds['labels'] = ds['labels'].loc[f]
        if ds['study'] == 'MTBLS92' and ds['data_set'] == 'Author_data':
            f = [fi for fi in list(ds['features'].index) if 'A' in fi]
            ds['features'] = ds['features'].loc[f]
            ds['labels'] = ds['labels'].loc[f]
        print(k, ds['data_set'])    
        

        #define the number of repeats for each level of subsetting
        thresholds = [0, 100, 1000, 2000, 10000, 40000, 100000] # levels of subsetting
        ds['p_auc'] = []
        ds['p_std'] = []
        ds['feature_count'] = []
        ds['good_thresholds'] = []
        unique_p = []
        unique_shapes = []
        for t in thresholds:
            X = ds['features'].values.copy()
            ds['max_features'] = X.shape[1]
            y = ds['labels'].values.copy().ravel().astype(int)
            X = convert_nan_to_val(X, value=0)
            X[np.isinf(X)] = 0
            col_keep = []
            for i in range(X.shape[1]):
# THESE ARE THE THREE MAIN WAYS OF RUNNING THIS CELL - DONT FORGET TO CHANGE WHERE IT SAVES
                if X[:,i].mean() >= t:
#                 if X[:,i].max() >= t:
#                 if X[:,i].max()-X[:,i].min() >= t:
                    col_keep.append(i)
            if len(col_keep) < 2:
                continue
            X = X[:,col_keep]

            ds['feature_count'].append(X.shape[1])
            # for log transform:
            if 'IPO' in ds['data_set'] or 'XCMS' in ds['data_set']:
                X[X==0] = 1
                X = np.log2(X)
            auc_mean, auc_std = l1_log_reg(X,y,ds)

            ds['p_auc'].append(auc_mean)
            ds['p_std'].append(auc_std)
            ds['good_thresholds'].append(t)
# #         plt.plot(percents, ds['p_auc'], label=ds['data_set'])
        plt.errorbar(np.asarray(ds['feature_count'])/ds['max_features'], ds['p_auc'], ds['p_std'], marker='.', label=ds['data_set']+' '+str(ds['max_features']))
# #         plt.legend(bbox_to_anchor=(1, 0.5))
    plt.legend()
    plt.xlabel('Fraction of features with mean val greater than threshold')
    plt.ylabel('AUC')  
    plt.title(k)

#     plt.show()
# #     plt.tight_layout()
    plt.savefig('./mean_greaterthan_X/'+k) #CHANGE THIS FOR WHICH WAY YOU WANT TO RUN THIS
    plt.gcf().clear()

Feng serum_IPO_aligned_Feng_serum_batch1
Feng serum_IPO_aligned_Feng_serum_batch2
Feng urine_IPO_aligned_Feng_urine_batch1
Feng urine_IPO_aligned_Feng_urine_batch2
MTBLS105 IPO_aligned_MTBLS105_qMS
MTBLS105 IPO_aligned_MTBLS105_SIM-MS
MTBLS146 IPO_aligned_MTBLS146_global_neg
MTBLS146 IPO_aligned_MTBLS146_global_pos


No handles with labels found to put in legend.


MTBLS17 IPO_aligned_MTBLS17_neg_exp1
MTBLS17 IPO_aligned_MTBLS17_neg_exp2
MTBLS17 IPO_aligned_MTBLS17_neg_exp3
MTBLS17 IPO_aligned_MTBLS17_pos_exp1
MTBLS17 IPO_aligned_MTBLS17_pos_exp2
MTBLS17 IPO_aligned_MTBLS17_pos_exp3
MTBLS191 IPO_aligned_MTBLS191
MTBLS19 IPO_aligned_MTBLS19_neg_exp1_F
MTBLS19 IPO_aligned_MTBLS19_neg_exp1_R
MTBLS19 IPO_aligned_MTBLS19_neg_exp2_F
MTBLS19 IPO_aligned_MTBLS19_neg_exp2_R
MTBLS19 IPO_aligned_MTBLS19_pos_exp1_F
MTBLS19 IPO_aligned_MTBLS19_pos_exp1_R
MTBLS19 IPO_aligned_MTBLS19_pos_exp2_F
MTBLS19 IPO_aligned_MTBLS19_pos_exp2_R


No handles with labels found to put in legend.
No handles with labels found to put in legend.


MTBLS218 IPO_aligned_MTBLS218


No handles with labels found to put in legend.
No handles with labels found to put in legend.


MTBLS266 IPO_aligned_MTBLS266_neg
MTBLS266 IPO_aligned_MTBLS266_pos


No handles with labels found to put in legend.


MTBLS28 IPO_aligned_MTBLS28_neg
MTBLS28 IPO_aligned_MTBLS28_pos
MTBLS315 IPO_aligned_MTBLS315_mzData
MTBLS315 IPO_aligned_MTBLS315_mzXML
MTBLS315 IPO_aligned_MTBLS315_n_mzML
MTBLS315 IPO_aligned_MTBLS315_p_mzML
MTBLS315 XCMS-Report-annotated-SingleClass-CDF.
MTBLS315 XCMS-Report-annotated-SingleClass-n_mzML.
MTBLS352 IPO_aligned_MTBLS352_neg
MTBLS354 IPO_aligned_MTBLS354_neg
MTBLS354 IPO_aligned_MTBLS354_pos


No handles with labels found to put in legend.


MTBLS364 IPO_aligned_MTBLS364_hil_neg
MTBLS364 IPO_aligned_MTBLS364_hil_pos
MTBLS364 IPO_aligned_MTBLS364_lip_neg
MTBLS364 IPO_aligned_MTBLS364_lip_pos




MTBLS404 IPO_aligned_MTBLS404_neg
MTBLS408 IPO_aligned_MTBLS408_neg
MTBLS408 IPO_aligned_MTBLS408_pos
MTBLS423 IPO_aligned_MTBLS423


No handles with labels found to put in legend.


MTBLS665 IPO_aligned_MTBLS665_c18
MTBLS665 IPO_aligned_MTBLS665_hilic
MTBLS72 IPO_aligned_MTBLS72_neg
MTBLS72 IPO_aligned_MTBLS72_pos
MTBLS92 IPO_aligned_MTBLS92
ST000045 IPO_aligned_ST000045_2feb_pos
ST000045 IPO_aligned_ST000045_11feb_neg
ST000045 IPO_aligned_ST000045_11mar_pos
ST000045 IPO_aligned_ST000045_17mar_neg
ST000046 IPO_aligned_ST000046_20120606_neg_hilic
ST000046 IPO_aligned_ST000046_20120613_neg_hilic
ST000046 IPO_aligned_ST000046_20120618_pos_c18
ST000046 IPO_aligned_ST000046_20120620_neg_c18
ST000046 IPO_aligned_ST000046_20120625_pos_c18
ST000046 XCMS-Report-annotated-SingleClass.04jun12
ST000046 XCMS-Report-annotated-SingleClass.11jun12
ST000046 XCMS-Report-annotated-SingleClass.27jun12
ST000063 XCMS-Report-annotated-SingleClass.
ST000063 XCMS-Report-annotated-SingleClass-GCTOF.


No handles with labels found to put in legend.


ST000329 IPO_aligned_ST000329_pos
ST000329 IPO_aligned_ST000329_neg


No handles with labels found to put in legend.
No handles with labels found to put in legend.


ST000381 IPO_aligned_ST000381_pos


No handles with labels found to put in legend.


ST000385 IPO_aligned_ST000385_adc1_plasma
ST000385 IPO_aligned_ST000385_adc1_serum
ST000385 IPO_aligned_ST000385_adc2_plasma
ST000385 IPO_aligned_ST000385_adc2_serum
ST000388 IPO_aligned_ST000388_LC
ST000389 IPO_aligned_ST000388_GC
ST000392 IPO_aligned_ST000392_plasma
ST000392 IPO_aligned_ST000392_serum
ST000396 IPO_aligned_ST000396


No handles with labels found to put in legend.


ST000421 IPO_aligned_ST000421_n_hil
ST000421 IPO_aligned_ST000421_p_hil
ST000421 IPO_aligned_ST000421_nc18
ST000421 IPO_aligned_ST000421_pc18


No handles with labels found to put in legend.


ST000578 IPO_aligned_ST000578_AE
ST000578 IPO_aligned_ST000578_C18


No handles with labels found to put in legend.


ST000726 IPO_aligned_ST000726_neg
ST000726 IPO_aligned_ST000726_pos
ST000763 IPO_aligned_ST000763_untar_neg
ST000763 IPO_aligned_ST000763_untar_pos
ST000865 IPO_aligned_ST000865_batch2
ST000865 IPO_aligned_ST000865_batch3


No handles with labels found to put in legend.
No handles with labels found to put in legend.
No handles with labels found to put in legend.


<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

<Figure size 576x360 with 0 Axes>

# Filter based on camera labeling
different ways to filter: 1) get not null isotopes / adduct / pcgroup (note pcgroups might be assigned for all...and with multiple compounds in a pcgroup). 2) get a single compound for each of the groups (? doesn't make sense to do this for pcgroups and not real group for adducts so really only makes some sense for isotopes - ie to deisotope the data...) - ie extract single compounds? 

In [None]:
for k, v in datasets.items():  
#     plt.figure(figsize=(8,5))
    for ds in v:
        if k in ['MTBLS148','MTBLS200', 'MTBLS20', 'ST000397', 'MTBLS264', 'snyder']:
            ds['p_auc'] = []
            ds['p_std'] = [] # use len(ds['p_auc']) to pass these during plotting
            continue
        if 'IPO' not in ds['data_set'] and 'XCMS' not in ds['data_set']:
            continue
        if 'pcgroup' not in list(ds['peaks']):
            continue
        isotopes = ds['peaks']['isotopes']
        isotopes_notnull = isotopes.notnull()
        adducts = ds['peaks']['adduct']
        adducts_notnull = adducts.notnull()
        pcgroups = ds['peaks']['pcgroup']
        pcgroups_notnull = pcgroups.notnull()
        filters = [isotopes_notnull, adducts_notnull, pcgroups_notnull]
        filter_names = ['isotope', 'adduct', 'pcgroup']
#         if k != 'ST000063':
#             continue
#         if 'IPO' not in ds['data_set']:
#             continue
        
        ds['labels'] = ds['labels']*1
        vals = ds['labels'].values
        try:
            vals = [item for sublist in vals for item in sublist]
        except:
            pass
        labels = set(vals)
        ds['num_labels'] = len(labels)
        if ds['data_set'] == 'IPO_aligned_MTBLS92':
            f = [fi for fi in list(ds['features'].index) if '163' in fi]
            ds['features'] = ds['features'].loc[f]
            ds['labels'] = ds['labels'].loc[f]
        if ds['study'] == 'MTBLS92' and ds['data_set'] == 'Author_data':
            f = [fi for fi in list(ds['features'].index) if 'A' in fi]
            ds['features'] = ds['features'].loc[f]
            ds['labels'] = ds['labels'].loc[f]
        print(k, ds['data_set'])  
        
        X = ds['features'].values.copy()
        y = ds['labels'].values.copy().ravel().astype(int)
        X = convert_nan_to_val(X, value=0)
        X[np.isinf(X)] = 0
        if 'IPO' in ds['data_set'] or 'XCMS' in ds['data_set']:
            X[X==0] = 1
            X = np.log2(X)
        auc_mean, auc_std = l1_log_reg(X,y,ds)
        print('No filter: ', auc_mean)
        for filt, name in zip(filters,filter_names):
            X_filt = X[:,filt]
            if X_filt.shape[1] < 2:
                continue
            auc_mean, auc_std = l1_log_reg(X_filt,y,ds)
            print(name, auc_mean)

Feng serum_IPO_aligned_Feng_serum_batch1
No filer:  1.0
isotope 0.9931972789115647
adduct 1.0
pcgroup 1.0
Feng serum_IPO_aligned_Feng_serum_batch2
No filer:  0.996031746031746
isotope 0.996031746031746
adduct 0.9967948717948718
pcgroup 1.0
Feng urine_IPO_aligned_Feng_urine_batch1
No filer:  0.980952380952381
isotope 0.9904761904761905
adduct 0.9740740740740742
pcgroup 0.9761904761904763
Feng urine_IPO_aligned_Feng_urine_batch2
No filer:  0.9714285714285714
isotope 0.9848484848484849
adduct 0.938095238095238
pcgroup 0.9582251082251082
MTBLS105 IPO_aligned_MTBLS105_qMS
No filer:  0.5550393234216764
isotope 0.5202003878474467
pcgroup 0.5689102564102564
MTBLS105 IPO_aligned_MTBLS105_SIM-MS
No filer:  0.7993428140486963
isotope 0.6047053436759319
adduct 0.48598093083387206
pcgroup 0.857223658694247
MTBLS146 IPO_aligned_MTBLS146_global_neg
No filer:  0.39035428986409376
isotope 0.4793158003942318
adduct 0.3555866791160909
pcgroup 0.44235657225853303
MTBLS146 IPO_aligned_MTBLS146_global_pos
N

pcgroup 0.6
ST000063 XCMS-Report-annotated-SingleClass.
No filer:  0.7364047181372548
isotope 0.5533854166666666
pcgroup 0.7441023284313726
ST000063 XCMS-Report-annotated-SingleClass-GCTOF.
No filer:  0.8403416053921569
isotope 0.7562040441176471
pcgroup 0.7682291666666666
ST000329 IPO_aligned_ST000329_pos
No filer:  0.7313131313131312
isotope 0.6003367003367003
adduct 0.7380471380471381
pcgroup 0.7319865319865321
ST000381 IPO_aligned_ST000381_pos
No filer:  0.34767025089605735
isotope 0.3587813620071685
adduct 0.3379928315412186
pcgroup 0.3258064516129033
ST000385 IPO_aligned_ST000385_adc1_plasma
No filer:  0.6782766439909297
isotope 0.7215646258503402
adduct 0.5051020408163266
pcgroup 0.8013378684807256
ST000385 IPO_aligned_ST000385_adc1_serum
