The purpose of this notebook is to look at the effect of filtering the samples. 

In [2]:
# PART OF THIS CODE IS FROM MICHAEL MURPHY - THANKS!
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import OrderedDict
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 500) 
pd.set_option('display.max_columns', 100)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.utils.multiclass import type_of_target # used to check the Y labels are appropriate for classification
from sklearn.metrics import roc_curve
from sklearn import metrics
from sklearn.utils import shuffle
from scipy import interp

In [3]:
def scatterplot(xs, ys, ss, cs, ls, sscale=1, sbins=None):
    ax = plt.gca()
    cm = plt.get_cmap('rainbow')
    for i, c in enumerate(cs.unique()):
        ax.scatter(xs[cs==c],
                    ys[cs==c],
                    s=ss*sscale if isinstance(ss,int) else ss[cs==c]*sscale,
                    c=cm(1.*i/len(cs.unique())),
                    edgecolor='k',
                    alpha=0.9,
                    vmin=0, vmax=len(cs.unique()),
                   label='_nolegend_')
        ax.scatter([],[],c=cm(1.*i/len(cs.unique())),edgecolor='k',label=c)
    ax.scatter([],[],marker='None',label=' ')
    if not isinstance(ss,int):
        for s in sbins:
            ax.scatter([],[],c='k',edgecolor='k',s=s*sscale,label=str(s))
            ax.scatter([],[],marker='None',label=' ')
    for x, y, l in zip(xs, ys, ls):
        ax.text(x, y, l, color='k', ha='center', va='center')
    ax.set_xlabel(xs.name)
    ax.set_ylabel(ys.name)
    ax.legend()
    ax.grid()

In [4]:
bn = True
if bn:
    path = './bn_pickles/*.pkl'
else:
    path = './pickles/*.pkl'

datasets = OrderedDict()
for fn in sorted(glob.glob(path)):
    data = pd.read_pickle(open(fn,'rb'))
    datasets[data[0]['study']] = data

In [5]:
def convert_nan_to_val(data, value=0):
    data[pd.isnull(data)] = value
    return data

def extract_random_subset(X, y, percent):
    num_to_keep = int(X.shape[0]*percent)
    idx = np.random.randint(X.shape[0], size=num_to_keep)
    return X[idx,:], y[idx,:].ravel()

def l1_log_reg(X,y,ds):
    X,y = shuffle(X,y)
    # intercept scaling of 1 seems to help with the 0.5 AUC, think it was a convergence issues? 
    clf = LogisticRegressionCV(scoring='roc_auc', penalty='l1', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
    cv = StratifiedKFold(n_splits=3, shuffle=True)
    aucs = []
    for train, test in cv.split(X,y):
        x_train, y_train = X[train], y[train]
        x_test,y_test = X[test], y[test]
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        clf.fit(x_train, y_train)
        if ds['num_labels'] != 2:
            aucs.append(clf.score(x_test, y_test))
        else:
            y_pred = clf.predict_proba(x_test)
            fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
            auc_value = metrics.auc(fpr, tpr)
            aucs.append(auc_value)            
    auc = np.asarray(aucs)
    # found sometimes that something will just go wrong in fitting and it will shut down all features and give a 0.5 model
    # also sometimes the 1.0 models are fit wrong...
    if auc.mean() == 1.0 or auc.mean() == 0.5:
        X,y = shuffle(X,y)
        clf = LogisticRegressionCV(penalty='l1', scoring='roc_auc', solver='liblinear', tol=1e-4, intercept_scaling=1, max_iter=500)
        cv = StratifiedKFold(n_splits=3, shuffle=True)
        aucs = []
        for train, test in cv.split(X,y):
            x_train, y_train = X[train], y[train]
            x_test,y_test = X[test], y[test]
            scaler = StandardScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.transform(x_test)
            clf.fit(x_train, y_train)
            if ds['num_labels'] != 2:
                aucs.append(clf.score(x_test, y_test))
            else:
                y_pred = clf.predict_proba(x_test)
                fpr, tpr, _ = roc_curve(y_test, y_pred[:,1])
                auc_value = metrics.auc(fpr, tpr)
                aucs.append(auc_value)  
    return auc.mean(), auc.std()

In [None]:
for k, v in datasets.items():  
#     if k not in  ['ST000763', 'ST000865', 'ST000888', 'ST000918']:
#         continue
    plt.figure(figsize=(8,5))
    for ds in v:
#         if k in ['MTBLS148','MTBLS200', 'MTBLS20', 'ST000397', 'MTBLS264', 'snyder']:
#             ds['p_auc'] = []
#             ds['p_std'] = [] # use len(ds['p_auc']) to pass these during plotting
#             continue
#         if 'XCMS' not in ds['data_set']:
#             continue
            
        ds['labels'] = ds['labels']*1
        try:
            vals = ds['labels'].values
        except:
            vals = ds['labels']
        try:
            vals = [item for sublist in vals for item in sublist]
        except:
            pass
        labels = set(vals)
        ds['num_labels'] = len(labels)

#         if ds['data_set'] == 'IPO_aligned_MTBLS92':
#             f = [fi for fi in list(ds['features'].index) if '163' in fi]
#             ds['features'] = ds['features'].loc[f]
#             ds['labels'] = ds['labels'].loc[f]
#         if ds['study'] == 'MTBLS92' and ds['data_set'] == 'Author_data':
#             f = [fi for fi in list(ds['features'].index) if 'A' in fi]
#             ds['features'] = ds['features'].loc[f]
#             ds['labels'] = ds['labels'].loc[f]
        print(k, ds['data_set'])    
        
        
        if bn:
            try:
                y = ds['labels'].values.copy().astype(int)
                X = ds['features']
            except:
                y = ds['labels']
                X = ds['features']
        else:
            X = ds['features'].values.copy()
            y = ds['labels'].values.copy().astype(int)
        y = np.reshape(y, (-1,1))
        X = convert_nan_to_val(X, value=0)
        X[np.isinf(X)] = 0
        # for log transform:
#         if 'IPO' in ds['data_set'] or 'XCMS' in ds['data_set']:
#             X[X==0] = 1
#             X = np.log2(X)
        #define the number of repeats for each level of subsetting
        num_repeats = 3
        percents = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] # levels of subsetting
        ds['p_auc'] = []
        ds['p_std'] = []
        for p in percents:
            results = []
            for i in range(num_repeats):
                X_sub,y_sub = extract_random_subset(X,y, percent=p)
                if X_sub.shape[0] < 10:
                    results.append(0)
                    continue                    
                try:
                    auc_mean, auc_std = l1_log_reg(X_sub,y_sub,ds)
                    results.append(auc_mean)
                except:
                    results.append(np.nan)
            results = np.asarray(results)
            mean = np.nanmean(results)
            std = np.nanstd(results)
            ds['p_auc'].append(mean)
            ds['p_std'].append(std)  
#         plt.plot(percents, ds['p_auc'], label=ds['data_set'])
        plt.errorbar(percents, ds['p_auc'], ds['p_std'], marker='.', label=ds['data_set']+' '+str(X.shape[1]))
#         plt.legend(bbox_to_anchor=(1, 0.5))
        plt.legend()
        plt.xlabel('Fraction of Samples')
        plt.ylabel('AUC')  
        plt.title(k)
#     plt.tight_layout()
    plt.savefig(k)
    plt.gcf().clear()
#         print(ds['p_auc'], ds['p_std'])

Feng plasmaall_author
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
Feng urineall_author
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
Feng serum_IPO_aligned_Feng_serum_batch1
<class 'numpy.ndarray'> <class 'numpy.ndarray'>




Feng serum_IPO_aligned_Feng_serum_batch2
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
Feng urine_IPO_aligned_Feng_urine_batch1
<class 'numpy.ndarray'> <class 'numpy.ndarray'>
