In [1]:
# %matplotlib notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import OrderedDict
import glob, os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sp
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils import shuffle
pd.options.display.max_columns = 100
from sklearn.metrics import roc_curve
from sklearn.utils import shuffle
from sklearn import metrics

In [6]:
##### this commented out part is for playing around with group / individual data sets
# multi_to_single = ['ST000284', 'ST000046', 'ST000045', 'ST000763', 'ST000329', 'MTBLS358', 'MTBLS352']
# multi_to_single = ['Feng']
# datasets = OrderedDict()
# for fn in sorted(glob.glob('./pickles/*.pkl')):
# #     print(fn)
#     if fn[10:-8] in multi_to_single:
#         data = pd.read_pickle(open(fn,'rb'))
#         print(data[0]['study'])
#         datasets[data[0]['study']] = data

#### for general processing:
datasets = OrderedDict()
for fn in sorted(glob.glob('./pickles/*.pkl')):
    data = pd.read_pickle(open(fn,'rb'))
    datasets[data[0]['study']] = data
    
pre_norm_ds = [ 'plasmaall_author',
                'urineall_author',
                'm_oxylipin_chronic_hep_b',
                'm_chronic_hep_b_POS',
                'm_chronic_hep_b_NEG',
                'm_CER_mass_spectrometry_v4',
                'm_CER_mass_spectrometry_v4_3_CS',
                'm_CER_mass_spectrometry_v4_0_NS',
                'm_CER_mass_spectrometry_v4_2_FS',
                'm_CER_mass_spectrometry_v4_1_COPD',
                'm_EICO_mass_spectrometry_v4',
                'm_EICO_mass_spectrometry_v4_3_CS',
                'm_EICO_mass_spectrometry_v4_0_NS',
                'm_EICO_mass_spectrometry_v4_2_FS',
                'm_EICO_mass_spectrometry_v4_1_COPD',
                'AN000580',
                'AN000581',
                'AN001503',
                'ulsam_author']

def check_pre_norm(ds):
    if ds['data_set'] in pre_norm_ds:
        ds['pre_norm'] = 'Yes'
    else:
        ds['pre_norm'] = 'No'
    return ds

for k, v in datasets.items():
    for ds in v:
        ds = check_pre_norm(ds)

Feng


In [3]:
def convert_nan_to_val(data, value=0):
    data[pd.isnull(data)] = value
    return data

# percentile normalization - Sean / Claire's method - code copied, thanks you two!
def percentile_normalization(X, control_indices, all_indices):
    norm_x = np.array(
        [[sp.percentileofscore(X[control_indices, i], X[j, i], kind='mean') for j in all_indices] for i in range(X.shape[1])]
        ).T
    return norm_x

define functions to perform the batch normalization process

In [4]:
def log_trans_ds_aware(ds, X):
    X[np.isinf(X)] = 0
    X[X<0] = 0
    if ds['pre_norm'] == 'Yes':
        X = convert_nan_to_val(X, value=0)
    else:
        X = convert_nan_to_val(X, value=1)
        X[X<1] = 1
        X = np.log2(X)
    return X
        
def no_batch_ds_normalization(ds):
    y = ds['labels'].values.copy()
    X = ds['features'].values.copy()
    x_labels = list(ds['features'].index)
    control = [i for i in range(y.shape[0]) if (y[i]==0 or y[i]==False)]
    case = [i for i in range(y.shape[0]) if (y[i]!=0 or y[i]==True)]
    all_ind = [i for i in range(y.shape[0])]
    X_random = np.random.uniform(0.0,10**-9,size=(X.shape[0],X.shape[1]))
    X = log_trans_ds_aware(ds, X)
    X = X+X_random
    X = percentile_normalization(X, control, all_ind)
    ds['features'] = pd.DataFrame(X, index=x_labels)
    return ds

def batch_data(ds,k):
    '''
    return the batches specific to the 'batch effected' datasets 
    '''
    # want to return two lists, first is a list of X matricies for individual batches, second is y matricies for individual batches
    X = ds['features'].values.copy()
    y = ds['labels'].values.copy()
    x_labels = list(ds['features'].index)
    y_labels = list(ds['labels'].index)
    x_batch_labels = []
    y_batch_labels = []
    x_batches = []
    y_batches = []
    
    if k == 'Feng':
        batches = list(ds['features'].index)
        if batches[0][-6:] == '.mzXML':
            batches = [name[:-6] for name in batches if name[-6:]=='.mzXML' ]
        batches = [int(ele[-1]) for ele in batches]
    
    if k == 'MTBLS17':
        batches = list(ds['features'].index)
        batches = [int(ele[3]) for ele in batches]
        
    if k == 'MTBLS19':
        batches = list(ds['features'].index)
        batches = [int(ele[3]) for ele in batches]
        
    if k == 'MTBLS72':
        g1 = ['X20120829_Howard_NEG_129.mzML', 'X20120829_Howard_NEG_130.mzML', 'X20120829_Howard_NEG_131.mzML', 'X20120829_Howard_NEG_132.mzML', 'X20120829_Howard_NEG_134.mzML', 'X20120829_Howard_NEG_135.mzML', 'X20120829_Howard_NEG_137.mzML', 'X20120829_Howard_NEG_142.mzML', 'X20120829_Howard_NEG_158.mzML', 'X20120829_Howard_NEG_160.mzML', 'X20120829_Howard_NEG_163.mzML', 'X20120829_Howard_NEG_164.mzML', 'X20120829_Howard_NEG_165.mzML', 'X20120829_Howard_NEG_166.mzML', 'X20120829_Howard_NEG_167.mzML', 'X20120829_Howard_NEG_170.mzML', 'X20120829_Howard_NEG_173.mzML', 'X20120829_Howard_NEG_174.mzML', 'X20120829_Howard_NEG_175.mzML', 'X20120829_Howard_NEG_177.mzML', 'X20120829_Howard_NEG_179.mzML', 'X20120829_Howard_NEG_180.mzML', 'X20120829_Howard_NEG_181.mzML', 'X20120829_Howard_NEG_133.mzML', 'X20120829_Howard_NEG_136.mzML', 'X20120829_Howard_NEG_138.mzML', 'X20120829_Howard_NEG_140.mzML', 'X20120829_Howard_NEG_141.mzML', 'X20120829_Howard_NEG_143.mzML', 'X20120829_Howard_NEG_144.mzML', 'X20120829_Howard_NEG_146.mzML', 'X20120829_Howard_NEG_147.mzML', 'X20120829_Howard_NEG_148.mzML', 'X20120829_Howard_NEG_149.mzML', 'X20120829_Howard_NEG_151.mzML', 'X20120829_Howard_NEG_152.mzML', 'X20120829_Howard_NEG_153.mzML', 'X20120829_Howard_NEG_159.mzML', 'X20120829_Howard_NEG_168.mzML', 'X20120829_Howard_NEG_169.mzML', 'X20120829_Howard_NEG_176.mzML', 'X20120829_Howard_NEG_178.mzML', 'X20120829_Howard_NEG_182.mzML']
        g2 = ['X20120829_Howard_NEG_07.mzML', 'X20120829_Howard_NEG_08.mzML', 'X20120829_Howard_NEG_09.mzML', 'X20120829_Howard_NEG_10.mzML', 'X20120829_Howard_NEG_11.mzML', 'X20120829_Howard_NEG_12.mzML', 'X20120829_Howard_NEG_13.mzML', 'X20120829_Howard_NEG_14.mzML', 'X20120829_Howard_NEG_15.mzML', 'X20120829_Howard_NEG_16.mzML', 'X20120829_Howard_NEG_18.mzML', 'X20120829_Howard_NEG_19.mzML', 'X20120829_Howard_NEG_20.mzML', 'X20120829_Howard_NEG_21.mzML', 'X20120829_Howard_NEG_22.mzML', 'X20120829_Howard_NEG_23.mzML', 'X20120829_Howard_NEG_24.mzML', 'X20120829_Howard_NEG_25.mzML', 'X20120829_Howard_NEG_26.mzML', 'X20120829_Howard_NEG_27.mzML', 'X20120829_Howard_NEG_29.mzML', 'X20120829_Howard_NEG_30.mzML', 'X20120829_Howard_NEG_31.mzML', 'X20120829_Howard_NEG_32.mzML', 'X20120829_Howard_NEG_33.mzML', 'X20120829_Howard_NEG_34.mzML', 'X20120829_Howard_NEG_35.mzML', 'X20120829_Howard_NEG_36.mzML', 'X20120829_Howard_NEG_37.mzML', 'X20120829_Howard_NEG_38.mzML', 'X20120829_Howard_NEG_40.mzML', 'X20120829_Howard_NEG_41.mzML', 'X20120829_Howard_NEG_42.mzML', 'X20120829_Howard_NEG_43.mzML', 'X20120829_Howard_NEG_44.mzML', 'X20120829_Howard_NEG_45.mzML', 'X20120829_Howard_NEG_46.mzML', 'X20120829_Howard_NEG_47.mzML', 'X20120829_Howard_NEG_48.mzML', 'X20120829_Howard_NEG_49.mzML', 'X20120829_Howard_NEG_51.mzML', 'X20120829_Howard_NEG_52.mzML', 'X20120829_Howard_NEG_53.mzML', 'X20120829_Howard_NEG_54.mzML', 'X20120829_Howard_NEG_55.mzML', 'X20120829_Howard_NEG_56.mzML', 'X20120829_Howard_NEG_57.mzML', 'X20120829_Howard_NEG_58.mzML', 'X20120829_Howard_NEG_59.mzML', 'X20120829_Howard_NEG_60.mzML']
        g3 = ['X20120829_Howard_NEG_253.mzML', 'X20120829_Howard_NEG_254.mzML', 'X20120829_Howard_NEG_258.mzML', 'X20120829_Howard_NEG_259.mzML', 'X20120829_Howard_NEG_264.mzML', 'X20120829_Howard_NEG_267.mzML', 'X20120829_Howard_NEG_270.mzML', 'X20120829_Howard_NEG_271.mzML', 'X20120829_Howard_NEG_273.mzML', 'X20120829_Howard_NEG_274.mzML', 'X20120829_Howard_NEG_275.mzML', 'X20120829_Howard_NEG_277.mzML', 'X20120829_Howard_NEG_278.mzML', 'X20120829_Howard_NEG_284.mzML', 'X20120829_Howard_NEG_285.mzML', 'X20120829_Howard_NEG_286.mzML', 'X20120829_Howard_NEG_289.mzML', 'X20120829_Howard_NEG_295.mzML', 'X20120829_Howard_NEG_297.mzML', 'X20120829_Howard_NEG_299.mzML', 'X20120829_Howard_NEG_302.mzML', 'X20120829_Howard_NEG_251.mzML', 'X20120829_Howard_NEG_255.mzML', 'X20120829_Howard_NEG_256.mzML', 'X20120829_Howard_NEG_257.mzML', 'X20120829_Howard_NEG_265.mzML', 'X20120829_Howard_NEG_276.mzML', 'X20120829_Howard_NEG_279.mzML', 'X20120829_Howard_NEG_281.mzML', 'X20120829_Howard_NEG_282.mzML', 'X20120829_Howard_NEG_292.mzML', 'X20120829_Howard_NEG_296.mzML', 'X20120829_Howard_NEG_298.mzML', 'X20120829_Howard_NEG_300.mzML']   
        groups = [g1,g2,g3]

        labels = list(ds['features'].index)
        batches = []
        if labels[0][17:20] == 'POS':
            for i, g in enumerate(groups):
                groups[i] = [ele.replace('NEG','POS') for ele in g]
        for f in labels:
            for i,g in enumerate(groups):
                if f in g:
                    batches.append(i+1)                    
                    
    if k == 'MTBLS92':
        batch = list(ds['features'].index)
        if batch[0][0] == 'X':
            batches = [int(strin[1])-1 for strin in batch]
        else:
            batches = [strin[0] for strin in batch]
            mapper = {'A':0, 'B':1}
            batches = [mapper[s] for s in batches]

    if k == 'MTBLS105':
        batches = ds['samples']['Factor Value[Batch]'].values
        
    if k == 'MTBLS146':
        names = ds['samples'].set_index('Raw Spectral Data File')['Factor Value[Batch number]']
        name_order = list(ds['features'].index)
        names = names.loc[name_order]
        batches = names.values
        
    if k == 'MTBLS404':
        names = list(ds['features'].index)
        batches = []
        for n in names:
            if 'b2' in n:
                batches.append(1)
            else: 
                batches.append(0)
                
    if k == 'ST000063':
        batches = list(ds['labels'].index)
        batches = [int(i[-1]) for i in batches]
        
    if k == 'ST000062':
        batches = list(ds['labels'].index)
        batches = [int(i[-1]) for i in batches]
            
    if k == 'ST000763':
        names = list(ds['features'].index)
        batches = pd.read_csv('batches.csv').set_index('SAMPLE_ID')
        batches = batches.loc[names].values.flatten()
   
    if k == 'ST000865':
        batches_ = list(ds['features'].index)
        batches_ = [int(ele.split('_')[2]) for ele in batches_]
        batches = []
        for ele in batches_:
            if ele > 87:
                batches.append(1)
            else:
                batches.append(0) 
                
    if k == 'ST000385':
        batches = [0 if ele[2] == '3' else 1 for ele in x_labels]

    # now break up the X and Y datasets based on the batch info which is all in list form!
    set_batches = set(batches)
    batches = np.asarray(batches)
    for b in set_batches:
        mask = batches==b
        x_batches.append(X[batches==b])
        x_batch_labels.append([x_labels[i] for i in range(len(x_labels)) if mask[i]])
        y_batches.append(y[batches==b])  
        y_batch_labels.append([y_labels[i] for i in range(len(y_labels)) if mask[i]])
    single_x_batch_labels = [y for x in x_batch_labels for y in x]
    single_y_batch_labels = [y for x in x_batch_labels for y in x]
    return x_batches, y_batches, single_x_batch_labels, single_y_batch_labels 

def batch_ds_normalization(ds,k):
    '''
    need to extract the different batches, then on the batches get the indicies for each of the labels 
    issue: some datasets have multiple batches...so want to loop over batches to apply batch effect
    '''
    batch_x, batch_y, x_batch_labels, y_batch_labels = batch_data(ds, k) # these are lists of X and Ys for each of the batches
    norm_x = []
    reform_y = []
    for X, y in zip(batch_x, batch_y):
        control_ind = [i for i in range(y.shape[0]) if (y[i]==0 or y[i]==False)]
        all_ind  = [i for i in range(y.shape[0])]
        X_random = np.random.uniform(0.0,10**-7,size=(X.shape[0],X.shape[1]))
        X = log_trans_ds_aware(ds, X)
        X = X+X_random
        # give the BN the X data JUST for that batch, the indicies that are cases and all the indicies
        X = percentile_normalization(X,control_ind, all_ind)
        norm_x.append(X)
        reform_y.append(y)
    full_X = np.concatenate(norm_x, axis=0)
    full_y = np.concatenate(reform_y, axis=0)
    ds['features'] = pd.DataFrame(full_X, index=x_batch_labels)
    ds['labels'] = pd.DataFrame(full_y, index=y_batch_labels)    
    return ds

This next cell will perform the percentile normalization on all of the datasets and save!

In [7]:
true_batch = ['MTBLS72', 'MTBLS92', 'MTBLS105', 'MTBLS146', 'MTBLS404', 'ST000063', 'ST000062','ST000763']
part_batch = ['MTBLS17', 'MTBLS19', 'Feng', 'ST000865', 'ST000385']
skip_ds = ['MTBLS148','MTBLS200', 'MTBLS20', 'ST000397', 'MTBLS264'] # no real labels here so cant do this...unless its all controls...
for k, v in datasets.items():
    new_combined_ds = []
    print(k)
    for ds in v:
        if k in skip_ds:
            new_combined_ds.append(ds)
            continue
        #### Now try the normalization 
        if k in true_batch:
            print('batch - all batch', k, ds['data_set'])
            ds = batch_ds_normalization(ds, k)
        elif k in part_batch and ('onebatch' in ds['data_set'] or 'all' in ds['data_set']):
            print('batch - 2 part', k, ds['data_set'])
            ds = batch_ds_normalization(ds, k)
        else:
            print('no batch, single batch', ds['data_set'], k)
            ds = no_batch_ds_normalization(ds)
        new_combined_ds.append(ds)
    pickle.dump(new_combined_ds, open('./bn_pickles/{}.pkl'.format(k), 'wb'))

Feng
batch - 2 part Feng plasmaall_author
batch - 2 part Feng urineall_author
no batch, single batch serum_IPO_aligned_Feng_serum_batch1 Feng


  This is separate from the ipykernel package so we can avoid doing imports until


no batch, single batch serum_IPO_aligned_Feng_serum_batch2 Feng
no batch, single batch urine_IPO_aligned_Feng_urine_batch1 Feng
no batch, single batch urine_IPO_aligned_Feng_urine_batch2 Feng
batch - 2 part Feng serum_onebatch_IPO_aligned_Feng_serum_all_MSMS
batch - 2 part Feng urine_onebatch_IPO_aligned_Feng_urine_all_MSMS
