In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
%qtconsole

In [2]:
import numpy as np
import pandas as pd
import os
import collections
import math 
from sklearn import preprocessing
from IPython.display import display, HTML
#from experiments.models import Analysis
#from compound.models import Pathway, CompoundPathway, Compound
#from data.models import Peak, Dataset, PeakDTSample, PeakQCSample
#from fileupload.models import Sample
from collections import defaultdict
#from experiments.pipelines.pipeline_rpy2 import Rpy2PipelineMetadata
#from experiments.pipelines.helpers import convert_to_dataframe

### Construct a severeal pathways with 2 fold changes in the data (noise with std of 5 added)

In [3]:
sample_fnames = [u'Control_4.mzXML', u'Control_3.mzXML', u'Control_2.mzXML', 
           u'Control_1.mzXML', u'2_fold_1.mzXML', u'2_fold_2.mzXML', u'2_fold_3.mzXML', u'2_fold_4.mzXML']
data_names ={"two":2, "four":4, "six":6, "ten":10, "twenty":20, "forty":40, "eighty":80}

### Set up a DF calcutating the T-test scores and the number of metabolites

In [4]:
from scipy.stats import ttest_ind
condition_1 = [u'Control_4.mzXML', u'Control_3.mzXML', u'Control_2.mzXML', u'Control_1.mzXML']
condition_2 = [u'2_fold_1.mzXML', u'2_fold_2.mzXML', u'2_fold_3.mzXML', u'2_fold_4.mzXML']


### A method to add a percentage of random peaks to a peak intensity df

In [5]:
def add_random_peaks(int_df, percent):

    #For each of the pathways add random peaks
    rand_peak_list = []   
    for name in data_names:

        df_path = int_df.loc[name]
        num_peaks = math.ceil((df_path.shape[0])*(percent/100.0))

        num_samples = df_path.shape[1]
        
        #Generate random peaks for that pathway
        
        for p in range(int(num_peaks)):
            rand_peaks = []
            data = np.random.normal(0, 5, num_samples)
            rand_peaks.append(name)
            rand_peaks.extend(list(data))
            rand_peak_list.append(rand_peaks)

    #Construct a DF for the new peak list
    ran_df = pd.DataFrame(rand_peak_list).set_index([0])
    ran_df.columns=sample_fnames
    
    #Add the random peaks to the original DF
    new_df = pd.concat([int_df, ran_df])
    
    scaled_data = preprocessing.scale(np.array(new_df), axis=1)
    new_df[sample_fnames] = scaled_data

    
    return new_df
    
   

### Method to return p-value and t-test for an activity DF

In [6]:
def get_t_test(noisy_df):

    t_test_list = []

    for pathway, row in noisy_df.iterrows():
        c1 = noisy_df.loc[pathway, condition_1].values
        c2 = noisy_df.loc[pathway, condition_2].values
        path_params = [pathway] + list(ttest_ind(c1,c2))
        t_test_list.append(path_params)

    t_test_df = pd.DataFrame(t_test_list).set_index([0])
    t_test_df.columns = ['t-stat', 'p-value']

    t_test_df = t_test_df.sort_values(by='p-value', ascending = True)

    return t_test_df


### Method to return the peak activities given a peak DF composed of pathways

In [7]:
def get_pathway_act(int_df):
    #For all of the pathways get all of the peak IDs
    pathway_activities = []
    for name in data_names:
        pathway_peaks = int_df.loc[name] #DF selected from peak IDs.
        w, d, c = np.linalg.svd(np.array(pathway_peaks))
        pw_act_list = []
        pw_act_list.append(name)
        pw_act_list.extend(list(c[0]))

        pathway_activities.append(pw_act_list)
        
        
        activity_df = pd.DataFrame(pathway_activities).set_index([0])
        activity_df.columns=int_df.columns
        activity_df.index.name = "Pathways"


    return activity_df


In [8]:
def construct_fc_df(random=False):
    sample_fnames = [u'Control_4.mzXML', u'Control_3.mzXML', u'Control_2.mzXML', 
           u'Control_1.mzXML', u'2_fold_1.mzXML', u'2_fold_2.mzXML', u'2_fold_3.mzXML', u'2_fold_4.mzXML']
    data_names ={"two":2, "four":4, "six":6, "ten":10, "twenty":20, "forty":40, "eighty":80}

    if not random:
        data = [12.0, 6.0, 12.0, 6.0, -17.0, -7.0, -7.0, -17.0]
    else:
        data = data = np.random.normal(0, 1, 8)
    pk_samp_intensities = []
    for name, num in data_names.items():
        for n in range(num):
                peak_int_list =[]
                peak_int_list.append(name) 
                data_noise = data + np.random.normal(0, 5, len(data))
                peak_int_list.extend(list(data_noise)) #The intensities of all the samples for this peak.
                pk_samp_intensities.append(peak_int_list)

    fc_df = pd.DataFrame(pk_samp_intensities).set_index([0])

    fc_df.columns=sample_fnames
    fc_df.index.name = "ms1_peak_id"
    fc_df.columns.name ="sample_name"

    scaled_data = preprocessing.scale(np.array(fc_df), axis=1)

    fc_df[sample_fnames] = scaled_data
    
    
    return fc_df



### Constuct the df*num_iterations and calculate the mean t-test scores


In [9]:
def calc_av_p_scores(num_iterations, percent=None, random=False):
    
    for it in range(num_iterations):
        fc_df = construct_fc_df(random) #Construct the fold change DF
        pathway_acts = get_pathway_act(fc_df)

        #If we want to add a percentatge of random peaks
        if percent is not None:
            add_random_df = add_random_peaks(fc_df, percent)
            pathway_acts = get_pathway_act(add_random_df)
        #Calulate the t-test results, if it is the first iteration, initialise the df.
        if it == 0:   
            t_results = get_t_test(pathway_acts).sort_index(axis=0)      
        
        else:  
            t_results = t_results + get_t_test(pathway_acts).sort_index(axis=0)      

    final = t_results/num_iterations
    new_df = final.sort_values(by='p-value', ascending = True)
    
    return new_df


### Sampling the p-values over a pathways with a number of different peaks ( none random )

In [11]:
no_change_df = calc_av_p_scores(100)
display (no_change_df)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.14581,0.000794
forty,-6.242969,0.000857
twenty,-6.207159,0.000932
ten,-6.044508,0.001308
six,-5.961224,0.00166
four,-5.543742,0.002259
two,-5.529025,0.003589


### Adding a pecentage of random peaks to the pathways STD = 5

In [17]:
random_50_df = calc_av_p_scores(100,50)
display (random_50_df)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.198775,0.000863
forty,-6.267762,0.00087
twenty,-6.145097,0.001068
ten,-5.952522,0.001562
six,-5.769895,0.001902
four,-5.498751,0.002963
two,-5.458035,0.0052


In [18]:
random_100_df = calc_av_p_scores(100,100)
display (random_100_df)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.193808,0.000878
forty,-6.119482,0.000908
twenty,-6.07549,0.001231
ten,-5.702524,0.002065
six,-5.86872,0.002268
four,-5.229409,0.004353
two,-4.853832,0.009302


In [19]:
random_200_df = calc_av_p_scores(100,200)
display (random_200_df)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.15571,0.000964
forty,-6.040265,0.001127
twenty,-5.854052,0.001585
ten,-5.665839,0.002756
six,-5.342636,0.003955
four,-4.750764,0.011473
two,-2.282879,0.067568


In [25]:
random_1000_df = calc_av_p_scores(100,1000)
display (random_1000_df)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.114441,0.0013
forty,-5.634712,0.002133
twenty,-5.291475,0.006303
ten,-4.433016,0.013785
six,-3.256643,0.063632
four,-2.325079,0.113969
two,-1.450233,0.242306


### Adding a pecentage of random peaks to the pathways STD = 1

In [20]:
random_50_df_1 = calc_av_p_scores(100,50)
display (random_50_df_1)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
forty,-6.263193,0.000844
eighty,-6.177022,0.000867
twenty,-6.131866,0.001072
ten,-5.768825,0.001491
six,-5.887526,0.001838
four,-5.495185,0.002971
two,-5.0435,0.006522


In [30]:
random_100_df_1 = calc_av_p_scores(100,100)
display (random_100_df_1)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.160683,0.000917
forty,-6.197592,0.000957
twenty,-6.126346,0.001091
ten,-5.933496,0.001607
six,-5.626259,0.002597
four,-5.486142,0.004737
two,-4.300323,0.018063


In [37]:
random_200_df_1 = calc_av_p_scores(100,200)
display (random_200_df_1)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
eighty,-6.206872,0.000902
forty,-6.051215,0.001055
twenty,-6.066777,0.001279
ten,-5.813536,0.001905
six,-5.19997,0.00416
four,-4.816737,0.009429
two,-3.045534,0.040065


### Nothing is changing, it's all random

In [21]:
all_random = calc_av_p_scores(100, random = True)
display (all_random)

Unnamed: 0_level_0,t-stat,p-value
0,Unnamed: 1_level_1,Unnamed: 2_level_1
ten,-0.001272,0.492169
eighty,-0.021162,0.495297
two,0.071836,0.52028
forty,-0.004622,0.531693
six,0.025469,0.532259
twenty,-0.123371,0.536179
four,0.118787,0.540429
