In [17]:
import pandas as pd
import numpy as np

from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.feature_extraction import DataSource
from pals.PLAGE import PLAGE
from pals.ORA import ORA
from pals.GSEA import GSEA
from pals.common import *

from pals.preprocessing import *

from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests



In [18]:
mz_df = pd.DataFrame(columns = ['m/z', 'retention_time'])

mz_df['m/z'] = [np.random.randint(1, 150) for i in range(7375)]
mz_df['retention_time'] = [np.random.randint(1, 300) for i in range(7375)]
print(mz_df)


      m/z  retention_time
0      49              73
1      60             276
2      25             228
3      11              63
4      34             167
...   ...             ...
7370    8             213
7371    5             226
7372   49              52
7373  134             154
7374  131             144

[7375 rows x 2 columns]


In [19]:
intdf, annodf, groups = load_data(".\\int_df.csv", ".\\annotation_df.csv") #path to beer data 
groups

2021-09-01 10:44:30.151 | DEBUG    | pals.common:load_data:175 - Loaded 7375 x 12 peak intensities from .\int_df.csv
2021-09-01 10:44:30.153 | DEBUG    | pals.common:load_data:176 - Loaded groups: {'beer1': ['Beer_1_full1.mzXML', 'Beer_1_full2.mzXML', 'Beer_1_full3.mzXML'], 'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'], 'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'], 'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML']}
2021-09-01 10:44:30.167 | DEBUG    | pals.common:load_data:179 - Loaded 14549 peak annotations from .\annotation_df.csv


{'beer1': ['Beer_1_full1.mzXML', 'Beer_1_full2.mzXML', 'Beer_1_full3.mzXML'],
 'beer2': ['Beer_2_full1.mzXML', 'Beer_2_full2.mzXML', 'Beer_2_full3.mzXML'],
 'beer3': ['Beer_3_full1.mzXML', 'Beer_3_full2.mzXML', 'Beer_3_full3.mzXML'],
 'beer4': ['Beer_4_full1.mzXML', 'Beer_4_full2.mzXML', 'Beer_4_full3.mzXML']}

In [20]:
comparisons = [
    ('beer1', 'beer2'), 
    # ('beer3', 'beer4')
]

In [21]:
experimental_design = {
    'groups': groups,
    'comparisons': []
}
for case, control in comparisons:
    experimental_design['comparisons'].append({
        'case': case,
        'control': control,
        'name': '%s/%s' % (case, control)
    })
    

In [22]:
ds = DataSource(intdf, annodf, experimental_design, DATABASE_PIMP_KEGG)

2021-09-01 10:44:32.730 | DEBUG    | pals.feature_extraction:__init__:46 - Using PiMP_KEGG as database
2021-09-01 10:44:32.731 | DEBUG    | pals.loader:load_data:42 - Loading C:\Users\Simon\anaconda3\lib\site-packages\pals\data\PiMP_KEGG.json.zip
2021-09-01 10:44:32.882 | DEBUG    | pals.feature_extraction:__init__:59 - Mapping pathway to unique ids
2021-09-01 10:44:32.891 | DEBUG    | pals.feature_extraction:__init__:73 - Creating dataset to pathway mapping
2021-09-01 10:44:34.018 | DEBUG    | pals.feature_extraction:__init__:101 - Computing unique id counts


In [56]:
#Method for calculating p values from intensity matrix

def calculate_p(intensity_df):
    #students t for generating p values per compound

    comp = experimental_design['comparisons'][0]

    control_groups = ds.get_comparison_samples(comp)[0]
    case_groups = ds.get_comparison_samples(comp)[1]
    
    df = pd.DataFrame(columns = ['case', 'control']) #create case control df

    for sample_group in control_groups:
        df['control'] = intdf[sample_group] #populate with control data

    for sample_group in case_groups:
        df['case'] = intdf[sample_group] #populate with case data

    pvals = []

    df['pvalue'] = 0 #initialise p value column to 0
    df['control'] = np.random.normal(size=len(df.index)) + 4
    df['case'] = np.random.normal(size=len(df.index)) + 4
    
    
    for i in range(len(df.index)):
        if str(df['case'].iloc[i]) == 'nan':
            df['case'].iloc[i] == 5000
        if str(df['control'].iloc[i]) == 'nan':
            df['control'].iloc[i] == 5000
    
    print(df)
    
    #df.to_csv("tst.csv")


    for i in range(len(df.index)): #iterate through rows in dataframe calculating p value for each comparison
        
        case_log = np.log2(df['case'].iloc[i])
        control_log = np.log2(df['control'].iloc[i])

        if str(case_log) == 'nan': #filter nan values
            case_log = 1
        if str(control_log) == 'nan':
            control_log = 1

        statistics, pvalue = stats.ttest_ind(df['case'].iloc[i], df['control'].iloc[i]) #do t test on the logged 2 values
        pvals.append(pvalue) # append to pval list 


    print(pvals) #debug list

In [57]:
calculate_p(intdf)

             case   control  pvalue
row_id                             
3033929  3.530812  3.321983       0
3033930  3.475855  3.565030       0
3033931  3.845858  3.614788       0
3033932  3.267070  5.833165       0
3033933  4.375913  4.559787       0
...           ...       ...     ...
3041299  3.676798  5.154371       0
3041300  3.803884  2.669309       0
3041301  5.543563  3.575018       0
3041302  3.372756  3.721603       0
3041303  3.724413  2.844497       0

[7375 rows x 3 columns]


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  ret = ret.dtype.type(ret / rcount)


[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,