# Metabolites Fold Changes

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 24FEB2020

## Import Libraries

In [25]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import style
import scipy
from scipy import stats
import statsmodels as sms
from statsmodels.stats import multitest
import seaborn as sns; sns.set(style='white')

#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'

## Import Data

In [26]:
RCCdataframe = pd.read_excel('data/RCCdataframe.xlsx')

#RCCdataframe = pd.read_excel('RCCdataframe.xlsx')
# Remove samples with inconclusive staging information.
RCCdataframe = RCCdataframe[pd.notnull(RCCdataframe['Grouped Stage'])]

In [27]:
# non-metabolites (non-features) elements in RCC dataframe
non_metabolites = ['Groups', 'Collection', 'Sample ID', 'Metastatic', 
     'SubTypes','Nuclear Grade', 'Grade', 'Tstage', 
     'Tstage Group', 'Nstage', 'Mstage', 
     'Stage', 'Grouped Stage','Tumor Width','Gender', 
     'Race', 'BMI', 'Smoker', 'Age', 'Patient ID' ]

In [28]:
# Define two dataframes, Early RCC cancer and Late RCC cancer, and drop
# non-metabolites features from these dataframes.

Early = RCCdataframe[(RCCdataframe['Grouped Stage'] == 'Early')].drop(non_metabolites,
                                                                      axis=1)
Late = RCCdataframe[(RCCdataframe['Grouped Stage'] == 'Late')].drop(non_metabolites,
                                                                      axis=1)

In [29]:
# df_features contains the features alone. 
df_features = RCCdataframe.drop(non_metabolites, axis=1)
df_features.shape

(70, 7146)

In [30]:
# NMR and MS features
MS_features = df_features.iloc[:, 1:7098].columns
NMR_features = df_features.iloc[:, 7098:].columns

In [31]:
# NMR and MS Dataframe
NMRdataframe = pd.concat([RCCdataframe['Grouped Stage'].to_frame(), 
                    RCCdataframe[NMR_features]], axis=1)

MSdataframe = pd.concat([RCCdataframe['Grouped Stage'].to_frame(), 
                    RCCdataframe[MS_features]], axis=1)

## NMR Metabolomics

T-test function

In [32]:
def Ttest(metabolites, dfControl, dfTreat, alpha=0.05, var=True):
    '''
    Function conducts a T-test for the metabolites differences between two groups with 
    Benjamini-Hocberg FDR correction
    
    Inputs: 
    metabolites = A list containing names of metabolites
    dfControl = A pandas dataframe containing the control group metabolites data
    dfTreat =  A pandas dataframe containing the treatment group metabolites data
    alpha = alpha for statistical significant judgment, default 0.05
    var = If True (default), perform a standard independent 2 sample test that assumes 
    equal population variances [1]. If False, perform Welch’s t-test, which does not 
    assume equal population variance
    
    Outputs: A pandas dataframe with p-values of numerical cohort characteristics. 
    
    '''
    ttest_dict = {}
    for metabolite in metabolites:
        statistic, pvalue =  scipy.stats.ttest_ind(dfControl[metabolite], 
                                               dfTreat[metabolite], 
                                               equal_var=var)
        ttest_dict[metabolite] = pvalue 
        # a dictionary containing name of metabolites and p value after t-test
    ttest = pd.DataFrame.from_dict(ttest_dict, orient='index') # the dictionary in pandas df
    ttest_list=list(ttest_dict.values()) #values (pvalues) of ttest result in a list
    reject, pval_corrected, _, _ = sms.stats.multitest.multipletests(ttest_list, 
                                                                 alpha=alpha, 
                                                                 method='fdr_bh')
    ttest_results = pd.DataFrame({'Metabolite': metabolites, 'T-test p-value': ttest_list, 
                              'FDR p-value': pval_corrected, 'Reject H0': reject})
    Table = ttest_results.sort_values(by=['FDR p-value'])
    return Table

In [33]:
# Define two NMR dataframes, Early RCC cancer and Late RCC cancer, and drop
# non-metabolites features from these dataframes.

EarlyNMR = NMRdataframe[(NMRdataframe['Grouped Stage'] == 'Early')].drop('Grouped Stage', axis=1)
LateNMR = NMRdataframe[(NMRdataframe['Grouped Stage'] == 'Late')].drop('Grouped Stage', axis=1)

In [34]:
# NMR T-test result
NMRttest_result = Ttest(NMR_features, EarlyNMR, LateNMR, alpha=0.05, var=True)
#NMRstat_sig = ttest_result.loc[ttest_result['T-test p-value'] <= 0.05]

In [35]:
# define the mean NMR dataframe

dfmean_NMR = pd.DataFrame({'Features':NMRdataframe.drop(['Grouped Stage'], axis=1).mean(axis=0).index, 
                       'Early RCC (NMR)':EarlyNMR.mean(axis=0).values,
                       'Advanced RCC (NMR)':LateNMR.mean(axis=0).values})
dfmean_NMR.shape

(48, 3)

Compute Fold changes

In [36]:
FC_listnmr = []
for i in dfmean_NMR['Features']: 
    early_val = dfmean_NMR.loc[dfmean_NMR['Features'] == i]['Early RCC (NMR)']
    late_val = dfmean_NMR.loc[dfmean_NMR['Features'] == i]['Advanced RCC (NMR)']
    
    FC = round(np.log2(late_val.to_numpy()[0]/early_val.to_numpy()[0]),2)
    FC_listnmr.append(FC)

In [37]:
data = {'Metabolite': dfmean_NMR['Features'], 
        'FC': FC_listnmr}

# Create DataFrame 
df = pd.DataFrame(data) 

In [38]:
df.head()

Unnamed: 0,Metabolite,FC
0,unk1,0.13
1,unk2,-0.1
2,bile_acid1,-0.11
3,bile_acid2,0.04
4,HIVA,-0.17


In [39]:
NMRttest_result.head()

Unnamed: 0,Metabolite,T-test p-value,FDR p-value,Reject H0
13,citrate,0.002546,0.12221,False
8,acetone,0.028642,0.305652,False
22,glycine,0.031839,0.305652,False
12,pyruvate,0.028291,0.305652,False
18,choline,0.026118,0.305652,False


In [40]:
# drop columns that are not useful. 
NMRttest_result2 = NMRttest_result.drop(['Reject H0'], axis=1)
# merge dataframe for paper figure
NMR_excel_metabolites = pd.merge(df, 
                                   NMRttest_result2, 
                                   on='Metabolite')

# round all numbers to 2 decimal places
NMR_excel_metabolites.round(3);

In [41]:
# saving the excel 
#NMR_excel_metabolites.round(3).to_excel("NMR_foldchanges_tTest.xlsx") 
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.


## MS Metabolites

In [43]:
MSqmarker_dataframe = pd.read_excel('RCCStage_171qmarkers.xlsx')

In [44]:
MSdataframe.head()

Unnamed: 0,Grouped Stage,1,2,3,4,5,6,7,8,9,...,7088,7089,7090,7091,7092,7093,7094,7095,7096,7097
0,Early,724885000.0,1103877000.0,5068185.0,4682.599,18261.939755,1322.950578,61662.950456,191620.2,56194280.0,...,168446.786091,81645.226415,501574.2,43882.940713,235673.809999,141900.04406,144189.303819,1659259.0,105635.980011,76801.696555
7,Early,4693305.0,2097896000.0,2069727000.0,54991920.0,25432.663566,45520.86635,185797.531423,544290.9,91045940.0,...,26709.418499,136386.803103,931113.6,37112.894842,461212.184491,133276.969753,135275.801403,997630.5,45068.765392,209961.651238
8,Late,287615000.0,1280228000.0,9550407.0,36701.56,44239.226722,4718.247908,116683.725311,291059.7,386803900.0,...,647662.128723,352575.79048,1052685.0,15180.614023,948296.49393,260564.349646,17692.29375,639787.3,305002.623966,83340.915554
9,Early,4515404.0,903882600.0,896797000.0,2977243.0,13076.392417,3803.132036,83057.599725,282400.7,40203430.0,...,48406.87426,25702.543194,529308.0,1051.213851,151444.889717,44642.901959,17143.802793,138830.1,88515.357062,34774.981551
10,Early,2094258.0,2272247000.0,66226400.0,19750130.0,15870.650938,1640.590789,366063.610388,1194779.0,141804800.0,...,238353.245646,426965.773883,395838.1,33280.705223,432006.770308,197276.34129,96510.547193,2020430.0,365566.749529,620224.679053


In [45]:
MSqmarker_dataframe.head()

Unnamed: 0.1,Unnamed: 0,ID,Mode,RT [min],Name,Formula
0,49,50,positive,3.784,Betaine,C5 H11 N O2
1,226,227,positive,3.393,O-Desmethyltramadol,C15 H23 N O2
2,247,248,positive,5.127,248,
3,367,368,positive,1.483,Oxybenzone,C14 H12 O3
4,627,628,positive,1.66,capuride,C9 H18 N2 O2


In [46]:
# Define two MS dataframes, Early RCC cancer and Late RCC cancer, and drop
# non-metabolites features from these dataframes.

EarlyMS = MSdataframe[(MSdataframe['Grouped Stage'] == 'Early')].drop('Grouped Stage', axis=1)
LateMS = MSdataframe[(MSdataframe['Grouped Stage'] == 'Late')].drop('Grouped Stage', axis=1)

In [47]:
# MS T-test result
MSttest_result = Ttest(MS_features, EarlyMS, LateMS, alpha=0.05, var=True)
#NMRstat_sig = ttest_result.loc[ttest_result['T-test p-value'] <= 0.05]

In [48]:
# define the mean NMR dataframe

dfmean_MS = pd.DataFrame({'Features':MSdataframe.drop(['Grouped Stage'], axis=1).mean(axis=0).index, 
                       'Early RCC (NMR)':EarlyMS.mean(axis=0).values,
                       'Advanced RCC (NMR)':LateMS.mean(axis=0).values})
dfmean_MS.shape

(7097, 3)

Compute fold changes

In [49]:
FC_listMS = []
for i in dfmean_MS['Features']: 
    early_val = dfmean_MS.loc[dfmean_MS['Features'] == i]['Early RCC (NMR)']
    late_val = dfmean_MS.loc[dfmean_MS['Features'] == i]['Advanced RCC (NMR)']
    
    FC = round(np.log2(late_val.to_numpy()[0]/early_val.to_numpy()[0]),2)
    FC_listMS.append(FC)

In [53]:
data = {'Metabolite': dfmean_MS['Features'], 
        'FC': FC_listMS}

# Create DataFrame 
df = pd.DataFrame(data)

In [54]:
df.head()

Unnamed: 0,Metabolite,FC
0,1,0.45
1,2,-0.04
2,3,-0.05
3,4,0.32
4,5,-8.85


In [55]:
MSttest_result

Unnamed: 0,Metabolite,T-test p-value,FDR p-value,Reject H0
7000,7001,0.000160,0.189336,False
6395,6396,0.000155,0.189336,False
6336,6337,0.000126,0.189336,False
5407,5408,0.000114,0.189336,False
5064,5065,0.000094,0.189336,False
...,...,...,...,...
1521,1522,0.998874,0.999297,False
4043,4044,0.998829,0.999297,False
1590,1591,0.999103,0.999385,False
4765,4766,0.999608,0.999749,False


In [56]:
# drop columns that are not useful. 
MSttest_result2 = MSttest_result.drop(['Reject H0'], axis=1)
# merge dataframe for paper figure
MS_excel_metabolites = pd.merge(df, 
                                   MSttest_result2, 
                                   on='Metabolite')

In [57]:
MS_excel_metabolites.head()

Unnamed: 0,Metabolite,FC,T-test p-value,FDR p-value
0,1,0.45,0.181627,0.756414
1,2,-0.04,0.717363,0.893808
2,3,-0.05,0.874073,0.958537
3,4,0.32,0.641434,0.86102
4,5,-8.85,0.231517,0.756414


In [59]:
MS_excel_metabolites.shape

(7097, 4)

In [65]:
# drop unnecessary column and rename ID to metabolite.
MSqmarker = MSqmarker_dataframe.drop(['Unnamed: 0'], axis=1).rename(columns={"ID": "Metabolite"})
MSqmarker.head()

Unnamed: 0,Metabolite,Mode,RT [min],Name,Formula
0,50,positive,3.784,Betaine,C5 H11 N O2
1,227,positive,3.393,O-Desmethyltramadol,C15 H23 N O2
2,248,positive,5.127,248,
3,368,positive,1.483,Oxybenzone,C14 H12 O3
4,628,positive,1.66,capuride,C9 H18 N2 O2


In [66]:
MS_excel = pd.merge(MS_excel_metabolites, 
                    MSqmarker, 
                    on='Metabolite')

In [67]:
MS_excel.head()

Unnamed: 0,Metabolite,FC,T-test p-value,FDR p-value,Mode,RT [min],Name,Formula
0,50,-1.04,0.019741,0.608919,positive,3.784,Betaine,C5 H11 N O2
1,227,-4.94,0.080254,0.698281,positive,3.393,O-Desmethyltramadol,C15 H23 N O2
2,248,2.31,0.000253,0.243436,positive,5.127,248,
3,368,-2.85,0.077946,0.697608,positive,1.483,Oxybenzone,C14 H12 O3
4,628,-1.74,0.039229,0.662882,positive,1.66,capuride,C9 H18 N2 O2


In [69]:
MS_excel.shape

(171, 8)

In [None]:
# round all numbers to 2 decimal places
#MS_excel_metabolites.round(3);

In [70]:
# saving the excel 
#MS_excel.round(3).to_excel("MS_foldchanges_tTest.xlsx") 
print('DataFrame is written to Excel File successfully.')

DataFrame is written to Excel File successfully.
