In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
import math
from scipy.stats import chi2
from scipy import stats
from matplotlib import pyplot as plt
import seaborn as sns
import composition_stats as cs

In [None]:
# Data prep
hmp_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\HMP_2019_t2d.relative_abundance_metadata.csv")
hmp_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\HMP_2019_t2d.relative_abundance_rawdata.csv")

karl_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\KarlssonFH_2013.relative_abundance_metadata.csv")
karl_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\KarlssonFH_2013.relative_abundance_rawdata.csv")

lij_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\LiJ_2014.relative_abundance_metadata.csv")
lij_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\LiJ_2014.relative_abundance_rawdata.csv")

yuj_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\YuJ_2015.relative_abundance_metadata.csv")
yuj_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\YuJ_2015.relative_abundance_rawdata.csv")

feng_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\FengQ_2015.relative_abundance_metadata.csv")
feng_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\FengQ_2015.relative_abundance_rawdata.csv")

qin_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\QinJ_2012.relative_abundance_metadata.csv")
qin_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\QinJ_2012.relative_abundance_rawdata.csv")

sank_metadata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\SankaranarayananK_2015.relative_abundance_metadata.csv")
sank_rawdata = pd.read_csv(r"C:\Users\edwar\Desktop\Melbourne\research_project\curated_metagenomics\raw_data\SankaranarayananK_2015.relative_abundance_rawdata.csv")

In [None]:
# required basic functions for data cleaning and consistency

# Low counts removal
def low_counts_rm(data_offset, thres = 0.01):
    """
    data_offset: pandas.dataframe.
    thres: float, default value is 0.01
    """
    col_sum = data_offset.sum(axis = 0)
    ttl_sum = col_sum.sum()
    keep_otu = col_sum[(col_sum * 100 / ttl_sum) > thres].index
    data_offset_v2 = data_offset[keep_otu]
    data_offset_v2 = data_offset_v2.astype(float)
    return data_offset_v2, keep_otu

# Combine the samples with their health condition
def label_data(rawdata, metadata):
    """
    rawdata: pandas.dataframe, metagenomics counts or abundance matrix.
    metadata: pandas.dataframe, metadata with column "disease" for health status and index as sample ID.
    """
    taxa_lst = rawdata['FeatureID']
    rawdata_v1 = rawdata.T.drop(['FeatureID'])
    rawdata_v1.columns = list(taxa_lst)
    rawdata_v1 = rawdata_v1.drop(['Unnamed: 0']).reset_index()
    return rawdata_v1.merge(metadata[['Unnamed: 0', 'disease']], how = 'right', left_on = ['index'],  right_on = ['Unnamed: 0']).drop(['index', 'Unnamed: 0'], axis = 1)

# Derived the samples with targeted disease and control group
def take_sample(data):
    """
    data: pandas.dataframe, the output from label data including biological matrix and health status
    """
    data.loc[data['disease'].str.contains('T2D'), 'disease'] = 'T2D'
    data.loc[data['disease'].str.contains('healthy'), 'disease'] = 'healthy'
    data = data[data['disease'].isin(['T2D', 'healthy'])].reset_index().drop(['index'], axis = 1)
    return data


In [None]:
# processing the metagenomics data, including offset added and low counts removal

def split_processing_single(data, study, thres = 0.01):
    """
    data: pandas.dataframe, the counts matrix with well-cleaned data.
    study: str, indicate the different study/batch/platform of data generation.
    thres: float, indicate the threshold for low counts removal, mainly for metagenomics seq data.
    """
    data_hea = data[data['disease'] == 'healthy'].reset_index().drop(['index'], axis = 1)
    data_t2d = data[data['disease'] == 'T2D'].reset_index().drop(['index'], axis = 1)
    data_hea_offset = data_hea.iloc[:, 0:-2] + 1
    data_t2d_offset = data_t2d.iloc[:, 0:-2] + 1
    data_heainfo = low_counts_rm(data_hea_offset, thres)
    data_t2dinfo = low_counts_rm(data_t2d_offset, thres)
    Union_taxa = list(set(data_heainfo[1]) | set(data_t2dinfo[1]))
    data_v1 = data[Union_taxa]
    data_v1 = data_v1.astype('float64')
    try:
        data_v1['study'] = study
    except:
        pass
    data_v1['disease'] = data['disease']
    return data_v1

# CLR transformation
def clr_trans(data, last_ver_data, study_name):
    """
    data: pandas.dataframe, the output from split_processing_single().
    last_ver_data: pandas.dataframe, the output from label_data() followed by take_sample().
    study_name: str, same as the study parameter in split_processing_single().
    """
    if 'disease' in list(data.columns):
        data = data.drop(['disease'], axis = 1)
    if 'study' in list(data.columns):
        data = data.drop(['study'], axis = 1)
    data_v1 = pd.DataFrame(cs.clr(cs.closure(data + 1)))
    data_v1.columns = list(data.columns)
    data_v1['disease'] = last_ver_data['disease']
    data_v1['study'] = study_name
    return data_v1


In [None]:
# processing the data
hmp_data_raw1 = take_sample(label_data(hmp_rawdata, hmp_metadata))
karl_data_raw1 = take_sample(label_data(karl_rawdata, karl_metadata))
lij_data_raw1 = take_sample(label_data(lij_rawdata, lij_metadata))
yuj_data_raw1 = take_sample(label_data(yuj_rawdata, yuj_metadata))
feng_data_raw1 = take_sample(label_data(feng_rawdata, feng_metadata))
sank_data_raw1 = take_sample(label_data(sank_rawdata, sank_metadata))
qin_data_raw1 = take_sample(label_data(qin_rawdata, qin_metadata))

hmp_data = split_processing_single(data = hmp_data_raw1, study = 'hmp2019', thres = 0.0001)
karl_data = split_processing_single(data = karl_data_raw1, study = 'karl2013', thres = 0.0001)
lij_data = split_processing_single(data = lij_data_raw1, study = 'lij2014', thres = 0.0001)
yuj_data = split_processing_single(data = yuj_data_raw1, study = 'yuj2015', thres = 0.0001)
feng_data = split_processing_single(data = feng_data_raw1, study = 'feng2015', thres = 0.0001)
sank_data = split_processing_single(data = sank_data_raw1, study = 'sank2015', thres = 0.0001)
qin_data = split_processing_single(data = qin_data_raw1, study = 'qin2012', thres = 0.0001)

hmp_data_fv = clr_trans(hmp_data, hmp_data_raw1, 'hmp2019')
karl_data_fv = clr_trans(karl_data, karl_data_raw1, 'karl2013')
lij_data_fv = clr_trans(lij_data, lij_data_raw1, 'lij2014')
yuj_data_fv = clr_trans(yuj_data, yuj_data_raw1, 'yuj2015')
feng_data_fv = clr_trans(feng_data, feng_data_raw1, 'feng2015')
sank_data_fv = clr_trans(sank_data, sank_data_raw1, 'sank2015')
qin_data_fv = clr_trans(qin_data, qin_data_raw1, 'qin2012')

inte_data = pd.concat([hmp_data_fv, karl_data_fv, lij_data_fv, yuj_data_fv, feng_data_fv], ignore_index = True)

In [None]:
# remove study-specific taxa (Also put in the R scripts)
df = pd.DataFrame()

studies = inte_data['study'].unique()
df['species'] = []
for study in studies:
    df[study] = []

taxa_lst = list(inte_data.drop(['study', 'disease'], axis = 1).columns)
studies = list(inte_data['study'].unique())
for species in taxa_lst:
    df_taxa = pd.DataFrame()
    df_taxa['species'] = [species]
    for study in studies:
        if len(inte_data[inte_data['study'] == study][species].unique()) == 1:
            df_taxa[study] = ['x']
        else:
            df_taxa[study] = ['Exist']
    df = pd.concat([df, df_taxa]).reset_index().drop(['index'], axis = 1)

df['Exist_count'] = ''
df['Exist_count'] = df[list(df.columns)[1:-1]].apply(lambda row:np.sum(row == 'Exist'), axis = 1)