In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
# import tabula
import composition_stats as cs
from pandasql import sqldf
import math
from scipy.stats import chi2
from scipy import stats
from matplotlib import pyplot as plt

In [2]:
def low_counts_rm(data_offset, thres = 0.01):
    col_sum = data_offset.sum(axis = 0)
    ttl_sum = col_sum.sum()
    keep_otu = col_sum[(col_sum * 100 / ttl_sum) > thres].index
    data_offset_v2 = data_offset[keep_otu]
    data_offset_v2 = data_offset_v2.astype(float)
    return data_offset_v2, keep_otu

def split_processing_single(data):
    data_hea = data[data['Disease'] == 'Healthy'].reset_index().drop(['index'], axis = 1)
    data_t2d = data[data['Disease'] == 'T2D'].reset_index().drop(['index'], axis = 1)
    data_hea_offset = data_hea.drop(['Disease', 'sample_id'], axis = 1) # + data_hea.min()[1:-1].min()
    data_t2d_offset = data_t2d.drop(['Disease', 'sample_id'], axis = 1) # + data_t2d.min()[1:-1].min()
    data_heainfo = low_counts_rm(data_hea_offset)
    data_t2dinfo = low_counts_rm(data_t2d_offset)
    Union_taxa = list(set(data_heainfo[1]) | set(data_t2dinfo[1]))
    data_v1 = data[Union_taxa]
    # data_v1 += 1
    data_v1 = data_v1.astype('float64')
    # data_v1['BioProject'] = data['BioProject']
    data_v1['Disease'] = data['Disease']
    data_v1['sample_id'] = data['sample_id']
    return data_v1

def clr_trans_modify(data, last_ver_data, offset = 1):
    if 'disease' in list(data.columns):
        return 'drop disease column at first'
    data_v1 = pd.DataFrame(cs.clr(cs.closure(data + offset)))
    data_v1.columns = list(data.columns)
    data_v1['disease'] = last_ver_data['Disease']
    # data_v1['sample_id'] = last_ver_data['sample_id']
    # data_v1['study'] = study_name
    return data_v1

----------------------------

In [4]:
sample_md = pd.read_excel("sample_metadata.xlsx")
abund_data = pd.read_csv("vect_atlas.csv")
corr_taxa = pd.read_csv("corresponding_taxa.csv")

In [7]:
# Derive the metadata relevant to T2D studies
study1_metadata = sample_md[sample_md['BioProject'] == 'PRJNA422434'].reset_index(drop = True)  # PRJNA422434
study2_metadata = sample_md[sample_md['BioProject'] == 'PRJEB1786'].reset_index(drop = True)    # PRJEB1786
study3_metadata = sample_md[sample_md['BioProject'] == 'PRJNA361402'].reset_index(drop = True)  # PRJNA361402

In [None]:
metadata = study1_metadata
# metadata = study2_metadata
# metadata = study3_metadata

In [None]:
metadata = metadata[metadata['Disease'].isin(['T2D', 'Healthy', 'NGT'])].reset_index(drop = True)
sp_id = list(metadata['sample.ID'])
rawdata = abund_data[sp_id + ['Unnamed: 0']]
rawdata = rawdata.merge(corr_taxa, how = 'left', left_on = ['Unnamed: 0'], right_on = ['id']).drop(['Unnamed: 0', 'id'], axis = 1)

taxa_lst = list(rawdata['name'])
rawdata_v1 = rawdata.drop(['name'], axis = 1).T
rawdata_v1.columns = taxa_lst
rawdata_v1 = rawdata_v1.reset_index().rename(columns = {'index':'sample_id'})
rawdata_v1 = rawdata_v1.merge(metadata[['sample.ID', 'Disease']], how = 'left', left_on = ['sample_id'], right_on = ['sample.ID']).drop(['sample.ID'], axis = 1)

In [10]:
drop_cols = [taxa for taxa in taxa_lst if 'unclassified' in taxa]
rawdata_v1 = rawdata_v1.drop(drop_cols, axis = 1)

In [11]:
drop_cols = [taxa for taxa in list(rawdata_v1.columns)[1:-1] if len(rawdata_v1[taxa].unique()) == 1]
rawdata_v1 = rawdata_v1.drop(drop_cols, axis = 1)

In [12]:
rawdata_v2 = split_processing_single(rawdata_v1)

In [13]:
rawdata_sty1 = clr_trans_modify(rawdata_v2.drop(['Disease', 'sample_id'], axis = 1), rawdata_v1, offset = rawdata_v1[rawdata_v1.iloc[:, 1:-1] > 0].min().min())

In [14]:
rawdata_sty1.loc[rawdata_sty1['disease'] == 'Healthy', 'disease'] = 'healthy'
rawdata_sty1.loc[rawdata_sty1['disease'] == 'NGT', 'disease'] = 'healthy'

In [16]:
rawdata_sty1.to_csv("processed_data.csv", index = False)