# Import libraries

In [1]:
import pandas as pd
from config import RAW_DIR, CLEAN_DIR
import numpy as np
import os

# Extract features and labels from the raw training meta-analysis file

In [2]:
# load up our training dataset
raw_training = pd.read_csv(os.path.join(RAW_DIR, "raw_training.csv"), index_col=[0, 3])
raw_training.iloc[:5, :20]

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,Unnamed: 1_level_0,Author (year),BoiProject ID,RunID,Sequencing Platform,Library layout (SE/PE),"Sample origin? (e.g., stool, mucosa)",Disease status,Sample_ID,Subject Id (If available),Age (Years),Gender,Geographical Region or Population,BMI (kg/m²),PHENOTYPE_Disease,PHENOTYPE:Healthy_Nonhealthy,UNKNOWN,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales
Study_ID,Sample Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GMHI-23,SAMEA3879547,Vogtmann (2016),PRJEB12449,ERR1293689,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS84159866ST,MMRS84159866ST,67.0,Female,USA,18.63574,Healthy,Healthy,56.451568,0.861124,0.861124,0.861124,0.861124
GMHI-23,SAMEA3879551,Vogtmann (2016),PRJEB12449,ERR1293705,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS25211151ST,MMRS25211151ST,60.0,Female,USA,20.466036,Healthy,Healthy,39.905873,0.0,0.0,0.0,0.0
GMHI-23,SAMEA3879543,Vogtmann (2016),PRJEB12449,ERR1293673,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS32573774ST,MMRS32573774ST,82.0,Male,USA,20.524438,Healthy,Healthy,53.274915,0.151996,0.151996,0.151996,0.151996
GMHI-23,SAMEA3879565,Vogtmann (2016),PRJEB12449,ERR1293761,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS20257302ST,MMRS20257302ST,62.0,Female,USA,20.665713,Healthy,Healthy,59.044005,0.362985,0.362985,0.362985,0.362985
GMHI-23,SAMEA3879546,Vogtmann (2016),PRJEB12449,ERR1293685,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS86168210ST,MMRS86168210ST,79.0,Male,USA,21.284602,Healthy,Healthy,75.424715,0.001181,0.001181,0.001181,0.001181


In [3]:
print("Start with", raw_training.shape[0], "samples and", np.unique(raw_training.index.get_level_values(0)).shape[0], "studies")

Start with 12532 samples and 70 studies


In [4]:
# Select the taxonomic features
taxonomic_features = raw_training.iloc[:, 15:]

# Select the health status labels 
is_healthy = raw_training[["PHENOTYPE:Healthy_Nonhealthy"]] == "Healthy"

# Select the full phenotype labels
phenotype = raw_training[["PHENOTYPE_Disease"]]

# Apply sample exclusion criteria

In [5]:
non_illumina = [
    "454 GS FLX Titanium",
    "Ion Torrent PGM",
    "Ion Torrent Proton",
    "BGISEQ-500",
]

In [6]:
# Find the indices of samples we would like to remove

remove = (
    
    # we don't need a biomarker for obesity/underweight
    (phenotype == "Underweight").values.flatten() | 
    (phenotype == "Overweight").values.flatten() |
    (phenotype == "Obesity").values.flatten() |
    (phenotype == "Obese").values.flatten() |
    
    # to reduce batch effects, remove samples sequenced with non-illumina machines
    raw_training["Sequencing Platform"].isin(non_illumina).values.flatten() |
    
    # all of these studies have samples from subjects w/o a diagnosed disease but have an abonormal condition
    (phenotype.index.get_level_values(0) == "P4") | # P4 treats the poop for extracting viral DNA
    (phenotype.index.get_level_values(0) == "P86") | # Healthy at baseline but half develop T2D 
    (phenotype.index.get_level_values(0) == "GMHI-19") | # Outlier study
    ((phenotype.index.get_level_values(0) == "P48") & (phenotype == "Healthy").values.flatten()) | # Health subjects: alcohol or smoking
    (phenotype.index.get_level_values(0) == "P59") | # Are all technically healthy, but half are in heavily urbanized areas
    # and "Microbes with higher relative abundance in Chinese urban samples have been associated with disease in other studies"
    (phenotype.index.get_level_values(0) == "P63") | # Deals with semisupercentenarians, i.e., 105 to 109 years old
    
    # updated
    
    ((phenotype.index.get_level_values(0) == "GMHI-5") & (phenotype.values.flatten() != "T2D")) | # Disease phenotype is not related to microbiome (IGT)
    (phenotype.index.get_level_values(0) == "P95") | # Disease phenotype is not related to microbiome (Pancreatic Cancer)
    (phenotype.index.get_level_values(0) == "GMHI-21") | # Disease phenotype is not related to microbiome (HTN)
    (phenotype.index.get_level_values(0) == "P15") | # Disease phenotype is not related to microbiome (behcet)
    ((phenotype.index.get_level_values(0) == "GMHI-11") & (phenotype.values.flatten() != "carcinoma")) | # Disease phenotype is not related to microbiome (adenoma)
    phenotype.iloc[:, 0].apply(lambda x : "adenoma" in x.lower()).values.flatten() | # Disease phenotype is not related to microbiome (adenoma)
    (phenotype.index.get_level_values(0) == "P81") | # Disease phenotype is not related to microbiome (Schizophrenia)
    (phenotype.index.get_level_values(0) == "P32") | # Disease phenotype is not related to microbiome (breast cancer)
    (phenotype.index.get_level_values(0) == "P132") | # Disease phenotype is not related to microbiome (ESRD)
    (phenotype.index.get_level_values(0) == "P136") # Disease phenotype is not related to microbiome (NSCLC)
)

In [7]:
# remove bad samples
taxonomic_features = taxonomic_features.iloc[~remove, :]
is_healthy = is_healthy.iloc[~remove, :]
phenotype = phenotype.iloc[~remove, :]

In [8]:
phenotype.iloc[:, 0].unique()

array(['Healthy', 'ACVD', 'Advanced Dementia', 'ankylosing spondylitis',
       'Cancer', 'carcinoma', 'CD', 'CRC', "Crohn''s disease",
       'Crohns disease', 'Graves’ disease', 'Liver Cirrhosis', 'NAFLD',
       'Rheumatoid arthritis', 'Rheumatoid Arthritis', 'T2D',
       'Ulcerative colitis', 'ulcerative colitis'], dtype=object)

## Remove samples w/ low read counts

In [9]:
# Get samples with low read counts
training_read_counts = pd.read_csv(os.path.join(RAW_DIR, "training_read_counts.csv"), index_col=[2]).iloc[:9045, :5].loc[taxonomic_features.index.get_level_values(1)]

# make sure indices align
assert((training_read_counts.index == phenotype.index.get_level_values(1)).mean() == 1)

In [10]:
# remove low quality samples
low_quality_index = (training_read_counts["Read_count"] < 1000000).values
taxonomic_features = taxonomic_features.iloc[~low_quality_index, :]
is_healthy = is_healthy.iloc[~low_quality_index, :]
phenotype = phenotype.iloc[~low_quality_index, :]

In [11]:
print(f"Our dataset has {taxonomic_features.shape[0]} samples with {taxonomic_features.shape[1]} unique taxnomic features")
print(f"Our dataset has {len(np.unique(taxonomic_features.index.get_level_values(0)))} independent studies")

Our dataset has 7828 samples with 3201 unique taxnomic features
Our dataset has 55 independent studies


# Apply study exclusion criteria

In [12]:
# Find studies with >= 20 samples
high_sample_studies = phenotype.groupby(level=0).count() >= 20
high_sample_studies = high_sample_studies[high_sample_studies["PHENOTYPE_Disease"]].index

# Keep only theses studies
taxonomic_features = taxonomic_features.loc[high_sample_studies]
is_healthy = is_healthy.loc[high_sample_studies]
phenotype = phenotype.loc[high_sample_studies]

# Load up second csv file

In [13]:
raw_training2 = pd.read_csv(os.path.join(RAW_DIR, "raw_training2.csv"), index_col=[2, 0])
raw_training2 = raw_training2[raw_training2.index.get_level_values(0) != "Zhou_2022"]
raw_training2

Unnamed: 0_level_0,Unnamed: 1_level_0,Phenotype,UNKNOWN,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera,...,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Parvoviridae|g__Erythroparvovirus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Parvoviridae|g__Erythroparvovirus|s__Human_erythrovirus_V9,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Parvoviridae|g__Erythroparvovirus|s__Primate_erythroparvovirus_1,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Polyomaviridae|g__Betapolyomavirus|s__Macaca_mulatta_polyomavirus_1,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Tombusviridae,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Tombusviridae|g__Alphacarmovirus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Tombusviridae|g__Alphacarmovirus|s__Carnation_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Enterobacteria_phage_K1A,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Phage_Gifsy_1,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Streptococcus_phage_SW11
Study,Sample_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ventura_2019,SAMEA4888521,MS,78.07851,0.053285,0.053285,0.053285,0.053285,0.053285,0.053285,0.053285,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ventura_2019,SAMEA4888522,MS,66.55423,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ventura_2019,SAMEA4888523,MS,69.16581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ventura_2019,SAMEA4888524,MS,63.79696,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ventura_2019,SAMEA4888525,MS,64.84315,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yang_2020,SRR6456373,Healthy,42.34475,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yang_2020,SRR6456374,Healthy,40.76344,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yang_2020,SRR6456375,Healthy,36.81596,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yang_2020,SRR6456376,Healthy,47.26255,0.127371,0.127371,0.127371,0.127371,0.127371,0.127371,0.127371,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Remove samples with low read counts

In [14]:
# Get samples with low read counts
# low_read_count_ids = pd.read_csv(os.path.join(RAW_DIR, "iMSMS_read_count.csv")).iloc[:, 0].values

# raw_training2 = raw_training2.iloc[~raw_training2.index.get_level_values(1).isin(low_read_count_ids), :]
# raw_training2

In [15]:
# Select the taxonomic features
taxonomic_features2 = raw_training2.iloc[:, 1:]

# Select the health status labels 
is_healthy2 = raw_training2[['Phenotype']] == "Healthy"
is_healthy2.columns = ["PHENOTYPE:Healthy_Nonhealthy"]

# Select the full phenotype labels
phenotype2 = raw_training2[['Phenotype']]
phenotype2.columns = ["PHENOTYPE_Disease"]

In [16]:
# add dummy columns to align with training set
taxonomic_features2[list(set(taxonomic_features.columns) - set(taxonomic_features2.columns))] = 0
taxonomic_features2 = taxonomic_features2.copy()[taxonomic_features.columns]
taxonomic_features2

  self[col] = value


Unnamed: 0_level_0,Unnamed: 1_level_0,UNKNOWN,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae,...,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Hordeivirus|s__Barley_stripe_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cactus_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cucumber_green_mottle_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Paprika_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Pepper_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Tobacco_mild_green_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Deep_sea_thermophilic_phage_D6E,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Loktanella_phage_pCB2051_A,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Tetraselmis_viridis_virus_S1
Study,Sample_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Ventura_2019,SAMEA4888521,78.07851,0.053285,0.053285,0.053285,0.053285,0.053285,0.053285,0.053285,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Ventura_2019,SAMEA4888522,66.55423,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Ventura_2019,SAMEA4888523,69.16581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Ventura_2019,SAMEA4888524,63.79696,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Ventura_2019,SAMEA4888525,64.84315,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yang_2020,SRR6456373,42.34475,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Yang_2020,SRR6456374,40.76344,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Yang_2020,SRR6456375,36.81596,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
Yang_2020,SRR6456376,47.26255,0.127371,0.127371,0.127371,0.127371,0.127371,0.127371,0.127371,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


# Combine two csv files

In [17]:
taxonomic_features = pd.concat([taxonomic_features, taxonomic_features2])
is_healthy = pd.concat([is_healthy, is_healthy2])
phenotype = pd.concat([phenotype, phenotype2])

# Remove samples w/ high unknown and low taxa count

In [18]:
MAX_UNKNOWN = 90
MIN_TAXA_COUNT = 100

taxonomic_features = taxonomic_features[(taxonomic_features["UNKNOWN"] <= MAX_UNKNOWN) & ((taxonomic_features.iloc[:, 1:] > 0.00001).sum(axis=1) >= MIN_TAXA_COUNT)]
is_healthy = is_healthy.loc[taxonomic_features.index]
phenotype = phenotype.loc[taxonomic_features.index]

# Normalize relative abundances and delete unknown column 

In [19]:
# normalize relative abundances
taxonomic_features = taxonomic_features.divide((100 - taxonomic_features["UNKNOWN"]), axis="rows")

In [20]:
# we don't need unknown column anymore
taxonomic_features = taxonomic_features.drop(labels=["UNKNOWN"], axis=1)

# Check how many samples we have finally
print(f"Our dataset has {taxonomic_features.shape[0]} samples with {taxonomic_features.shape[1]} unique taxnomic features")
print(f"Our dataset has {len(np.unique(taxonomic_features.index.get_level_values(0)))} independent studies")


Our dataset has 8069 samples with 3200 unique taxnomic features
Our dataset has 54 independent studies


In [21]:
# santiy check: there are 7 taxonomic ranks, each one should sum to 1, so sum of each sample should be 7 (w/ floating point error)
taxonomic_features.sum(axis=1)

Study_ID   Sample Accession
GMHI-10    SAMN03283239        6.998433
           SAMN03283266        6.999368
           SAMN03283281        6.774279
           SAMN03283294        6.997221
           SAMN03283288        6.985597
                                 ...   
Yang_2020  SRR6456373          6.993197
           SRR6456374          6.922800
           SRR6456375          6.862885
           SRR6456376          6.864952
           SRR6456377          6.980490
Length: 8069, dtype: float64

# Standardize names for diseased phenotypes

In [22]:
np.unique(phenotype)

array(['ACVD', 'CD', 'CRC', 'Cancer', "Crohn''s disease",
       'Crohns disease', 'Graves’ disease', 'Healthy', 'Liver Cirrhosis',
       'MS', 'NAFLD', 'Rheumatoid Arthritis', 'Rheumatoid arthritis',
       'T2D', 'Ulcerative colitis', 'ankylosing spondylitis', 'carcinoma',
       'ulcerative colitis'], dtype=object)

In [23]:
dic = {
    "Adenoma (MP)" : "Colorectal adenoma",
    "Large adenoma" : "Colorectal adenoma",
    "Small adenoma" : "Colorectal adenoma",
    "adenoma" : "Colorectal adenoma",
    "advanced adenoma" : "Colorectal adenoma",
    
    "carcinoma" : "Colorectal cancer",
    "Cancer" : "Colorectal cancer",
    "CRC" : "Colorectal cancer",
    
    "Crohn''s disease" : "Crohn's Disease",
    "CD" : "Crohn's Disease",
    "Crohns disease" : "Crohn's Disease",
    
    "Rheumatoid Arthritis" : "Rheumatoid arthritis",
    
    "IGT" : "Impaired glucose tolerance",
    
    "T2D" : "Type 2 diabetes",
    
    "schizophrenia" : "Schizophrenia",
    
    "ulcerative colitis" : "Ulcerative colitis",
    
    "ACVD" : "Atherosclerotic cardiovascular disease",
    
    "End-stage renal disease (ESRD)" : "End-stage renal disease",
    
    "Hypertension (HTN)" : "Hypertension",
    
    "ankylosing spondylitis" : "Ankylosing spondylitis",
    
}

phenotype = phenotype.replace({"PHENOTYPE_Disease" : dic}) 

In [24]:
# Look at how many studies and samples for each phenotype

phenotype_info = phenotype.reset_index().groupby("PHENOTYPE_Disease").agg(lambda x: x.nunique())
phenotype_info.columns = ["# of studies", "# of samples"]
phenotype_info

Unnamed: 0_level_0,# of studies,# of samples
PHENOTYPE_Disease,Unnamed: 1_level_1,Unnamed: 2_level_1
Ankylosing spondylitis,1,95
Atherosclerotic cardiovascular disease,1,214
Colorectal cancer,9,789
Crohn's Disease,7,284
Graves’ disease,1,100
Healthy,46,5547
Liver Cirrhosis,1,152
MS,1,24
NAFLD,1,86
Rheumatoid arthritis,2,151


# Yay we are done, export stuff

In [25]:
# export training set stuff
taxonomic_features.to_csv(os.path.join(CLEAN_DIR, "taxonomic_features.csv"))
metadata = pd.concat([is_healthy, phenotype], axis=1)
metadata.to_csv(os.path.join(CLEAN_DIR, "metadata.csv"))

# Clean validation data

## Clean first longitudinal data file

In [26]:
raw_val1 = pd.read_csv(os.path.join(RAW_DIR, "raw_validation.csv"), index_col=[0, 3])

In [27]:
# Select the taxonomic features
taxonomic_features_val1 = raw_val1.iloc[:, 14:]

# get meta
meta_val1 = raw_val1.iloc[:, :14]

# normalize and drop
taxonomic_features_val1 = taxonomic_features_val1.divide((100 - taxonomic_features_val1["UNKNOWN"]), axis="rows")
taxonomic_features_val1 = taxonomic_features_val1.drop(labels=["UNKNOWN"], axis=1)

# add dummy columns to align with training set
taxonomic_features_val1[list(set(taxonomic_features.columns) - set(taxonomic_features_val1.columns))] = 0
taxonomic_features_val1 = taxonomic_features_val1.copy()[taxonomic_features.columns]

# sanity check: there are 7 taxonomic ranks, each one should sum to 1, so sum of each sample should be 7 (w/ floating point error)
taxonomic_features_val1.sum(axis=1)

  self[col] = value


Study_ID  Sample Accession
P65       SAMEA104062441      6.999077
          SAMEA104062442      6.999857
          SAMEA104062443      7.000002
          SAMEA104062444      6.991888
          SAMEA104062445      6.996988
                                ...   
P116      SAMN12478559        6.999367
          SAMN12478560        6.999259
          SAMN12478561        6.995951
          SAMN12478591        6.999739
          SAMN12478594        6.996203
Length: 691, dtype: float64

## Clean second longitudinal data file

In [28]:
raw_val2 = pd.read_csv(os.path.join(RAW_DIR, "raw_validation2.csv"), index_col=[0, 4])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [29]:
# Select the taxonomic features
taxonomic_features_val2 = raw_val2.iloc[:, 13:]

# get meta
meta_val2 = raw_val2.iloc[:, :13]

In [30]:
# normalize and drop
taxonomic_features_val2 = taxonomic_features_val2.divide((100 - taxonomic_features_val2["UNKNOWN"]), axis="rows")
taxonomic_features_val2 = taxonomic_features_val2.drop(labels=["UNKNOWN"], axis=1)

# add dummy columns to align with training set
taxonomic_features_val2[list(set(taxonomic_features.columns) - set(taxonomic_features_val2.columns))] = 0
taxonomic_features_val2 = taxonomic_features_val2.copy()[taxonomic_features.columns]

# sanity check: there are 7 taxonomic ranks, each one should sum to 1, so sum of each sample should be 7 (w/ floating point error)
taxonomic_features_val2.sum(axis=1)

Study_ID  BioSample   
P107      SAMEA6512889    6.999663
          SAMEA6513017    6.996898
          SAMEA6513054    6.998531
          SAMEA6513021    6.994938
          SAMEA6512856    6.992086
                            ...   
P90       SAMN16701308    6.999814
          SAMN16701460    6.997841
          SAMN16701459    6.846649
          SAMN16701458    6.977201
          SAMN16701457    6.966260
Length: 672, dtype: float64

In [31]:
# Merge taxonomic features
taxonomic_features_val = pd.concat([taxonomic_features_val1, taxonomic_features_val2])

In [32]:
# Select columns of interest for metadata
meta_val1_cropped = meta_val1[["Subject_ID", "Health status", "Time point"]]
meta_val2_cropped = meta_val2[["patient_ID", "Host_diet", "timepoint"]]

In [33]:
# Rename column/index names
meta_val2_cropped.index = meta_val2_cropped.index.rename(["Study_ID", "Sample Accession"])
meta_val1_cropped.columns = ["Subject_ID", "health_status/host_diet", "timepoint"]
meta_val2_cropped.columns = ["Subject_ID", "health_status/host_diet", "timepoint"]

In [34]:
# Merge metadata 
metadata_val = pd.concat([meta_val1_cropped, meta_val2_cropped])
display(np.unique(metadata_val.index.get_level_values(0)))
metadata_val

array(['P106', 'P107', 'P116', 'P37', 'P43', 'P65', 'P89', 'P90'],
      dtype=object)

Unnamed: 0_level_0,Unnamed: 1_level_0,Subject_ID,health_status/host_diet,timepoint
Study_ID,Sample Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P65,SAMEA104062441,ERAS10,Healthy,D0
P65,SAMEA104062442,ERAS10,Healthy,D180
P65,SAMEA104062443,ERAS10,Healthy,D42
P65,SAMEA104062444,ERAS10,Healthy,D8
P65,SAMEA104062445,ERAS11,Healthy,D0
...,...,...,...,...
P90,SAMN16701308,9003,EEN,3
P90,SAMN16701460,9017,Vegan,1
P90,SAMN16701459,9016,EEN,15
P90,SAMN16701458,9016,EEN,14


# Yay we are done, export stuff

In [35]:
taxonomic_features_val.to_csv(os.path.join(CLEAN_DIR, "taxonomic_features_val.csv"))
metadata_val.to_csv(os.path.join(CLEAN_DIR, "metadata_val.csv"))