# Import libraries

In [1]:
import pandas as pd
from config import RAW_DIR, CLEAN_DIR
import numpy as np
import os

# Extract features and labels from the raw meta-analysis file

In [2]:
# load up our dataset
raw = pd.read_csv(os.path.join(RAW_DIR, "raw.csv"), index_col=[0, 3])
raw.iloc[:5, :20]

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0_level_0,Unnamed: 1_level_0,Author (year),BoiProject ID,RunID,Sequencing Platform,Library layout (SE/PE),"Sample origin? (e.g., stool, mucosa)",Disease status,Sample_ID,Subject Id (If available),Age (Years),Gender,Geographical Region or Population,BMI (kg/m²),PHENOTYPE_Disease,PHENOTYPE:Healthy_Nonhealthy,UNKNOWN,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales
Study_ID,Sample Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GMHI-23,SAMEA3879547,Vogtmann (2016),PRJEB12449,ERR1293689,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS84159866ST,MMRS84159866ST,67.0,Female,USA,18.63574,Healthy,Healthy,56.451568,0.861124,0.861124,0.861124,0.861124
GMHI-23,SAMEA3879551,Vogtmann (2016),PRJEB12449,ERR1293705,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS25211151ST,MMRS25211151ST,60.0,Female,USA,20.466036,Healthy,Healthy,39.905873,0.0,0.0,0.0,0.0
GMHI-23,SAMEA3879543,Vogtmann (2016),PRJEB12449,ERR1293673,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS32573774ST,MMRS32573774ST,82.0,Male,USA,20.524438,Healthy,Healthy,53.274915,0.151996,0.151996,0.151996,0.151996
GMHI-23,SAMEA3879565,Vogtmann (2016),PRJEB12449,ERR1293761,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS20257302ST,MMRS20257302ST,62.0,Female,USA,20.665713,Healthy,Healthy,59.044005,0.362985,0.362985,0.362985,0.362985
GMHI-23,SAMEA3879546,Vogtmann (2016),PRJEB12449,ERR1293685,Illumina HiSeq 2000,PAIRED,stool,Control,MMRS86168210ST,MMRS86168210ST,79.0,Male,USA,21.284602,Healthy,Healthy,75.424715,0.001181,0.001181,0.001181,0.001181


In [3]:
# Select the taxonomic features
taxonomic_features = raw.iloc[:, 15:]

# Select the health status labels 
is_healthy = raw[["PHENOTYPE:Healthy_Nonhealthy"]] == "Healthy"

# Select the full phenotype labels
phenotype = raw[["PHENOTYPE_Disease"]]

# Remove bad samples 

In [4]:
non_illumina = [
    "454 GS FLX Titanium",
    "Ion Torrent PGM",
    "Ion Torrent Proton",
    "BGISEQ-500",
]

In [5]:
# Find the indices of samples we would like to remove

remove = (
    
    # we don't need a biomarker for obesity/underweight
    (phenotype == "Underweight").values.flatten() | 
    (phenotype == "Overweight").values.flatten() |
    (phenotype == "Obesity").values.flatten() |
    (phenotype == "Obese").values.flatten() |
    
    # to reduce batch effects, remove samples sequenced with non-illumina machines
    raw["Sequencing Platform"].isin(non_illumina).values.flatten() |
    
    # all of these studies have some issue
    (phenotype.index.get_level_values(0) == "P4") | # P4 treats the poop for extracting viral DNA
    (phenotype.index.get_level_values(0) == "P86") | # Healthy at baseline but half develop T2D 
    (phenotype.index.get_level_values(0) == "GMHI-19") | # Outlier study
    ((phenotype.index.get_level_values(0) == "P48") & (phenotype == "Healthy").values.flatten()) | # Alcohol or smoking
    (phenotype.index.get_level_values(0) == "P59") | # Are all technically healthy, but half are in heavily urbanized areas
    # and "Microbes with higher relative abundance in Chinese urban samples have been associated with disease in other studies"
    (phenotype.index.get_level_values(0) == "P63") # Deals with semisupercentenarians, i.e., 105 to 109 years old
)

In [6]:
# remove bad samples
taxonomic_features = taxonomic_features.iloc[~remove, :]
is_healthy = is_healthy.iloc[~remove, :]
phenotype = phenotype.iloc[~remove, :]

In [7]:
# normalize relative abundances
taxonomic_features = taxonomic_features.divide((100 - taxonomic_features["UNKNOWN"]), axis="rows")

In [8]:
# we don't need unknown column anymore
taxonomic_features = taxonomic_features.drop(labels=["UNKNOWN"], axis=1)

print(f"Our dataset has {taxonomic_features.shape[0]} samples with {taxonomic_features.shape[1]} unique taxnomic features")
print(f"Our dataset has {len(np.unique(taxonomic_features.index.get_level_values(0)))} independent studies")


Our dataset has 9045 samples with 3200 unique taxnomic features
Our dataset has 61 independent studies


In [9]:
# there are 7 taxonomic ranks, each one should sum to 1, so sum of each sample should be 7 (w/ floating point error)
taxonomic_features.sum(axis=1)

Study_ID  Sample Accession
GMHI-23   SAMEA3879547        6.998069
          SAMEA3879551        6.999311
          SAMEA3879543        6.996683
          SAMEA3879565        6.996989
          SAMEA3879546        6.975655
                                ...   
P140      SAMN07509557        7.000000
          SAMN07509558        1.000000
          SAMN07509546        6.999737
          SAMN07509552        7.000000
          SAMN07509921        6.998958
Length: 9045, dtype: float64

# Yay we are done, export stuff

In [10]:
taxonomic_features.to_csv(os.path.join(CLEAN_DIR, "taxonomic_features.csv"))
is_healthy.to_csv(os.path.join(CLEAN_DIR, "is_healthy.csv"))
phenotype.to_csv(os.path.join(CLEAN_DIR, "phenotype.csv"))