This notebook is being used for the preparation of 2 different studies from MGnify which where curated on GMrepo. Information regarding phenotype (IBD or healthy) is being used for biological group of interest.

# Obtaining data

### Loading packages

In [2]:
import pandas as pd
import numpy as np

### Loading data

In [31]:
study_1_meta = pd.read_csv(
    "metadata/all_runs_in_project_PRJNA389280.tsv", sep="\t", index_col=False
)
study_1_meta = study_1_meta[["run ID", "associated phenotype", "project ID", "sex"]]

phenotypes = {
    "D003424": "CD",
    "D003093": "UC",
    "D006262": "nonIBD",
}

study_1_meta["associated phenotype"] = (
    study_1_meta["associated phenotype"]
    .map(phenotypes)
    .fillna(study_1_meta["associated phenotype"])  # keep originals if not in mapping
)

study_2_meta = pd.read_csv("metadata/sra_run_PRJNA398089.csv", index_col=False)
study_2_meta = study_2_meta.rename(
    columns={
        "Run": "run ID",
        "Host_disease": "associated phenotype",
        "BioProject": "project ID",
        "host_sex": "sex",
    }
)

study_2_meta["sex"] = study_2_meta["sex"].str.capitalize()

study_2_meta = study_2_meta[
    ["run ID", "associated phenotype", "project ID", "sex"]
]

study_1_count = pd.read_csv(
    "count_data/PRJNA389280_taxonomy_abundances_SSU_v4.1.tsv",
    sep="\t",
    index_col="#SampleID",
).transpose()
study_2_count = pd.read_csv(
    "count_data/PRJNA398089_taxonomy_abundances_SSU_v5.0.tsv",
    sep="\t",
    index_col="#SampleID",
).transpose()

study_1_count.index.name = "#SampleID"
study_1_count = study_1_count.reset_index()
study_1_count = study_1_count.rename(columns={"#SampleID": "run ID"})

study_2_count.index.name = "#SampleID"
study_2_count = study_2_count.reset_index()
study_2_count = study_2_count.rename(columns={"#SampleID": "run ID"})

### Merging dataframes

In [43]:
merged_df_1 = pd.merge(study_1_meta, study_1_count, on="run ID")
merged_df_2 = pd.merge(study_2_meta, study_2_count, on="run ID")

shared_columns = merged_df_1.columns.intersection(merged_df_2.columns)
only_df_1 = merged_df_1.columns.difference(merged_df_2.columns)
only_df_2 = merged_df_2.columns.difference(merged_df_1.columns)

merged_ibd_df = pd.concat([merged_df_1, merged_df_2],
                          axis = 0,
                          ignore_index=True,
                          sort=False,
                          join="inner")

# merged_ibd_df.to_csv("IBD_dataset.csv", index=False)
shared_columns

Index(['run ID', 'associated phenotype', 'project ID', 'sex',
       'sk__Archaea;k__;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter',
       'sk__Archaea;k__;p__Euryarchaeota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomassiliicoccaceae;g__Methanomassiliicoccus',
       'sk__Bacteria', 'sk__Bacteria;k__;p__Acidobacteria',
       'sk__Bacteria;k__;p__Acidobacteria;c__Acidobacteriia',
       'sk__Bacteria;k__;p__Acidobacteria;c__Acidobacteriia;o__Acidobacteriales',
       ...
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales',
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae',
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia',
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia;s__Akkermansia_m

# Filtering for genus level taxa

### Loading data

In [3]:
path = "IBD_dataset.csv"
merged_dataset = pd.read_csv(path)

merged_dataset

Unnamed: 0,run ID,associated phenotype,project ID,sex,sk__Archaea;k__;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter,sk__Archaea;k__;p__Euryarchaeota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomassiliicoccaceae;g__Methanomassiliicoccus,sk__Bacteria,sk__Bacteria;k__;p__Acidobacteria,sk__Bacteria;k__;p__Acidobacteria;c__Acidobacteriia,sk__Bacteria;k__;p__Acidobacteria;c__Acidobacteriia;o__Acidobacteriales,...,sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales,sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae,sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia,sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia;s__Akkermansia_muciniphila,sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Verrucomicrobiaceae,sk__Eukaryota;k__Fungi;p__Basidiomycota,sk__Eukaryota;k__Fungi;p__Basidiomycota;c__Malasseziomycetes;o__Malasseziales;f__Malasseziaceae;g__Malassezia,sk__Eukaryota;k__Metazoa,sk__Eukaryota;k__Metazoa;p__Chordata;c__Mammalia,sk__Eukaryota;k__Viridiplantae
0,SRR5947807,CD,PRJNA389280,Female,0.0,0.0,2620.0,0.0,0.0,0.0,...,4.0,0.0,103.0,13.0,0.0,0.0,0.0,3.0,3.0,0.0
1,SRR5947808,UC,PRJNA389280,Female,0.0,0.0,5927.0,0.0,0.0,0.0,...,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,SRR5947809,CD,PRJNA389280,Male,0.0,0.0,2120.0,2.0,0.0,0.0,...,17.0,0.0,31.0,3.0,0.0,0.0,0.0,2.0,0.0,0.0
3,SRR5947810,nonIBD,PRJNA389280,Female,0.0,0.0,2523.0,0.0,0.0,0.0,...,42.0,0.0,141.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0
4,SRR5947811,CD,PRJNA389280,Male,0.0,0.0,706.0,0.0,0.0,0.0,...,0.0,0.0,28.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,SRR6704409,UC,PRJNA398089,Male,0.0,0.0,6.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
520,SRR6704410,CD,PRJNA398089,Male,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0
521,SRR6704411,nonIBD,PRJNA398089,Female,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
522,SRR6704412,CD,PRJNA398089,Male,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Extracting only taxa that contain genus level

In [4]:
# Extract only taxa
taxa_dataset = merged_dataset.select_dtypes(include = "number")

# Extract taxa that reach the genus level
genus_dataset = taxa_dataset.filter(regex=r'g__')

# Ignore taxa that contain species level taxa
genus_cols = genus_dataset.columns
mask_cols = ~genus_cols.str.contains(r'(?:^|;)s__')

genus_dataset_no_s = genus_dataset.loc[:, mask_cols].copy()

# Filtering taxa that don't appear in at least 1% of the samples
prevalence = (genus_dataset_no_s > 0).sum(axis=0) / genus_dataset_no_s.shape[0]
df_genus = genus_dataset_no_s.loc[:, prevalence >= 0.005]

# Remove 0 count samples
non_zero_sample = df_genus.sum(axis=1) > 0 
df_genus = df_genus.loc[non_zero_sample]

df_genus_pd = pd.concat([merged_dataset.loc[non_zero_sample].select_dtypes(exclude = "number"), df_genus], axis = 1)

df_genus_pd.to_csv("IBD_dataset_genus.csv", index=False)