This notebook is being used for the preparation of 2 different studies from MGnify which where curated on GMrepo. Information regarding phenotype (IBD or healthy) is being used for biological group of interest.

### Loading packages

In [1]:
import pandas as pd
import numpy as np

### Loading data

In [31]:
study_1_meta = pd.read_csv(
    "metadata/all_runs_in_project_PRJNA389280.tsv", sep="\t", index_col=False
)
study_1_meta = study_1_meta[["run ID", "associated phenotype", "project ID", "sex"]]

phenotypes = {
    "D003424": "CD",
    "D003093": "UC",
    "D006262": "nonIBD",
}

study_1_meta["associated phenotype"] = (
    study_1_meta["associated phenotype"]
    .map(phenotypes)
    .fillna(study_1_meta["associated phenotype"])  # keep originals if not in mapping
)

study_2_meta = pd.read_csv("metadata/sra_run_PRJNA398089.csv", index_col=False)
study_2_meta = study_2_meta.rename(
    columns={
        "Run": "run ID",
        "Host_disease": "associated phenotype",
        "BioProject": "project ID",
        "host_sex": "sex",
    }
)

study_2_meta["sex"] = study_2_meta["sex"].str.capitalize()

study_2_meta = study_2_meta[
    ["run ID", "associated phenotype", "project ID", "sex"]
]

study_1_count = pd.read_csv(
    "count_data/PRJNA389280_taxonomy_abundances_SSU_v4.1.tsv",
    sep="\t",
    index_col="#SampleID",
).transpose()
study_2_count = pd.read_csv(
    "count_data/PRJNA398089_taxonomy_abundances_SSU_v5.0.tsv",
    sep="\t",
    index_col="#SampleID",
).transpose()

study_1_count.index.name = "#SampleID"
study_1_count = study_1_count.reset_index()
study_1_count = study_1_count.rename(columns={"#SampleID": "run ID"})

study_2_count.index.name = "#SampleID"
study_2_count = study_2_count.reset_index()
study_2_count = study_2_count.rename(columns={"#SampleID": "run ID"})

### Merging dataframes

In [43]:
merged_df_1 = pd.merge(study_1_meta, study_1_count, on="run ID")
merged_df_2 = pd.merge(study_2_meta, study_2_count, on="run ID")

shared_columns = merged_df_1.columns.intersection(merged_df_2.columns)
only_df_1 = merged_df_1.columns.difference(merged_df_2.columns)
only_df_2 = merged_df_2.columns.difference(merged_df_1.columns)

merged_ibd_df = pd.concat([merged_df_1, merged_df_2],
                          axis = 0,
                          ignore_index=True,
                          sort=False,
                          join="inner")

# merged_ibd_df.to_csv("IBD_dataset.csv", index=False)
shared_columns

Index(['run ID', 'associated phenotype', 'project ID', 'sex',
       'sk__Archaea;k__;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter',
       'sk__Archaea;k__;p__Euryarchaeota;c__Thermoplasmata;o__Methanomassiliicoccales;f__Methanomassiliicoccaceae;g__Methanomassiliicoccus',
       'sk__Bacteria', 'sk__Bacteria;k__;p__Acidobacteria',
       'sk__Bacteria;k__;p__Acidobacteria;c__Acidobacteriia',
       'sk__Bacteria;k__;p__Acidobacteria;c__Acidobacteriia;o__Acidobacteriales',
       ...
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales',
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae',
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia',
       'sk__Bacteria;k__;p__Verrucomicrobia;c__Verrucomicrobiae;o__Verrucomicrobiales;f__Akkermansiaceae;g__Akkermansia;s__Akkermansia_m