# Import libraries

In [10]:
import pandas as pd
import numpy as np
import os
import config
from joblib import dump, load
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib import rc
from GMHI import GMHI

# Set constants

In [11]:
# set figure details
figsize = (10, 10)
dpi = 200

rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)

filename = "performance_per_study_per_phenotype.svg"

# Load data

In [12]:
X = pd.read_csv(os.path.join(config.CLEAN_DIR, "taxonomic_features.csv"), index_col=[0, 1])
y = pd.read_csv(os.path.join(config.CLEAN_DIR, "is_healthy.csv"), index_col=[0, 1])
phenotype = pd.read_csv(os.path.join(config.CLEAN_DIR, "phenotype.csv"), index_col=[0, 1])
GMHI2_scores_cv = pd.read_csv(os.path.join(config.PREDICTION_DIR, "GMHI2_scores_cv.csv"), index_col=[0, 1])
display(X.head())
display(y.head())
display(GMHI2_scores_cv.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae,k__Archaea|p__Euryarchaeota|c__Thermoplasmata,...,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Hordeivirus|s__Barley_stripe_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cactus_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cucumber_green_mottle_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Paprika_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Pepper_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Tobacco_mild_green_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Deep_sea_thermophilic_phage_D6E,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Loktanella_phage_pCB2051_A,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Tetraselmis_viridis_virus_S1
Study_ID,Sample Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
GMHI-23,SAMEA3879547,0.019774,0.019774,0.019774,0.019774,0.019774,0.019774,0.019774,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GMHI-23,SAMEA3879551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GMHI-23,SAMEA3879543,0.003253,0.003253,0.003253,0.003253,0.003253,0.003253,0.003253,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GMHI-23,SAMEA3879565,0.008863,0.008863,0.008863,0.008863,0.008863,0.008863,0.008863,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GMHI-23,SAMEA3879546,4.8e-05,4.8e-05,4.8e-05,4.8e-05,4.8e-05,4.8e-05,4.8e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Unnamed: 1_level_0,PHENOTYPE:Healthy_Nonhealthy
Study_ID,Sample Accession,Unnamed: 2_level_1
GMHI-23,SAMEA3879547,True
GMHI-23,SAMEA3879551,True
GMHI-23,SAMEA3879543,True
GMHI-23,SAMEA3879565,True
GMHI-23,SAMEA3879546,True


Unnamed: 0_level_0,Unnamed: 1_level_0,GMHI2_cv
Study_ID,Sample Accession,Unnamed: 2_level_1
GMHI-23,SAMEA3879547,1.884788
GMHI-23,SAMEA3879551,-0.062656
GMHI-23,SAMEA3879543,-0.890399
GMHI-23,SAMEA3879565,0.468921
GMHI-23,SAMEA3879546,1.171292


# Rank order and plot

In [14]:
np.unique(phenotype)

array(['Advanced Dementia', 'Ankylosing spondylitis',
       'Atherosclerotic cardiovascular disease', 'Behcet’s disease',
       'Breast Cancer', 'Colorectal adenoma', 'Colorectal cancer',
       "Crohn's Disease", 'End-stage renal disease', 'Graves’ disease',
       'Healthy', 'Hypertension', 'Impaired glucose tolerance',
       'Liver Cirrhosis', 'NAFLD', 'Pancreatic cancer',
       'Rheumatoid arthritis', 'Schizophrenia', 'Type 2 diabetes',
       'Ulcerative colitis'], dtype=object)

In [18]:
info = pd.DataFrame((GMHI2_scores_cv > 0).values, index=GMHI2_scores_cv.index, columns=["predicted"])
info["phenotype"] = phenotype
info["y"] = y

# remove samples with low GMHI2 score magnitudes
cutoff = 0.1
info = info[(abs(GMHI2_scores_cv) > cutoff).values]
info

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,phenotype,y
Study_ID,Sample Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GMHI-23,SAMEA3879547,True,Healthy,True
GMHI-23,SAMEA3879543,False,Healthy,True
GMHI-23,SAMEA3879565,True,Healthy,True
GMHI-23,SAMEA3879546,True,Healthy,True
GMHI-23,SAMEA3879522,True,Healthy,True
...,...,...,...,...
P140,SAMN07509562,False,Ulcerative colitis,False
P140,SAMN07509555,False,Ulcerative colitis,False
P140,SAMN07509557,True,Ulcerative colitis,False
P140,SAMN07509546,False,Ulcerative colitis,False


In [25]:
for p in np.unique(phenotype)[:10]:
    print(p)
    phenotype_df = info[(info["phenotype"] == p).values]
    phenotype_studies = np.unique(phenotype_df.index.get_level_values(0))
    for s in phenotype_studies:
        print(s)
    print()

Advanced Dementia
P113

Ankylosing spondylitis
GMHI-V-41

Atherosclerotic cardiovascular disease
GMHI-2

Behcet’s disease
P15

Breast Cancer
P32

Colorectal adenoma
GMHI-11
GMHI-3
GMHI-V-38
P48

Colorectal cancer
GMHI-23
GMHI-27
GMHI-3
GMHI-V-38
GMHI-V-40
P48

Crohn's Disease
GMHI-28
GMHI-31
GMHI-9
P135
P140
P56
P57

End-stage renal disease
P132

Graves’ disease
P39

