# Import libraries

In [1]:
import pandas as pd
import numpy as np
import os
import config
from joblib import dump, load
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from GMHI import GMHI

# Load model

In [2]:
gmhi2 = load(os.path.join(config.MODEL_DIR, "gmhi2_model.joblib"))
gmhi2

LogisticRegression(C=0.03, class_weight='balanced', penalty='l1',
                   random_state=42, solver='liblinear')

# Load validation data

In [11]:
X_cv = pd.read_csv(os.path.join(config.CLEAN_DIR, "taxonomic_features_cv.csv"), index_col=[0, 1])
X_cv

Unnamed: 0_level_0,Unnamed: 1_level_0,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae,k__Archaea|p__Euryarchaeota|c__Thermoplasmata,...,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Hordeivirus|s__Barley_stripe_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cactus_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cucumber_green_mottle_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Paprika_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Pepper_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Tobacco_mild_green_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Deep_sea_thermophilic_phage_D6E,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Loktanella_phage_pCB2051_A,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Tetraselmis_viridis_virus_S1
Study_ID,Sample Accession,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
P65,SAMEA104062441,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P65,SAMEA104062442,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P65,SAMEA104062443,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P65,SAMEA104062444,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P65,SAMEA104062445,0.02755,0.02755,0.02755,0.02755,0.02755,0.026885,0.026885,0.000665,0.000665,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P90,SAMN16701308,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P90,SAMN16701460,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P90,SAMN16701459,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
P90,SAMN16701458,0.00000,0.00000,0.00000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
studies = np.unique(X_cv.index.get_level_values(0))
studies

array(['P106', 'P107', 'P116', 'P37', 'P43', 'P65', 'P89', 'P90'],
      dtype=object)

# P106

In [37]:
study = "P107"

In [38]:
X = X_cv.loc[study]
X

Unnamed: 0_level_0,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera|s__Methanosphaera_stadtmanae,k__Archaea|p__Euryarchaeota|c__Thermoplasmata,...,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Hordeivirus|s__Barley_stripe_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cactus_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Cucumber_green_mottle_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Paprika_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Pepper_mild_mottle_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Virgaviridae|g__Tobamovirus|s__Tobacco_mild_green_mosaic_virus,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Deep_sea_thermophilic_phage_D6E,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Loktanella_phage_pCB2051_A,k__Viruses|p__Viruses_unclassified|c__Viruses_unclassified|o__Viruses_unclassified|f__Viruses_unclassified|g__Viruses_unclassified|s__Tetraselmis_viridis_virus_S1
Sample Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMEA6512889,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6513017,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6513054,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6513021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6512856,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMEA6512989,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6512906,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6513071,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
SAMEA6512988,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
meta = pd.read_csv(os.path.join(config.CLEAN_DIR, "meta_cv2.csv"), index_col=[0, 1]).loc[study]
meta

Unnamed: 0_level_0,patient_ID,Host_diet,timepoint
BioSample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SAMEA6512889,P3,MED DIET,baseline
SAMEA6513017,D20,CONTROL DIET,8 weeks
SAMEA6513054,P3,MED DIET,8 weeks
SAMEA6513021,D25,MED DIET,8 weeks
SAMEA6512856,D24,CONTROL DIET,baseline
...,...,...,...
SAMEA6512989,P22,MED DIET,4 weeks
SAMEA6512906,P21,MED DIET,baseline
SAMEA6513071,P21,MED DIET,8 weeks
SAMEA6512988,P21,MED DIET,4 weeks


In [62]:
meta["GMHI2"] = gmhi2.decision_function(X > config.PRESENCE_CUTOFF)
meta

Unnamed: 0_level_0,patient_ID,Host_diet,timepoint,GMHI2
BioSample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SAMEA6512889,P3,MED DIET,baseline,1.723964
SAMEA6513017,D20,CONTROL DIET,8 weeks,1.280643
SAMEA6513054,P3,MED DIET,8 weeks,1.485948
SAMEA6513021,D25,MED DIET,8 weeks,1.819939
SAMEA6512856,D24,CONTROL DIET,baseline,0.843216
...,...,...,...,...
SAMEA6512989,P22,MED DIET,4 weeks,0.255275
SAMEA6512906,P21,MED DIET,baseline,2.036473
SAMEA6513071,P21,MED DIET,8 weeks,1.306591
SAMEA6512988,P21,MED DIET,4 weeks,1.970512


In [68]:
sorted = meta.sort_values("patient_ID")

In [71]:
med_diet = sorted[sorted["Host_diet"] == "MED DIET"]

In [90]:
baseline = med_diet[med_diet["timepoint"] == "baseline"].set_index("patient_ID")
four = med_diet[med_diet["timepoint"] == "4 weeks"].set_index("patient_ID")
eight = med_diet[med_diet["timepoint"] == "8 weeks"].set_index("patient_ID")

In [96]:
eight["GMHI2"] - baseline.loc[eight.index]["GMHI2"]

patient_ID
D1    -0.114054
D12    0.193077
D16   -0.639492
D21    0.547342
D22    0.597073
D25    1.314283
D26    0.384448
D27   -0.956227
D28    0.434767
D3     0.866691
D31   -0.189374
D32    0.701172
D35   -1.358728
D36    0.536719
D37   -0.278429
D38    0.373596
D4    -0.211432
D40   -0.542609
D42   -0.085396
D43   -0.272961
D46    0.679868
D5     0.136538
D51    0.277368
D52    0.201581
D53    0.406437
D6     0.312948
D9     0.089914
P10    1.076862
P11   -0.672249
P12   -0.789627
P13   -0.977734
P18    0.461843
P19    1.074823
P21   -0.729883
P22   -0.916986
P25   -0.275060
P26    0.707742
P27    1.784478
P3    -0.238016
P30    0.068248
P4    -0.603747
P5     1.085256
P6     0.581193
Name: GMHI2, dtype: float64

In [78]:
four

Unnamed: 0_level_0,patient_ID,Host_diet,timepoint,GMHI2
BioSample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SAMEA6512916,D1,MED DIET,4 weeks,0.426413
SAMEA6512927,D12,MED DIET,4 weeks,1.366266
SAMEA6512931,D16,MED DIET,4 weeks,-0.004236
SAMEA6512936,D21,MED DIET,4 weeks,1.641205
SAMEA6512937,D22,MED DIET,4 weeks,1.452229
SAMEA6512939,D25,MED DIET,4 weeks,1.132065
SAMEA6512940,D26,MED DIET,4 weeks,1.332377
SAMEA6512941,D27,MED DIET,4 weeks,1.815695
SAMEA6512942,D28,MED DIET,4 weeks,-1.408881
SAMEA6512918,D3,MED DIET,4 weeks,-0.251437
