In [108]:
import pandas as pd
import config
import os
from joblib import load
import numpy as np

In [109]:
! ls ../data/raw/kefir_metaphlan3_results/

Subj001_time1_S1_metaphlan3.txt
Subj001_time2_S2_metaphlan3.txt
Subj003_time1_S3_metaphlan3.txt
Subj004_time1_S4_metaphlan3.txt
Subj007_time1_S5_metaphlan3.txt
Subj008_time1_S6_metaphlan3.txt
Subj008_time2_S7_metaphlan3.txt
Subj011_time1_S8_metaphlan3.txt
Subj011_time2_S9_metaphlan3.txt
Subj013_time1_S10_metaphlan3.txt
Subj014_time1_S11_metaphlan3.txt
Subj015_time2_S12_metaphlan3.txt
Subj016_time1_S13_metaphlan3.txt
Subj016_time2_S14_metaphlan3.txt
Subj017_time1_ReExtract_S44_metaphlan3.txt
Subj020_time1_S15_metaphlan3.txt
Subj021_time1_S16_metaphlan3.txt
Subj021_time2_S17_metaphlan3.txt
Subj030_time1_S18_metaphlan3.txt
Subj030_time2_S19_metaphlan3.txt
Subj030_time3_S20_metaphlan3.txt
Subj031_time1_S21_metaphlan3.txt
Subj032_time1_S22_metaphlan3.txt
Subj032_time2_S23_metaphlan3.txt
Subj034_time1_S24_metaphlan3.txt
Subj034_time2_S25_metaphlan3.txt
Subj035_time1_S26_metaphlan3.txt
Subj039_time1_S27_metaphlan3.txt
Subj039_time2_S28_metaphlan3.txt
Subj040_time1_S29_metaphlan3.txt
Subj041_t

In [110]:
def compute_GMWI2(file):
    df = pd.read_csv(file, sep="\t", skiprows=3, usecols=[0, 2], index_col=0).T

    # load model
    gmwi2 = load(os.path.join(config.MODEL_DIR, "GMHI2_model.joblib"))

    # add dummy columns
    dummy_cols = list(set(gmwi2.feature_names_in_) - set(df.columns))
    dummy_df = pd.DataFrame(np.zeros((1, len(dummy_cols))), columns=dummy_cols, index=["relative_abundance"])
    df = pd.concat([dummy_df, df], axis=1)
    df = df.copy()[["UNKNOWN"] + list(gmwi2.feature_names_in_)]

    # normalize relative abundances
    df = df.divide((100 - df["UNKNOWN"]), axis="rows")
    df = df.drop(labels=["UNKNOWN"], axis=1)

    # compute gmwi2
    presence_cutoff = 0.00001
    score = gmwi2.decision_function(df > presence_cutoff)[0]
    return score

In [111]:
kefir_dir = "../data/raw/kefir_metaphlan3_results/"
gmwi2 = {}

for f in os.listdir(kefir_dir):
    if "all" in f: continue
    score = compute_GMWI2(os.path.join(kefir_dir, f))
    gmwi2[f] = score

In [112]:
pd.DataFrame(gmwi2, index=["GMWI2"]).T.sort_index()

Unnamed: 0,GMWI2
Subj001_time1_S1_metaphlan3.txt,-0.574863
Subj001_time2_S2_metaphlan3.txt,0.176184
Subj003_time1_S3_metaphlan3.txt,-0.059041
Subj004_time1_S4_metaphlan3.txt,0.219228
Subj007_time1_S5_metaphlan3.txt,-1.794307
Subj008_time1_S6_metaphlan3.txt,-1.168975
Subj008_time2_S7_metaphlan3.txt,-0.804171
Subj011_time1_S8_metaphlan3.txt,-1.200323
Subj011_time2_S9_metaphlan3.txt,0.146706
Subj013_time1_S10_metaphlan3.txt,-1.005974


In [113]:
def compute_GMWI(file):
    df = pd.read_csv(file, sep="\t", skiprows=3, usecols=[0, 2], index_col=0).T

    # load model
    gmwi = load(os.path.join(config.MODEL_DIR, "GMHI_model.joblib"))

    # add dummy columns
    dummy_cols = list(set(gmwi.features) - set(df.columns))
    
    dummy_df = pd.DataFrame(np.zeros((1, len(dummy_cols))), columns=dummy_cols, index=["relative_abundance"])
    df = pd.concat([dummy_df, df], axis=1)
    df = df.copy()[["UNKNOWN"] + list(gmwi.features)]

    # normalize relative abundances
    df = df.divide((100 - df["UNKNOWN"]), axis="rows")
    df = df.drop(labels=["UNKNOWN"], axis=1)

    # compute gmwi
    score = gmwi.decision_function(df)[0]
    return score

In [114]:
kefir_dir = "../data/raw/kefir_metaphlan3_results/"
gmwi = {}

for f in os.listdir(kefir_dir):
    if "all" in f: continue
    score = compute_GMWI(os.path.join(kefir_dir, f))
    gmwi[f] = score

In [120]:
results = pd.DataFrame([gmwi2, gmwi], index=["GMWI2", "GMWI"]).T.sort_index()
results

Unnamed: 0,GMWI2,GMWI
Subj001_time1_S1_metaphlan3.txt,-0.574863,-0.184521
Subj001_time2_S2_metaphlan3.txt,0.176184,1.512431
Subj003_time1_S3_metaphlan3.txt,-0.059041,0.355143
Subj004_time1_S4_metaphlan3.txt,0.219228,-1.004163
Subj007_time1_S5_metaphlan3.txt,-1.794307,-0.884695
Subj008_time1_S6_metaphlan3.txt,-1.168975,-0.334715
Subj008_time2_S7_metaphlan3.txt,-0.804171,0.141104
Subj011_time1_S8_metaphlan3.txt,-1.200323,-3.858764
Subj011_time2_S9_metaphlan3.txt,0.146706,2.706158
Subj013_time1_S10_metaphlan3.txt,-1.005974,-1.255278


In [116]:
results.to_csv(os.path.join(config.LOG_DIR, "kefir_results.csv"))

In [88]:
time = [t.split("_")[1] for t in results.index]
subject = [t.split("_")[0] for t in results.index]

results["time"] = time
results["subject"] = subject

In [89]:
results = results[results["time"] != "time3"]
results

Unnamed: 0,GMWI2,GMWI,time,subject
Subj001_time1_S1_metaphlan3.txt,-0.574863,-0.184521,time1,Subj001
Subj001_time2_S2_metaphlan3.txt,0.176184,1.512431,time2,Subj001
Subj003_time1_S3_metaphlan3.txt,-0.059041,0.355143,time1,Subj003
Subj004_time1_S4_metaphlan3.txt,0.219228,-1.004163,time1,Subj004
Subj007_time1_S5_metaphlan3.txt,-1.794307,-0.884695,time1,Subj007
Subj008_time1_S6_metaphlan3.txt,-1.168975,-0.334715,time1,Subj008
Subj008_time2_S7_metaphlan3.txt,-0.804171,0.141104,time2,Subj008
Subj011_time1_S8_metaphlan3.txt,-1.200323,-3.858764,time1,Subj011
Subj011_time2_S9_metaphlan3.txt,0.146706,2.706158,time2,Subj011
Subj013_time1_S10_metaphlan3.txt,-1.005974,-1.255278,time1,Subj013


In [90]:
from scipy.stats import wilcoxon

In [95]:
count = results.groupby("subject").count()
in_both = count[count["time"] == 2].index

In [99]:
wilc_df = results[results["subject"].isin(in_both)]

In [118]:
import matplotlib.pyplot as plt

wilcoxon(wilc_df[wilc_df["time"] == "time1"]["GMWI2"], wilc_df[wilc_df["time"] == "time2"]["GMWI2"], alternative='less')


WilcoxonResult(statistic=15.0, pvalue=0.029869507727696205)

In [119]:
wilcoxon(wilc_df[wilc_df["time"] == "time1"]["GMWI"], wilc_df[wilc_df["time"] == "time2"]["GMWI"], alternative='less')

WilcoxonResult(statistic=10.0, pvalue=0.0052490234375)