In [366]:
import biom
from biom import Table
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

In [285]:
metadata = pd.read_excel("../../data/metadata.xls", index_col=0)
OTUs_table = biom.load_table("../../data/projects/table_6721_2378.biom")
metadata = metadata.loc[OTUs_table.ids(axis="sample")]
metadata = metadata.loc[metadata.study.values != "PRJEB2165"]
metadata = metadata.loc[metadata.study.values != "PRJNA385004"]

In [286]:
# metadata = metadata.loc[metadata.fber_type != "starch-entrapped-microspheres"]

In [287]:
study_list = metadata.study.unique()

In [288]:
study_list

array(['PRJEB41443', 'PRJNA560950', 'PRJNA780023', 'PRJNA891951',
       'PRJNA293971', 'PRJNA306884', 'PRJNA428736'], dtype=object)

# Species

### Within-study cross validation

In [421]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=42, class_weight='balanced')

In [224]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [225]:
study_names = []
fold_list = []
auc_list = []
for i in study_list:
    metadata_pick = metadata.loc[metadata.study == i]
    inter_sid = np.intersect1d(OTUs_table.ids(axis="sample"), metadata_pick.index.values)
    OTUs_table_pick = OTUs_table.filter(inter_sid, axis="sample", inplace=False)
    OTUs_table_pick = OTUs_table_pick.norm(axis='sample', inplace=False)
    OTUs_table_pick = OTUs_table_pick.to_dataframe().T
    
    host_id = metadata_pick.subject_id.unique()
    for fold, (train_idx, test_idx) in enumerate(kf.split(host_id)):
        train_sid = metadata_pick.loc[[i in host_id[train_idx] for i in metadata_pick.subject_id.values]].index.values
        test_sid = metadata_pick.loc[[i not in host_id[train_idx] for i in metadata_pick.subject_id.values]].index.values
        
        X_train = OTUs_table_pick.loc[train_sid]
        X_test = OTUs_table_pick.loc[test_sid]
        y_train = metadata_pick.loc[train_sid].group.values
        y_test = metadata_pick.loc[test_sid].group.values
        
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        y_prob = rf_model.predict_proba(X_test)
    
        auc = roc_auc_score(y_test, y_prob[:, 1])
        
        study_names.append(i)
        fold_list.append(f"fold_{fold+1}")
        auc_list.append(auc)
        
results_cv = pd.DataFrame({"study": study_names, "fold": fold_list, "auc": auc_list})

In [226]:
results_cv.to_csv("../../data/classification/results_cv.csv", index=False)

### Cross-study validation (CSV)

In [229]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=42, class_weight='balanced')

In [227]:
train_study = []
test_study = []
auc_list = []
for i in study_list:
    train_sid = metadata.loc[metadata.study.values == i].index.values
    for j in study_list:
        if i != j:
            test_sid = metadata.loc[metadata.study.values == j].index.values
            
            X_train = OTUs_table.filter(train_sid, axis="sample", inplace=False)
            X_train.norm(axis='sample')
            X_train = X_train.to_dataframe().T
            
            X_test = OTUs_table.filter(test_sid, axis="sample", inplace=False)
            X_test.norm(axis='sample')
            X_test = X_test.to_dataframe().T
            
            y_train = metadata.loc[train_sid].group.values
            y_test = metadata.loc[test_sid].group.values
            
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
        
            auc = roc_auc_score(y_test, y_prob[:, 1])
            
            train_study.append(i)
            test_study.append(j)
            auc_list.append(auc)

results_csv = pd.DataFrame({"train_study": train_study, "test_study": test_study, "auc": auc_list})
results_csv.to_csv("../../data/classification/results_csv.csv", index=False)

### Leave-one-study-out validation (LOSO)

In [276]:
test_study = []
auc_list = []
for i in study_list:
    df = metadata.loc[metadata.study.values != i]
    # train_sid_1 = df.loc[df.group.values == 0].drop_duplicates(subset=['subject_id']).index.values
    # train_sid_2 = df.loc[df.group.values == 1].drop_duplicates(subset=['subject_id']).index.values
    # train_sid = list(train_sid_1) + list(train_sid_2)
    train_sid = metadata.loc[metadata.study.values != i].index.values
    test_sid = metadata.loc[metadata.study.values == i].index.values
    
    X_train = OTUs_table.filter(train_sid, axis="sample", inplace=False)
    X_train.remove_empty()
    fid = X_train.ids(axis="observation")
    X_train.norm(axis='sample')
    X_train = X_train.to_dataframe().T
    
    X_test = OTUs_table.filter(test_sid, axis="sample", inplace=False)
    X_test = X_test.filter(fid, axis="observation", inplace=False)
    X_test.norm(axis='sample')
    X_test = X_test.to_dataframe().T
    
    y_train = metadata.loc[train_sid].group.values
    y_test = metadata.loc[test_sid].group.values
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    auc = roc_auc_score(y_test, y_prob[:, 1])

    test_study.append(i)
    auc_list.append(auc)

results_loso = pd.DataFrame({"test_study": test_study, "auc": auc_list})
results_loso.to_csv("../../data/classification/results_loso.csv", index=False)

In [277]:
results_loso

Unnamed: 0,test_study,auc
0,PRJEB41443,0.750678
1,PRJNA560950,0.701942
2,PRJNA780023,0.508412
3,PRJNA891951,0.559547
4,PRJNA293971,0.731875
5,PRJNA306884,0.82547
6,PRJNA428736,0.6522
