In [1]:
import biom
from biom import Table
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

In [167]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [225]:
rf_model = RandomForestClassifier(n_estimators=500, max_depth=5, max_features="log2", random_state=42)

### synthetic data

In [229]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Synthetic_Dataset_1/metadata.txt"

In [234]:
fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Synthetic_Dataset_1/data/train_{i}.biom"
    test_table = f"//home/dongbiao/software/Phylo-Spec/data/Synthetic_Dataset_1/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)
    
    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(X_test.index.values)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob[:, 1])
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/synthetic_data/results/RF_results.csv", index=False)

  return fit_method(estimator, *args, **kwargs)


0.8888888888888888


  return fit_method(estimator, *args, **kwargs)


0.9151785714285714


  return fit_method(estimator, *args, **kwargs)


0.88


  return fit_method(estimator, *args, **kwargs)


0.8285714285714286


  return fit_method(estimator, *args, **kwargs)


0.7740384615384616


### IBD_16S

In [319]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_IBD/metadata.tsv"

fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_IBD/data/train_{i}.biom"
    test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_IBD/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

    test_sid = X_test.index.values
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

    X_train = X_train.values
    X_test = X_test.values
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(test_sid)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob[:, 1])
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/IBD_16S/results/RF_results.csv", index=False)



0.9325412368890628




0.9380045205037134




0.8468944099378882




0.8926795816903823




0.9372976155431263


In [338]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_IBD/metadata.tsv"
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

auc = []
group_1 = []
group_2 = []
for n in ["without_low", "without_high"]:
    for m in ["0.1", "0.2", "0.4", "0.5", "0.6", "0.8"]:
        temp_auc = []
        for i in [1, 2, 3, 4, 5]:
            train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_IBD/data/train_{n}_{m}_{i}.biom"
            test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_IBD/data/test_{n}_{m}_{i}.biom"
            X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
            X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
        
            test_sid = X_test.index.values
            y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
            y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)
        
            X_train = X_train.values
            X_test = X_test.values
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
            
            temp_auc.append(roc_auc_score(y_test, y_prob[:, 1]))
        auc.append(np.mean(temp_auc))
        group_1.append(n)
        group_2.append(m)

pred_res = pd.DataFrame({"group_1": group_1, "group_2": group_2, "value": auc})
pred_res.to_csv("/home/dongbiao/GCN/data/IBD_16S/results/RF_results_biomark.csv", index=False)



### CRC_16S

In [243]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_CRC/metadata.tsv"

fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_CRC/data/train_{i}.biom"
    test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_CRC/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

    test_sid = X_test.index.values
    X_train = X_train.values
    X_test = X_test.values
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(test_sid)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob[:, 1])
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/CRC_16S/results/RF_results.csv", index=False)



0.8837555886736215




0.8232704402515724




0.8961538461538461




0.895




0.8307692307692308


In [336]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_CRC/metadata.tsv"
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

auc = []
group_1 = []
group_2 = []
for n in ["without_low", "without_high"]:
    for m in ["0.1", "0.2", "0.4", "0.5", "0.6", "0.8"]:
        temp_auc = []
        for i in [1, 2, 3, 4, 5]:
            train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_CRC/data/train_{n}_{m}_{i}.biom"
            test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_16S_CRC/data/test_{n}_{m}_{i}.biom"
            X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
            X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
        
            test_sid = X_test.index.values
            y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
            y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)
        
            X_train = X_train.values
            X_test = X_test.values
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
            
            temp_auc.append(roc_auc_score(y_test, y_prob[:, 1]))
        auc.append(np.mean(temp_auc))
        group_1.append(n)
        group_2.append(m)

pred_res = pd.DataFrame({"group_1": group_1, "group_2": group_2, "value": auc})
pred_res.to_csv("/home/dongbiao/GCN/data/CRC_16S/results/RF_results_biomark.csv", index=False)



### diatery fiber 16S

In [277]:
metadata_filename = "/home/dongbiao/GCN/data/dietary_fiber/metadata.tsv"

fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/GCN/data/dietary_fiber/data/train_{i}.biom"
    test_table = f"/home/dongbiao/GCN/data/dietary_fiber/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

    test_sid = X_test.index.values
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

    X_train = X_train.values
    X_test = X_test.values
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(test_sid)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob[:, 1])
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/dietary_fiber/results/RF_results.csv", index=False)



0.708914030692139




0.6635896214241758




0.6767662801951451




0.6695281522601111




0.6990250361097736


In [326]:
metadata_filename = "/home/dongbiao/GCN/data/dietary_fiber/metadata.tsv"
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

auc = []
group_1 = []
group_2 = []
for n in ["without_low", "without_high"]:
    for m in ["0.1", "0.2", "0.4", "0.5", "0.6", "0.8"]:
        temp_auc = []
        for i in [1, 2, 3, 4, 5]:
            train_table = f"/home/dongbiao/GCN/data/dietary_fiber/data/train_{n}_{m}_{i}.biom"
            test_table = f"/home/dongbiao/GCN/data/dietary_fiber/data/test_{n}_{m}_{i}.biom"
            X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
            X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
        
            test_sid = X_test.index.values
            y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
            y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

            X_train = X_train.values
            X_test = X_test.values
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
    
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
            
            temp_auc.append(roc_auc_score(y_test, y_prob[:, 1]))
        auc.append(np.mean(temp_auc))
        group_1.append(n)
        group_2.append(m)

pred_res = pd.DataFrame({"group_1": group_1, "group_2": group_2, "value": auc})
pred_res.to_csv("/home/dongbiao/GCN/data/dietary_fiber/results/RF_results_biomark.csv", index=False)



### CRC WGS

In [244]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_CRC/metadata.tsv"

fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_CRC/data/train_{i}.biom"
    test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_CRC/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

    test_sid = X_test.index.values
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

    X_train = X_train.values
    X_test = X_test.values
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(test_sid)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob[:, 1])
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/CRC_WGS/results/RF_results.csv", index=False)



0.9571428571428571




0.7055555555555556




0.859375




0.8288770053475936




0.903061224489796


In [327]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_CRC/metadata.tsv"
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

auc = []
group_1 = []
group_2 = []
for n in ["without_low", "without_high"]:
    for m in ["0.1", "0.2", "0.4", "0.5", "0.6", "0.8"]:
        temp_auc = []
        for i in [1, 2, 3, 4, 5]:
            train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_CRC/data/train_{n}_{m}_{i}.biom"
            test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_CRC/data/test_{n}_{m}_{i}.biom"
            X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
            X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
        
            test_sid = X_test.index.values
            y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
            y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)
        
            X_train = X_train.values
            X_test = X_test.values
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
            
            temp_auc.append(roc_auc_score(y_test, y_prob[:, 1]))
        auc.append(np.mean(temp_auc))
        group_1.append(n)
        group_2.append(m)

pred_res = pd.DataFrame({"group_1": group_1, "group_2": group_2, "value": auc})
pred_res.to_csv("/home/dongbiao/GCN/data/CRC_WGS/results/RF_results_biomark.csv", index=False)



### T2D WGS

In [245]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_T2D/metadata.tsv"

fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_T2D/data/train_{i}.biom"
    test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_T2D/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

    test_sid = X_test.index.values
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

    X_train = X_train.values
    X_test = X_test.values
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(test_sid)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob[:, 1])
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/T2D_WGS/results/RF_results.csv", index=False)



0.8303571428571428




0.8482142857142856




0.725




0.8888888888888888




0.7727272727272727


In [333]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_T2D/metadata.tsv"
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

auc = []
group_1 = []
group_2 = []
for n in ["without_low", "without_high"]:
    for m in ["0.1", "0.2", "0.4", "0.5", "0.6", "0.8"]:
        temp_auc = []
        for i in [1, 2, 3, 4, 5]:
            train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_T2D/data/train_{n}_{m}_{i}.biom"
            test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_WGS_T2D/data/test_{n}_{m}_{i}.biom"
            X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
            X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
        
            test_sid = X_test.index.values
            y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
            y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)
        
            X_train = X_train.values
            X_test = X_test.values
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
            
            temp_auc.append(roc_auc_score(y_test, y_prob[:, 1]))
        auc.append(np.mean(temp_auc))
        group_1.append(n)
        group_2.append(m)

pred_res = pd.DataFrame({"group_1": group_1, "group_2": group_2, "value": auc})
pred_res.to_csv("/home/dongbiao/GCN/data/T2D_WGS/results/RF_results_biomark.csv", index=False)



### Multi-status classification

In [250]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/metadata.tsv"

In [248]:
fold = []
pred_prob = []
sampel_id = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/data/train_{i}.biom"
    test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/data/test_{i}.biom"
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

    test_sid = X_test.index.values
    y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
    y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)

    X_train = X_train.values
    X_test = X_test.values
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)

    pred_prob = pred_prob + list(y_prob[:, 1])
    sampel_id = sampel_id + list(test_sid)
    fold = fold + [f"fold_{i}"] * len(y_prob[:, 1])
    
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    print(auc)

pred_res = pd.DataFrame({"sampel_id": sampel_id, "fold": fold, "pred_prob": pred_prob})
pred_res.to_csv("/home/dongbiao/GCN/data/Multi_classification/results/RF_results.csv", index=False)



0.8988951575635363




0.9091547261590236




0.8986092882744199




0.9165710677702043




0.8789807486757713


In [312]:
results_list = []
for i in range(1, 6):
    train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/data/train_{i}.biom"
    test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/data/test_{i}.biom"
    
    X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
    X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
    
    metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

    test_sid = X_test.index.values
    
    y_train = metadata.loc[X_train.index.values, "group"].values
    y_test = metadata.loc[X_test.index.values, "group"].values

    X_train = X_train.values
    X_test = X_test.values
    
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_model.fit(X_train, y_train)
    
    y_prob = rf_model.predict_proba(X_test)
    class_names = rf_model.classes_
    fold_df = pd.DataFrame(y_prob, columns=class_names)

    fold_df["sample_id"] = test_sid
    fold_df["fold"] = f"fold_{i}"

    results_list.append(fold_df)
    
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    print(f"Fold {i} AUC: {auc}")

final_results = pd.concat(results_list, axis=0)
cols = ["sample_id", "fold"] + [c for c in final_results.columns if c not in ["sample_id", "fold"]]
final_results = final_results[cols]
save_path = "/home/dongbiao/GCN/data/Multi_classification/results/RF_results.csv"
final_results.to_csv(save_path, index=False)

KeyError: "None of [Index(['ERR1368879', 'ERR1368880', 'ERR1368881', 'ERR1368882', 'ERR1368885',\n       'ERR1368886', 'ERR1368887', 'ERR1368889', 'ERR1368891', 'ERR1368894',\n       ...\n       'SRR9160350', 'SRR9160352', 'SRR9160353', 'SRR9160354', 'SRR9160355',\n       'SRR9160356', 'SRR9160357', 'SRR9160358', 'SRR9160359', 'SRR9160360'],\n      dtype='object', name='Run', length=1585)] are in the [index]"

In [339]:
metadata_filename = "/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/metadata.tsv"
metadata = pd.read_csv(metadata_filename, sep="\t", index_col=0)

auc = []
group_1 = []
group_2 = []
for n in ["without_low", "without_high"]:
    for m in ["0.1", "0.2", "0.4", "0.5", "0.6", "0.8"]:
        temp_auc = []
        for i in [1, 2, 3, 4, 5]:
            train_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/data/train_{n}_{m}_{i}.biom"
            test_table = f"/home/dongbiao/software/Phylo-Spec/data/Real_Dateset_Multi-classification/data/test_{n}_{m}_{i}.biom"
            X_train = biom.load_table(train_table).norm(axis='sample', inplace=False).to_dataframe().T
            X_test = biom.load_table(test_table).norm(axis='sample', inplace=False).to_dataframe().T
        
            test_sid = X_test.index.values
            y_train = np.array(metadata.loc[X_train.index.values, "group"]).reshape(-1, 1)
            y_test = np.array(metadata.loc[X_test.index.values, "group"]).reshape(-1, 1)
        
            X_train = X_train.values
            X_test = X_test.values
            smote = SMOTE(random_state=42)
            X_train, y_train = smote.fit_resample(X_train, y_train)
        
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
        
            rf_model.fit(X_train, y_train)
            y_pred = rf_model.predict(X_test)
            y_prob = rf_model.predict_proba(X_test)
            
            temp_auc.append(roc_auc_score(y_test, y_prob, multi_class='ovr'))
        auc.append(np.mean(temp_auc))
        group_1.append(n)
        group_2.append(m)

pred_res = pd.DataFrame({"group_1": group_1, "group_2": group_2, "value": auc})
pred_res.to_csv("/home/dongbiao/GCN/data/Multi_classification/results/RF_results_biomark.csv", index=False)

