## Random Forest for Prediction

In [41]:
import pandas as pd

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# ignore warnings
# !!!!!!!!!!!!
# COMMENT OUT WHEN CHANGING THE CODE
import warnings
warnings.filterwarnings("ignore")

In [23]:
de_comps = pd.read_csv('results/multi/de_compounds.csv')
de_exp = pd.read_csv('SourceData/combined_exp_data.csv')[pd.concat([de_comps.Compound, pd.Series(['RID', 'DX.bl', 'AGE', 'PTEDUCAT', 'PTGENDER', 'APOE4'])])]

de_exp['PTGENDER'] = de_exp['PTGENDER'].map({'Male': 1, 'Female': 0})
comp_info = pd.read_csv('Results/multi/de_compounds.csv')[['Compound', 'annotation']]

In [24]:
de_exp

Unnamed: 0,X1084,X1090,X100002154,X100020361,X1084.1,X100001851,X100002154.1,X10479.18,X12414.31,X14156.33,...,X9266.1,X9599.6,X9884.8,X9900.36,RID,DX.bl,AGE,PTEDUCAT,PTGENDER,APOE4
0,1.468697,0.102706,1.156543,1.131452,1.468697,0.770269,1.156543,2.139829e+00,1.072448,1.139351,...,-0.377059,0.475920,0.747135,1.122776,4521,MCI,70.0,18.0,1,1.0
1,0.276340,-1.585510,-0.129522,0.619385,0.276340,-0.116378,-0.129522,1.926272e-01,-1.175869,-0.566141,...,0.296053,-0.055658,-0.885049,-1.318552,4488,CN,72.6,18.0,1,0.0
2,-0.386183,-1.585510,-0.534253,-0.720177,-0.386183,-0.207632,-0.534253,-2.155047e-01,-1.275524,-1.024589,...,-0.537343,0.598309,-1.341615,-0.859730,4516,CN,71.4,20.0,1,0.0
3,-1.745379,-1.585510,1.746855,-1.173428,-1.745379,-1.864430,1.746855,-5.546286e-01,0.276579,0.222636,...,0.168596,-1.141330,-0.019081,-0.098586,4530,MCI,75.9,18.0,1,1.0
4,-0.596943,-1.585510,0.738124,-0.540534,-0.596943,-0.401417,0.738124,3.385407e-01,-0.012317,-0.028570,...,1.400633,-0.842683,-0.253674,-0.405918,4462,MCI,71.3,18.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,2.718196,1.269160,-1.890028,0.525402,2.718196,1.517968,-1.890028,1.868216e+00,1.417988,1.086680,...,-0.059231,0.513284,1.714269,0.736459,892,MCI,72.8,12.0,0,1.0
601,0.477322,-1.585510,-0.108782,0.194513,0.477322,0.652311,-0.108782,-8.121225e-16,0.040038,-0.290838,...,-0.444387,-0.286562,-0.444569,0.051159,1290,AD,79.3,8.0,0,1.0
602,-0.457112,0.656772,0.429408,1.929625,-0.457112,0.612991,0.429408,2.068961e-01,1.958468,1.552488,...,-0.537343,0.340201,1.510293,1.161533,438,AD,81.9,11.0,1,0.0
603,2.749958,0.623277,0.097312,1.456274,2.749958,2.532086,0.097312,4.125520e-01,0.486410,0.223092,...,-0.896680,1.045482,0.659391,0.440209,459,CN,72.9,16.0,1,0.0


In [25]:
cols = [col for col in de_exp.columns if col.startswith('X')]

### Functions

In [26]:
# Get subset of data according to condition
def cond_filter(data, cond1, cond2):

    subset = data[data['DX.bl'].isin([cond1, cond2])]
    subset['DX.bl'] = subset['DX.bl'].map({cond1: 0, cond2: 1})

    return(subset)

In [39]:
# Apply Random Forest Classifier for each compound in a list for given Diagnoses
def rf_apply(data, cols, test_size = 0.2):
    accuracies = []
    for i in cols:
        X = data.drop(columns=['DX.bl'])
        y = data[['DX.bl']]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

        rf = RandomForestClassifier()
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append([i, accuracy])
    return(accuracies)


## Subsetting Data

In [42]:
cn_ad = cond_filter(de_exp, 'CN', 'AD')

mci_ad = cond_filter(de_exp, 'MCI', 'AD')

cn_mci = cond_filter(de_exp, 'CN', 'MCI')

## Simple Random Forest Classifier

In [43]:
acc_cn_ad = rf_apply(cn_ad, cols)
acc_cn_ad

[['X1084', 0.9444444444444444],
 ['X1090', 0.9629629629629629],
 ['X100002154', 0.8518518518518519],
 ['X100020361', 0.8703703703703703],
 ['X1084', 0.8703703703703703],
 ['X100001851', 0.9074074074074074],
 ['X100002154', 0.9629629629629629],
 ['X10479.18', 0.8888888888888888],
 ['X12414.31', 0.8888888888888888],
 ['X14156.33', 0.8888888888888888],
 ['X15333.11', 0.8518518518518519],
 ['X21721.6', 0.9259259259259259],
 ['X4179.57', 0.7777777777777778],
 ['X4272.46', 0.9629629629629629],
 ['X5858.6', 0.8703703703703703],
 ['X8479.4', 0.8518518518518519],
 ['X9900.36', 0.8888888888888888],
 ['X10082.251', 0.8703703703703703],
 ['X4179.57', 0.9629629629629629],
 ['X9900.36', 0.9259259259259259],
 ['X10082.251', 0.8518518518518519],
 ['X10351.51', 0.8703703703703703],
 ['X10479.18', 0.9444444444444444],
 ['X11681.8', 0.8888888888888888],
 ['X12358.6', 0.8888888888888888],
 ['X12387.7', 0.8703703703703703],
 ['X12414.31', 0.9259259259259259],
 ['X12853.112', 0.8518518518518519],
 ['X12861.

In [44]:
acc_mci_ad = rf_apply(mci_ad, cols)
acc_mci_ad

[['X1084', 0.7608695652173914],
 ['X1090', 0.7608695652173914],
 ['X100002154', 0.8043478260869565],
 ['X100020361', 0.8043478260869565],
 ['X1084', 0.8586956521739131],
 ['X100001851', 0.8586956521739131],
 ['X100002154', 0.7934782608695652],
 ['X10479.18', 0.8478260869565217],
 ['X12414.31', 0.8369565217391305],
 ['X14156.33', 0.8369565217391305],
 ['X15333.11', 0.7608695652173914],
 ['X21721.6', 0.7934782608695652],
 ['X4179.57', 0.8043478260869565],
 ['X4272.46', 0.8152173913043478],
 ['X5858.6', 0.7391304347826086],
 ['X8479.4', 0.7934782608695652],
 ['X9900.36', 0.8478260869565217],
 ['X10082.251', 0.8369565217391305],
 ['X4179.57', 0.7282608695652174],
 ['X9900.36', 0.8478260869565217],
 ['X10082.251', 0.8152173913043478],
 ['X10351.51', 0.8260869565217391],
 ['X10479.18', 0.8152173913043478],
 ['X11681.8', 0.8043478260869565],
 ['X12358.6', 0.782608695652174],
 ['X12387.7', 0.8586956521739131],
 ['X12414.31', 0.7282608695652174],
 ['X12853.112', 0.8043478260869565],
 ['X12861.1

In [45]:
acc_cn_mci = rf_apply(cn_mci, cols)
acc_cn_mci

[['X1084', 0.7938144329896907],
 ['X1090', 0.7216494845360825],
 ['X100002154', 0.7010309278350515],
 ['X100020361', 0.6907216494845361],
 ['X1084', 0.7319587628865979],
 ['X100001851', 0.711340206185567],
 ['X100002154', 0.7835051546391752],
 ['X10479.18', 0.7525773195876289],
 ['X12414.31', 0.7731958762886598],
 ['X14156.33', 0.8041237113402062],
 ['X15333.11', 0.711340206185567],
 ['X21721.6', 0.711340206185567],
 ['X4179.57', 0.7525773195876289],
 ['X4272.46', 0.7010309278350515],
 ['X5858.6', 0.7216494845360825],
 ['X8479.4', 0.7525773195876289],
 ['X9900.36', 0.7525773195876289],
 ['X10082.251', 0.7938144329896907],
 ['X4179.57', 0.7216494845360825],
 ['X9900.36', 0.6804123711340206],
 ['X10082.251', 0.7422680412371134],
 ['X10351.51', 0.6804123711340206],
 ['X10479.18', 0.7216494845360825],
 ['X11681.8', 0.7628865979381443],
 ['X12358.6', 0.7319587628865979],
 ['X12387.7', 0.6907216494845361],
 ['X12414.31', 0.6185567010309279],
 ['X12853.112', 0.7422680412371134],
 ['X12861.13'

In [46]:
acc_cn_ad = pd.DataFrame(acc_cn_ad, columns=['Compound', 'accuracy_CNvsAD'])
acc_cn_mci = pd.DataFrame(acc_cn_mci, columns=['Compound', 'accuracy_CnvsMCI'])
acc_mci_ad = pd.DataFrame(acc_mci_ad, columns=['Compound', 'accuracy_MCIvsAD'])
results = pd.concat([acc_cn_ad, acc_cn_mci, acc_mci_ad, comp_info], axis=1)[['annotation', 'accuracy_CNvsAD', 'accuracy_CnvsMCI', 'accuracy_MCIvsAD']]

In [47]:
results.to_csv('results/multi/rf_de_comp_accuracy.csv', index=False)