# Classification for UKB data

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math, boto3, tempfile
import scipy.stats as sp
from sklearn import manifold

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
from sklearn.utils import resample

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = pd.read_csv(s3_path+'crc_dataset.csv')
df.columns

## Decision Trees

In [None]:
precision = []
recall = []
accuracy = []
f1 = []
bio_name = []

for col in df.columns[:-1]:
    df2 = df[[col,'label_crc']].copy(deep=True)
    df2.dropna(inplace=True)
    p_idx = df2['label_crc'] == True
    df2 = df2.groupby('label_crc', group_keys=False).apply(lambda x: x.sample(n=len(df2[p_idx]), replace=False, random_state=0)) # create a balanced set
    
    y = df2.pop('label_crc')
    X = np.array(df2).reshape(-1, 1)
    if len(X)>0:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
        clf = DecisionTreeClassifier()
        clf = clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
    
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        accuracy.append(accuracy_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
        bio_name.append(col)

In [None]:
pd.set_option('display.max_rows', None)
clf_df = pd.DataFrame()
clf_df = clf_df.assign(biomarker=bio_name, precision=precision, recall=recall, accuracy=accuracy, f1=f1)

clf_df = clf_df.sort_values(by='accuracy', ascending=False, ignore_index=True)
clf_df.to_csv(s3_path+'tables/decision_trees_results.csv', index=False)
clf_df

In [None]:
fig1, ax1 = plt.subplots(figsize=(20,4))

ax1 = plt.bar(np.arange(len(clf_df)), clf_df['accuracy'], tick_label=clf_df['biomarker'], color='navy')
plt.axhline(y=0.5, color='r', linestyle=':')
plt.xticks(rotation=80)
plt.ylabel('Accuracy')
plt.ylim((0.2,0.7))
plt.savefig('./figures/decision_trees_results.jpg', dpi=150) 
plt.show()


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [None]:
precision = []
recall = []
accuracy = []
f1 = []
bio_name = []

for col in df.columns[:-1]:
    df2 = df[[col,'label_crc']].copy(deep=True)
    df2.dropna(inplace=True)
    p_idx = df2['label_crc'] == True
    df2 = df2.groupby('label_crc', group_keys=False).apply(lambda x: x.sample(n=len(df2[p_idx]), replace=False, random_state=0)) # create a balanced set
    y = df2.pop('label_crc')
    X = np.array(df2).reshape(-1, 1)
    
    if len(X)>0:
        clf = RandomForestClassifier(n_estimators=100, max_depth=5, criterion="entropy", random_state=0)
        splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
        precision.append(np.round(cross_val_score(clf, X, y, cv=splitter, scoring='precision').mean(),2))
        recall.append(np.round(cross_val_score(clf, X, y, cv=splitter, scoring='recall').mean(),2))
        accuracy.append(np.round(cross_val_score(clf, X, y, cv=splitter, scoring='accuracy').mean(),2))
        f1.append(np.round(cross_val_score(clf, X, y, cv=splitter, scoring='f1').mean(),2))
        bio_name.append(col)

In [None]:
pd.set_option('display.max_rows', None)
clf_df = pd.DataFrame()
clf_df = clf_df.assign(biomarker=bio_name, precision=precision, recall=recall, accuracy=accuracy, f1=f1)

clf_df = clf_df.sort_values(by='accuracy', ascending=False, ignore_index=True)
clf_df.to_csv(s3_path+'tables/random_forests_results.csv', index=False)
clf_df

In [None]:
fig1, ax1 = plt.subplots(figsize=(20,4))

ax1 = plt.bar(np.arange(len(clf_df)), clf_df['accuracy'], tick_label=clf_df['biomarker'], color='mediumturquoise')
plt.axhline(y=0.5, color='k', linestyle='-')
plt.xticks(rotation=80)
plt.ylabel('Accuracy')
plt.ylim((0.3,0.6))
plt.title('Random forest classification accuracy - single measures')
plt.grid(axis='y')

plt.savefig('./figures/random_forests_results.jpg', dpi=150) 
plt.show()


# Recursive feature elimination

In [None]:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = pd.read_csv(s3_path+'labels/all_cancer_labels.csv')

In [None]:
df2 = df[['sex', 'age', 'ethnicity', 'townsend', 'bmi', 'pulse', 'dbp', 'sbp',
       'wbc', 'rbc', 'hgb', 'hct', 'plt', 'lym', 'ualb', 'cr_urine', 'potas',
       'sodium', 'apoa', 'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1',
       'ldl', 'shbg', 'tst', 'tprotein', 'tgly', 'vitd','label_crc']].copy(deep=True)
df2.dropna(inplace=True)

In [None]:
y = df2.pop('label_crc')
X = df2

In [None]:
clf = RandomForestClassifier(n_estimators=100,max_depth=5,criterion="entropy",random_state=0)
selector = RFE(clf)
selector = selector.fit(X, y)
selected_features = pd.DataFrame({'Feature':list(X.columns),'Ranking':selector.ranking_})
selected_features.sort_values(by='Ranking')

# Random Forest classification of all cancer sites

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = pd.read_csv(s3_path+'labels/all_cancer_labels.csv')
ca_sites = df.columns.to_list()

In [None]:
biomarkers = ['sex', 'age', 'ethnicity', 'townsend', 'bmi', 'pulse', 'dbp', 'sbp',
       'wbc', 'rbc', 'hgb', 'hct', 'plt', 'lym', 'ualb', 'cr_urine', 'potas',
       'sodium', 'apoa', 'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1',
       'ldl', 'shbg', 'tst', 'tprotein', 'tgly', 'vitd', 'age_at_diagnosis',
       'n_cancer_dx', 'n_cancer_occs', 'age_at_death', 'tumour_hist',
       'tumour_beh', 'baso', 'eos', 'mchc', 'mcv', 'mono', 'np', 'ret', 'pdw',
       'pct', 'alt', 'alb', 'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt',
       'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', 'cit', 'gln',
       'gly', 'his', 'ile', 'bla', 'leu', 'mufa', 'phe', 'sph', 'tyr', 'val',
       'dha_tfa', 'grip_l', 'grip_r', 'trunk_fat', 'bmr', 'adj_ts_ratio',
       'ts_ratio_reg', 'unadj_ts_ratio', 'z_ts_ratio','label']

In [None]:
count_arr = np.empty((len(biomarkers[:-1]),len(ca_sites)), float)
acc_arr = np.empty((len(biomarkers[:-1]),len(ca_sites)), float)

In [None]:
for ca in ca_sites:
    print('Working on ', ca)
    df = pd.read_csv(s3_path+ca+'_dataset.csv')
    j = ca_sites.index(ca)
    for col in df.columns[:-1]:
        i = df.columns.get_loc(col)
        df2 = df[[col,'label']].copy(deep=True)
        df2.dropna(inplace=True)
        p_idx = df2['label'] == True
        count_arr[i,j] = len(df2[p_idx])
        df2 = df2.groupby('label', group_keys=False).apply(lambda x: x.sample(n=len(df2[p_idx]), replace=False, random_state=0)) # create a balanced set
        y = df2.pop('label')
        X = np.array(df2).reshape(-1, 1)
    
        if len(X)>20:
            clf = RandomForestClassifier(n_estimators=100, max_depth=5, criterion="entropy", random_state=0)
            splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
            acc_arr[i,j] = np.round(cross_val_score(clf, X, y, cv=splitter, scoring='accuracy').mean(),2)
        else:
            acc_arr[i,j] = np.NaN
    np.nanmax(acc_arr[:,j])
    print('Max accuracy is,', np.nanmax(acc_arr[:,j]), ' for ', biomarkers[np.nanargmax(acc_arr[:,j])])

In [None]:
pd.set_option('display.max_rows', None)
clf_df = pd.DataFrame()
clf_df['biomarker'] = biomarkers[:-1]

for ca in ca_sites:
    clf_df[ca] = acc_arr[:,ca_sites.index(ca)]

#clf_df = clf_df.sort_values(by='accuracy', ascending=False, ignore_index=True)
clf_df.to_csv(s3_path+'tables/random_forests_by_cancer_site_accuracy.csv', index=False)
clf_df

In [None]:
ca_names = ['CRC', 'breast','prostate', 'lung', 'bladder', 'kidney', 'brain', 'lymphoma', 'thyroid', 'stomach', 'liver', 'pancreas', 'cervix']

fig, axes = plt.subplots(13,1, figsize=(20,15))

for i in range(len(ca_sites)):
    plt.subplot(13,1,i+1)
    plt.plot(range(1,len(biomarkers)),acc_arr[:,i], color='navy')
    plt.ylim((0.5,0.75))
    plt.xticks(range(1,len(biomarkers)))
    plt.ylabel(ca_names[i], rotation=0, labelpad=35)
    plt.fill_between(range(1,len(biomarkers)),acc_arr[:,i], color='blue', alpha=0.2)
    if i==0:
        plt.title('Random forest classification accuracy - single measures')

plt.xticks(range(1,len(biomarkers)), biomarkers, rotation=80)
plt.savefig('./figures/random_forests_by_cancer_site_accuracy.jpg', dpi=150) 
plt.show()


In [None]:
pd.set_option('display.max_rows', None)
clf_df = pd.DataFrame()
clf_df['biomarker'] = biomarkers[:-1]

for ca in ca_sites:
    clf_df[ca] = count_arr[:,ca_sites.index(ca)]

#clf_df = clf_df.sort_values(by='accuracy', ascending=False, ignore_index=True)
clf_df.to_csv(s3_path+'tables/random_forests_by_cancer_site_count.csv', index=False)
clf_df

# XGBoost

In [None]:
pip install xgboost

In [None]:
pip install hyperopt

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
s3_path = 's3://ukb-colorectal-cancer/analysis/'
df = pd.read_csv(s3_path+'labels/all_cancer_labels.csv')
ca_sites = df.columns.to_list()

In [None]:
biomarkers = ['sex', 'age', 'ethnicity', 'townsend', 'bmi', 'pulse', 'dbp', 'sbp',
       'wbc', 'rbc', 'hgb', 'hct', 'plt', 'lym', 'ualb', 'cr_urine', 'potas',
       'sodium', 'apoa', 'apob', 'urea', 'chol', 'crp', 'cysc', 'hdl', 'igf1',
       'ldl', 'shbg', 'tst', 'tprotein', 'tgly', 'vitd', 'age_at_diagnosis',
       'n_cancer_dx', 'n_cancer_occs', 'age_at_death', 'tumour_hist',
       'tumour_beh', 'baso', 'eos', 'mchc', 'mcv', 'mono', 'np', 'ret', 'pdw',
       'pct', 'alt', 'alb', 'alp', 'ast', 'calc', 'cr_blood', 'dbi', 'ggt',
       'glu', 'hgba1c', 'e2', 'phos', 'rf', 'tbil', 'phbv', 'cit', 'gln',
       'gly', 'his', 'ile', 'bla', 'leu', 'mufa', 'phe', 'sph', 'tyr', 'val',
       'dha_tfa', 'grip_l', 'grip_r', 'trunk_fat', 'bmr', 'adj_ts_ratio',
       'ts_ratio_reg', 'unadj_ts_ratio', 'z_ts_ratio','label']

In [None]:
# Run xgboost hyperparameter optimisation

df = pd.read_csv(s3_path+ca_sites[0]+'_dataset.csv')
df2 = df[biomarkers].copy(deep=True)
idx = df2.index[df2['label'].isnull()]
df2.drop(idx, inplace=True)
p_idx = df2['label'] == True
df2 = df2.groupby('label', group_keys=False).apply(lambda x: x.sample(n=len(df2[p_idx]), replace=False, random_state=0)) # create a balanced set
        
y = df2.pop('label')
X = df2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0}

def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    evaluation = [( X_train, y_train), ( X_test, y_test)]  
    clf.fit(X_train, y_train, eval_set=evaluation, eval_metric="auc", early_stopping_rounds=10,verbose=False)
    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    return {'loss': -accuracy, 'status': STATUS_OK }

trials = Trials()
best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)

print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
# Run xgboost on CRC and check feature importance

df = pd.read_csv(s3_path+ca_sites[0]+'_dataset.csv')
df2 = df[biomarkers].copy(deep=True)
idx = df2.index[df2['label'].isnull()]
df2.drop(idx, inplace=True)
p_idx = df2['label'] == True
df2 = df2.groupby('label', group_keys=False).apply(lambda x: x.sample(n=len(df2[p_idx]), replace=False, random_state=0)) # create a balanced set
        
y = df2.pop('label')
X = df2

best_hyperparams['n_estimators'] = 100
best_hyperparams['max_depth'] = int(best_hyperparams['max_depth'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
xgb_clf = xgb.XGBClassifier(**best_hyperparams)
xgb_clf.fit(X_train, y_train)
preds = xgb_clf.predict(X_test)
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))

xgb_cv = xgb.XGBClassifier(**best_hyperparams)
scores = cross_val_score(xgb_cv, X_train, y_train, cv=5, scoring = "accuracy")
print("XGBoost's CV prediction accuracy is: %3.2f" % (scores.mean()*100))

importance_df = pd.DataFrame({
    'Feature': df2.columns,
    'Importance': xgb_clf.feature_importances_
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df = importance_df.set_index('Feature')

importance_df

In [None]:
# Run xgboost on balanced datasets for each cancer site

precision = []; recall = [];
accuracy = []; f1 = []

for ca in ca_sites:
    print('Working on ', ca)
    j = ca_sites.index(ca)
    
    df = pd.read_csv(s3_path+ca+'_dataset.csv')
    df2 = df[biomarkers].copy(deep=True)
    idx = df2.index[df2['label'].isnull()]
    df2.drop(idx, inplace=True)
    p_idx = df2['label'] == True
    df2 = df2.groupby('label', group_keys=False).apply(lambda x: x.sample(n=len(df2[p_idx]), replace=False, random_state=0)) # create a balanced set
        
    y = df2.pop('label')
    X = df2
    data_dmatrix = xgb.DMatrix(data=X,label=y)
    model = xgb.XGBClassifier()
    splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
    
    precision.append(np.round(cross_val_score(model, X, y, cv=splitter, scoring='precision').mean(),2))
    recall.append(np.round(cross_val_score(model, X, y, cv=splitter, scoring='recall').mean(),2))
    k = np.round(cross_val_score(model, X, y, cv=splitter, scoring='accuracy').mean(),2)
    accuracy.append(k)
    print('Accuracy: ', k)
    f1.append(np.round(cross_val_score(model, X, y, cv=splitter, scoring='f1').mean(),2))
    
    #xgb_lg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=50)
    #xgb.plot_importance(xgb_lg)
    #plt.rcParams['figure.figsize'] = [20, 20]
    #plt.savefig('./figures/xgboost_feature_importance_'+ca+'.jpg', dpi=150)
    #plt.show()

In [None]:
pd.set_option('display.max_rows', None)
clf_df = pd.DataFrame()
clf_df = clf_df.assign(cancer=ca_sites, precision=precision, recall=recall, accuracy=accuracy, f1=f1)

clf_df = clf_df.sort_values(by='accuracy', ascending=False, ignore_index=True)
clf_df.to_csv(s3_path+'tables/xgboost_by_cancer_site_results.csv', index=False)
clf_df