# Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from collections import Counter
from matplotlib_venn import venn2, venn3
from scipy import stats
UKBB_PATH = '../glaucoma_project/UKBB_Data/'
UKBB_GENOME_PATH = '../glaucoma_project/UKBB_Data_Basket2/raw_genome/merged_raw_genome/'
UKBB_IMPUTED_GENOME_PATH = '../glaucoma_project/UKBB_Data_Basket2/imputed_genome/merged_imputed_genome/'
GWAS_PATH = '../glaucoma_project/GWAS/'
from sklearn.model_selection import train_test_split
from tableone import TableOne
import statsmodels.api as sm
import re
import seaborn as sns
import gzip
import time
import venn
tqdm.pandas()
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import shap
from joblib import dump, load

UKBB_PATH = '../glaucoma_project/UKBB_Data/'
HTML_FILE_NAME = 'ukb49508.html'
TOTAL_POP = 502419
TEST_SET_RANDOM_SPLIT = 42
CHUNK_SIZE = 10000

In [2]:
df_genes = pd.read_pickle(UKBB_PATH+'processed_data/extracted_genes.pkl')

In [3]:
train_idx, test_idx = train_test_split(range(TOTAL_POP),random_state=TEST_SET_RANDOM_SPLIT, test_size=0.1)

In [4]:
def get_dataframe_for_analysis(column_names, idx=train_idx):
    print('Preparing dataframe...')
    df_selected_cols = pd.DataFrame()
    for chunk in tqdm(pd.read_csv(UKBB_PATH+'ukb49508.tab', sep='\t', chunksize=CHUNK_SIZE, usecols=column_names), total=TOTAL_POP // CHUNK_SIZE):
        df_selected_cols = pd.concat([df_selected_cols, chunk])
    return df_selected_cols.iloc[idx].reset_index(drop=True)

# Processing and imputation

In [5]:
id_fields = get_dataframe_for_analysis(['f.eid'])

Preparing dataframe...


51it [05:48,  6.83s/it]                                                                                                       


In [6]:
id_fields

Unnamed: 0,f.eid
0,3869176
1,2586731
2,3806758
3,3200572
4,2560773
...,...
452172,3591797
452173,4658390
452174,2319334
452175,2468682


In [7]:
df_genes['Participant'] = df_genes['Participant'].apply(lambda x: x.split('_')[0])

In [8]:
df_genes['f.eid'] = df_genes['Participant'].astype(int)

In [9]:
df_selected_genes = pd.merge(id_fields, df_genes, on='f.eid', how='left')

In [10]:
from sklearn.impute import SimpleImputer

In [11]:
smp_imputer = SimpleImputer(strategy='most_frequent',verbose=2)

In [12]:
t0 = time.time()
df_selected_genes_imputed = pd.DataFrame(smp_imputer.fit_transform(df_selected_genes), columns=df_selected_genes.columns)
print(time.time() - t0)

101.91621923446655


In [14]:
dump(smp_imputer, '../glaucoma_project/Models/smp_imputer_genetic.joblib')

['../glaucoma_project/Models/smp_imputer_genetic.joblib']

In [52]:
df_gene_data = pd.read_pickle(UKBB_PATH+'processed_data/gene_data.pkl')

In [53]:
df_gene_data = df_gene_data[~((df_gene_data['ID in UKBB'] == 'Absent') | (df_gene_data['ID in UKBB'] == 'Wrong SNP found at right position'))].reset_index(drop=True)

In [54]:
df_glaucoma_data = pd.read_pickle(UKBB_PATH+'processed_data/Clinical Feature Selection/glaucoma_data_checkpoint.pkl')

In [55]:
df_selected_genes['Total Glaucoma'] = df_glaucoma_data['Total Glaucoma']

In [59]:
df_selected_genes_imputed = df_selected_genes_imputed[~df_selected_genes['Participant'].isna()].reset_index(drop=True)
df_selected_genes = df_selected_genes[~df_selected_genes['Participant'].isna()].reset_index(drop=True)

In [62]:
df_selected_genes.to_pickle(UKBB_PATH+'processed_data/unimputed_genes.pkl')
df_selected_genes_imputed.to_pickle(UKBB_PATH+'processed_data/imputed_genes.pkl')

# Initial model

In [2]:
df_selected_genes_imputed = pd.read_pickle(UKBB_PATH+'processed_data/imputed_genes.pkl')

In [3]:
df_selected_genes = pd.read_pickle(UKBB_PATH+'processed_data/unimputed_genes.pkl')

In [4]:
glaucoma_diagnosis = df_selected_genes['Total Glaucoma'].astype(int)

In [5]:
glaucoma_diagnosis.to_pickle(UKBB_PATH+'processed_data/glaucoma_diagnosis_for_genetic_ml.pkl')

In [2]:
glaucoma_diagnosis = pd.read_pickle(UKBB_PATH+'processed_data/glaucoma_diagnosis_for_genetic_ml.pkl')

In [4]:
glaucoma_diagnosis.value_counts()

0    426407
1     12098
Name: Total Glaucoma, dtype: int64

In [8]:
int_train_idx, int_val_idx = train_test_split(range(len(df_selected_genes_imputed)),random_state=TEST_SET_RANDOM_SPLIT, test_size=0.1)

In [9]:
X_train = df_selected_genes_imputed.iloc[int_train_idx].copy().reset_index(drop=True)
X_val = df_selected_genes_imputed.iloc[int_val_idx].copy().reset_index(drop=True)

In [12]:
X_train.to_pickle(UKBB_PATH+'processed_data/X_train_initial_genetic_only.pkl')
X_val.to_pickle(UKBB_PATH+'processed_data/X_val_initial_genetic_only.pkl')

In [10]:
X_train = X_train[X_train.columns[2:]]
X_val = X_val[X_val.columns[2:]]

In [11]:
y_train = glaucoma_diagnosis.iloc[int_train_idx].copy().reset_index(drop=True)
y_val = glaucoma_diagnosis.iloc[int_val_idx].copy().reset_index(drop=True)

In [13]:
y_train.to_pickle(UKBB_PATH+'processed_data/y_train_initial_genetic_only.pkl')
y_val.to_pickle(UKBB_PATH+'processed_data/y_val_initial_genetic_only.pkl')

# Model selection

In [2]:
X_train = pd.read_pickle(UKBB_PATH+'processed_data/X_train_initial_genetic_only.pkl')
X_val = pd.read_pickle(UKBB_PATH+'processed_data/X_val_initial_genetic_only.pkl')
y_train = pd.read_pickle(UKBB_PATH+'processed_data/y_train_initial_genetic_only.pkl')
y_val = pd.read_pickle(UKBB_PATH+'processed_data/y_val_initial_genetic_only.pkl')

In [3]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, accuracy_score, confusion_matrix

In [4]:
def specificity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    return specificity

In [5]:
def sensitivity(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp+fn)
    return sensitivity

In [6]:
def cutoff_youdens_j(fpr,tpr,thresholds):
    j_scores = tpr-fpr
    j_ordered = sorted(zip(j_scores,thresholds))
    return j_ordered[-1][1]

In [7]:
def evaluate_model(y_true, y_pred_proba):
    if len(y_pred_proba.shape) == 2:
        probs = y_pred_proba[:,1]
    else:
        probs = y_pred_proba
    auc = roc_auc_score(y_true,probs)
    print('AUC',auc)
    fpr, tpr, thresholds = roc_curve(y_true, probs)
    yj_threshold = cutoff_youdens_j(fpr, tpr, thresholds)
    print('Youden\'s J threshold',yj_threshold)
    acc = accuracy_score(y_true,(probs > yj_threshold).astype(int))
    print('Accuracy',acc)
    sens = sensitivity(y_true,(probs > yj_threshold).astype(int))
    print('Sensitivity',sens)
    spec = specificity(y_true,(probs > yj_threshold).astype(int))
    print('Specificity',spec)
    return auc,yj_threshold,acc,sens,spec

# Optimising hyperparameters

In [8]:
models = [LogisticRegression(class_weight='balanced',n_jobs=-1, max_iter=300),
SGDClassifier(class_weight='balanced',n_jobs=-1, random_state=42,loss='modified_huber'),
RandomForestClassifier(class_weight='balanced',max_depth=10, random_state=42, n_jobs=-1),
AdaBoostClassifier(DecisionTreeClassifier(class_weight='balanced',max_depth=10,random_state=42)),
GradientBoostingClassifier(random_state=42),
BaggingClassifier(DecisionTreeClassifier(class_weight='balanced',max_depth=10,random_state=42),n_jobs=1,random_state=42),
LinearSVC(class_weight='balanced',random_state=42),
MLPClassifier(random_state=42),
KNeighborsClassifier(n_jobs=-1),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]

In [9]:
model_names = ['LogisticRegression',
               'SGDClassifier',
               'RandomForestClassifier',
               'AdaBoostClassifier',
               'GradientBoostingClassifier',
               'BaggingClassifier',
               'LinearSVC',
               'MLPClassifier',
               'KNeighborsClassifier',
               'LinearDiscriminantAnalysis',
               'QuadraticDiscriminantAnalysis'
]

In [29]:
for model, model_name in zip(models, model_names):
    print('Training',model_name)
    t0 = time.time()
    model.fit(X_train,y_train)
    print('Training took',time.time()-t0)
    dump(model,'../glaucoma_project/Models/'+model_name+'_genetic_initial_model.joblib')

Training SGDClassifier
Training took 401.5163297653198
Training RandomForestClassifier
Training took 40.92296648025513
Training AdaBoostClassifier
Training took 1973.4057693481445
Training GradientBoostingClassifier
Training took 1704.020925283432
Training BaggingClassifier
Training took 221.86514902114868
Training LinearSVC


Liblinear failed to converge, increase the number of iterations.


Training took 989.9864153862
Training MLPClassifier
Training took 3248.861126422882
Training KNeighborsClassifier
Training took 17.420405387878418
Training LinearDiscriminantAnalysis
Training took 128.38816952705383
Training QuadraticDiscriminantAnalysis


Variables are collinear


Training took 87.32210516929626


In [10]:
models = []
for model_name in model_names:
    models.append(load('../glaucoma_project/Models/'+model_name+'_genetic_initial_model.joblib'))

In [6]:
param = {'max_depth':3, 'eta':1, 'objective':'binary:logistic' }
num_round = 20

In [7]:
dtrain = xgb.DMatrix(X_train.astype(int), label=y_train)
dval = xgb.DMatrix(X_val.astype(int), label=y_val)

In [8]:
bst = xgb.train(param, dtrain, num_round)

In [9]:
y_pred = bst.predict(dtrain)

In [17]:
xgb_train_stats = evaluate_model(y_train, y_pred)

AUC 0.6541007723298712
Youden's J threshold 0.029095747
Accuracy 0.663393757569922
Sensitivity 0.555952380952381
Specificity 0.6664512396608067


In [18]:
y_pred = bst.predict(dval)

In [19]:
xgb_val_stats = evaluate_model(y_val, y_pred)

AUC 0.5874733736401155
Youden's J threshold 0.03032571
Accuracy 0.684385760872044
Sensitivity 0.4405772495755518
Specificity 0.6911161624446371


## Validate models

In [None]:
train_stats = {}
val_stats = {}
for model, model_name in zip(models, model_names):
    print(model_name)
    if model_name != 'KNeighborsClassifier':
        print('Training:')
        try:
            y_train_pred_proba = model.predict_proba(X_train)
        except:
            y_train_pred_proba = model._predict_proba_lr(X_train)
        train_stats[model_name] = evaluate_model(y_train,y_train_pred_proba)
    print('Validation:')
    try:
        y_val_pred_proba = model.predict_proba(X_val)
    except:
        y_val_pred_proba = model._predict_proba_lr(X_val)
    val_stats[model_name] = evaluate_model(y_val,y_val_pred_proba)
    print('==========')

LogisticRegression
Training:
AUC 0.6929987853021127
Youden's J threshold 0.4974438124952202
Accuracy 0.6361192335564824
Sensitivity 0.6437728937728938
Specificity 0.6359014317209317
Validation:
AUC 0.6626246295067273
Youden's J threshold 0.4839183484254699
Accuracy 0.6078994777770176
Sensitivity 0.6392190152801358
Specificity 0.6070348932580321
SGDClassifier
Training:
AUC 0.5968509968395957
Youden's J threshold 0.9997619960218707
Accuracy 0.4472880041758097
Sensitivity 0.737912087912088
Specificity 0.4390176528532786
Validation:
AUC 0.5793063983194027
Youden's J threshold 0.9917219247738345
Accuracy 0.4449613463775056
Sensitivity 0.7088285229202037
Specificity 0.43767721978768775
RandomForestClassifier
Training:
AUC 0.9164380645878571
Youden's J threshold 0.4696678037030685
Accuracy 0.8464376390458478
Sensitivity 0.8258241758241758
Specificity 0.847024240750103
Validation:
AUC 0.5930216467894575
Youden's J threshold 0.4354877182956549
Accuracy 0.5583909146883765
Sensitivity 0.581494057

In [8]:
model = load('../glaucoma_project/Models/QuadraticDiscriminantAnalysis_genetic_initial_model.joblib')

In [10]:
y_train_pred_proba = model.predict_proba(X_train)

In [12]:
evaluate_model(y_train, y_train_pred_proba)

AUC 0.5462131551616065
Youden's J threshold 1.0
Accuracy 0.9723301930298439
Sensitivity 0.0
Specificity 1.0


(0.5462131551616065, 1.0, 0.9723301930298439, 0.0, 1.0)

In [13]:
y_val_pred_proba = model.predict_proba(X_val)

In [14]:
evaluate_model(y_val, y_val_pred_proba)

AUC 0.4751080163172404
Youden's J threshold 2.0
Accuracy 0.9731363024788489
Sensitivity 0.0
Specificity 1.0


(0.4751080163172404, 2.0, 0.9731363024788489, 0.0, 1.0)

In [125]:
df_model_stats = pd.DataFrame()

In [126]:
df_model_stats['Model Names'] = model_names

In [127]:
df_model_stats['Train AUC'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][0] if x in train_stats.keys() else np.nan)
df_model_stats['Train Threshold'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][1] if x in train_stats.keys() else np.nan)
df_model_stats['Train Accuracy'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][2] if x in train_stats.keys() else np.nan)
df_model_stats['Train Sensitivity'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][3] if x in train_stats.keys() else np.nan)
df_model_stats['Train Specificity'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][4] if x in train_stats.keys() else np.nan)

In [128]:
df_model_stats['Val AUC'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][0] if x in val_stats.keys() else np.nan)
df_model_stats['Val Threshold'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][1] if x in val_stats.keys() else np.nan)
df_model_stats['Val Accuracy'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][2] if x in val_stats.keys() else np.nan)
df_model_stats['Val Sensitivity'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][3] if x in val_stats.keys() else np.nan)
df_model_stats['Val Specificity'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][4] if x in val_stats.keys() else np.nan)

In [133]:
df_model_stats = df_model_stats.append({stat_name: stat for stat_name, stat in zip(df_model_stats.columns, ('XGBoost',) + xgb_train_stats + xgb_val_stats)}, ignore_index=True)

In [135]:
df_model_stats.sort_values('Val AUC', ascending=False).to_csv('../glaucoma_project/UKBB_Data/processed_data/initial_model_selection_genetic_data_only.csv', index=False)

## Retry models that overfit

In [8]:
df_model_stats = pd.read_csv('../glaucoma_project/UKBB_Data/processed_data/initial_model_selection_genetic_data_only.csv')

In [9]:
df_model_stats['Train AUC - Val AUC'] = df_model_stats['Train AUC'] - df_model_stats['Val AUC']

In [10]:
df_model_stats[df_model_stats['Train AUC - Val AUC'] > 0.05]

Unnamed: 0,Model Names,Train AUC,Train Threshold,Train Accuracy,Train Sensitivity,Train Specificity,Val AUC,Val Threshold,Val Accuracy,Val Sensitivity,Val Specificity,Train AUC - Val AUC
2,MLPClassifier,0.710363,0.033121,0.644577,0.657143,0.64422,0.652525,0.031751,0.618276,0.617997,0.618283,0.057838
4,RandomForestClassifier,0.916438,0.469668,0.846438,0.825824,0.847024,0.593022,0.435488,0.558391,0.581494,0.557753,0.323416
5,XGBoost,0.654101,0.029096,0.663394,0.555952,0.666451,0.587473,0.030326,0.684386,0.440577,0.691116,0.066627
8,BaggingClassifier,0.867731,0.456208,0.773447,0.800275,0.772684,0.534819,0.396948,0.516271,0.54584,0.515455,0.332911
10,AdaBoostClassifier,1.0,0.517762,0.999995,0.999908,0.999997,0.475547,0.463976,0.942578,0.039049,0.96752,0.524453
11,QuadraticDiscriminantAnalysis,0.546213,1.0,0.97233,0.0,1.0,0.475108,2.0,0.973136,0.0,1.0,0.071105


In [11]:
models = [ RandomForestClassifier(class_weight='balanced',max_depth=5, random_state=42, n_jobs=-1),
AdaBoostClassifier(DecisionTreeClassifier(class_weight='balanced',max_depth=1,random_state=42)),
BaggingClassifier(DecisionTreeClassifier(class_weight='balanced',max_depth=5,random_state=42),n_jobs=-1,random_state=42),
MLPClassifier(random_state=42,alpha=0.1),
         QuadraticDiscriminantAnalysis(reg_param=1)]

In [12]:
model_names = ['RandomForestClassifier',
               'AdaBoostClassifier',
               'BaggingClassifier',
               'MLPClassifier',
               'QuadraticDiscriminantAnalysis'
]

In [13]:
for model, model_name in zip(models, model_names):
    print('Training',model_name)
    t0 = time.time()
    model.fit(X_train,y_train)
    print('Training took',time.time()-t0)
    dump(model,'../glaucoma_project/Models/'+model_name+'_genetic_model_v2.joblib')

Training RandomForestClassifier
Training took 30.020475149154663
Training AdaBoostClassifier
Training took 2263.958827018738
Training BaggingClassifier


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [11]:
models = [ 
BaggingClassifier(DecisionTreeClassifier(class_weight='balanced',max_depth=5,random_state=42),n_jobs=1,random_state=42),
MLPClassifier(random_state=42,alpha=0.1),
         QuadraticDiscriminantAnalysis(reg_param=1)
]

In [12]:
model_names = [
               'BaggingClassifier',
               'MLPClassifier',
               'QuadraticDiscriminantAnalysis'
]

In [13]:
for model, model_name in zip(models, model_names):
    print('Training',model_name)
    t0 = time.time()
    model.fit(X_train,y_train)
    print('Training took',time.time()-t0)
    dump(model,'../glaucoma_project/Models/'+model_name+'_genetic_model_v2.joblib')

Training BaggingClassifier
Training took 439.0493040084839
Training MLPClassifier
Training took 822.6835014820099
Training QuadraticDiscriminantAnalysis
Training took 88.72513175010681


Variables are collinear


In [14]:
model_names = ['RandomForestClassifier',
               'AdaBoostClassifier',
               'BaggingClassifier',
               'MLPClassifier',
               'QuadraticDiscriminantAnalysis'
]

In [15]:
models = [load('../glaucoma_project/Models/'+model_name+'_genetic_model_v2.joblib') for model_name in model_names]

In [17]:
train_stats = {}
val_stats = {}
for model, model_name in zip(models, model_names):
    print(model_name)
    if model_name != 'KNeighborsClassifier':
        print('Training:')
        try:
            y_train_pred_proba = model.predict_proba(X_train)
        except:
            y_train_pred_proba = model._predict_proba_lr(X_train)
        train_stats[model_name] = evaluate_model(y_train,y_train_pred_proba)
    print('Validation:')
    try:
        y_val_pred_proba = model.predict_proba(X_val)
    except:
        y_val_pred_proba = model._predict_proba_lr(X_val)
    val_stats[model_name] = evaluate_model(y_val,y_val_pred_proba)
    print('==========')

RandomForestClassifier
Training:
AUC 0.6133029716374234
Youden's J threshold 0.4914206318251086
Accuracy 0.5674312182316662
Sensitivity 0.593040293040293
Specificity 0.5667024553466724
Validation:
AUC 0.5880708616164534
Youden's J threshold 0.5091571483602827
Accuracy 0.7192310323595813
Sensitivity 0.399830220713073
Specificity 0.7280481803482296
AdaBoostClassifier
Training:
AUC 0.6281103647356376
Youden's J threshold 0.4999106818429564
Accuracy 0.586050059039057
Sensitivity 0.5991758241758242
Specificity 0.5856765363507013
Validation:
AUC 0.6158494074872772
Youden's J threshold 0.5005670383999075
Accuracy 0.682880664066954
Sensitivity 0.4898132427843803
Specificity 0.6882103437770956
BaggingClassifier
Training:
AUC 0.6137078714105052
Youden's J threshold 0.48504598288670564
Accuracy 0.5673197281669513
Sensitivity 0.5967032967032967
Specificity 0.5664835537116857
Validation:
AUC 0.5702928142656456
Youden's J threshold 0.47921021717669954
Accuracy 0.5228387037923878
Sensitivity 0.588285

In [20]:
df_model_stats = pd.DataFrame()
df_model_stats['Model Names'] = model_names
df_model_stats['Train AUC'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][0] if x in train_stats.keys() else np.nan)
df_model_stats['Train Threshold'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][1] if x in train_stats.keys() else np.nan)
df_model_stats['Train Accuracy'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][2] if x in train_stats.keys() else np.nan)
df_model_stats['Train Sensitivity'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][3] if x in train_stats.keys() else np.nan)
df_model_stats['Train Specificity'] = df_model_stats['Model Names'].apply(lambda x: train_stats[x][4] if x in train_stats.keys() else np.nan)
df_model_stats['Val AUC'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][0] if x in val_stats.keys() else np.nan)
df_model_stats['Val Threshold'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][1] if x in val_stats.keys() else np.nan)
df_model_stats['Val Accuracy'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][2] if x in val_stats.keys() else np.nan)
df_model_stats['Val Sensitivity'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][3] if x in val_stats.keys() else np.nan)
df_model_stats['Val Specificity'] = df_model_stats['Model Names'].apply(lambda x: val_stats[x][4] if x in val_stats.keys() else np.nan)

In [21]:
df_model_stats.sort_values('Val AUC', ascending=False).to_csv('../glaucoma_project/UKBB_Data/processed_data/initial_model_selection_genetic_data_only_v2.csv', index=False)

Best model was MLPClassifier

In [13]:
best_model = MLPClassifier(random_state=42,alpha=0.1)

In [17]:
X_train = pd.read_pickle(UKBB_PATH+'processed_data/imputed_genes.pkl')
X_train = X_train[X_train.columns[2:]]

In [9]:
y_train = pd.read_pickle(UKBB_PATH+'processed_data/glaucoma_diagnosis_for_genetic_ml.pkl')

In [19]:
X_train

Unnamed: 0,rs71628956,rs78389809,rs6704012,rs12024620,rs12021948,rs302714,rs172531,rs534975221,rs28991009,rs143038218,...,rs181331502,rs1047922,rs41506447,rs2107482,rs66819623,rs12558081,rs17146835,rs67499600,rs138274479,rs12013156
0,0,0,0,0,0,2,2,0,0,0,...,0,0,0,1,1,1,1,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,1,1,2,1,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3,1,1,0,0,0,1,1,0,0,0,...,0,0,0,0,1,1,1,1,1,0
4,1,1,1,0,0,1,1,0,0,0,...,0,0,0,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438500,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
438501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,1,0,0
438502,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
438503,0,0,0,0,0,1,1,0,0,0,...,0,0,0,2,1,2,2,2,0,0


In [None]:
best_model.fit(X_train,y_train)