In [25]:
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.width=200
pd.options.display.min_rows=60
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV

from datetime import datetime
import xgboost as xgb
from xgboost import XGBClassifier

In [9]:
# first, define some functions we can use to run the models

def preprocess_data(df):
    '''
    function to remove non-numeric features and null values
    input: dataframe
    outputs: df & var_class=Series with class labels popped from df
    '''
    
    #select only columns with int or float data types
    df = df.select_dtypes(['number'])
    #drop any columns with null values
    df.dropna(axis=1,inplace=True)
    #remove the class series
    var_class = df.pop('CLASS')
    #create a Dmatrix (specific for xgboost)
    data_dmatrix = xgb.DMatrix(data=df, label=var_class)
    
    return df, var_class, data_dmatrix

def compare_dicts(a,b,ignore=['test_score', 'train_score', 'tn', 'fn', 'tp', 'fp',
                              'f1_score', 'precision', 'recall', 'feature_importances']):
    '''
    function to compare if current hyperparameters have already been run in a model
    inputs: a=hyperparameter entry, b=current hyperparameters, ignore=hyperparameters to ignore in comparison
    output: boolean, True if the current hyperparameters have been run already, and False if they have not
    '''
    
    a = dict(a)
    b = dict(b)
    for k in ignore:
        a.pop(k,None)
        b.pop(k,None)
        
    return tuple(a.items()) == tuple(b.items())

def make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts):
    '''
    function to compare current hyperparameters (hyper_dict) to existing hyperparam_table
    inputs: hyperparam_table, hyper_dict
    outputs: exists=True if hyper_dict has been run before and False if it hasn't & hyper_dict
    '''
    
    exists = any([compare_func(a, b=hyper_dict) for a in hyperparam_table])
    return exists, hyper_dict
    
def train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False):
    '''
    function to write train and test sets to files
    inputs: train and test dfs & scaled=False if no scaling, True if scaling
    output: None
    '''
    
    if scaled:
        x_train.to_csv(filename[:-4]+'_scaledxtrain.csv')
        x_test.to_csv(filename[:-4]+'_scaledxtest.csv')
        y_train.to_csv(filename[:-4]+'_scaledytrain.csv',header=False)
        y_test.to_csv(filename[:-4]+'_scaledytest.csv',header=False)
    else:
        x_train.to_csv(filename[:-4]+'_xtrain.csv')
        x_test.to_csv(filename[:-4]+'_xtest.csv')
        y_train.to_csv(filename[:-4]+'_ytrain.csv',header=False)
        y_test.to_csv(filename[:-4]+'_ytest.csv',header=False)

def train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table):
    '''
    function to train model with given training/test sets and hyperparameters
    inputs: x_train, y_train, x_test, y_test, hyper_dict=dict of current hyperparameters to be run, hyperparam_table=table of hyperparameters already run
    outputs: clf=classifier trained & hyperparam_table updated
    '''
    
    hyperparam_table += [hyper_dict]
    clf = hyper_dict['model'](class_weight=hyper_dict['class_weight'], random_state=hyper_dict['random_state'])
    clf.fit(x_train, y_train)
    
    predictions_test = [round(x) for x in clf.predict(x_test)]
    predictions_train = [round(x) for x in clf.predict(x_train)]
    
    score = clf.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    training_score = clf.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall
    
    hyperparam_table[-1]['feature_importances'] = clf.feature_importances_
    
    return clf, hyperparam_table

In [5]:
# create empty hyperparameter table
hyperparam_table = []

In [10]:
# run a xgboost model on raw data before any data cleaning/feature engineering
filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class, data_dmatrix = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': XGBClassifier,
              'class_weight': None
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [11]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'xgboost.sklearn.XGBClassifier'>,,0.761656,0.79326,2275,147,630,208,0.348701,0.585915,0.24821,"[0.13841417, 0.11646682, 0.33226815, 0.4128508]"


In [12]:
# balance the class weights
filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class, data_dmatrix = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': XGBClassifier,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [13]:
# Use cleaned data file with NLP features
filename = 'data/data_cleanednlp4.csv'
df =  pd.read_csv(filename)
df, var_class, data_dmatrix = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': XGBClassifier,
              'class_weight': None
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [14]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'xgboost.sklearn.XGBClassifier'>,,0.761656,0.79326,2275,147,630,208,0.348701,0.585915,0.24821,"[0.13841417, 0.11646682, 0.33226815, 0.4128508]"
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'xgboost.sklearn.XGBClassifier'>,balanced,0.761656,0.79326,2275,147,630,208,0.348701,0.585915,0.24821,"[0.13841417, 0.11646682, 0.33226815, 0.4128508]"
2,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'xgboost.sklearn.XGBClassifier'>,,0.765031,0.818208,2227,195,571,267,0.410769,0.577922,0.318616,"[0.01426685, 0.022147033, 0.055169754, 0.07037..."


In [26]:
# Use RandomSearchCV to find the best hyperparameters for this model

param_dist = {'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
              "max_depth": [5, 7, 10, None],
              "colsample_bytree": [0.5, 0.7, 0.9, 1.0],
              "n_estimators": [5, 10, 15],
              "gamma": [0, 0.1, 0.2, 0.3, 0.4, 0.5]}

# Instantiate a XGBoost classifier
clf = XGBClassifier(random_state=0)

# Instantiate the RandomizedSearchCV object
clf_cv = GridSearchCV(clf, param_dist, scoring='balanced_accuracy', cv=5)

# Fit it to the data
clf_cv.fit(x_train, y_train)

KeyboardInterrupt: 

In [19]:
# preview the best parameters and score
print(f"Tuned XGBoost Parameters: {clf_cv.best_params_}")
print(f"Best score is {clf_cv.best_score_}")

Tuned XGBoost Parameters: {'n_estimators': 15, 'max_depth': 10, 'learning_rate': 0.25, 'gamma': 0.4, 'colsample_bytree': 0.5}
Best score is 0.5936359691318951


In [None]:
cv_params = clf_cv.best_params_

In [None]:
# Using this model with the best params, predict labels on test data
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': XGBClassifier
             }
clf = XGBClassifier(random_state=0)
clf.set_params(**cv_params)
hyper_dict.update(cv_params)
hyperparam_table += [hyper_dict]
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
score = clf.score(x_test,y_test)
hyperparam_table[-1]['test_score'] = score
training_score = clf.score(x_train,y_train)
hyperparam_table[-1]['train_score'] = training_score

tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
hyperparam_table[-1]['tn'] = tn
hyperparam_table[-1]['fp'] = fp
hyperparam_table[-1]['fn'] = fn
hyperparam_table[-1]['tp'] = tp

f1 = f1_score(y_test,pred)
hyperparam_table[-1]['f1_score'] = f1
precision = precision_score(y_test,pred)
hyperparam_table[-1]['precision'] = precision
recall = recall_score(y_test,pred)
hyperparam_table[-1]['recall'] = recall

hyperparam_table[-1]['feature_importances'] = clf.feature_importances_

In [None]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head(10)