In [21]:
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.width=200
pd.options.display.min_rows=60
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV 
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
from sklearn import preprocessing
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree

from datetime import datetime

In [2]:
def preprocess_data(df):
    '''
    function to remove non-numeric features and null values
    input: dataframe
    outputs: df & var_class=Series with class labels popped from df
    '''
    
    #select only columns with int or float data types
    df = df.select_dtypes(['number'])
    #drop any columns with null values
    df.dropna(axis=1,inplace=True)
    #remove the class series
    var_class = df.pop('CLASS')
    
    return df, var_class


In [3]:
def compare_dicts(a,b,ignore=['test_score', 'train_score', 'tn', 'fn', 'tp', 'fp',
                              'f1_score', 'precision', 'recall', 'feature_importances']):
    '''
    function to compare if current hyperparameters have already been run in a model
    inputs: a=hyperparameter entry, b=current hyperparameters, ignore=hyperparameters to ignore in comparison
    output: boolean, True if the current hyperparameters have been run already, and False if they have not
    '''
    
    a = dict(a)
    b = dict(b)
    for k in ignore:
        a.pop(k,None)
        b.pop(k,None)
        
    return tuple(a.items()) == tuple(b.items())

In [4]:
def make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts):
    '''
    function to compare current hyperparameters (hyper_dict) to existing hyperparam_table
    inputs: hyperparam_table, hyper_dict
    outputs: exists=True if hyper_dict has been run before and False if it hasn't & hyper_dict
    '''
    
    exists = any([compare_func(a, b=hyper_dict) for a in hyperparam_table])
    return exists, hyper_dict
    

In [5]:
def train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False):
    '''
    function to write train and test sets to files
    inputs: train and test dfs & scaled=False if no scaling, True if scaling
    output: None
    '''
    
    if scaled:
        x_train.to_csv(filename[:-4]+'_scaledxtrain.csv')
        x_test.to_csv(filename[:-4]+'_scaledxtest.csv')
        y_train.to_csv(filename[:-4]+'_scaledytrain.csv',header=False)
        y_test.to_csv(filename[:-4]+'_scaledytest.csv',header=False)
    else:
        x_train.to_csv(filename[:-4]+'_xtrain.csv')
        x_test.to_csv(filename[:-4]+'_xtest.csv')
        y_train.to_csv(filename[:-4]+'_ytrain.csv',header=False)
        y_test.to_csv(filename[:-4]+'_ytest.csv',header=False)

In [9]:
def train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table):
    '''
    function to train model with given training/test sets and hyperparameters
    inputs: x_train, y_train, x_test, y_test, hyper_dict=dict of current hyperparameters to be run, hyperparam_table=table of hyperparameters already run
    outputs: clf=classifier trained & hyperparam_table updated
    '''
    
    hyperparam_table += [hyper_dict]
    clf = hyper_dict['model'](class_weight=hyper_dict['class_weight'], random_state=hyper_dict['random_state'])
    clf.fit(x_train, y_train)
    
    predictions_test = clf.predict(x_test)
    predictions_train = clf.predict(x_train)
    
    score = clf.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    training_score = clf.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall
    
    hyperparam_table[-1]['feature_importances'] = clf.feature_importances_
    
    return clf, hyperparam_table

In [10]:
# instantiate empty hyperparameter table
hyperparam_table = []

In [11]:
# Let's start with the raw data file, before any data cleaning
filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': RandomForestClassifier,
              'class_weight': None
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [12]:
# Balance the class_weight because the data is imbalanced (see EDA notebook)
filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': RandomForestClassifier,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [13]:
# look at hyperparameter table so far to compare model performance
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.711963,0.965524,2083,339,600,238,0.336396,0.412478,0.28401,"[0.6150616711866501, 0.09225693788412763, 0.17..."


In [14]:
# look at cleaned data: data_cleaned4.csv
filename = 'data/data_cleaned4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': RandomForestClassifier,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [15]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.711963,0.965524,2083,339,600,238,0.336396,0.412478,0.28401,"[0.6150616711866501, 0.09225693788412763, 0.17..."
2,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.745399,0.980187,2221,201,629,209,0.334936,0.509756,0.249403,"[0.13076954550169534, 0.07782536291451711, 0.1..."


In [16]:
# finally, let's run the data file with NLP engineered features: data_cleanednlp4.csv
filename = 'data/data_cleanednlp4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': RandomForestClassifier,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [17]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.711963,0.965524,2083,339,600,238,0.336396,0.412478,0.28401,"[0.6150616711866501, 0.09225693788412763, 0.17..."
2,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.745399,0.980187,2221,201,629,209,0.334936,0.509756,0.249403,"[0.13076954550169534, 0.07782536291451711, 0.1..."
3,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.751534,0.980623,2246,176,634,204,0.334975,0.536842,0.243437,"[0.11052723789618309, 0.06990488965469807, 0.1..."


In [18]:
filename = 'data/data_cleanednlp4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': RandomForestClassifier,
              'class_weight': None
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [19]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.711963,0.965524,2083,339,600,238,0.336396,0.412478,0.28401,"[0.6150616711866501, 0.09225693788412763, 0.17..."
2,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.745399,0.980187,2221,201,629,209,0.334936,0.509756,0.249403,"[0.13076954550169534, 0.07782536291451711, 0.1..."
3,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.751534,0.980623,2246,176,634,204,0.334975,0.536842,0.243437,"[0.11052723789618309, 0.06990488965469807, 0.1..."
4,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.749693,0.981785,2232,190,626,212,0.341935,0.527363,0.252983,"[0.11449647228835245, 0.06922918417085917, 0.1..."


In [22]:
# Use RandomSearchCV to find the best hyperparameters for this model

param_dist = {"class_weight": [None, 'balanced'],
              "max_depth": [5, 10, 15, None],
              "max_features": ['sqrt', 'log2', None],
              "min_samples_leaf": np.arange(1, 9),
              'n_estimators': [10, 100, 500],
              'min_samples_split': [2, 4, 6, 8, 10],
              'min_samples_leaf': [1, 3, 5, 7, 9]
             }


# Instantiate a Random Forest classifier
clf = RandomForestClassifier(random_state=0)

# Instantiate the RandomizedSearchCV object
clf_cv = RandomizedSearchCV(clf, param_dist, scoring='balanced_accuracy', cv=5)

# Fit it to the data
clf_cv.fit(x_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [28]:
# preview the best parameters and score
print(f"Tuned Random Forest Parameters: {clf_cv.best_params_}")
print(f"Best score is {clf_cv.best_score_}")
cv_params = clf_cv.best_params_

Tuned Random Forest Parameters: {'n_estimators': 500, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'max_depth': None, 'class_weight': 'balanced'}
Best score is 0.7063652458893677


In [25]:
# Using this model with the best params, predict labels on test data
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': RandomForestClassifier
             }
clf = RandomForestClassifier(random_state=0)
clf.set_params(**cv_params)
hyper_dict.update(cv_params)
hyperparam_table += [hyper_dict]
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
score = clf.score(x_test,y_test)
hyperparam_table[-1]['test_score'] = score
training_score = clf.score(x_train,y_train)
hyperparam_table[-1]['train_score'] = training_score

tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
hyperparam_table[-1]['tn'] = tn
hyperparam_table[-1]['fp'] = fp
hyperparam_table[-1]['fn'] = fn
hyperparam_table[-1]['tp'] = tp

f1 = f1_score(y_test,pred)
hyperparam_table[-1]['f1_score'] = f1
precision = precision_score(y_test,pred)
hyperparam_table[-1]['precision'] = precision
recall = recall_score(y_test,pred)
hyperparam_table[-1]['recall'] = recall

hyperparam_table[-1]['feature_importances'] = clf.feature_importances_

In [26]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head(10)

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances,n_estimators,min_samples_split,min_samples_leaf,max_features,max_depth
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187...",,,,,
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.711963,0.965524,2083,339,600,238,0.336396,0.412478,0.28401,"[0.6150616711866501, 0.09225693788412763, 0.17...",,,,,
2,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.745399,0.980187,2221,201,629,209,0.334936,0.509756,0.249403,"[0.13076954550169534, 0.07782536291451711, 0.1...",,,,,
3,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.751534,0.980623,2246,176,634,204,0.334975,0.536842,0.243437,"[0.11052723789618309, 0.06990488965469807, 0.1...",,,,,
4,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,,0.749693,0.981785,2232,190,626,212,0.341935,0.527363,0.252983,"[0.11449647228835245, 0.06922918417085917, 0.1...",,,,,
5,0.05,0,"(65188, 61)",,data/data_cleanednlp4.csv,<class 'sklearn.ensemble.forest.RandomForestCl...,balanced,0.710736,0.783055,1759,663,280,558,0.542011,0.457002,0.665871,"[0.08079377798860113, 0.12654824999305472, 0.1...",500.0,6.0,9.0,sqrt,


- Looking at the hyperparameter table above, the `data_cleanednlp4.csv` dataset produces the best test score accuracy as well as f1 score (before hyperparameter tuning). However, training accuracy is nearly perfect compared to test accuracy, indicating that this model is overfit.
- Using this dataset for hyperparameter tuning, we are able to close the gap between the two accuracy scores. While the accuracy drops by a bit, more importantly the f1 score is increased fro 0.34 to 0.54. 

In [27]:
# save the hyperparameter table to a .pickle file
time = datetime.now().strftime("%Y%m%d_%H:%M:%S")
filename = 'hyperparameter_tables/hyperparameter_table'+time+'.pkl'
df_hyp.to_pickle(filename)