In [1]:
# import necessary modules
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.width=200
pd.options.display.min_rows=60
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold

# import custom functions from hyperparamfuncs.py
from hyperparamfuncs import *
from datetime import datetime

In [2]:
# create empty hyperparameter table
hyperparam_table = []

In [3]:
# run a logistic regression model on raw data before any data cleaning/feature engineering
filename = 'data/clinvar_conflicting.csv'
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': None,
              'scaling': None,
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': None
             }
clf, hyperparam_table = train_eval(filename, hyper_dict, hyperparam_table)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [4]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."


In [5]:
# looks like data is imbalanced, let's balance class_weight in the model
filename = 'data/clinvar_conflicting.csv'
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': None,
              'scaling': None,
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }
clf, hyperparam_table = train_eval(filename, hyper_dict, hyperparam_table)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


In [6]:
# convert the hyperparam_table to dataframe and visualize
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head(10)

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."


In [7]:
# perform MinMax scaling before running the model
filename = 'data/clinvar_conflicting.csv'
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': None,
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }
clf, hyperparam_table = train_eval(filename, hyper_dict, hyperparam_table)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


In [8]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."
2,0.05,0,"(65188, 4)",MinMax,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.395399,0.393457,512,1910,61,777,0.440851,0.28917,0.927208,"[[0.13284429181162538, -5.064108331797677, -0...."


- Looking at the models run so far, balancing the `class_weight` in our model is important due to the imbalanced nature of our dataset. 
- We want to increase our `precision` and `recall`, therefore we want a higher `f1 score`. The first row of the hyperparameter table shows that without balancing our classes, we get a high accuracy but f1 score of 0; this is a bad model. 
- Additionally, `MixMax` scaling seems to improve performance so I will continue to use both scaling and class weights hyperparameters in future models

In [9]:
# lets run the model on data that has been cleaned 
filename = 'data/data_cleaned4.csv'
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': None,
              'scaling': None,
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }
clf, hyperparam_table = train_eval(filename, hyper_dict, hyperparam_table)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


In [10]:
filename = 'data/data_cleaned4.csv'
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': None,
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }
clf, hyperparam_table = train_eval(filename, hyper_dict, hyperparam_table)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


In [11]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."
2,0.05,0,"(65188, 4)",MinMax,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.395399,0.393457,512,1910,61,777,0.440851,0.28917,0.927208,"[[0.13284429181162538, -5.064108331797677, -0...."
3,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144657191668e-10, -3.4762990112364325e..."
4,0.05,0,"(65188, 53)",MinMax,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.515951,0.528598,1072,1350,228,610,0.436026,0.311224,0.727924,"[[0.027245693225715493, -5.70248924868861, -0...."


In [12]:
# final dataset: cleaned data with additional engineering NLP features for the REF and ALT alleles
filename = 'data/data_cleanednlp4.csv'
hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': None,
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }
clf, hyperparam_table = train_eval(filename, hyper_dict, hyperparam_table)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


In [13]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head(10)

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."
2,0.05,0,"(65188, 4)",MinMax,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.395399,0.393457,512,1910,61,777,0.440851,0.28917,0.927208,"[[0.13284429181162538, -5.064108331797677, -0...."
3,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144657191668e-10, -3.4762990112364325e..."
4,0.05,0,"(65188, 53)",MinMax,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.515951,0.528598,1072,1350,228,610,0.436026,0.311224,0.727924,"[[0.027245693225715493, -5.70248924868861, -0...."
5,0.05,0,"(65188, 61)",MinMax,data/data_cleanednlp4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.528221,0.536106,1120,1302,236,602,0.439096,0.316176,0.718377,"[[0.026377656517386078, -5.7429100422809505, -..."


- Looking at the hyperparameter table, the cleaned data drastically improves accuracy from the raw dataset with a minimal sacrifice in the f1_score. 
- MinMax scaling seems to be important for balancing class predictions. 
- It looks like the NLP feature engineering produces a small increase in accuracy in the model while maintaining the f1_score, so we will perform cross validation on this dataset 

In [16]:
# 3-Fold cross validation: tuning the C hyperparameter using our best model so far

filename = 'data/data_cleanednlp4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)
df = scale_data(df, scaler=MinMaxScaler)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
#the grid of parameters to search over
Cs = [0.01, 0.1, 1, 10, 100]

#Find the best value for C, and the best classifier
best_C = None
maxscore=-np.inf
for C in Cs:        

    clf = hyper_dict['model'](C=C)
    score = np.mean(cross_val_score(clf, x_train, y_train, cv=3, scoring='accuracy'))

    if np.mean(score) > maxscore:
        maxscore = score
        best_C = C

hyperparam_table[-1]['C'] = best_C  # update hyperparam_table

# run and print best classifier
bestclf = LogisticRegression(class_weight='balanced', C=best_C, random_state=0)
bestclf.fit(x_train, y_train)
pred = bestclf.predict(x_test)
print(f'Logistic Regression Model\n'
     f'Accuracy: {bestclf.score(x_test, y_test):0.3f}\n'
     f'f1 score: {f1_score(y_test, pred):0.3f}\n'
     f'precision: {precision_score(y_test, pred):0.3f}\n'
     f'recall: {recall_score(y_test, pred):0.3f}')
print(f'best C: {best_C}')
        

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=1, inplace=True)


Logistic Regression Model
Accuracy: 0.528
f1 score: 0.439
precision: 0.316
recall: 0.718
best C: 1


In [55]:
# save the hyperparameter table to a .pickle file
time = datetime.now().strftime("%Y%m%d_%H:%M:%S")
filename = 'hyperparameter_tables/hyperparameter_table'+time+'.pkl'
df_hyp.to_pickle(filename)