In [35]:
# import necessary modules
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.width=200
pd.options.display.min_rows=60
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, KFold

from datetime import datetime

In [36]:
def preprocess_data(df):
    '''
    function to remove non-numeric features and null values
    input: dataframe
    outputs: df & var_class=Series with class labels popped from df
    '''
    
    #select only columns with int or float data types
    df = df.select_dtypes(['number'])
    #drop any columns with null values
    df.dropna(axis=1,inplace=True)
    #remove the class series
    var_class = df.pop('CLASS')
    
    return df, var_class

In [37]:
def scale_data(df, scaler=MinMaxScaler):
    '''
    function to scale the input dataframe and return scaled dataframe
    inputs: df, scaler
    output: scaled_df
    '''
    
    scaler = scaler()
    scaled_df = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaled_df,columns=df.columns)
    return scaled_df

In [38]:
def compare_dicts(a,b,ignore=['test_score', 'train_score', 'tn', 'fn', 'tp', 'fp',
                              'f1_score', 'precision', 'recall', 'feature_importances']):
    '''
    function to compare if current hyperparameters have already been run in a model
    inputs: a=hyperparameter entry, b=current hyperparameters, ignore=hyperparameters to ignore in comparison
    output: boolean, True if the current hyperparameters have been run already, and False if they have not
    '''
    
    a = dict(a)
    b = dict(b)
    for k in ignore:
        a.pop(k,None)
        b.pop(k,None)
        
    return tuple(a.items()) == tuple(b.items())

In [39]:
def make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts):
    '''
    function to compare current hyperparameters (hyper_dict) to existing hyperparam_table
    inputs: hyperparam_table, hyper_dict
    outputs: exists=True if hyper_dict has been run before and False if it hasn't & hyper_dict
    '''
    
    exists = any([compare_func(a, b=hyper_dict) for a in hyperparam_table])
    return exists, hyper_dict
    

In [40]:
def train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False):
    '''
    function to write train and test sets to files
    inputs: train and test dfs & scaled=False if no scaling, True if scaling
    output: None
    '''
    
    if scaled:
        x_train.to_csv(filename[:-4]+'_scaledxtrain.csv')
        x_test.to_csv(filename[:-4]+'_scaledxtest.csv')
        y_train.to_csv(filename[:-4]+'_scaledytrain.csv',header=False)
        y_test.to_csv(filename[:-4]+'_scaledytest.csv',header=False)
    else:
        x_train.to_csv(filename[:-4]+'_xtrain.csv')
        x_test.to_csv(filename[:-4]+'_xtest.csv')
        y_train.to_csv(filename[:-4]+'_ytrain.csv',header=False)
        y_test.to_csv(filename[:-4]+'_ytest.csv',header=False)

In [41]:
def train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table):
    '''
    function to train model with given training/test sets and hyperparameters
    inputs: x_train, y_train, x_test, y_test, hyper_dict=dict of current hyperparameters to be run, hyperparam_table=table of hyperparameters already run
    outputs: clf=classifier trained & hyperparam_table updated
    '''
    
    hyperparam_table += [hyper_dict]
    clf = hyper_dict['model'](class_weight=hyper_dict['class_weight'], random_state=hyper_dict['random_state'])
    clf.fit(x_train, y_train)
    
    predictions_test = clf.predict(x_test)
    predictions_train = clf.predict(x_train)
    
    score = clf.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    training_score = clf.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall
    
    hyperparam_table[-1]['feature_importances'] = clf.coef_
    
    return clf, hyperparam_table

In [42]:
# create empty hyperparameter table
hyperparam_table = []

In [43]:
# run a logistic regression model on raw data before any data cleaning/feature engineering
filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': None
             }

x_train, x_test, y_train, y_test = train_test_split(df,var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])

train_test_write(x_train,x_test,y_train,y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [45]:
# looks like data is imbalanced, let's balance class_weight in the model
filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
train_test_write(x_train, x_test, y_train, y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict, compare_func=compare_dicts)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [46]:
# convert the hyperparam_table to dataframe and visualize
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head(10)

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."


In [47]:
# perform MinMax scaling before running the model

filename = 'data/clinvar_conflicting.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)
df = scale_data(df, scaler=MinMaxScaler)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
train_test_write(x_train, x_test, y_train, y_test, filename, scaled=True)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict)

if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [48]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."
2,0.05,0,"(65188, 4)",MinMax,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.395399,0.393457,512,1910,61,777,0.440851,0.28917,0.927208,"[[0.13284429181162538, -5.064108331797677, -0...."


- Looking at the models run so far, balancing the class_weight in our model is important due to the imbalanced nature of our dataset. Additionally, MixMax scaling seems to improve performance so I will continue to use both of these hyperparameters in future models

In [49]:
# lets run the model on data that has been cleaned 
filename = 'data/data_cleaned4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)


hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': None,
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
train_test_write(x_train, x_test, y_train, y_test, filename, scaled=False)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [50]:
filename = 'data/data_cleaned4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)
df = scale_data(df, scaler=MinMaxScaler)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
train_test_write(x_train, x_test, y_train, y_test, filename, scaled=True)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [51]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."
2,0.05,0,"(65188, 4)",MinMax,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.395399,0.393457,512,1910,61,777,0.440851,0.28917,0.927208,"[[0.13284429181162538, -5.064108331797677, -0...."
3,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144657191668e-10, -3.4762990112364325e..."
4,0.05,0,"(65188, 53)",MinMax,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.515951,0.528598,1072,1350,228,610,0.436026,0.311224,0.727924,"[[0.027245693225715493, -5.70248924868861, -0...."


In [52]:
# final dataset: cleaned data with additional engineering NLP features for the REF and ALT alleles
filename = 'data/data_cleanednlp4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)
df = scale_data(df, scaler=MinMaxScaler)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
train_test_write(x_train, x_test, y_train, y_test, filename, scaled=True)
exists, hyper_dict = make_comparison(hyperparam_table, hyper_dict)
if not exists: 
    clf, hyperparam_table = train_model(x_train, y_train, x_test, y_test, hyper_dict, hyperparam_table)


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [53]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head(10)

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,class_weight,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,"[[-9.233498049854785e-09, -8.542404895192725e-..."
1,0.05,0,"(65188, 4)",,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144526616175e-10, -3.476104502147056e-..."
2,0.05,0,"(65188, 4)",MinMax,data/clinvar_conflicting.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.395399,0.393457,512,1910,61,777,0.440851,0.28917,0.927208,"[[0.13284429181162538, -5.064108331797677, -0...."
3,0.05,0,"(65188, 53)",,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,"[[1.7341144657191668e-10, -3.4762990112364325e..."
4,0.05,0,"(65188, 53)",MinMax,data/data_cleaned4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.515951,0.528598,1072,1350,228,610,0.436026,0.311224,0.727924,"[[0.027245693225715493, -5.70248924868861, -0...."
5,0.05,0,"(65188, 61)",MinMax,data/data_cleanednlp4.csv,<class 'sklearn.linear_model.logistic.Logistic...,balanced,0.528221,0.536106,1120,1302,236,602,0.439096,0.316176,0.718377,"[[0.026377656517386078, -5.7429100422809505, -..."


- Looking at the hyperparameter table, the cleaned data drastically improves accuracy from the raw dataset with a minimal sacrifice in the f1_score. 
- MinMax scaling seems to be important for balancing class predictions. 
- It looks like the NLP feature engineering produces a small increase in accuracy in the model while maintaining the f1_score, so we will perform cross validation on this dataset 

In [54]:
# 3-Fold cross validation: tuning the C hyperparameter using our best model so far

filename = 'data/data_cleanednlp4.csv'
df =  pd.read_csv(filename)
df, var_class = preprocess_data(df)
df = scale_data(df, scaler=MinMaxScaler)

hyper_dict = {'test_size': 0.05, 
              'random_state': 0, 
              'data_size': str(df.shape),
              'scaling': 'MinMax',
              'filename': filename,
              'model': LogisticRegression,
              'class_weight': 'balanced'
             }

x_train, x_test, y_train, y_test = train_test_split(df, var_class,
                                                 test_size=hyper_dict['test_size'],
                                                 random_state=hyper_dict['random_state'])
#the grid of parameters to search over
Cs = [0.01, 0.1, 1, 10, 100]

#Find the best value for C, and the best classifier
best_C = None
maxscore=-np.inf
for C in Cs:        

    clf = hyper_dict['model'](C=C)
    score = np.mean(cross_val_score(clf, x_train, y_train, cv=3, scoring='accuracy'))

    if np.mean(score) > maxscore:
        maxscore = score
        best_C = C

hyperparam_table[-1]['C'] = best_C  # update hyperparam_table

# run and print best classifier
bestclf = LogisticRegression(class_weight='balanced', C=1, random_state=0)
bestclf.fit(x_train, y_train)
pred = bestclf.predict(x_test)
print(f'Logistic Regression Model\n'
     f'Accuracy: {best_clf.score(x_test, y_test):0.3f}\n'
     f'f1 score: {f1_score(y_test, pred):0.3f}\n'
     f'precision: {precision_score(y_test, pred):0.3f}\n'
     f'recall: {recall_score(y_test, pred):0.3f}')
print(f'best C: {best_C}')
        

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Logistic Regression Model
Accuracy: 0.528
f1 score: 0.439
precision: 0.316
recall: 0.718
best C: 1


In [55]:
# save the hyperparameter table to a .pickle file
time = datetime.now().strftime("%Y%m%d_%H:%M:%S")
filename = 'hyperparameter_tables/hyperparameter_table'+time+'.pkl'
df_hyp.to_pickle(filename)