In [2]:
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.width=200
pd.options.display.min_rows=60
import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn import preprocessing
from sklearn.tree import export_graphviz
import pydotplus

from datetime import datetime

In [3]:
def compare_dicts(a,b,ignore=['test_score','train_score','tn','fn','tp','fp','f1_score','precision','recall']):
    a = dict(a)
    b = dict(b)
    for k in ignore:
        a.pop(k,None)
        b.pop(k,None)
        
    return tuple(a.items()) == tuple(b.items())

In [4]:
hyperparam_table = []

In [5]:
filename = 'clinvar_conflicting.csv'
df =  pd.read_csv(filename)

#select only columns with int or float data types
df = df.select_dtypes(['number'])
#drop any columns with null values
df.dropna(axis=1,inplace=True)
#remove the class series
var_class = df.pop('CLASS')

x_train,x_test,y_train,y_test = train_test_split(df,var_class,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_xtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_xtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_ytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_ytest.csv',index=False,header=False)

exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df.shape),
                                'scaling':'no_scaling',
                                'filename': filename,
                                'model': 'sklearn.RandomForestClassifier'
                                }) 
              for a in hyperparam_table])

clf = RandomForestClassifier(random_state=0)

if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df.shape),
                          'scaling':'no_scaling',
                          'filename': filename,
                          'model': 'sklearn.DecisionTreeClassifier'
                         }]
        
    clf = clf.fit(x_train,y_train)
    
    predictions_test = clf.predict(X=x_test)
    predictions_train = clf.predict(X=x_train)
    
    score = clf.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = clf.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall
    hyperparam_table[-1]['feature_importances'] = clf.feature_importances_




  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",no_scaling,clinvar_conflicting.csv,sklearn.DecisionTreeClassifier,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187..."


In [7]:
print(clf.feature_importances_)

[0.61345776 0.09308881 0.18779581 0.10565762]


In [8]:
filename = 'data_cleanednlp2.csv'
df =  pd.read_csv(filename, index_col=0)

#select only columns with int or float data types
df = df.select_dtypes(['number'])
#drop any columns with null values
df.dropna(axis=1,inplace=True)
#remove the class series
var_class = df.pop('CLASS')

x_train,x_test,y_train,y_test = train_test_split(df,var_class,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_xtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_xtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_ytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_ytest.csv',index=False,header=False)

exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df.shape),
                                'scaling':'no_scaling',
                                'filename': filename,
                                'model': 'sklearn.RandomForestClassifier'
                                }) 
              for a in hyperparam_table])

clf = RandomForestClassifier(random_state=0)

if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df.shape),
                          'scaling':'no_scaling',
                          'filename': filename,
                          'model': 'sklearn.DecisionTreeClassifier'
                         }]
        
    clf = clf.fit(x_train,y_train)
    
    predictions_test = clf.predict(X=x_test)
    predictions_train = clf.predict(X=x_train)
    
    score = clf.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = clf.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall
    hyperparam_table[-1]['feature_importances'] = clf.feature_importances_


  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,feature_importances
0,0.05,0,"(65188, 4)",no_scaling,clinvar_conflicting.csv,sklearn.DecisionTreeClassifier,0.721166,0.966025,2095,327,582,256,0.36031,0.439108,0.305489,"[0.613457760979103, 0.09308880732967509, 0.187..."
1,0.05,0,"(65188, 55)",no_scaling,data_cleanednlp2.csv,sklearn.DecisionTreeClassifier,0.737117,0.980041,2210,212,645,193,0.310539,0.476543,0.23031,"[0.1479280709198763, 0.07932871247184145, 0.12..."
