In [1]:
import pandas as pd
pd.options.display.max_columns=1000
pd.options.display.width=200
pd.options.display.min_rows=60
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn import preprocessing

from datetime import datetime

In [2]:
def compare_dicts(a,b,ignore=['test_score','train_score','tn','fn','tp','fp','f1_score','precision','recall']):
    a = dict(a)
    b = dict(b)
    for k in ignore:
        a.pop(k,None)
        b.pop(k,None)
        
    return tuple(a.items()) == tuple(b.items())

In [6]:
hyperparam_table = []

In [7]:
filename = 'data_cleanednlp4.csv'
df =  pd.read_csv(filename)

#select only columns with int or float data types
df = df.select_dtypes(['number'])
#drop any columns with null values
df.dropna(axis=1,inplace=True)
#remove the class series
var_class = df.pop('CLASS')

scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df,columns=df.columns)

x_train,x_test,y_train,y_test = train_test_split(scaled_df,var_class,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_scaledxtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_scaledxtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_scaledytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_scaledytest.csv',index=False,header=False)


exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df.shape),
                                'scaling':'min_max',
                                'filename': filename,
                                'model': 'sklearn.LogisticRegression'}) 
              for a in hyperparam_table])

logisticRegr = LogisticRegression()
if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df.shape),
                          'scaling':'min_max',
                         'filename': filename,
                         'model': 'sklearn.LogisticRegression'}]
        
    logisticRegr.fit(x_train,y_train)
    
    predictions_test = logisticRegr.predict(x_test)
    predictions_train = logisticRegr.predict(x_train)
    
    score = logisticRegr.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = logisticRegr.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall
0,0.05,0,"(65188, 61)",min_max,data_cleanednlp4.csv,sklearn.LogisticRegression,0.743252,0.748224,2422,0,837,1,0.002384,1.0,0.001193


In [9]:
filename = 'data_cleanednlp4.csv'
df1 =  pd.read_csv(filename)

#select only columns with int or float data types
df1 = df1.select_dtypes(['number'])
#drop any columns with null values
df1.dropna(axis=1,inplace=True)
#remove the class series
var_class1 = df1.pop('CLASS')

x_train,x_test,y_train,y_test = train_test_split(df1,var_class1,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_xtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_xtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_ytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_ytest.csv',index=False,header=False)


exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df1.shape),
                                'scaling':'no_scaling',
                                'filename': filename,
                                 'class_weight':'balanced',
                                'model': 'sklearn.LogisticRegression'}) 
              for a in hyperparam_table])

logisticRegr = LogisticRegression(class_weight='balanced')
if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df1.shape),
                          'scaling':'no_scaling',
                          'filename': filename,
                          'class_weight':'balanced',
                        'model': 'sklearn.LogisticRegression'}]
        
    logisticRegr.fit(x_train,y_train)
    
    predictions_test = logisticRegr.predict(x_test)
    predictions_train = logisticRegr.predict(x_train)
    
    score = logisticRegr.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = logisticRegr.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,class_weight
0,0.05,0,"(65188, 61)",min_max,data_cleanednlp4.csv,sklearn.LogisticRegression,0.743252,0.748224,2422,0,837,1,0.002384,1.0,0.001193,
1,0.05,0,"(65188, 61)",no_scaling,data_cleanednlp4.csv,sklearn.LogisticRegression,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,balanced


In [11]:
filename = 'data_cleaned4.csv'
df1 =  pd.read_csv(filename)

#select only columns with int or float data types
df1 = df1.select_dtypes(['number'])
#drop any columns with null values
df1.dropna(axis=1,inplace=True)
#remove the class series
var_class1 = df1.pop('CLASS')

x_train,x_test,y_train,y_test = train_test_split(df1,var_class1,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_xtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_xtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_ytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_ytest.csv',index=False,header=False)


exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df1.shape),
                                'scaling':'no_scaling',
                                'filename': filename,
                                 'class_weight':'balanced',
                                'model': 'sklearn.LogisticRegression'}) 
              for a in hyperparam_table])

logisticRegr = LogisticRegression(class_weight='balanced')
if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df1.shape),
                          'scaling':'no_scaling',
                          'filename': filename,
                          'class_weight':'balanced',
                        'model': 'sklearn.LogisticRegression'}]
        
    logisticRegr.fit(x_train,y_train)
    
    predictions_test = logisticRegr.predict(x_test)
    predictions_train = logisticRegr.predict(x_train)
    
    score = logisticRegr.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = logisticRegr.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,class_weight
0,0.05,0,"(65188, 61)",min_max,data_cleanednlp4.csv,sklearn.LogisticRegression,0.743252,0.748224,2422,0,837,1,0.002384,1.0,0.001193,
1,0.05,0,"(65188, 61)",no_scaling,data_cleanednlp4.csv,sklearn.LogisticRegression,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,balanced
2,0.05,0,"(65188, 53)",no_scaling,data_cleaned4.csv,sklearn.LogisticRegression,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,balanced


In [22]:
x_train.head(20)

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,STRAND,LoFtool,CADD_PHRED,CADD_RAW,CHROM_1,CHROM_2,CHROM_3,CHROM_4,CHROM_5,CHROM_6,CHROM_7,CHROM_8,CHROM_9,CHROM_10,CHROM_11,CHROM_12,CHROM_13,CHROM_14,CHROM_15,CHROM_16,CHROM_16.1,CHROM_17,CHROM_18,CHROM_19,CHROM_20,CHROM_21,CHROM_22,CHROM_MT,CHROM_X,CLNVC_Deletion,CLNVC_Duplication,CLNVC_Indel,CLNVC_Insertion,CLNVC_Inversion,CLNVC_Microsatellite,CLNVC_single_nucleotide_variant,INT,EX,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,BIOTYPE_misc_RNA,BIOTYPE_protein_coding,BIOTYPE_unknown,Relative_Location
16318,84205724,0.0027,0.00204,0.0062,1.0,0.0,-1.0,0.523,3.778,0.115872,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.309973
39036,103310860,0.0,1e-05,0.0,1.0,0.0,-1.0,0.0132,24.0,4.310761,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0.035398
52550,39912056,0.0005,0.00024,0.0006,1.0,0.0,-1.0,0.752,14.14,1.65257,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1.025503
19664,131939132,0.0001,0.0,0.0,1.0,0.0,1.0,0.987,24.3,4.550019,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.596799
58457,11230784,0.0,0.0,0.0,1.0,0.0,1.0,0.0737,26.0,5.396334,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0.722093
7982,152522801,0.0011,0.00054,0.0006,1.0,0.0,-1.0,0.995,34.0,7.291316,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0.241715
48346,23641441,0.0,0.0,0.0,1.0,0.0,-1.0,0.965,0.048,-0.777735,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0.571669
54578,59853875,0.0,0.0,0.0002,1.0,0.0,-1.0,0.64,33.0,6.999475,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0.97929
3437,215847775,0.0002,0.00053,0.0014,0.0,0.0,-1.0,0.924,10.23,0.926353,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0.863706
16769,184587424,0.0006,0.00425,0.0052,1.0,0.0,1.0,0.345058,10.66,0.99946,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.064431


In [21]:
y_train.head(20)

16318    1
39036    0
52550    0
19664    0
58457    0
7982     1
48346    0
54578    0
3437     0
16769    0
40953    0
59651    1
41649    1
15824    0
36681    0
29340    1
35838    0
1132     0
17692    0
58953    0
Name: CLASS, dtype: int64

In [23]:
x_test.head(20)

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,STRAND,LoFtool,CADD_PHRED,CADD_RAW,CHROM_1,CHROM_2,CHROM_3,CHROM_4,CHROM_5,CHROM_6,CHROM_7,CHROM_8,CHROM_9,CHROM_10,CHROM_11,CHROM_12,CHROM_13,CHROM_14,CHROM_15,CHROM_16,CHROM_16.1,CHROM_17,CHROM_18,CHROM_19,CHROM_20,CHROM_21,CHROM_22,CHROM_MT,CHROM_X,CLNVC_Deletion,CLNVC_Duplication,CLNVC_Indel,CLNVC_Insertion,CLNVC_Inversion,CLNVC_Microsatellite,CLNVC_single_nucleotide_variant,INT,EX,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,BIOTYPE_misc_RNA,BIOTYPE_protein_coding,BIOTYPE_unknown,Relative_Location
35291,108121511,0.0,2e-05,0.0,1.0,0.0,1.0,0.782,0.018,-0.973085,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.143979
17020,1279430,0.0012,0.00115,0.0008,1.0,0.0,-1.0,0.345058,11.68,1.186965,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0.620141
35770,108150311,0.0002,5e-05,0.0,1.0,0.0,1.0,0.782,3.043,0.046114,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0.368455
24790,128037043,0.0023,0.00223,0.0004,1.0,0.0,-1.0,0.0617,24.1,4.413189,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.719844
38688,58145038,0.0,0.0,0.0,1.0,0.0,-1.0,0.303,0.041,-0.809819,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0.336634
40564,32906558,0.0,0.00037,0.0016,1.0,0.0,1.0,0.0896,0.098,-0.642699,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.092159
26258,90967593,0.0,1e-05,0.0,1.0,0.0,-1.0,0.811,0.001,-2.362717,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0.582228
43759,68229462,0.0014,0.00148,0.001,1.0,0.0,-1.0,0.892,25.5,5.185729,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0.799134
42677,113760228,0.1628,0.20961,0.1909,1.0,0.0,1.0,0.0655,1.262,-0.170605,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0.0
13096,242707175,0.0009,0.0006,0.0002,1.0,0.0,1.0,0.0903,16.81,2.093566,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0.919386


In [24]:
y_test.head(20)

35291    0
17020    0
35770    0
24790    0
38688    1
40564    1
26258    1
43759    0
42677    0
13096    0
11868    0
30706    1
44731    0
51419    0
1279     0
59588    0
30080    1
15584    1
46461    0
19784    0
Name: CLASS, dtype: int64

In [25]:
filename = 'data_cleaned3.csv'
df1 =  pd.read_csv(filename)

#select only columns with int or float data types
df1 = df1.select_dtypes(['number'])
#drop any columns with null values
df1.dropna(axis=1,inplace=True)
#remove the class series
var_class1 = df1.pop('CLASS')

x_train,x_test,y_train,y_test = train_test_split(df1,var_class1,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_xtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_xtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_ytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_ytest.csv',index=False,header=False)


exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df1.shape),
                                'scaling':'no_scaling',
                                'filename': filename,
                                 'class_weight':'balanced',
                                'model': 'sklearn.LogisticRegression'}) 
              for a in hyperparam_table])

logisticRegr = LogisticRegression(class_weight='balanced')
if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df1.shape),
                          'scaling':'no_scaling',
                          'filename': filename,
                          'class_weight':'balanced',
                        'model': 'sklearn.LogisticRegression'}]
        
    logisticRegr.fit(x_train,y_train)
    
    predictions_test = logisticRegr.predict(x_test)
    predictions_train = logisticRegr.predict(x_train)
    
    score = logisticRegr.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = logisticRegr.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall

  interactivity=interactivity, compiler=compiler, result=result)


In [26]:
filename = 'data_cleaned4.csv'
df1 =  pd.read_csv(filename)

#select only columns with int or float data types
df1 = df1.select_dtypes(['number'])
#drop any columns with null values
df1.dropna(axis=1,inplace=True)
#remove the class series
var_class1 = df1.pop('CLASS')

x_train,x_test,y_train,y_test = train_test_split(df1,var_class1,test_size=0.05,random_state=0)
x_train.to_csv(filename[:-4]+'_xtrain.csv',index=False)
x_test.to_csv(filename[:-4]+'_xtest.csv',index=False)
y_train.to_csv(filename[:-4]+'_ytrain.csv',index=False,header=False)
y_test.to_csv(filename[:-4]+'_ytest.csv',index=False,header=False)


exists = any([compare_dicts(a,b={'test_size': 0.05, 
                                'random_state': 0, 
                                'data_size': str(df1.shape),
                                'scaling':'no_scaling',
                                'filename': filename,
                                'model': 'sklearn.LogisticRegression'}) 
              for a in hyperparam_table])

logisticRegr = LogisticRegression()
if not exists:
    hyperparam_table += [{'test_size': 0.05, 
                          'random_state': 0, 
                          'data_size': str(df1.shape),
                          'scaling':'no_scaling',
                          'filename': filename,
                        'model': 'sklearn.LogisticRegression'}]
        
    logisticRegr.fit(x_train,y_train)
    
    predictions_test = logisticRegr.predict(x_test)
    predictions_train = logisticRegr.predict(x_train)
    
    score = logisticRegr.score(x_test,y_test)
    hyperparam_table[-1]['test_score'] = score
    
    training_score = logisticRegr.score(x_train,y_train)
    hyperparam_table[-1]['train_score'] = training_score
    
    tn, fp, fn, tp = confusion_matrix(y_test,predictions_test).ravel()
    hyperparam_table[-1]['tn'] = tn
    hyperparam_table[-1]['fp'] = fp
    hyperparam_table[-1]['fn'] = fn
    hyperparam_table[-1]['tp'] = tp
    
    f1 = f1_score(y_test,predictions_test)
    hyperparam_table[-1]['f1_score'] = f1
    precision = precision_score(y_test,predictions_test)
    hyperparam_table[-1]['precision'] = precision
    recall = recall_score(y_test,predictions_test)
    hyperparam_table[-1]['recall'] = recall

  interactivity=interactivity, compiler=compiler, result=result)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [27]:
df_hyp = pd.DataFrame(hyperparam_table)
df_hyp.head()

Unnamed: 0,test_size,random_state,data_size,scaling,filename,model,test_score,train_score,tn,fp,fn,tp,f1_score,precision,recall,class_weight
0,0.05,0,"(65188, 61)",min_max,data_cleanednlp4.csv,sklearn.LogisticRegression,0.743252,0.748224,2422,0,837,1,0.002384,1.0,0.001193,
1,0.05,0,"(65188, 61)",no_scaling,data_cleanednlp4.csv,sklearn.LogisticRegression,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,balanced
2,0.05,0,"(65188, 53)",no_scaling,data_cleaned4.csv,sklearn.LogisticRegression,0.257055,0.251841,0,2422,0,838,0.40898,0.257055,1.0,balanced
3,0.05,0,"(65188, 47)",no_scaling,data_cleaned3.csv,sklearn.LogisticRegression,0.544479,0.532021,1344,1078,407,431,0.367277,0.28562,0.51432,balanced
4,0.05,0,"(65188, 53)",no_scaling,data_cleaned4.csv,sklearn.LogisticRegression,0.742945,0.748159,2422,0,838,0,0.0,0.0,0.0,


In [31]:
df3 = pd.read_csv('data_cleaned3.csv')
df3.describe(include='all')

Unnamed: 0,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,CLNDNINCL,CLNHGVS,CLNSIGINCL,CLNVI,MC,ORIGIN,SSR,CLASS,Allele,Consequence,IMPACT,SYMBOL,Feature_type,Feature,BIOTYPE,EXON,INTRON,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,DISTANCE,STRAND,BAM_EDIT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CHROM_1,CHROM_2,CHROM_3,CHROM_4,CHROM_5,CHROM_6,CHROM_7,CHROM_8,CHROM_9,CHROM_10,CHROM_11,CHROM_12,CHROM_13,CHROM_14,CHROM_15,CHROM_16,CHROM_16.1,CHROM_17,CHROM_18,CHROM_19,CHROM_20,CHROM_21,CHROM_22,CHROM_MT,CHROM_X,CLNVC_Deletion,CLNVC_Duplication,CLNVC_Indel,CLNVC_Insertion,CLNVC_Inversion,CLNVC_Microsatellite,CLNVC_single_nucleotide_variant,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,Length,Relative_Location
count,65188.0,65188,65188,65188.0,65188.0,65188.0,65188,76,65188,76,65188,76,27659,58219,65188.0,65188.0,65188.0,65188,65188,65188,65172,65174,65174,65188,65188.0,65188.0,56304.0,55233.0,65188.0,55184,55184,108.0,65188.0,31969,24796,2,2.0,2,2.0,60975.0,64096.0,64096.0,25593.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0
unique,,866,458,,,,9234,48,9260,54,65188,68,26289,89,,,,374,48,4,2328,2,2369,3,3265.0,1930.0,13970.0,13663.0,,1262,2220,,,2,4,2,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,C,T,,,,MedGen:CN169374,MedGen:CN169374,not_specified,not_specified,NC_000014.8:g.102486364A>G,424764:Likely_pathogenic,UniProtKB_(protein):P04637,SO:0001583|missense_variant,,,,T,missense_variant,MODERATE,TTN,Transcript,NM_001267550.1,protein_coding,0.0,0.0,852.0,1.0,,A,cGg/cAg,,,OK,benign,FOXA1:MA0546.1,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,21798,20409,,,,5344,7,5344,7,1,2,124,23034,,,,19991,31444,33212,2765,65172,2765,65158,8893.0,56385.0,31.0,36.0,,2005,915,,,31707,13329,1,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,77575940.0,,,0.014511,0.014492,0.015263,,,,,,,,,1.15331,0.003666,0.252102,,,,,,,,,,,,1639.317971,,,825.731481,-0.006658,,,,1.0,,-0.08,0.345058,15.685616,2.554131,-0.402258,0.068325,0.132616,0.038795,0.0185,0.055885,0.03312,0.04398,0.02841,0.042094,0.035912,0.074339,0.042186,0.04059,0.030282,0.026462,0.042508,0.017733,0.082745,0.013745,0.059996,0.011643,0.012932,0.017411,0.000245,0.029545,0.038489,0.015862,0.003789,0.001457,0.000261,7.7e-05,0.940066,0.176413,0.011889,0.176167,0.016521,0.61901,3014.104434,0.796283
std,59740510.0,,,0.057795,0.059542,0.059527,,,,,,,,,5.099663,0.191803,0.434223,,,,,,,,,,,,4017.94395,,,1069.363315,0.999878,,,,0.0,,0.024042,0.361238,10.83635,2.961553,1.872684,0.252306,0.339162,0.193109,0.134753,0.2297,0.17895,0.205053,0.166143,0.200804,0.186071,0.262324,0.201014,0.197341,0.171363,0.160506,0.201746,0.131982,0.275499,0.116431,0.237481,0.107275,0.112981,0.130799,0.015665,0.169331,0.192374,0.124942,0.061439,0.038147,0.016147,0.008758,0.237367,0.381174,0.108386,0.380965,0.127471,0.485634,6739.867879,1.092935
min,961.0,,,0.0,0.0,0.0,,,,,,,,,0.0,0.0,0.0,,,,,,,,,,,,1.0,,,1.0,-1.0,,,,1.0,,-0.097,6.9e-05,0.001,-5.477391,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0.000262
25%,32541790.0,,,0.0,0.0,0.0,,,,,,,,,1.0,0.0,0.0,,,,,,,,,,,,291.0,,,55.5,-1.0,,,,1.0,,-0.0885,0.0243,7.141,0.462951,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,655.0,0.304388
50%,57970210.0,,,0.0,4e-05,0.0,,,,,,,,,1.0,0.0,0.0,,,,,,,,,,,,743.0,,,469.0,-1.0,,,,1.0,,-0.08,0.157,14.09,1.642948,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1208.0,0.582577
75%,112745400.0,,,0.0012,0.00123,0.0016,,,,,,,,,1.0,0.0,1.0,,,,,,,,,,,,1639.317971,,,1415.0,1.0,,,,1.0,,-0.0715,0.71,24.1,4.381392,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2647.0,0.871622


In [32]:
df4 = pd.read_csv('data_cleaned4.csv')
df4.describe(include='all')

Unnamed: 0,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,CLNDNINCL,CLNHGVS,CLNSIGINCL,CLNVI,MC,ORIGIN,SSR,CLASS,Allele,Consequence,IMPACT,SYMBOL,Feature_type,Feature,EXON,INTRON,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,DISTANCE,STRAND,BAM_EDIT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62,CHROM_1,CHROM_2,CHROM_3,CHROM_4,CHROM_5,CHROM_6,CHROM_7,CHROM_8,CHROM_9,CHROM_10,CHROM_11,CHROM_12,CHROM_13,CHROM_14,CHROM_15,CHROM_16,CHROM_16.1,CHROM_17,CHROM_18,CHROM_19,CHROM_20,CHROM_21,CHROM_22,CHROM_MT,CHROM_X,CLNVC_Deletion,CLNVC_Duplication,CLNVC_Indel,CLNVC_Insertion,CLNVC_Inversion,CLNVC_Microsatellite,CLNVC_single_nucleotide_variant,INT,EX,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,BIOTYPE_misc_RNA,BIOTYPE_protein_coding,BIOTYPE_unknown,Length,Relative_Location
count,65188.0,65188,65188,65188.0,65188.0,65188.0,65188,76,65188,76,65188,76,27659,58219,65188.0,65188.0,65188.0,65188,65188,65188,65172,65174,65174,56295,8803,56304.0,55233.0,54027.0,55184,55184,108.0,65188.0,31969,24796,2,2.0,2,2.0,65188.0,65188.0,65188.0,25593.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,65188.0,64995.0,65188.0
unique,,866,458,,,,9234,48,9260,54,65188,68,26289,89,,,,374,48,4,2328,2,2369,3264,1929,13970.0,13663.0,,1262,2220,,,2,4,2,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,C,T,,,,MedGen:CN169374,MedGen:CN169374,not_specified,not_specified,NC_000014.8:g.102486364A>G,424764:Likely_pathogenic,UniProtKB_(protein):P04637,SO:0001583|missense_variant,,,,T,missense_variant,MODERATE,TTN,Transcript,NM_001267550.1,16/16,47/362,852.0,1.0,,A,cGg/cAg,,,OK,benign,FOXA1:MA0546.1,,N,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,21798,20409,,,,5344,7,5344,7,1,2,124,23034,,,,19991,31444,33212,2765,65172,2765,1129,93,31.0,36.0,,2005,915,,,31707,13329,1,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,77575940.0,,,0.014511,0.014492,0.015263,,,,,,,,,1.15331,0.003666,0.252102,,,,,,,,,,,1639.317971,,,825.731481,-0.006658,,,,1.0,,-0.08,0.345058,15.685616,2.554131,-0.402258,0.068325,0.132616,0.038795,0.0185,0.055885,0.03312,0.04398,0.02841,0.042094,0.035912,0.074339,0.042186,0.04059,0.030282,0.026462,0.042508,0.017733,0.082745,0.013745,0.059996,0.011643,0.012932,0.017411,0.000245,0.029545,0.038489,0.015862,0.003789,0.001457,0.000261,7.7e-05,0.940066,0.13504,0.863579,0.176413,0.011889,0.176167,0.016521,0.61901,0.000215,0.99954,0.000245,3014.104434,0.43145
std,59740510.0,,,0.057795,0.059542,0.059527,,,,,,,,,5.099663,0.191803,0.434223,,,,,,,,,,,4413.498307,,,1069.363315,0.999878,,,,0.0,,0.024042,0.34937,10.745203,2.936643,1.872684,0.252306,0.339162,0.193109,0.134753,0.2297,0.17895,0.205053,0.166143,0.200804,0.186071,0.262324,0.201014,0.197341,0.171363,0.160506,0.201746,0.131982,0.275499,0.116431,0.237481,0.107275,0.112981,0.130799,0.015665,0.169331,0.192374,0.124942,0.061439,0.038147,0.016147,0.008758,0.237367,0.341769,0.343238,0.381174,0.108386,0.380965,0.127471,0.485634,0.014653,0.021448,0.015665,6749.867496,0.340927
min,961.0,,,0.0,0.0,0.0,,,,,,,,,0.0,0.0,0.0,,,,,,,,,,,1.0,,,1.0,-1.0,,,,1.0,,-0.097,6.9e-05,0.001,-5.477391,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,0.0
25%,32541790.0,,,0.0,0.0,0.0,,,,,,,,,1.0,0.0,0.0,,,,,,,,,,,237.0,,,55.5,-1.0,,,,1.0,,-0.0885,0.0305,7.304,0.483561,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,655.0,0.096284
50%,57970210.0,,,0.0,4e-05,0.0,,,,,,,,,1.0,0.0,0.0,,,,,,,,,,,543.0,,,469.0,-1.0,,,,1.0,,-0.08,0.209,14.325,1.685374,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1196.0,0.414178
75%,112745400.0,,,0.0012,0.00123,0.0016,,,,,,,,,1.0,0.0,1.0,,,,,,,,,,,1231.0,,,1415.0,1.0,,,,1.0,,-0.0715,0.656,24.0,4.313898,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2607.5,0.723922


In [33]:
xtrain3 = pd.read_csv('data_cleaned3_xtrain.csv')
ytrain3 = pd.read_csv('data_cleaned3_ytrain.csv')
xtest3 = pd.read_csv('data_cleaned3_xtest.csv')
ytest3 = pd.read_csv('data_cleaned3_ytest.csv')

In [34]:
xtrain3.head()

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,Protein_position,STRAND,CHROM_1,CHROM_2,CHROM_3,CHROM_4,CHROM_5,CHROM_6,CHROM_7,CHROM_8,CHROM_9,CHROM_10,CHROM_11,CHROM_12,CHROM_13,CHROM_14,CHROM_15,CHROM_16,CHROM_16.1,CHROM_17,CHROM_18,CHROM_19,CHROM_20,CHROM_21,CHROM_22,CHROM_MT,CHROM_X,CLNVC_Deletion,CLNVC_Duplication,CLNVC_Indel,CLNVC_Insertion,CLNVC_Inversion,CLNVC_Microsatellite,CLNVC_single_nucleotide_variant,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,Length,Relative_Location
0,84205724,0.0027,0.00204,0.0062,1.0,0.0,115.0,-1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,371.0,0.309973
1,103310860,0.0,1e-05,0.0,1.0,0.0,16.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,452.0,0.035398
2,39912056,0.0005,0.00024,0.0006,1.0,0.0,764.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,745.0,1.025503
3,131939132,0.0001,0.0,0.0,1.0,0.0,783.0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1312.0,0.596799
4,11230784,0.0,0.0,0.0,1.0,0.0,621.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,860.0,0.722093


In [36]:
ytrain3.head()

Unnamed: 0,1
0,0
1,0
2,0
3,0
4,1


In [37]:
xtest3.head()

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,Protein_position,STRAND,CHROM_1,CHROM_2,CHROM_3,CHROM_4,CHROM_5,CHROM_6,CHROM_7,CHROM_8,CHROM_9,CHROM_10,CHROM_11,CHROM_12,CHROM_13,CHROM_14,CHROM_15,CHROM_16,CHROM_16.1,CHROM_17,CHROM_18,CHROM_19,CHROM_20,CHROM_21,CHROM_22,CHROM_MT,CHROM_X,CLNVC_Deletion,CLNVC_Duplication,CLNVC_Indel,CLNVC_Insertion,CLNVC_Inversion,CLNVC_Microsatellite,CLNVC_single_nucleotide_variant,SIFT_deleterious,SIFT_deleterious_low_confidence,SIFT_tolerated,SIFT_tolerated_low_confidence,SIFT_unknown,Length,Relative_Location
0,108121511,0.0,2e-05,0.0,1.0,0.0,440.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,3056.0,0.143979
1,1279430,0.0012,0.00115,0.0008,1.0,0.0,702.0,-1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1132.0,0.620141
2,108150311,0.0002,5e-05,0.0,1.0,0.0,1126.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,3056.0,0.368455
3,128037043,0.0023,0.00223,0.0004,1.0,0.0,370.0,-1.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,514.0,0.719844
4,58145038,0.0,0.0,0.0,1.0,0.0,102.0,-1.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,303.0,0.336634


In [38]:
ytest3.head()

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,1
