In [158]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import pickle

In [159]:
df = pd.read_csv('encoded.csv',index_col=0)

print(df.head())

   site_id  patient_id    image_id laterality view   age  cancer  biopsy   
0        1        8785  1050149859          L   CC  63.0       0       1  \
1        2       17834  2118622197          L  MLO  47.0       0       1   
2        1       32614  2016537179          R   CC  41.0       0       1   
3        2       41936  1901427644          L  MLO  53.0       0       1   
4        1       51622  1419380850          L  MLO  70.0       0       1   

   invasive  BIRADS  implant density  machine_id  difficult_negative_case   
0         0     0.0        0       B          49                     True  \
1         0     NaN        0     NaN          29                     True   
2         0     0.0        0       B          49                     True   
3         0     NaN        0     NaN          29                     True   
4         0     0.0        0       C          49                     True   

             path  
0  [[0.51231754]]  
1  [[0.34303793]]  
2   [[0.5469943]]  


In [160]:
df = df.drop(columns=['site_id','patient_id','image_id','machine_id','BIRADS','difficult_negative_case'])

In [161]:
df['density'] = df['density'].replace({'B':2,'A':1,'C':3,'D':4,np.nan:0})
df['density'].unique()

array([2., 0., 3., 1., 4.])

In [162]:
df['path'] = df['path'].apply(lambda x:float(x[2:-2]))
df.head()

Unnamed: 0,laterality,view,age,cancer,biopsy,invasive,implant,density,path
0,L,CC,63.0,0,1,0,0,2.0,0.512318
1,L,MLO,47.0,0,1,0,0,0.0,0.343038
2,R,CC,41.0,0,1,0,0,2.0,0.546994
3,L,MLO,53.0,0,1,0,0,0.0,0.194642
4,L,MLO,70.0,0,1,0,0,3.0,0.680472


In [163]:
df['laterality'] = df['laterality'] .replace({'L':0,'R':1})
df['view'] = df['view'].replace({'CC':0,'MLO':1,'ML':2,'AT':3})

In [164]:
df.columns

Index(['laterality', 'view', 'age', 'cancer', 'biopsy', 'invasive', 'implant',
       'density', 'path'],
      dtype='object')

In [165]:
train_df, val_df = train_test_split(df,
                                   test_size = 0.30,
                                   random_state = 2018,
                                   stratify = df[['cancer']])

In [166]:
Y_train = train_df['cancer']
X_train = train_df.drop(columns=['cancer'])
Y_test = val_df['cancer']
X_test = val_df.drop(columns=['cancer'])

In [167]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train,Y_train)

pred = classifier.predict(X_test)


In [168]:
acc = accuracy_score(Y_test,pred)
acc

accuracy :  0.9280575539568345


In [169]:
filename = 'decision_model.sav'
pickle.dump(classifier, open(filename, 'wb'))