In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#--------------------------------------------------------------------------OutcomeG part--------------------------------------------------------------------------
train=pd.read_csv("/Users/derri/Onedrive/ml/train2_outcomeG.csv")
train=train.drop(columns=['AnimalID','Aggressive', 'At Vet', 'Barn', 'Behavior',
       'Court/Investigation', 'Enroute', 'Foster', 'In Foster', 'In Kennel',
       'In Surgery', 'Medical', 'Offsite', 'Partner', 'Rabies Risk', 'SCRP',
       'Suffering','BreedName','color','agenumber', 'ageperiod', 'age','outcome'])

from sklearn.model_selection import train_test_split
X=train.values[:,0:312]
Y=train['Target'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)


from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
#---------------------------Grid Search---------------------------------------------------
from sklearn.model_selection import GridSearchCV
tree_para = {'criterion':['gini','entropy'],'max_depth':[1,2,3,4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,80,90,100,110,120,130,140,150],
             'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20]}
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf.fit(X, Y)
clf.best_params_
# the best criterion is entropy, max_depth=15, min_samples_leaf=11

In [None]:
#-----------------------------------------Entropy Decision Tree-------------------------------
# perform training with entropy.
# Decision tree with entropy
clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=15, min_samples_leaf=11)
# Performing training
clf_entropy.fit(X_train, Y_train)
#%%-----------------------------------------------------------------------
# make predictions
# predicton on test using entropy
Y_pred_entropy = clf_entropy.predict(X_test)
#%%-----------------------------------------------------------------------
# calculate metrics entropy model
print("\n")
print("Results Using Entropy: \n")
print("Classification Report: ")
print(classification_report(Y_test,Y_pred_entropy))
print("\n")
print("Accuracy : ", accuracy_score(Y_test, Y_pred_entropy) * 100)
print ('-'*80 + '\n')
#%%-----------------------------------------------------------------------
# confusion matrix for entropy model
conf_matrix = confusion_matrix(Y_test, Y_pred_entropy)
class_names = train.target.unique()
df_cm = pd.DataFrame(conf_matrix, index=class_names, columns=class_names )

plt.figure(figsize=(5,5))
hm = sns.heatmap(df_cm, cbar=False, annot=True, square=True, fmt='d', annot_kws={'size': 20},
                 yticklabels=df_cm.columns, xticklabels=df_cm.columns)
hm.yaxis.set_ticklabels(hm.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=20)
hm.xaxis.set_ticklabels(hm.xaxis.get_ticklabels(), rotation=0, ha='right', fontsize=20)
plt.ylabel('True label',fontsize=20)
plt.xlabel('Predicted label',fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
# display decision tree
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
import webbrowser

In [None]:
from IPython.display import Image

In [None]:
dot_data = export_graphviz(clf_entropy, filled=True, rounded=True, class_names=class_names,
                           feature_names=train.iloc[:, :-2].columns, out_file=None)

graph = graph_from_dot_data(dot_data)
Image(graph.create_png())

In [None]:
# display important features
importances=clf_entropy.feature_importances_
for k,v in sorted(zip(map(lambda x: round(x, 5), importances), train.columns), reverse=True):
    print (v + ": " + str(k))

from sklearn.metrics import roc_curve,auc

predictions = clf_entropy.predict_proba(X_test)
false_positive_rate, true_positive_rate, _ = roc_curve(Y_test, predictions[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)

plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()