# Decision Tree for p2p lending data

Entropy Formula:             $ Entropy(p) = -\sum_{i=1}^{N}p_{i}\log_{2}{p_{i}} $  
CART Gini Index              $Gini(D)=1-\sum_{i=1}^{N}p_{i}^{2} $

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import manifold, datasets
from sklearn import tree 
import graphviz 
from sklearn.metrics import accuracy_score as acc_rate 
from sklearn.model_selection import train_test_split

In [2]:
p2p_data = pd.read_csv('./p2p.csv', index_col=0)

In [3]:
p2p_data.head()

Unnamed: 0,ratio001,ratio002,ratio003,ratio004,ratio005,ratio006,ratio008,ratio011,ratio012,ratio017,...,DIO,DPO,DSO,turnover,status,nace,ratio036,ratio037,ratio039,ratio040
1,-0.238174,-0.155186,-0.124784,-0.422984,-0.441998,-0.281685,-0.476657,-3.467956,-3.405227,-1.297426,...,-0.297901,-0.462928,-0.722769,-0.381662,1,4635,1,1,1,1
2,-0.758472,-0.356497,-0.65906,-0.076745,0.260873,-0.515998,-1.119889,1.75463,0.921839,4.328932,...,-0.317154,-0.279493,-0.198444,4.591706,1,4791,1,1,1,1
3,-0.333616,-0.207378,0.508892,-0.263181,-0.49691,-0.276421,-0.372174,-0.547247,-0.088445,1.102322,...,-0.224098,-0.268703,-0.603186,0.408384,1,4752,1,1,1,1
4,-0.444811,-0.478278,-1.05666,0.145204,-0.255298,-0.239247,-0.424368,-3.467956,0.06405,1.449654,...,-0.166338,-0.398186,-0.685974,0.094786,1,4771,1,1,1,1
5,-0.479096,-0.523014,-1.156061,-0.902393,-0.749504,-0.371168,-0.400809,1.75463,-0.107507,0.039276,...,-0.320363,-0.1608,0.638637,0.0409,1,3212,1,1,1,1


In [4]:
X = p2p_data.drop("status",axis=1)
y = p2p_data["status"]

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', ) 
clf.fit(X=X_train, y=y_train)

DecisionTreeClassifier(criterion='entropy')

In [6]:
import os
os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'

In [7]:
# make prediction 
print('\nThe target test data set is:\n', y_test) 
print('\nThe predicted result is:\n', clf.predict(X_test)) 
print('\nAccuracy rate is:\n',acc_rate(y_test, clf.predict(X_test))) 
# visualizing the tree 
dot_data = tree.export_graphviz(clf, out_file=None,feature_names=list(X_train.columns.values), class_names=list("status"), filled=True, rounded=True, impurity=False, special_characters=True) 
graph = graphviz.Source(dot_data) 
graph.render("p2p_decision tree") 


The target test data set is:
 928      1
8848     0
12383    0
8237     0
10986    0
        ..
13690    0
13540    0
1959     0
10725    0
6760     0
Name: status, Length: 4965, dtype: int64

The predicted result is:
 [0 0 0 ... 0 0 0]

Accuracy rate is:
 0.8702920443101712


'p2p_decision tree.pdf'

# Decision Tree for mnist data

In [8]:
import tensorflow as tf 
(train_x, train_y), (test_x, test_y) = tf.keras.datasets.mnist.load_data(path='mnist.npz')


In [9]:
train_x.shape

(60000, 28, 28)

In [10]:
test_x.shape

(10000, 28, 28)

In [11]:
train_x = train_x.reshape(60000,28*28)
test_x = test_x.reshape(10000,28*28)
clf = tree.DecisionTreeClassifier(criterion='entropy',splitter='best')
clf.fit(X=train_x, y=train_y)

DecisionTreeClassifier(criterion='entropy')

In [12]:
print('\nThe target test data set is:\n', test_x)
print('\nThe predicted result is:\n', clf.predict(test_x))
print('\nAccuracy rate is:\n', acc_rate(test_y, clf.predict(test_x)))
dot_data = tree.export_graphviz(clf,
                                filled=True,
                                rounded=True,
                                impurity=False,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("mnist", view = True)


The target test data set is:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

The predicted result is:
 [7 8 1 ... 4 6 6]

Accuracy rate is:
 0.8849


'mnist.pdf'

In [13]:
train_x = train_x.reshape(60000,28*28)
test_x = test_x.reshape(10000,28*28)
clf = tree.DecisionTreeClassifier(criterion='gini',splitter='best')
clf.fit(X=train_x, y=train_y)
print('\nThe target test data set is:\n', test_x)
print('\nThe predicted result is:\n', clf.predict(test_x))
print('\nAccuracy rate is:\n', acc_rate(test_y, clf.predict(test_x)))
dot_data = tree.export_graphviz(clf,
                                filled=True,
                                rounded=True,
                                impurity=False,
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("mnist1", view = True)


The target test data set is:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

The predicted result is:
 [7 2 1 ... 4 5 6]

Accuracy rate is:
 0.8796


'mnist1.pdf'