# Decision trees

The purpose of this chapter is to use the python libraries given by scikit learn to solve the case of Adventure Works targeting problem.
DATA SET: Bikebuyer.csv
The company's data warehouse, Adventure Works DW, contains a list of past customers with their demographic data (TargetMail) with attributes such as ‘Bikebuyer’, MaritalStatus', 'YearlyIncome', 'TotalChildren', 'ChildrenAtHome', 'HouseOwnerFlag','NumberCarsOwned','Age'

CLASS WORK
• Develop a decision tree using CART algorithm with Python libraries. Determine suitable ccp alpha for tree pruning.
• Browse the tree to discover the classification of the records based on probability of buying cycles.
• Analyse the prediction performance of the tree model using confusion matrix.
• Build and test a DT using Random forests.
• Compare performances of the two trees using ROC and PR curves.


In [None]:
#Install updates (-U)

In [None]:
pip install -U scikit-learn

In [None]:
pip install -U yellowbrick

In [None]:
pip install -U graphviz

In [None]:
#IMPORT PYTHON MODULES

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
import pandas as pd
import numpy as np
import graphviz
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification #for bootstrapping
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler 

In [None]:
#READ AND EXPLORE DATA
#Use pandas read_csv function to upload Bikebuyer data from the computer folder (Mac) to IPython. For other OS, please refer to Chapter 6 of Wes McKinney’s book on Python for Data Analysis
#Alternatively upload data file to jupyter homepage and use pd.read_csv("Bikebuyer.csv") 
data=pd.read_csv("/Users/sajimathew/Documents/DoMS/Teaching/EMBA/DMBI/DMBI-2020/Data/Bikebuyer.csv")
#Extract relevant features from data
X=data[['YearlyIncome','TotalChildren','ChildrenAtHome','HouseOwnerFlag','NumberCarsOwned','Age']]
#'MaritalStatus' not included as scikit learn CART algorithm doesnt support categorical variables type(X)
#np.shape(X) # dimension of dataframe
#X.dtypes
#X.head()
#data.isnull().sum().sum()
data.describe()

In [None]:
#DATA PREPARATION
#scikit-learn uses an optimised version of the CART algorithm; however, scikit-learn implementation does not support categorical variables for now.
#[https://scikit-learn.org/stable/modules/tree.html#tree-algorithms-id3-c4-5-c5-0-and-cart]

#Assign Bikebuyer as target variable y
y=data['BikeBuyer']
#Normalising predictor variables-not recommended for this algorithm, as integer data is preferred.
#scaler = StandardScaler()
#scaler.fit(X)
#X=scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
#MODEL BUILDING
#Specify decision tree model as dt using scikit learn “DecisionTreeClassifier” module: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
dt=DecisionTreeClassifier(criterion='gini',
    min_samples_split=1200,
    min_samples_leaf=500,
    ccp_alpha=0.0)
#Fit the model using fit() method
dt.fit(X_train, y_train)

In [None]:
#TREE PRUNING USING CCP ALPHA
#Determining Cost Complexity Parameter (ccp_alpha) for post pruning the tree: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
path = dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

dts = []
for ccp_alpha in ccp_alphas:
    dt = DecisionTreeClassifier(criterion = "gini",  random_state=0, ccp_alpha=ccp_alpha)
    dt.fit(X_train, y_train)
    dts.append(dt)

print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        dts[-1].tree_.node_count, ccp_alphas[-1]
    )
)
#checking if number of nodes becomes 1 for the last tree, if so remove it using the steps below
#--removes last item if required
#dts = dts[:-1]
#ccp_alphas = ccp_alphas[:-1]

In [None]:
#decide on max depth parameter
node_counts = [dt.tree_.node_count for dt in dts]
depth = [dt.tree_.max_depth for dt in dts]
fig, ax = plt.subplots(2, 1)
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()

In [None]:
#decide on ccp alpha baased on training and test error
train_scores = [dt.score(X_train, y_train) for dt in dts]
test_scores = [dt.score(X_test, y_test) for dt in dts]

fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
#FINAL MODEL BUILDING
dt_cart=DecisionTreeClassifier(criterion='gini',max_depth= 10, 
    min_samples_split=1200,
    min_samples_leaf=500,
    ccp_alpha=0.0)
dt_cart.fit(X_train, y_train)

In [None]:
#VISUALIZING THE TREE USING GRAPHVIZ
dot_data = tree.export_graphviz(dt_cart, out_file=None,feature_names=X_train.columns,class_names=['NonBuyer','Buyer'],filled=True, rounded=True,special_characters=True) 
graph = graphviz.Source(dot_data)
graph
#Scikit learn CART algorithm treats ordinal data also as integers and performs division. Categorical data not advisable. 

In [None]:
#MODEL PREDICTION AND PERFORMANCE
y_pred=dt_cart.predict(X_test)
from sklearn.metrics import classification_report
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp
#print(classification_report(y_test, y_pred, target_names=['Buyer','Nonbuyer']))

In [None]:
#RANDOM FOREST

X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
#Generates 1000 subsamples for tree
dt_rf=RandomForestClassifier(n_estimators=5, criterion='gini', 
                          max_depth=None, min_samples_split=1200, 
                          min_samples_leaf=500, 	
                          ccp_alpha=0.0)
dt_rf.fit(X_train, y_train)
y_pred=dt_rf.predict(X_test)
from sklearn.metrics import classification_report
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
tn, fp, fn, tp
#print(classification_report(y_test, y_pred, target_names=['Buyer','Nonbuyer'])) 
#The F1 score can be interpreted as a harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:

#F1 = 2 * (precision * recall) / (precision + recall) [Harmonic mean]

In [None]:
#ROC Curve and PR curves
#References
# 1. https://www.kaggle.com/code/nicholasgah/obtain-optimal-probability-threshold-using-rocfrom sklearn import metrics
# 2. https://sklearn-evaluation.ploomber.io/en/latest/classification/micro_macro.html#:~:text=%2C%20%22C%22%5D-,Micro%2Daverage%20approach,(FNs)%20of%20the%20model.
# 3. https://scikit-learn.org/stable/modules/model_evaluation.html#roc-metrics
# 4. https://scikit-learn.org/stable/visualizations.html


from sklearn import metrics
from sklearn.metrics import roc_curve
roc_cart=metrics.plot_roc_curve(dt_cart, X_test, y_test)
roc_rf=metrics.plot_roc_curve(dt_rf, X_test, y_test)
#plt.plot([0,1], [0,1], linestyle="--")
pr_cart=metrics.plot_precision_recall_curve(dt_cart, X_test, y_test)
pr_rf=metrics.plot_precision_recall_curve(dt_rf, X_test, y_test)
fpr, tpr, cutoffs = roc_curve(y_test, dt_cart.predict_proba(X_test)[:, 1])
cutoffs
#As many threshols as the number of leaf nodes; each score is the probability of the node

In [None]:
#Subplots visualization of ROC and PR charts

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
roc_cart.plot(ax=ax1)
roc_rf.plot(ax=ax1)
pr_cart.plot(ax=ax2)
pr_rf.plot(ax=ax2)
plt.show()

In [None]:
#CROSS VALIDATION

#simple validation set approach: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter  
#by default splits into five folds
from sklearn import model_selection
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
scores = cross_validate(dt, X_test, y_test)
k=scores['test_score']
k


In [None]:
#K-fold

kf = KFold(n_splits=500)
scores=[]
for train_index, test_index in kf.split(X,y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index],y[train_index], y[test_index]
    dt.fit(X_train, y_train)
    scores.append(dt.score(X_test, y_test)) 
scores