# Decision Tree

In [2]:
# Import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score  ## evaluate the model

## Import data

In [5]:
# Read file
from deezerData import readData
df, X, y, X_train, X_test, y_train, y_test = readData()
# Making arrays
X, y, X_train, X_test, y_train, y_test = X.values, y.values, X_train.values, X_test.values, y_train.values, y_test.values

## Hyperparameter tunning

For the decision tree technique, we tune these following parameters:
   * Maximum depth of the tree
   * Maximum features
   * Minimum samples leaf
   * Criterion

In [10]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [5, None],
              "max_features": randint(1, 15),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X_train, y_train)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

# Predict
y_pred = tree_cv.predict(X_test)

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 5}
Best score is 0.7527570114884445


## Evaluation

In [8]:
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy of decision tree: {}".format(tree_cv.score(X_test, y_test)))

# Compute and print AUC score
print("AUC of decision tree: {}".format(roc_auc_score(y_test, y_pred)))

[[ 551  296]
 [ 302 1436]]
              precision    recall  f1-score   support

           0       0.65      0.65      0.65       847
           1       0.83      0.83      0.83      1738

    accuracy                           0.77      2585
   macro avg       0.74      0.74      0.74      2585
weighted avg       0.77      0.77      0.77      2585

Accuracy of decision tree: 0.7686653771760155
AUC of decision tree: 0.7383841704900393
