In [1]:
import os
import warnings
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from matplotlib import pyplot as plt
from sklearn import tree
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
destination =  os.environ.get('LR_Destination')
ChurnData = pd.read_csv(str(destination + '/' + 'WA_Fn-UseC_-Telco-Customer-Churn.csv'))

Although Decision Trees can train on complex non-linear relationships to determine the relationship between a feature and a  target variable, it can't understand categorical features that are expressed through strings like 'Yes' or 'No' / 'Female' or 'Male' so we must map to these to numerical values without necessarily needing to create indicator variables unlike with building a logistic regression model.

### Data Transformation

In [3]:
#First drop off customerID column
if 'customerID' in ChurnData.columns.values:
     ChurnData = ChurnData.drop(['customerID'],axis=1)
else:
     1==1

#Converting Total Charges to Float
ChurnData['TotalCharges'] = pd.to_numeric(ChurnData['TotalCharges'], errors='coerce').fillna(pd.to_numeric(ChurnData['TotalCharges'], errors='coerce').mean())


unique_values_dict = {}
for cols in ChurnData.columns.values:
    if ChurnData[cols].dtypes != 'int64' and ChurnData[cols].dtypes != 'float64':
          unique_values_dict[cols] = ChurnData[cols].unique()

In [4]:
for i in unique_values_dict:
    remapped_values = {}
    names_and_remapped_values = {}

    #for i in range of length of values in key:values for gender
    for iterator in range(len(unique_values_dict[i])):
        remapped_values[unique_values_dict[i][iterator]] = iterator

        names_and_remapped_values[i] = remapped_values 

    for key in names_and_remapped_values:
        ChurnData[key] = ChurnData[key].map(names_and_remapped_values[key])

### Hyperparameter Tuning (Helps us find optimal parameters for the ML Model, can be beneficial for potential overfitting)

In [5]:
from sklearn.datasets import make_classification

# ChurnData = ChurnData.dropna()
Churn_X = ChurnData.iloc[:, 0:19].to_numpy()
Churn_y = ChurnData['Churn'].to_numpy()

CD_X_train, CD_X_test, CD_y_train, CD_y_test = train_test_split(
    Churn_X, Churn_y, test_size=0.2, random_state=42
)

array = [i for i in range(1,11)]
 
param_dist = {
    "max_depth": array,
    "max_features": array,
    "min_samples_leaf": array,
    "criterion": ["gini", "entropy"]
}

model = DecisionTreeClassifier()

rs = RandomizedSearchCV(model, param_dist, cv=5)
rs.fit(CD_X_train, CD_y_train)

print("Tuned Decision Tree Parameters (RandomizedSearchCV): {}".format(rs.best_params_))
print("Best score is {}".format(rs.best_score_))

grid_search = GridSearchCV(estimator= DecisionTreeClassifier(), param_grid= param_dist, cv=3, scoring='accuracy')
grid_search.fit(CD_X_train, CD_y_train)

print("Tuned Decision Tree Parameters (Grid_Search): {}".format(grid_search.best_params_))
print("Best score is {}".format(grid_search.best_score_))

model.fit(CD_X_train, CD_y_train)
Churn_y_pred = model.predict(CD_X_test)
print('Accuracy for default (Decision Tree): ' + str(accuracy_score(CD_y_test, Churn_y_pred)))

Tuned Decision Tree Parameters (RandomizedSearchCV): {'min_samples_leaf': 6, 'max_features': 4, 'max_depth': 9, 'criterion': 'gini'}
Best score is 0.779730370795318
Tuned Decision Tree Parameters (Grid_Search): {'criterion': 'entropy', 'max_depth': 6, 'max_features': 10, 'min_samples_leaf': 10}
Best score is 0.7921547745828897
Accuracy for default (Decision Tree): 0.7388218594748048


### Decision Tree Building

In [6]:
CD_X_train, CD_X_test, CD_y_train, CD_y_test = train_test_split(
    Churn_X, Churn_y, test_size=0.2, random_state=42
)

model = DecisionTreeClassifier()

model.fit(CD_X_train, CD_y_train)

Churn_y_pred = model.predict(CD_X_test)

round(accuracy_score(CD_y_test, Churn_y_pred)*100, 2)

print('Accuracy: ' + str(round(accuracy_score(CD_y_test, Churn_y_pred)*100, 2)) + '%')

Accuracy: 73.24%


In [7]:
print("Confusion matrix: \n", confusion_matrix(CD_y_test, Churn_y_pred))
print("\nClassification report: \n", classification_report(CD_y_test, Churn_y_pred))

Confusion matrix: 
 [[843 193]
 [184 189]]

Classification report: 
               precision    recall  f1-score   support

           0       0.82      0.81      0.82      1036
           1       0.49      0.51      0.50       373

    accuracy                           0.73      1409
   macro avg       0.66      0.66      0.66      1409
weighted avg       0.73      0.73      0.73      1409

