In [63]:
import os
import warnings
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from matplotlib import pyplot as plt
from sklearn import tree
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [64]:
destination =  os.environ.get('LR_Destination')
ChurnData = pd.read_csv(str(destination + '/' + 'WA_Fn-UseC_-Telco-Customer-Churn.csv'))

Although Decision Trees can train on complex non-linear relationships to determine the relationship between a feature and a  target variable, it can't understand categorical features that are expressed through strings like 'Yes' or 'No' / 'Female' or 'Male' so we must map to these to numerical values without necessarily needing to create indicator variables unlike with building a logistic regression model.

### Data Transformation

In [65]:
#First drop off customerID column
if 'customerID' in ChurnData.columns.values:
     ChurnData = ChurnData.drop(['customerID'],axis=1)
else:
     1==1

#Converting Total Charges to Float
ChurnData['TotalCharges'] = pd.to_numeric(ChurnData['TotalCharges'], errors='coerce').fillna(pd.to_numeric(ChurnData['TotalCharges'], errors='coerce').mean())


unique_values_dict = {}
for cols in ChurnData.columns.values:
    if ChurnData[cols].dtypes != 'int64' and ChurnData[cols].dtypes != 'float64':
          unique_values_dict[cols] = ChurnData[cols].unique()

In [66]:
for i in unique_values_dict:
    remapped_values = {}
    names_and_remapped_values = {}

    #for i in range of length of values in key:values for gender
    for iterator in range(len(unique_values_dict[i])):
        remapped_values[unique_values_dict[i][iterator]] = iterator

        names_and_remapped_values[i] = remapped_values 

    for key in names_and_remapped_values:
        ChurnData[key] = ChurnData[key].map(names_and_remapped_values[key])

### Hyperparameter Tuning (Helps us find optimal parameters for the ML Model, can be beneficial for potential overfitting)

In [67]:
from sklearn.datasets import make_classification

# ChurnData = ChurnData.dropna()
Churn_X = ChurnData.iloc[:, 0:19].to_numpy()
Churn_y = ChurnData['Churn'].to_numpy()
 
param_dist = {
    "max_depth": [3, None],
    "max_features": randint(1, 9),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}

model = DecisionTreeClassifier()
model_cv = RandomizedSearchCV(model, param_dist, cv=5)
 
model_cv.fit(Churn_X, Churn_y)

print("Tuned Decision Tree Parameters: {}".format(model_cv.best_params_))
print("Best score is {}".format(model_cv.best_score_))


Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 3, 'max_features': 6, 'min_samples_leaf': 4}
Best score is 0.7743874604813213


### Decision Tree Building

In [68]:
CD_X_train, CD_X_test, CD_y_train, CD_y_test = train_test_split(
    Churn_X, Churn_y, test_size=0.2, random_state=42
)

model = DecisionTreeClassifier(criterion = 'gini', max_depth=None, max_features = 7, min_samples_leaf = 6).fit(CD_X_train, CD_y_train)

Churn_y_pred = model.predict(CD_X_test)

round(accuracy_score(CD_y_test, Churn_y_pred)*100, 2)

print('Accuracy: ' + str(round(accuracy_score(CD_y_test, Churn_y_pred)*100, 2)) + '%')

Accuracy: 78.42%


In [69]:
print("Confusion matrix: \n", confusion_matrix(CD_y_test, Churn_y_pred))
print("\nClassification report: \n", classification_report(CD_y_test, Churn_y_pred))

Confusion matrix: 
 [[934 102]
 [202 171]]

Classification report: 
               precision    recall  f1-score   support

           0       0.82      0.90      0.86      1036
           1       0.63      0.46      0.53       373

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409

