In [102]:
import os
import warnings
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from matplotlib import pyplot as plt
from sklearn import tree

In [103]:
destination =  os.environ.get('LR_Destination')
ChurnData = pd.read_csv(str(destination + '/' + 'WA_Fn-UseC_-Telco-Customer-Churn.csv'))

Although Decision Trees can train on complex non-linear relationships to determine the relationship between a feature and a  target variable, it can't understand categorical features that are expressed through strings like 'Yes' or 'No' / 'Female' or 'Male' so we must map to these to numerical values without necessarily needing to create indicator variables unlike with building a logistic regression model.

### Data Transformation

In [104]:
#First drop off customerID column
if 'customerID' in ChurnData.columns.values:
     ChurnData = ChurnData.drop(['customerID'],axis=1)
else:
     1==1

#Converting Total Charges to Float
ChurnData['TotalCharges'] = pd.to_numeric(ChurnData['TotalCharges'], errors='coerce').fillna(pd.to_numeric(ChurnData['TotalCharges'], errors='coerce').mean())


unique_values_dict = {}
for cols in ChurnData.columns.values:
    if ChurnData[cols].dtypes != 'int64' and ChurnData[cols].dtypes != 'float64':
          unique_values_dict[cols] = ChurnData[cols].unique()

In [105]:
for i in unique_values_dict:
    remapped_values = {}
    names_and_remapped_values = {}
    #  print(i, unique_values_dict[i])

    #for i in range of length of values in key:values for gender
    for iterator in range(len(unique_values_dict[i])):
        remapped_values[unique_values_dict[i][iterator]] = iterator

        names_and_remapped_values[i] = remapped_values 

    for key in names_and_remapped_values:
        ChurnData[key] = ChurnData[key].map(names_and_remapped_values[key])

### Decision Tree Building

In [106]:
Churn_X = ChurnData.iloc[:, 0:19]
Churn_y = ChurnData['Churn']
scaler = MinMaxScaler()
model = DecisionTreeClassifier(max_depth=5)

CD_X_train, CD_X_test, CD_y_train, CD_y_test = train_test_split(
    Churn_X, Churn_y, test_size=0.2, random_state=42
)

#rescaling features so that no feature becomes overtrained for overrepresented features preventing a feature from dominating.
CD_X_train = scaler.fit_transform(CD_X_train)
CD_X_test = scaler.fit_transform(CD_X_test)

model = model.fit(CD_X_train,CD_y_train)

In [107]:
#Model makes predictions on testing subset of the data
predictions = model.predict(CD_X_test)
accuracy_score(CD_y_test, predictions)


0.7977288857345636

In [108]:
print("Confusion matrix: \n", confusion_matrix(CD_y_test, predictions))
print("\nClassification report: \n", classification_report(CD_y_test, predictions))

Confusion matrix: 
 [[888 148]
 [137 236]]

Classification report: 
               precision    recall  f1-score   support

           0       0.87      0.86      0.86      1036
           1       0.61      0.63      0.62       373

    accuracy                           0.80      1409
   macro avg       0.74      0.74      0.74      1409
weighted avg       0.80      0.80      0.80      1409

