In [45]:
import boto3
import pandas as pd; pd.set_option('display.max_column', 100)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

## define bucket in which you are trying to reach
s3 = boto3.resource('s3')
bucket_name = 'daltondencklau-data445-bucket'
bucket = s3.Bucket(bucket_name)

## define csv file to read in the bucket
file_key= 'telecom_churn.csv'

## syntax to allow us to read the file
bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## reading the data file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [46]:
## reporting the relative frequency of Churn variable (frequency table)
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [47]:
## defining the input and target variables
x = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
y = churn_data['Churn']

## splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [48]:
y_train.value_counts() / y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [49]:
y_test.value_counts() / y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

In [50]:
## building random forest classifier
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)

## predicting on the testing dataset
rf_preds = rf_md.predict_proba(x_test)[:, 1]

## calculating the ROC AUC curve
tpr, fpr, threshold = roc_curve(y_test, rf_preds)

In [51]:
## creating a dataframe to store roc_curve results
cutoffs = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.660974
1,0.010309,0.000000,0.660974
2,0.164948,0.000000,0.597617
3,0.164948,0.001754,0.596624
4,0.175258,0.001754,0.596094
...,...,...,...
145,0.989691,0.989474,0.056099
146,0.989691,0.994737,0.056060
147,0.989691,0.998246,0.056048
148,1.000000,0.998246,0.056045


In [52]:
## removing the first observation because it doesn't make sense and could skew the results
cutoffs = cutoffs.drop(cutoffs.index[0], axis = 0)

In [53]:
cutoffs

Unnamed: 0,fpr,tpr,threshold
1,0.010309,0.000000,0.660974
2,0.164948,0.000000,0.597617
3,0.164948,0.001754,0.596624
4,0.175258,0.001754,0.596094
5,0.175258,0.003509,0.595839
...,...,...,...
145,0.989691,0.989474,0.056099
146,0.989691,0.994737,0.056060
147,0.989691,0.998246,0.056048
148,1.000000,0.998246,0.056045


In [54]:
## calculating euclidean distance
cutoffs['euclidean_dist'] = np.sqrt(cutoffs['fpr']**2 + (1 - cutoffs['tpr'])**2)
cutoffs

Unnamed: 0,fpr,tpr,threshold,euclidean_dist
1,0.010309,0.000000,0.660974,1.000053
2,0.164948,0.000000,0.597617,1.013513
3,0.164948,0.001754,0.596624,1.011782
4,0.175258,0.001754,0.596094,1.013513
5,0.175258,0.003509,0.595839,1.011786
...,...,...,...,...
145,0.989691,0.989474,0.056099,0.989747
146,0.989691,0.994737,0.056060,0.989705
147,0.989691,0.998246,0.056048,0.989692
148,1.000000,0.998246,0.056045,1.000002


In [55]:
## sorting smallest to largest bc smaller euclidean distances = better AUC ROC curve result
cutoffs = cutoffs.sort_values(by = 'euclidean_dist')
cutoffs

Unnamed: 0,fpr,tpr,threshold,euclidean_dist
130,0.958763,0.882456,0.056971,0.965941
129,0.958763,0.878947,0.056978,0.966375
128,0.958763,0.861404,0.057169,0.968729
127,0.958763,0.856140,0.057175,0.969496
126,0.958763,0.852632,0.057185,0.970023
...,...,...,...,...
86,0.917526,0.205263,0.108743,1.213862
81,0.896907,0.180702,0.130263,1.214781
84,0.907216,0.184211,0.125641,1.220063
83,0.907216,0.182456,0.125723,1.221237


In [56]:
## changing likelihoods to labels
rf_labels = np.where(rf_preds < 0.056971, 0, 1)

print(classification_report(y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.94      0.12      0.21       570
           1       0.16      0.96      0.27        97

    accuracy                           0.24       667
   macro avg       0.55      0.54      0.24       667
weighted avg       0.83      0.24      0.22       667

