In [1]:
pip install imblearn

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the datafile
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [3]:
#defining the input and target variables
x=churn_data[['AccountWeeks','ContractRenewal','CustServCalls','MonthlyCharge','DayMins']]
y=churn_data['Churn']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,stratify=y)

In [4]:
#running SMOTE
x_SMOTE,y_SMOTE = SMOTE().fit_resample(x_train,y_train)

**Random Forest**

In [5]:
#building the model
rf_md=RandomForestClassifier(n_estimators=500,max_depth=3).fit(x_SMOTE,y_SMOTE)

#predict on test
rf_pred = rf_md.predict_proba(x_test)[:,1]

#ROC
fpr,tpr,threshold = roc_curve(y_test,rf_pred)

#finding optimal cutoff from ROC
rf_cutoff=pd.DataFrame({'fpr':fpr,'tpr':tpr,'cutoff':threshold})

#computing distance to perfect model
rf_cutoff['distance']=np.sqrt(rf_cutoff['fpr']**2 + (1-rf_cutoff['tpr'])**2)

#sorting based on distance
rf_cutoff = rf_cutoff.sort_values(by='distance').reset_index(drop=True)

#changing likelihoods to labels
rf_pred_label = np.where(rf_pred<rf_cutoff['cutoff'][0],0,1)
                         
#classification report
print(classification_report(y_test,rf_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.88      0.92       570
           1       0.55      0.86      0.67        97

    accuracy                           0.88       667
   macro avg       0.76      0.87      0.80       667
weighted avg       0.91      0.88      0.89       667



**AdaBoost**

In [6]:
#building the model
ada_md=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),n_estimators=500,learning_rate=0.001).fit(x_SMOTE,y_SMOTE)

#predict on test
ada_pred = ada_md.predict_proba(x_test)[:,1]

#ROC
fpr,tpr,threshold = roc_curve(y_test,ada_pred)

#finding optimal cutoff from ROC
ada_cutoff=pd.DataFrame({'fpr':fpr,'tpr':tpr,'cutoff':threshold})

#computing distance to perfect model
ada_cutoff['distance']=np.sqrt(ada_cutoff['fpr']**2 + (1-ada_cutoff['tpr'])**2)

#sorting based on distance
ada_cutoff = ada_cutoff.sort_values(by='distance').reset_index(drop=True)

#changing likelihoods to labels
ada_pred_label = np.where(ada_pred<ada_cutoff['cutoff'][0],0,1)
                         
#classification report
print(classification_report(y_test,ada_pred_label))

              precision    recall  f1-score   support

           0       0.97      0.84      0.90       570
           1       0.47      0.85      0.61        97

    accuracy                           0.84       667
   macro avg       0.72      0.84      0.75       667
weighted avg       0.90      0.84      0.86       667



Based on my results, I would use the Random Forest Model to predict customer churn