In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("telco_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [3]:
df = df.drop('Unnamed: 0',axis=1)

In [4]:
df.columns

Index(['SeniorCitizen', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'P

In [5]:
# Creating X & y variables
X = df.drop('Churn',axis=1)
y = df['Churn']

In [6]:
X.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


#### Train Test Split

In [7]:
X = StandardScaler().fit_transform(X)

In [8]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

#### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

LogisticRegression()

In [10]:
# Evaluate the model
predictions_lr = model_lr.predict(X_test)
print(predictions_lr)

[0 0 0 ... 0 0 0]


In [11]:
#Checking precision, recall, f1-score
print( classification_report(y_test, predictions_lr) )

              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1049
           1       0.64      0.52      0.58       358

    accuracy                           0.81      1407
   macro avg       0.75      0.71      0.73      1407
weighted avg       0.80      0.81      0.80      1407



 - As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
 - Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

#### Imbalanced Classification SMOTEENN (UpSampling + ENN)

In [12]:
from imblearn.combine import SMOTEENN

In [13]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_sample(X,y)

In [14]:
Xr_train,Xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [15]:
model_lr_sm = LogisticRegression()
model_lr_sm.fit(Xr_train, yr_train)

LogisticRegression()

In [16]:
predictions_lr_sm = model_lr_sm.predict(Xr_test)
print(predictions_lr_sm)

[0 1 0 ... 0 1 1]


In [17]:
print( classification_report(yr_test, predictions_lr_sm) )

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       588
           1       0.92      0.94      0.93       695

    accuracy                           0.92      1283
   macro avg       0.92      0.92      0.92      1283
weighted avg       0.92      0.92      0.92      1283



- The results are better, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

#### Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model_rf= RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [20]:
model_rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [21]:
predictions_rf = model_rf.predict(X_test)
print(predictions_rf)

[0 0 0 ... 0 0 0]


In [22]:
model_rf.score(X_test,y_test)

0.7967306325515281

In [23]:
print( classification_report(y_test, predictions_rf) )

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1049
           1       0.65      0.44      0.52       358

    accuracy                           0.80      1407
   macro avg       0.74      0.68      0.70      1407
weighted avg       0.78      0.80      0.78      1407



#### Imbalanced Classification SMOTEENN (UpSampling + ENN)

In [24]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_sample(X,y)

In [25]:
Xr_train1,Xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [26]:
model_rf_sm=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [27]:
model_rf_sm.fit(Xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [28]:
predictions_rf_sm = model_rf_sm.predict(Xr_test1)
print(predictions_rf_sm)

[1 1 1 ... 0 1 1]


In [29]:
print( classification_report(yr_test1, predictions_rf_sm) )

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       540
           1       0.93      0.93      0.93       737

    accuracy                           0.92      1277
   macro avg       0.91      0.91      0.91      1277
weighted avg       0.92      0.92      0.92      1277



With RF Classifier, also we are able to get quite good results

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [31]:
random_grid = {'n_estimators': [int(x) for x in np.linspace(10, 50, num = 5)],
               'max_features': ['auto','sqrt'],
               'max_depth': [int(x) for x in np.linspace(10, 50, num = 10)],
               'min_samples_split': [int(x) for x in np.linspace(2, 11, num = 9)],
               'min_samples_leaf': [int(x) for x in np.linspace(2, 11, num = 9)],
               'bootstrap': [True, False]}

rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), 
                               param_distributions = random_grid, 
                               n_iter=500, 
                               cv=3, 
                               verbose=2, 
                               random_state=42, 
                               scoring='accuracy',
                               n_jobs = -1)

In [32]:
rf_random.fit(Xr_train1,yr_train1)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=500,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 14, 18, 23, 27, 32,
                                                      36, 41, 45, 50],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 11],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 11],
                                        'n_estimators': [10, 20, 30, 40, 50]},
                   random_state=42, scoring='accuracy', verbose=2)

In [33]:
rf_random.best_score_

0.9559341950646298

In [34]:
pred = rf_random.predict(Xr_test1)

In [35]:
print( classification_report(yr_test1, pred) )

              precision    recall  f1-score   support

           0       0.96      0.95      0.96       540
           1       0.97      0.97      0.97       737

    accuracy                           0.96      1277
   macro avg       0.96      0.96      0.96      1277
weighted avg       0.96      0.96      0.96      1277



We'll pick this model

#### Performing PCA

In [36]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
Xr_train_pca = pca.fit_transform(Xr_train1)
Xr_test_pca = pca.transform(Xr_test1)
explained_variance = pca.explained_variance_ratio_

In [37]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [38]:
model.fit(Xr_train_pca,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [39]:
predictions_pca = model.predict(Xr_test_pca)

In [40]:
model_score_r_pca = model.score(Xr_test_pca, yr_test1)

In [41]:
print( classification_report(yr_test1, predictions_pca))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90       540
           1       0.93      0.92      0.92       737

    accuracy                           0.91      1277
   macro avg       0.91      0.91      0.91      1277
weighted avg       0.91      0.91      0.91      1277



- With PCA, the results are not better

### Pickling the model

In [48]:
import pickle

In [49]:
filename = 'model.sav'

In [54]:
pickle.dump(rf_random, open(filename, 'wb'))

In [55]:
load_model = pickle.load(open(filename, 'rb'))

In [56]:
model_score_r1 = load_model.score(Xr_test1, yr_test1)

In [57]:
model_score_r1

0.9647611589663273