In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, plot_confusion_matrix
from sklearn.tree import plot_tree

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Data/DATA_Customer-Churn.csv")
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


### check datatypes of all columns

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

### Convert column `TotalCharges` to numeric and replace null values in dataframe

In [10]:
df["TotalCharges"] = np.nanmedian(pd.to_numeric(df["TotalCharges"], errors="coerce"))

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   float64
 15  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(12)
memory

### Create dataframe with subset of features

In [13]:
X = df[["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]]
X

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,1397.475
1,0,34,56.95,1397.475
2,0,2,53.85,1397.475
3,0,45,42.30,1397.475
4,0,2,70.70,1397.475
...,...,...,...,...
7038,0,24,84.80,1397.475
7039,0,72,103.20,1397.475
7040,0,11,29.60,1397.475
7041,1,4,74.40,1397.475


In [14]:
y = df["Churn"]
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

### split data into training and test set

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,random_state=42)
X_train.shape, \
X_test.shape, \
y_train.shape, \
y_test.shape

((5634, 4), (1409, 4), (5634,), (1409,))

### scale features

In [24]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((5634, 4), (1409, 4))

### KNN Classifier

In [78]:
neigh = KNeighborsClassifier(n_neighbors = 23)
neigh.fit(X_train_scaled, y_train)
y_pred_train = neigh.predict(X_train_scaled)
y_pred_test = neigh.predict(X_test_scaled)

print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))

scores=cross_val_score(neigh, X_train_scaled, y_train, cv=10)
print()
print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.8036918707845225
Accuracy test: 0.7934705464868701

Precision train: 0.685361216730038
Precision test: 0.66015625

Recall train: 0.481951871657754
Recall test: 0.45308310991957107

CROSS VALIDATION
0.79 accuracy with a standard deviation of 0.01


### Decision Tree Classifier

In [77]:
dectree = DecisionTreeClassifier(max_depth = 6)
dectree.fit(X_train_scaled, y_train)
y_pred_train = dectree.predict(X_train_scaled)
y_pred_test = dectree.predict(X_test_scaled)

print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))

scores=cross_val_score(dectree, X_train_scaled, y_train, cv=10)
print()
print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.7969471068512602
Accuracy test: 0.7920511000709723

Precision train: 0.6654135338345865
Precision test: 0.6515151515151515

Recall train: 0.4732620320855615
Recall test: 0.46112600536193027

CROSS VALIDATION
0.78 accuracy with a standard deviation of 0.01


### Random Forest Classifier

In [76]:
rand_for = RandomForestClassifier(max_depth = 6,
                             min_samples_leaf = 50,
                             max_features = None,
                             n_estimators = 100,
                             bootstrap = True, 
                             oob_score = True, 
                             random_state = 42).fit(X_train_scaled, y_train)

y_pred_train = dectree.predict(X_train_scaled)
y_pred_test = dectree.predict(X_test_scaled)

print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))
print()

scores = cross_val_score(rand_for, X_train_scaled, y_train, cv = 10)
print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.7969471068512602
Accuracy test: 0.7913413768630234

Precision train: 0.6654135338345865
Precision test: 0.6490566037735849

Recall train: 0.4732620320855615
Recall test: 0.46112600536193027

CROSS VALIDATION
0.79 accuracy with a standard deviation of 0.01


### Hyperparameter tuning with gridsearch

In [82]:
param_grid = {
    'n_estimators': [50, 100, 250, 500, 750],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf' : [1, 2, 3, 4],
    "n_jobs" : [-1],
    #'max_features': ['sqrt']
    ##'max_samples' : ['None', 0.5],
    ##'max_depth':[3,5,10],
    ## 'bootstrap':[True,False] 
    }

grid_search = GridSearchCV(rand_for, param_grid, cv=5,return_train_score=True,n_jobs=-1,)
grid_search.fit(X_train_scaled,y_train)
grid_search.best_params_ #To check the best set of parameters returned

{'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 500,
 'n_jobs': -1,
 'random_state': 42}

In [83]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_n_estimators,param_n_jobs,param_random_state,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.387212,0.010276,0.048033,0.013573,1,2,50,-1,42,"{'min_samples_leaf': 1, 'min_samples_split': 2...",...,0.787894,0.010096,49,0.803195,0.810295,0.810961,0.805414,0.810781,0.808129,0.003208
1,0.356039,0.055055,0.102617,0.086156,1,2,50,-1,69,"{'min_samples_leaf': 1, 'min_samples_split': 2...",...,0.785765,0.010168,149,0.802751,0.806967,0.808742,0.805858,0.812112,0.807286,0.003102
2,0.684415,0.072547,0.104363,0.047943,1,2,100,-1,42,"{'min_samples_leaf': 1, 'min_samples_split': 2...",...,0.786120,0.010272,139,0.802086,0.810739,0.808964,0.804304,0.809894,0.807197,0.003389
3,0.647442,0.114826,0.225937,0.093219,1,2,100,-1,69,"{'min_samples_leaf': 1, 'min_samples_split': 2...",...,0.786298,0.010737,131,0.800754,0.808076,0.808076,0.806967,0.813221,0.807419,0.003979
4,1.769519,0.071339,0.263567,0.083041,1,2,250,-1,42,"{'min_samples_leaf': 1, 'min_samples_split': 2...",...,0.787895,0.009711,47,0.801420,0.809408,0.809186,0.804083,0.812112,0.807242,0.003900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,1.725840,0.119772,0.247881,0.061341,4,8,250,-1,69,"{'min_samples_leaf': 4, 'min_samples_split': 8...",...,0.786297,0.010434,134,0.800976,0.808742,0.809186,0.804748,0.809894,0.806709,0.003380
156,3.680646,0.273457,0.273504,0.063808,4,8,500,-1,42,"{'min_samples_leaf': 4, 'min_samples_split': 8...",...,0.788427,0.009716,1,0.800533,0.809408,0.808964,0.804970,0.810559,0.806887,0.003693
157,3.545543,0.212371,0.315716,0.035327,4,8,500,-1,69,"{'min_samples_leaf': 4, 'min_samples_split': 8...",...,0.786475,0.009946,127,0.800533,0.808964,0.807633,0.804083,0.809894,0.806221,0.003462
158,5.479307,0.143066,0.444341,0.077935,4,8,750,-1,42,"{'min_samples_leaf': 4, 'min_samples_split': 8...",...,0.788072,0.010346,21,0.800089,0.809408,0.807854,0.804748,0.812112,0.806842,0.004132


In [85]:
grid_search.best_params_

{'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 500,
 'n_jobs': -1,
 'random_state': 42}

In [86]:
rand_for_opt = RandomForestClassifier(min_samples_leaf = 4, min_samples_split = 2, n_estimators = 500,
                                      n_jobs = -1, random_state = 42, oob_score = True)

scores = cross_val_score(rand_for_opt, X_train_scaled, y_train, cv = 10)

In [87]:
print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))
print()

print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.8036918707845225
Accuracy test: 0.7934705464868701

Precision train: 0.685361216730038
Precision test: 0.66015625

Recall train: 0.481951871657754
Recall test: 0.45308310991957107

CROSS VALIDATION
0.78 accuracy with a standard deviation of 0.02


## Managing imbalance in the dataset


### Check for the imbalance

In [88]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

### Downsampling to create a balance between the two classes

In [101]:
churn_yes = df[df['Churn']=="Yes"].sample(len(df[df['Churn']=="Yes"]))
churn_no = df[df['Churn']=="No"].sample(len(df[df['Churn']=="Yes"]))
churn_yes.shape, churn_no.shape

In [108]:
df_ds = pd.concat([churn_yes, churn_no], axis = 0)
df_ds = df_ds.sample(frac=1)
df_ds = df_ds.reset_index(drop=True)
df_ds.shape

(3738, 16)

#### X y split

In [116]:
y = df_ds['Churn']
X = df_ds.drop(['Churn'], axis=1)[["SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges"]] ### split data into training and test set

#### split data into training and test set

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,random_state=42)
X_train.shape, \
X_test.shape, \
y_train.shape, \
y_test.shape

((2990, 4), (748, 4), (2990,), (748,))

#### scale features

In [118]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

((2990, 4), (748, 4))

#### KNN Classifier

In [119]:
neigh = KNeighborsClassifier(n_neighbors = 23)
neigh.fit(X_train_scaled, y_train)
y_pred_train = neigh.predict(X_train_scaled)
y_pred_test = neigh.predict(X_test_scaled)

print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))

scores=cross_val_score(neigh, X_train_scaled, y_train, cv=10)
print()
print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.7642140468227425
Accuracy test: 0.7419786096256684

Precision train: 0.7527018436109345
Precision test: 0.7268041237113402

Recall train: 0.7893333333333333
Recall test: 0.7642276422764228

CROSS VALIDATION
0.74 accuracy with a standard deviation of 0.02


#### Decision Tree Classifier

In [120]:
dectree = DecisionTreeClassifier(max_depth = 6)
dectree.fit(X_train_scaled, y_train)
y_pred_train = dectree.predict(X_train_scaled)
y_pred_test = dectree.predict(X_test_scaled)

print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))

scores=cross_val_score(dectree, X_train_scaled, y_train, cv=10)
print()
print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.7775919732441472
Accuracy test: 0.732620320855615

Precision train: 0.7777777777777778
Precision test: 0.7241379310344828

Recall train: 0.7793333333333333
Recall test: 0.7398373983739838

CROSS VALIDATION
0.74 accuracy with a standard deviation of 0.02


#### Random Forest Classifier

In [121]:
rand_for = RandomForestClassifier(max_depth = 6,
                             min_samples_leaf = 50,
                             max_features = None,
                             n_estimators = 100,
                             bootstrap = True, 
                             oob_score = True, 
                             random_state = 42).fit(X_train_scaled, y_train)

y_pred_train = dectree.predict(X_train_scaled)
y_pred_test = dectree.predict(X_test_scaled)

print("Accuracy train:",accuracy_score(y_train, y_pred_train))
print("Accuracy test:",accuracy_score(y_test, y_pred_test))

print()
print("Precision train:",precision_score(y_train, y_pred_train, pos_label="Yes"))
print("Precision test:",precision_score(y_test, y_pred_test, pos_label="Yes"))
print()
print("Recall train:",recall_score(y_train, y_pred_train, pos_label="Yes"))
print("Recall test:",recall_score(y_test, y_pred_test, pos_label="Yes"))
print()

scores = cross_val_score(rand_for, X_train_scaled, y_train, cv = 10)
print("CROSS VALIDATION")
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Accuracy train: 0.7775919732441472
Accuracy test: 0.732620320855615

Precision train: 0.7777777777777778
Precision test: 0.7241379310344828

Recall train: 0.7793333333333333
Recall test: 0.7398373983739838

CROSS VALIDATION
0.75 accuracy with a standard deviation of 0.02


`precision and recall improved after downsampling!`