### Importing Required Library

In [1]:
# All purpose library
import pandas as pd
import numpy as np

# Preprocessing library
import sklearn.utils
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# ML library
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Performance metrics library
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

# Visualization library
import matplotlib.pyplot as plt
import seaborn as sns

# Ignoring Warning during trainings 
import warnings
warnings.filterwarnings('ignore')

### Loading dataset

In [2]:
df = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

### Analysing dataset

In [3]:
# Description of dataset
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeniorCitizen,7043.0,0.162147,0.368612,0.0,0.0,0.0,0.0,1.0
tenure,7043.0,32.371149,24.559481,0.0,9.0,29.0,55.0,72.0
MonthlyCharges,7043.0,64.761692,30.090047,18.25,35.5,70.35,89.85,118.75


In [4]:
# Displaying dataset information
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


None

In [5]:
# Displaying the first and last few rows
print('First few rows of data')
display(df.head())
print('Last few rows of data')
display(df.tail())

First few rows of data


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


Last few rows of data


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [6]:
# Displaying the dimension of the dataset
df.shape

(7043, 21)

In [7]:
# Check for null values in dataset
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

### Preprocessing dataset

In [8]:
# Convert 'TotalCharges' column to numeric values and filling missing values with 0
df['TotalCharges'].replace(' ', '0', inplace=True)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [9]:
# Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1
df['Churn'] = df['Churn'].map({'No':0, 'Yes':1})

### Separate the dependent and independent columns

In [10]:
x = df.drop(['Churn', 'customerID'], axis=1)
y = df['Churn']

In [11]:
# Split the data into an 80-20 train-test split with a random state of "1"
x_train, x_test, y_train, y_test = train_test_split(x ,y ,random_state=1,test_size=0.20)

In [12]:
# Identify the categorical and numeric features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
               'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
               'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

### Feature Engineering

Standardize the numeric features of the dataset

In [13]:
# Scale numeric features using StandardScaler, convert the output back to a dataframe 
# and put back the column names
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train[numerical])
x_train_scaled = pd.DataFrame(x_train_scaled, columns=numerical)

x_test = x_test.reset_index(drop=True)
x_test_scaled = scaler.transform(x_test[numerical])
x_test_scaled = pd.DataFrame(x_test_scaled, columns=numerical)

Encode the categorical features of the dataset

In [14]:
# Encode categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), 
# convert the output back to a dataframe and put back the column names
encoder = OneHotEncoder(sparse_output=False)
x_train_encoded = encoder.fit_transform(x_train[categorical])
x_train_encoded = pd.DataFrame(x_train_encoded, columns=encoder.get_feature_names_out())

x_test_encoded = encoder.transform(x_test[categorical])
x_test_encoded = pd.DataFrame(x_test_encoded, columns=encoder.get_feature_names_out())

In [15]:
# Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes
x_train_combine = pd.concat([x_train_encoded, x_train_scaled], axis=1)
x_test_combine = pd.concat([x_test_encoded, x_test_scaled], axis=1)

### Model and performance metrics

#### Question 14

In [16]:
# Train a Random Forest classifier
RF_model = RandomForestClassifier(random_state=1)
RF_model.fit(x_train_combine, y_train)

# Prediction
y_pred = RF_model.predict(x_test_combine)
print (accuracy_score(y_test, y_pred))

0.794180269694819


In [17]:
# Train Extra Trees classifier
ET_model = ExtraTreesClassifier(random_state=1)
ET_model.fit(x_train_combine, y_train)

# Prediction
y_pred = ET_model.predict(x_test_combine)
print (accuracy_score(y_test, y_pred))

0.7686302342086586


#### Question 15

In [18]:
# Train xgboost
XGB_model = XGBClassifier()
XGB_model.fit(x_train_combine, y_train)

# Prediction
y_pred = XGB_model.predict(x_test_combine)
print (accuracy_score(y_test, y_pred))

0.7877927608232789


#### Question 16

In [19]:
# Train lightgbm
LGB_model = LGBMClassifier()
LGB_model.fit(x_train_combine, y_train)

# Prediction
y_pred = LGB_model.predict(x_test_combine)
print (accuracy_score(y_test, y_pred))

0.8034066713981547


#### Question 17

In [20]:
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

rscv_clf = RandomizedSearchCV(ExtraTreesClassifier(), hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', 
                              n_jobs=-1, verbose=1, random_state=1)
rscv_clf.fit(x_train_combine, y_train)
print(rscv_clf.best_estimator_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
ExtraTreesClassifier(min_samples_leaf=8, min_samples_split=9, n_estimators=1000)


#### Question 18
Accuracy of new optimal model is higher at 0.80 compared to the initial at 0.77

In [21]:
# Train Extra Trees classifier with new hyperparameters from RandomizedSearchCV
ET2_model = ExtraTreesClassifier(min_samples_leaf=8, min_samples_split=9, n_estimators=1000, random_state=1)
ET2_model.fit(x_train_combine, y_train)

# Prediction
y_pred = ET2_model.predict(x_test_combine)
print (accuracy_score(y_test, y_pred))

0.8026969481902059


#### Question 19
From the parameters of ExtraTreesClassifier, min_weight_fraction_leaf is another hyperparameter that 
can be tuned as below with a value of 0.01

In [22]:
ET3_model = ExtraTreesClassifier(min_weight_fraction_leaf=0.01, min_samples_leaf=8, min_samples_split=9,
                                  n_estimators=1000, random_state=1)
ET3_model.fit(x_train_combine, y_train)

# Prediction
y_pred = ET3_model.predict(x_test_combine)
print (accuracy_score(y_test, y_pred))

0.8105039034776437


#### Question 20

In [23]:
feature_importances = pd.DataFrame(ET2_model.feature_importances_,
                                   index = x_test_combine.columns,
                                    columns=['Importance Score']).sort_values('Importance Score',
                                                                        ascending=False)
feature_importances.head(2)

Unnamed: 0,Importance Score
Contract_Month-to-month,0.152171
tenure,0.089643
