# Telco Customer Churn

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading data into dataframe
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
n_rows, n_columns = df.shape
print(f"The shape of the dataframe is {n_rows} rows and {n_columns} columns")

The shape of the dataframe is 7043 rows and 21 columns


In [5]:
duplicate = df.duplicated().sum()
print (f"There are {duplicate} dupliacte(s) in the dataset")

There are 0 dupliacte(s) in the dataset


In [6]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [7]:
# Checking distribution of the target variable
df_churn= df["Churn"].value_counts()
df_churn

Churn
No     5174
Yes    1869
Name: count, dtype: int64

# Data Preprocessing

In [8]:
df["TotalCharges"].sample(20)

2642    4045.65
7036      743.3
6226    1654.75
5778       20.3
1335      96.45
4951    2357.75
5671     1554.9
2115     8477.6
3569      310.6
5585       19.3
1571      20.05
1117    4577.75
268      1099.6
3436      516.3
5908       20.2
2150    1737.45
873      3027.4
3996     487.05
559        49.3
940      531.55
Name: TotalCharges, dtype: object

In [9]:
# Converting the 'TotalCharges' column to numeric values and filling missing values with 0.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce', downcast='float')
df["TotalCharges"].dtype

dtype('float32')

In [10]:
df["TotalCharges"].isnull().sum()

11

In [11]:
# Replace the missing values with zero
df["TotalCharges"].fillna(0, inplace = True)
df["TotalCharges"].isnull().sum()

0

In [12]:
# Converting the Churn(target variable) to binary
y = df["Churn"]
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = pd.DataFrame(y, columns =["Churn"])
y
#y = pd.DataFrame(y, columns=['stabf'])

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1
...,...
7038,0
7039,0
7040,0
7041,1


categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
               'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
               'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [13]:
# Splitting data 
X = df.drop(columns =["customerID","Churn"])

## Feature Engineering

In [14]:
df["tenure"].unique()

array([ 1, 34,  2, 45,  8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, 71,
       21, 12, 30, 47, 72, 17, 27,  5, 46, 11, 70, 63, 43, 15, 60, 18, 66,
        9,  3, 31, 50, 64, 56,  7, 42, 35, 48, 29, 65, 38, 68, 32, 55, 37,
       36, 41,  6,  4, 33, 67, 23, 57, 61, 14, 20, 53, 40, 59, 24, 44, 19,
       54, 51, 26,  0, 39], dtype=int64)

In [15]:
#Scaling the numerical columns using standard scaler
numerical = df[["tenure", "MonthlyCharges", "TotalCharges"]]



In [16]:
scaler = StandardScaler()
scaled_num_df = scaler.fit_transform(numerical)
scaled_num_df = pd.DataFrame(scaled_num_df, columns = numerical.columns)
scaled_num_df

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.362660,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874
...,...,...,...
7038,-0.340876,0.665992,-0.127605
7039,1.613701,1.277533,2.242606
7040,-0.870241,-1.168632,-0.852932
7041,-1.155283,0.320338,-0.870513


In [17]:
# categorical values
categorical = df[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
               'PhoneService', 'MultipleLines', 'InternetService',
               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport', 'StreamingTV', 'StreamingMovies',
                'Contract', 'PaperlessBilling', 'PaymentMethod']]
categorical

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic)
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check
7039,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic)
7040,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check
7041,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check


In [18]:
# Encoding categorical values using OneHotEncoder
encoder = OneHotEncoder(sparse = False)
cat_encoded = pd.DataFrame(encoder.fit_transform(categorical))

cat_encoded.columns = encoder.get_feature_names_out(categorical.columns)
cat_encoded

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
7039,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
7040,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7041,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [19]:
# Combining the numerical and categorical columns
X_normalized = pd.concat([cat_encoded,scaled_num_df], axis = 1)
X_normalized.head(5)

Unnamed: 0,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.277445,-1.160323,-0.992611
1,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.066327,-0.259629,-0.172165
2,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.236724,-0.36266,-0.958066
3,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.514251,-0.746535,-0.193672
4,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-1.236724,0.197365,-0.938874


In [20]:
#Splitting into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=1) 

## Model training: Random Forest

In [21]:
clf = RandomForestClassifier(random_state=1)


In [22]:
clf.fit(X_train, y_train)

## Model Evaluation


In [23]:
def evaluate_model(model, X_test, y_test):
    
    
    #Predict Test Data
    y_predict = model.predict(X_test)
    
    #Calculate the accuracy, precision, recall, f1-score 
    acc = metrics.accuracy_score(y_test,y_predict)
    pre = metrics.precision_score(y_test,y_predict)
    rec = metrics.recall_score(y_test,y_predict)
    f1 = metrics.f1_score(y_test,y_predict)
    
    #Display confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predict)
    
    return {"acc": acc, "pre":pre, "rec": rec, "f1": f1, "cm":cm}

### Question 14

In [24]:
clf_eval = evaluate_model(clf, X_test, y_test)

#Print result
print ("Accuracy:", clf_eval["acc"])
print ("Precision:", clf_eval["pre"])
print ("Recall:", clf_eval["rec"])
print ("f1 score:", clf_eval["f1"])
print ("Confusion matrix:\n", clf_eval["cm"])

Accuracy: 0.794180269694819
Precision: 0.5941558441558441
Recall: 0.5258620689655172
f1 score: 0.5579268292682926
Confusion matrix:
 [[936 125]
 [165 183]]


In [25]:
#Cross validation
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.71503942, 0.71589234, 0.69809084, 0.71507196, 0.70045225])

## Model training: Extra Trees 

In [26]:
xtc = ExtraTreesClassifier(random_state=1)

In [27]:

xtc.fit(X_train, y_train)

In [28]:
xtc_eval = evaluate_model(xtc, X_test, y_test)

#Print result
print ("Accuracy:", xtc_eval["acc"])
print ("Precision:", xtc_eval["pre"])
print ("Recall:", xtc_eval["rec"])
print ("f1 score:", xtc_eval["f1"])
print ("Confusion matrix:\n", xtc_eval["cm"])

Accuracy: 0.7679205110007097
Precision: 0.5337620578778135
Recall: 0.47701149425287354
f1 score: 0.503793626707132
Confusion matrix:
 [[916 145]
 [182 166]]


In [29]:
#Cross validation
scores = cross_val_score(xtc, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.69365895, 0.6874623 , 0.67932217, 0.70344897, 0.68075392])

## XGB Boost

In [30]:
xgb = XGBClassifier(random_state=1)
xgb.fit(X_train,y_train)

### Question 15

In [31]:
xgb_eval = evaluate_model(xgb, X_test, y_test)

#Print result
print ("Accuracy:", xgb_eval["acc"])
print ("Precision:", xgb_eval["pre"])
print ("Recall:", xgb_eval["rec"])
print ("f1 score:", xgb_eval["f1"])
print ("Confusion matrix:\n", xgb_eval["cm"])

Accuracy: 0.7934705464868701
Precision: 0.5861027190332326
Recall: 0.5574712643678161
f1 score: 0.5714285714285715
Confusion matrix:
 [[924 137]
 [154 194]]


In [32]:
scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring='f1_macro')
scores

array([0.69368322, 0.71755802, 0.70699226, 0.72567004, 0.67121523])

## LGBM Classifier

In [33]:
lgbm = LGBMClassifier(random_state=1)
lgbm.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


### Quesion 16

In [34]:
lgbm_eval = evaluate_model(lgbm, X_test, y_test)

#Print result
print ("Accuracy:", lgbm_eval["acc"])
print ("Precision:", lgbm_eval["pre"])
print ("Recall:", lgbm_eval["rec"])
print ("f1 score:", lgbm_eval["f1"])
print ("Confusion matrix:\n", lgbm_eval["cm"])

Accuracy: 0.8133427963094393
Precision: 0.6299694189602446
Recall: 0.5919540229885057
f1 score: 0.6103703703703703
Confusion matrix:
 [[940 121]
 [142 206]]


In [35]:
scores = cross_val_score(lgbm, X_train, y_train, cv=5, scoring='f1_macro')
scores

[LightGBM] [Info] Number of positive: 1217, number of negative: 3290
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 4507, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.270024 -> initscore=-0.994499
[LightGBM] [Info] Start training from score -0.994499
[LightGBM] [Info] Number of positive: 1217, number of negative: 3290
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000909 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 4507, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.270024 -> initscore=-0.994499
[LightGBM]

array([0.70676906, 0.71081001, 0.71844559, 0.74344853, 0.68648264])

In [36]:
from sklearn.model_selection import RandomizedSearchCV
xt_clf = ExtraTreesClassifier(verbose=1,
                              random_state=1,
                              n_jobs=-1)

In [37]:
from random import randint
import numpy as np

parameters = {
    'n_estimators': np.arange(100,1000, 100),
    'min_samples_split' : [2,5,7],
    'min_samples_leaf' : [4,6,8],
    'max_features' : ['auto', 'sqrt', 'log2', None]
}

#### Question 17

In [45]:


def hypertuning(xt_clf, parameters, n_iter, X_train, y_train):
    rdm_search = RandomizedSearchCV(xt_clf, parameters, n_jobs=-1, n_iter=10, cv=5, 
                                    verbose =1, random_state=1, scoring ='accuracy')
    rdm_search.fit(X_train, y_train)
    ht_param = rdm_search.best_params_
    ht_score = rdm_search.best_score_
    return ht_param, ht_score

xtc_param, xtc_ht_score = hypertuning(xtc, parameters, 10, X_train, y_train)

xtc_param, xtc_ht_score



Fitting 5 folds for each of 10 candidates, totalling 50 fits


({'n_estimators': 400,
  'min_samples_split': 2,
  'min_samples_leaf': 8,
  'max_features': 'sqrt'},
 0.792509074059773)

#### Question 18


In [39]:


xt_clf_new = ExtraTreesClassifier(n_estimators=500,min_samples_split= 2,min_samples_leaf= 4,max_features= None, random_state= 1)

xt_clf_new.fit(X_train, y_train)

xt_clf_new_predictions = xt_clf_new.predict(X_test)



acc = metrics.accuracy_score(y_test, xt_clf_new_predictions)

# Print the accuracy with four decimal places
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.8041


### Question 20

In [74]:

#### Feature importance using the optimal ExtraTreesClassifier model

feature_importance = pd.DataFrame(xt_clf_new.feature_importances_,
                            
                                  columns=['importance']).sort_values('importance', ascending=False)

top_10_fi = feature_importance[0:10]
top_10_fi

Unnamed: 0,importance
34,0.288434
43,0.107757
14,0.097041
45,0.074608
44,0.031749
41,0.026719
19,0.021989
0,0.018759
1,0.018666
10,0.015557


In [76]:
X_normalized.columns[0:10]


Index(['gender_Female', 'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1',
       'Partner_No', 'Partner_Yes', 'Dependents_No', 'Dependents_Yes',
       'PhoneService_No', 'PhoneService_Yes'],
      dtype='object')