In [None]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import lightgbm as lgb
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,classification_report

In [2]:
d = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
d.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


**PREPROCESSING**

In [4]:
d.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
d['TotalCharges'] = d['TotalCharges'].replace(r'^\s*$', np.nan, regex=True)
d['TotalCharges'] = d['TotalCharges'].fillna(0)
d.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
d['TotalCharges'] = d['TotalCharges'].astype('float')
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
dic = {'No':0,'Yes':1}
d['Churn'] = d['Churn'].map(dic)

In [8]:
d.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


**SPLITTING THE DATASET**

In [9]:
d.drop(['customerID'],axis= 1,inplace=True)
d.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [10]:
di = {0:'No',1:'Yes'}
d['SeniorCitizen'] = d['SeniorCitizen'].map(di)

In [11]:
x = d.iloc[:,:19]
y = d.iloc[:,19]

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 1)

**ENCODING THE CATAGORICAL COLUMNS WITH ONE-HOT ENCODING**

In [13]:
x_train= pd.get_dummies(x_train,dtype=int)
x_test= pd.get_dummies(x_test,dtype=int)

In [14]:
x_train

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
1814,12,19.70,258.35,0,1,1,0,0,1,0,...,0,0,0,1,1,0,0,0,0,1
5946,42,73.90,3160.55,1,0,1,0,1,0,1,...,1,0,1,0,1,0,0,1,0,0
3881,71,65.15,4681.75,0,1,1,0,0,1,1,...,0,0,0,1,1,0,1,0,0,0
2389,71,85.45,6300.85,0,1,1,0,0,1,0,...,1,0,1,0,1,0,0,0,1,0
3676,30,70.40,2044.75,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,9,100.50,918.60,0,1,0,1,1,0,1,...,1,1,0,0,0,1,0,0,1,0
5192,60,19.95,1189.90,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,1
3980,28,105.70,2979.50,0,1,1,0,1,0,1,...,1,1,0,0,0,1,0,0,1,0
235,2,54.40,114.10,0,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1


In [15]:
x_train.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Female',
       'gender_Male', 'SeniorCitizen_No', 'SeniorCitizen_Yes', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
     

In [16]:
x_test.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Female',
       'gender_Male', 'SeniorCitizen_No', 'SeniorCitizen_Yes', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
     

In [17]:
print(x_test.columns == x_train.columns)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True]


**SCALING THE NUMERICAL COLUMNS WITH STANDARDSCALER**

In [18]:
sc = StandardScaler()

In [19]:
x_train[['tenure', 'MonthlyCharges', 'TotalCharges']] = sc.fit_transform(x_train[['tenure', 'MonthlyCharges', 'TotalCharges']])
x_test[['tenure', 'MonthlyCharges', 'TotalCharges']] = sc.fit_transform(x_test[['tenure', 'MonthlyCharges', 'TotalCharges']])

In [20]:
x_train

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
1814,-0.825884,-1.497530,-0.890947,0,1,1,0,0,1,0,...,0,0,0,1,1,0,0,0,0,1
5946,0.395961,0.302996,0.389693,1,0,1,0,1,0,1,...,1,0,1,0,1,0,0,1,0,0
3881,1.577078,0.012320,1.060945,0,1,1,0,0,1,1,...,0,0,0,1,1,0,1,0,0,0
2389,1.577078,0.686687,1.775397,0,1,1,0,0,1,0,...,1,0,1,0,1,0,0,0,1,0
3676,-0.092777,0.186726,-0.102671,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,-0.948068,1.186648,-0.599602,0,1,0,1,1,0,1,...,1,1,0,0,0,1,0,0,1,0
5192,1.129068,-1.489225,-0.479886,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,1
3980,-0.174233,1.359393,0.309802,0,1,1,0,1,0,1,...,1,1,0,0,0,1,0,0,1,0
235,-1.233166,-0.344795,-0.954599,0,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1


**MACHINE LEARNING MODELING**

RANDOM FOREST CLASSIFIER

In [27]:
rm = RandomForestClassifier()

In [33]:
rm_model = rm.fit(x_train,y_train)

In [42]:
rm_pred = rm_model.predict(x_test)

In [46]:
print(classification_report(y_test, rm_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1061
           1       0.61      0.52      0.56       348

    accuracy                           0.80      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [49]:
accuracy_score(y_test, rm_pred)

0.8005677785663591

EXTRA TREES CLASSIFIER

In [28]:
et = ExtraTreesClassifier()

In [32]:
et_model = et.fit(x_train,y_train)

In [41]:
et_pred = et_model.predict(x_test)

In [53]:
print(classification_report(y_test, et_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      1061
           1       0.54      0.53      0.53       348

    accuracy                           0.77      1409
   macro avg       0.69      0.69      0.69      1409
weighted avg       0.77      0.77      0.77      1409



In [54]:
accuracy_score(y_test, et_pred)

0.7700496806245565

LightGBM CLASSIFIER 

In [29]:
lg = lgb.LGBMClassifier()

In [34]:
lg_model = lg.fit(x_train,y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [40]:
lg_pred = lg_model.predict(x_test)

In [57]:
print(classification_report(y_test, lg_pred))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87      1061
           1       0.61      0.59      0.60       348

    accuracy                           0.80      1409
   macro avg       0.74      0.73      0.73      1409
weighted avg       0.80      0.80      0.80      1409



In [55]:
accuracy_score(y_test, lg_pred)

0.8041163946061036

XGBoost CLASSIFIER

In [36]:
xg = XGBClassifier()

In [37]:
xg_model = xg.fit(x_train,y_train)

In [39]:
xg_pred = xg_model.predict(x_test)

In [58]:
print(classification_report(y_test, xg_pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1061
           1       0.58      0.58      0.58       348

    accuracy                           0.79      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.79      0.79      0.79      1409



In [56]:
accuracy_score(y_test, xg_pred)

0.7920511000709723

**QUIZ**

In [64]:
# (17) To improve the Extra Trees Classifier, you will use the following parameters (number of estimators,
# minimum number of samples, minimum number of samples for leaf node and the number of features to consider when
# looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV). 

n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None] 
hyperparameter_grid = {'n_estimators': n_estimators,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

In [65]:
model = RandomizedSearchCV(estimator = et,
                                param_distributions = hyperparameter_grid,
                                scoring = "accuracy",
                                cv = 5,
                                verbose= 1,
                                random_state = 1,
                                n_jobs = 5)

In [66]:
model.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\ASUS\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_validation.py

In [67]:
model.best_params_

{'n_estimators': 1000,
 'min_samples_split': 9,
 'min_samples_leaf': 8,
 'max_features': 'sqrt'}

In [68]:
# (18) Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV 
# (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier
# model with no hyperparameter tuning?

y_pred = model.predict(x_test)

In [69]:
accuracy_score(y_test,y_pred)

0.8048261178140526