# TUNNING Hiper Parameters - Evaluation metrics - Classification

accuracy, balanced accuracy
recall
precision
f1-Score
specificity
area under ROC

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_excel('churn_data.xlsx')
df.head()

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer,42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors='coerce')

In [4]:
df.dtypes

customerID           object
tenure                int64
PhoneService         object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [5]:
# are there any null? 
df.isnull().any()

customerID          False
tenure              False
PhoneService        False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges         True
Churn               False
dtype: bool

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7042 entries, 0 to 7041
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7042 non-null   object 
 1   tenure            7042 non-null   int64  
 2   PhoneService      7042 non-null   object 
 3   Contract          7042 non-null   object 
 4   PaperlessBilling  7042 non-null   object 
 5   PaymentMethod     7042 non-null   object 
 6   MonthlyCharges    7042 non-null   float64
 7   TotalCharges      7031 non-null   float64
 8   Churn             7042 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 495.3+ KB


In [7]:
#Drop nulls

df= df.dropna()

# Train-Test Split

In [8]:
cat_vars = ['PhoneService',"Contract","PaperlessBilling","PaymentMethod"]
num_vars = ["tenure","MonthlyCharges","TotalCharges"]
modeling_vars = cat_vars + num_vars

In [9]:
X = df.filter(modeling_vars).copy()
y = df['Churn'].copy()
y = pd.DataFrame(y) # better to manipulate

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(y.Churn) 
y.Churn = le.transform(y.Churn)

In [11]:
X = pd.get_dummies(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Feature Engineering -  Decision Tree

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

arvore = DecisionTreeClassifier()

parameters = {
    "max_depth":[1,2,3,4,5,6,7,8,9,10]
}

grid_search = GridSearchCV(arvore, parameters, scoring="roc_auc", cv=5, n_jobs=-1)

In [14]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             scoring='roc_auc')

In [15]:
results = pd.DataFrame(grid_search.cv_results_)

In [16]:
# Classified by rank_test_score
results.sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.021219,0.003703,0.00802,0.002112,4,{'max_depth': 4},0.826964,0.829731,0.832914,0.816734,0.806075,0.822483,0.009836,1
4,0.021402,0.003445,0.008205,0.000397,5,{'max_depth': 5},0.816209,0.828215,0.827852,0.821989,0.803665,0.819586,0.009093,2
5,0.023176,0.001988,0.011519,0.001347,6,{'max_depth': 6},0.812136,0.817685,0.83693,0.81147,0.804103,0.816465,0.011107,3
2,0.023204,0.008058,0.011544,0.00542,3,{'max_depth': 3},0.817728,0.818505,0.831114,0.807923,0.795237,0.814101,0.011967,4
6,0.023513,0.0024,0.008002,0.001412,7,{'max_depth': 7},0.793308,0.804975,0.826117,0.81147,0.78542,0.804258,0.014178,5
7,0.027409,0.004027,0.009395,0.002415,8,{'max_depth': 8},0.771399,0.795889,0.816179,0.789998,0.769194,0.788532,0.017252,6
1,0.024123,0.005586,0.012238,0.002718,2,{'max_depth': 2},0.773193,0.791156,0.786073,0.768403,0.755439,0.774853,0.012751,7
8,0.028613,0.004755,0.007021,0.000555,9,{'max_depth': 9},0.756468,0.773561,0.793618,0.756829,0.754408,0.766977,0.014997,8
9,0.026031,0.00266,0.006313,0.000661,10,{'max_depth': 10},0.733571,0.760977,0.764704,0.741159,0.729129,0.745908,0.014399,9
0,0.022945,0.003344,0.012517,0.003631,1,{'max_depth': 1},0.732952,0.733239,0.741631,0.714413,0.710664,0.72658,0.01194,10


# Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Instantiate the RandomForestClassifier
rf = RandomForestClassifier()

parameters = {
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "n_estimators": [100, 300, 500]
}

grid_search = GridSearchCV(rf, parameters, scoring="roc_auc", cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)


  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [100, 300, 500]},
             scoring='roc_auc')

In [18]:
# Classified by rank_test_score
results=pd.DataFrame(grid_search.cv_results_)
results.sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
16,1.676493,0.029764,0.094274,0.010204,6,300,"{'max_depth': 6, 'n_estimators': 300}",0.833017,0.846881,0.856898,0.833918,0.819343,0.838011,0.01285,1
19,1.641794,0.031005,0.110302,0.012722,7,300,"{'max_depth': 7, 'n_estimators': 300}",0.830741,0.847765,0.855673,0.834535,0.819055,0.837554,0.012886,2
17,2.658634,0.087015,0.157618,0.013124,6,500,"{'max_depth': 6, 'n_estimators': 500}",0.831399,0.847419,0.855351,0.833404,0.819595,0.837434,0.012583,3
20,2.644599,0.020536,0.156382,0.013589,7,500,"{'max_depth': 7, 'n_estimators': 500}",0.829725,0.847994,0.85391,0.834333,0.819149,0.837022,0.012534,4
15,0.585876,0.050994,0.039381,0.005574,6,100,"{'max_depth': 6, 'n_estimators': 100}",0.830644,0.846856,0.854357,0.833594,0.817906,0.836672,0.012763,5
18,0.616191,0.048939,0.050729,0.010044,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.829855,0.846138,0.853229,0.832825,0.820807,0.836571,0.011633,6
23,2.776868,0.062262,0.180907,0.016341,8,500,"{'max_depth': 8, 'n_estimators': 500}",0.828,0.848344,0.851383,0.834009,0.819909,0.836329,0.011961,7
22,1.680121,0.048793,0.112727,0.021388,8,300,"{'max_depth': 8, 'n_estimators': 300}",0.828146,0.84783,0.852401,0.832654,0.819505,0.836107,0.012279,8
13,1.76116,0.132782,0.091921,0.004391,5,300,"{'max_depth': 5, 'n_estimators': 300}",0.8298,0.844617,0.855422,0.833278,0.816895,0.836002,0.013133,9
14,2.668133,0.18491,0.153994,0.017545,5,500,"{'max_depth': 5, 'n_estimators': 500}",0.829496,0.845105,0.85557,0.832654,0.816739,0.835913,0.013344,10


In [19]:
grid_search.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# Randomized Search

In [20]:
!pip install lightgbm




[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

# Define the parameter grid
parameters = {
    "learning_rate": [0.001, 0.01],
    "num_leaves": [2, 128],
    "min_child_samples": [1, 100],
    "subsample": [0.05, 1],
    "colsample_bytree": [0.1, 1.0]
}

# Instantiate the LightGBM classifier
lgbm = LGBMClassifier()

# Perform randomized hyperparameter search
random_search = RandomizedSearchCV(lgbm, parameters, scoring="roc_auc", cv=5, n_iter=5, n_jobs=-1)

# Fit the randomized search to your training data
random_search.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Number of positive: 1495, number of negative: 4129
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 605
[LightGBM] [Info] Number of data points in the train set: 5624, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265825 -> initscore=-1.015909
[LightGBM] [Info] Start training from score -1.015909


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=5, n_jobs=-1,
                   param_distributions={'colsample_bytree': [0.1, 1.0],
                                        'learning_rate': [0.001, 0.01],
                                        'min_child_samples': [1, 100],
                                        'num_leaves': [2, 128],
                                        'subsample': [0.05, 1]},
                   scoring='roc_auc')

In [25]:
results = pd.DataFrame(random_search.cv_results_)
results.sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_num_leaves,param_min_child_samples,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.581486,0.04836,0.006602,0.00049,1.0,128,100,0.001,1.0,"{'subsample': 1, 'num_leaves': 128, 'min_child...",0.833782,0.842291,0.840584,0.827059,0.809705,0.830684,0.011795,1
0,0.607919,0.074093,0.013311,0.006611,1.0,2,1,0.01,0.1,"{'subsample': 1, 'num_leaves': 2, 'min_child_s...",0.810144,0.827988,0.841384,0.81244,0.797722,0.817936,0.015168,2
2,0.547869,0.003959,0.007801,0.000398,0.05,2,100,0.01,0.1,"{'subsample': 0.05, 'num_leaves': 2, 'min_chil...",0.810144,0.827988,0.841384,0.81244,0.797722,0.817936,0.015168,2
1,0.46729,0.055232,0.007604,0.000488,0.05,2,100,0.01,1.0,"{'subsample': 0.05, 'num_leaves': 2, 'min_chil...",0.798963,0.819135,0.846622,0.788767,0.788963,0.80849,0.022038,4
3,0.319969,0.08622,0.007599,0.002801,1.0,2,100,0.01,1.0,"{'subsample': 1, 'num_leaves': 2, 'min_child_s...",0.798963,0.819135,0.846622,0.788767,0.788963,0.80849,0.022038,4
