In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Telco_Customer_Churn_clean.csv')

#### Extract features and target and then split into train and test data

In [3]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
import xgboost as xgb

#### Let us first fit a xgboost model with no hyperparameter tuning to get a baseline for accuracy, roc_auc_score. We are using eval_metric as `auc` because we have an imbalanced dataset, so accuracy is not a good choice.

In [6]:
xgb_model_1 = xgb.XGBClassifier(n_estimators = 1000,
                                verbosity = 1,
                                objective = 'binary:logistic',
                                learning_rate = 0.1,
                                eval_metric = 'auc'
                                )

In [7]:
xgb_model_1.fit(X_train, y_train, verbose=False)

In [8]:
y_pred = xgb_model_1.predict(X_test)

In [9]:
from sklearn.metrics import *

In [10]:
accuracy_score(y_test, y_pred)

0.7938659058487875

In [11]:
y_test_prob = xgb_model_1.predict_proba(X_test)[:,1]

In [12]:
roc_auc_score(y_test, y_test_prob)

0.8209572883075265

#### So we got a baseline roc_auc_score = 0.827579. We will now do hyperparameter tuning to improve on this

#### Parameters we will tune
- max depth
- learning rate
- gamma- This controls the complexity of the tree
- subsample - what fraction of training data is randomly selected prior to growing a new tree

In [13]:
from scipy.stats import uniform

param_grid = {
    'max_depth' : [2,3,4,5,6,7],
    'gamma':uniform(loc=0.0, scale=3),
    'learning_rate':uniform(loc=0.01, scale=0.5),
    'subsample':uniform(loc=0.5, scale=0.5)
}

In [14]:
from sklearn.model_selection import ParameterSampler

n_iter = 1000

param_list = list(ParameterSampler(param_grid, n_iter=n_iter, random_state=42))

In [15]:
param_list[0]

{'gamma': 1.1236203565420875,
 'learning_rate': 0.4853571532049581,
 'max_depth': 4,
 'subsample': 0.8898455001363846}

In [16]:
xgb_model_2 = xgb.XGBClassifier(
    n_estimators = 1000,
    verbosity = 1,
    objective = 'binary:logistic',
    eval_metric = 'auc',
    early_stopping_rounds = 30
)

In [17]:
val_aucs = []
counter = 1
eval_set = [(X_train, y_train), (X_test, y_test)]
for params in param_list:
    # Fit the model with the hyperparameters
    xgb_model_2.set_params(**params)
    xgb_model_2.fit(X_train, y_train, eval_set=eval_set,verbose=False)
    y_test_prob = xgb_model_2.predict_proba(X_test)[:,1]
    val_aucs.append(roc_auc_score(y_test, y_test_prob))
    if counter % 50 ==0:
        print(f'Done with {counter} of {n_iter} iterations')
    counter += 1


Done with 50 of 1000 iterations
Done with 100 of 1000 iterations
Done with 150 of 1000 iterations
Done with 200 of 1000 iterations
Done with 250 of 1000 iterations
Done with 300 of 1000 iterations
Done with 350 of 1000 iterations
Done with 400 of 1000 iterations
Done with 450 of 1000 iterations
Done with 500 of 1000 iterations
Done with 550 of 1000 iterations
Done with 600 of 1000 iterations
Done with 650 of 1000 iterations
Done with 700 of 1000 iterations
Done with 750 of 1000 iterations
Done with 800 of 1000 iterations
Done with 850 of 1000 iterations
Done with 900 of 1000 iterations
Done with 950 of 1000 iterations
Done with 1000 of 1000 iterations


In [18]:
xgb_param_df = pd.DataFrame(param_list)
xgb_param_df.head()

Unnamed: 0,gamma,learning_rate,max_depth,subsample
0,1.12362,0.485357,4,0.889846
1,1.79055,0.232916,4,0.729624
2,1.001126,0.081433,4,0.510292
3,2.90973,0.426221,7,0.500389
4,2.976635,0.318741,3,0.762378


In [19]:
xgb_param_df['Test ROC_AUC'] = val_aucs
xgb_param_df.head()

Unnamed: 0,gamma,learning_rate,max_depth,subsample,Test ROC_AUC
0,1.12362,0.485357,4,0.889846,0.849
1,1.79055,0.232916,4,0.729624,0.849006
2,1.001126,0.081433,4,0.510292,0.851173
3,2.90973,0.426221,7,0.500389,0.840606
4,2.976635,0.318741,3,0.762378,0.851181


In [20]:
max_auc = xgb_param_df['Test ROC_AUC'].max()
max_auc

0.8565494047567588

#### Parameters for best roc_auc_score

In [21]:
xgb_param_df[xgb_param_df['Test ROC_AUC']==max_auc].iloc[:,:-1]

Unnamed: 0,gamma,learning_rate,max_depth,subsample
488,1.350373,0.123482,2,0.533326


#### We have imporved from our baseline roc_auc_score = 0.82758 to 0.856549