In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix


seed = 1337
df = pd.read_csv('input/telco_customer_churn.csv')

# Drop customerID, since it is not related to customer churn.
df.drop(['customerID', ], axis=1, inplace=True) # axis=0 for rows, axis=1 for columns

# Remove white space in columns for later plotting of tree.
df.replace(' ', '_', regex=True, inplace=True)

# Look into the dtype == object columns
for col in df.select_dtypes(include='object'):
    if len(col) < 7:
        tabs = '\t\t\t'
    elif len(col) < 15:
        tabs = '\t\t'
    else:
        tabs = '\t'
    # print(f'{col}:{tabs}{df[col].unique()}')

df.loc[(df['TotalCharges'] == '_'), 'TotalCharges'] = 0.0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

# Let's convert all 'Yes'/'No' columns to 1/0 instead.
cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in cols:
    df[col] = pd.Series(np.where(df[col].values == 'Yes', 1, 0), df.index)
    df[col] = pd.to_numeric(df[col])

# Convert Male / Female to 1 / 0 similar to above.
df['gender'] = pd.Series(np.where(df['gender'].values == 'Male', 1, 0), df.index)
df['gender'] = pd.to_numeric(df['gender'])

# Divide the data into independent variables X and dependent variable y (Churn).
X = df.drop('Churn', axis=1).copy()
y = df['Churn'].copy()

# Convert to one-hot encoding as this is suitable for trees.
# We see that there are a couple of dtype == object columns that are not binary. These, we will one-hot encode.
cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
X_encoded = pd.get_dummies(X, columns=cols)

# Check how many Churn == 1 samples there are compared to total.
print(f'Churn percentage: {round((sum(y) / len(y) * 100), 2)}%')

# Since we only have 26.54% of Churn == 1 samples, we need to use stratification
# when splitting out data into a training and a testing dataset. This way, we
# ensure that the ratio of 1 / 0 will be the same in the training and test datasets.
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=seed, stratify=y)

print(f'Churn percentage train:\t{round((sum(y_train) / len(y_train) * 100), 2)}%')
print(f'Churn percentage test:\t{round((sum(y_test) / len(y_test) * 100), 2)}%\n')

# In sparse matrices, 0 is used for missing data. I explicitly state missing=None, even
# though that is the default. But if our missing data points were '?', we would have said
# missing='?'.
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', missing=None, seed=seed)

# For creating our forest of XGBoosted trees, run fit(). Set early_stopping so we don't
# need to wait if the model has stopped improving. Use Area under the PR Curve for evaluating.
clf_xgb.fit(X_train,
            y_train,
            verbose=True,
            early_stopping_rounds=10,
            eval_metric='aucpr',
            eval_set=[(X_test, y_test)])

Churn percentage: 26.54%
Churn percentage train:	26.54%
Churn percentage test:	26.52%

[0]	validation_0-aucpr:0.61852
Will train until validation_0-aucpr hasn't improved in 10 rounds.
[1]	validation_0-aucpr:0.63184
[2]	validation_0-aucpr:0.64216
[3]	validation_0-aucpr:0.64429
[4]	validation_0-aucpr:0.64326
[5]	validation_0-aucpr:0.64245
[6]	validation_0-aucpr:0.64733
[7]	validation_0-aucpr:0.64619
[8]	validation_0-aucpr:0.64595
[9]	validation_0-aucpr:0.64346
[10]	validation_0-aucpr:0.64230
[11]	validation_0-aucpr:0.64418
[12]	validation_0-aucpr:0.64327
[13]	validation_0-aucpr:0.64389
[14]	validation_0-aucpr:0.64245
[15]	validation_0-aucpr:0.64187
[16]	validation_0-aucpr:0.64169
Stopping. Best iteration:
[6]	validation_0-aucpr:0.64733



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              random_state=1337, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=1337, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [13]:
# For imbalanced datasets, the XGBoost manual says that one should balance the positive
# and negative weights via scale_pos_weight and that one should use AUC for evaluation.

# Use GridSearchCV() for optimizing the hyperparameters. Reduce total optimization time
# by dividing the optimization into several rounds.

# Optimization round 1
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.05],
    'gamma': [0, 0.25, 1.0],
    'reg_lambda': [0, 1.0, 10.0],
    'scale_pos_weight': [1, 3, 5]
}

# Use random 90% subset of the data and a random 50% subset of the features
# (i.e. independent variables / columns) per tree in the random forest. This
# is for speeding up the cross-validation and for preventing overfitting.
optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic',
                                seed=seed,
                                subsample=0.9,
                                colsample_bytree=0.5),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=2,
    n_jobs=10,
    cv=3
)

optimal_params.fit(X_train,
                   y_train,
                   early_stopping_rounds=10,
                   eval_metric='auc',
                   eval_set=[(X_test, y_test)],
                   verbose=1)
print(optimal_params.best_params_)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    4.2s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:   16.4s
[Parallel(n_jobs=10)]: Done 345 tasks      | elapsed:   42.7s
[Parallel(n_jobs=10)]: Done 628 tasks      | elapsed:  1.3min


[0]	validation_0-auc:0.80109
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.81668
[2]	validation_0-auc:0.82163
[3]	validation_0-auc:0.82501
[4]	validation_0-auc:0.82622
[5]	validation_0-auc:0.82660
[6]	validation_0-auc:0.82591
[7]	validation_0-auc:0.83048
[8]	validation_0-auc:0.83109
[9]	validation_0-auc:0.83326
[10]	validation_0-auc:0.83260
[11]	validation_0-auc:0.83337
[12]	validation_0-auc:0.83443


[Parallel(n_jobs=10)]: Done 729 out of 729 | elapsed:  1.6min finished


[13]	validation_0-auc:0.83485
[14]	validation_0-auc:0.83493
[15]	validation_0-auc:0.83465
[16]	validation_0-auc:0.83484
[17]	validation_0-auc:0.83481
[18]	validation_0-auc:0.83434
[19]	validation_0-auc:0.83424
[20]	validation_0-auc:0.83397
[21]	validation_0-auc:0.83405
[22]	validation_0-auc:0.83429
[23]	validation_0-auc:0.83437
[24]	validation_0-auc:0.83435
Stopping. Best iteration:
[14]	validation_0-auc:0.83493

{'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'reg_lambda': 10.0, 'scale_pos_weight': 3}


In [14]:
# Round 1 yielded:
# {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'reg_lambda': 10.0, 'scale_pos_weight': 3}

# Optimization round 2 - building on results of optimization round 1.
param_grid = {
    'max_depth': [3, 4],
    'learning_rate': [0.05, 0.1, 0.5],
    'gamma': [0, 0.25],
    'reg_lambda': [10.0, 20.0, 100],
    'scale_pos_weight': [3]
}

optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic',
                                seed=seed,
                                subsample=0.9,
                                colsample_bytree=0.5),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=2,
    n_jobs=10,
    cv=3
)

optimal_params.fit(X_train,
                   y_train,
                   early_stopping_rounds=10,
                   eval_metric='auc',
                   eval_set=[(X_test, y_test)],
                   verbose=1)
print(optimal_params.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    2.9s


[0]	validation_0-auc:0.80109
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.81668
[2]	validation_0-auc:0.82163
[3]	validation_0-auc:0.82501
[4]	validation_0-auc:0.82622
[5]	validation_0-auc:0.82660
[6]	validation_0-auc:0.82591
[7]	validation_0-auc:0.83048
[8]	validation_0-auc:0.83109
[9]	validation_0-auc:0.83326
[10]	validation_0-auc:0.83260
[11]	validation_0-auc:0.83337
[12]	validation_0-auc:0.83443
[13]	validation_0-auc:0.83485


[Parallel(n_jobs=10)]: Done 108 out of 108 | elapsed:   13.2s finished


[14]	validation_0-auc:0.83493
[15]	validation_0-auc:0.83465
[16]	validation_0-auc:0.83484
[17]	validation_0-auc:0.83481
[18]	validation_0-auc:0.83434
[19]	validation_0-auc:0.83424
[20]	validation_0-auc:0.83397
[21]	validation_0-auc:0.83405
[22]	validation_0-auc:0.83429
[23]	validation_0-auc:0.83437
[24]	validation_0-auc:0.83435
Stopping. Best iteration:
[14]	validation_0-auc:0.83493

{'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'reg_lambda': 10.0, 'scale_pos_weight': 3}


In [None]:
# {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'reg_lambda': 10.0, 'scale_pos_weight': 3}