In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('../../data/telecom_churn.csv')

In [3]:
train_df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
train_df.drop(['State', 'Voice mail plan'], axis=1, inplace=True)

In [5]:
train_df['International plan'] = \
    pd.factorize(train_df['International plan'])[0]

In [6]:
y = train_df['Churn'].astype('int')

In [7]:
train_df.drop(['Churn'], axis=1, inplace=True)

In [8]:
from xgboost import XGBClassifier

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_holdout, y_train, y_holdout = train_test_split(train_df, y, 
                                                          test_size=0.3,
                                                         random_state=7)

In [11]:
clf = XGBClassifier()

In [12]:
%%time
clf.fit(X_train, y_train)

CPU times: user 303 ms, sys: 83.4 ms, total: 386 ms
Wall time: 286 ms


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [13]:
from sklearn.metrics import roc_auc_score

In [14]:
roc_auc_score(y_holdout, clf.predict_proba(X_holdout)[:, 1])

0.92842095515892298

In [15]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [16]:
xgb_params = {'n_estimators': [30],
              'max_depth': [2, 4, 6, 8],
              'gamma': [0, .2, .4],
              'colsample_bytree': [.5, .75, 1]}

In [17]:
xgb_grid = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=xgb_params,
    cv=3,
    scoring='accuracy',
    verbose=True
)

In [18]:
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:   10.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [30], 'gamma': [0, 0.2, 0.4], 'max_depth': [2, 4, 6, 8], 'colsample_bytree': [0.5, 0.75, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=True)

In [19]:
xgb_grid.best_params_

{'colsample_bytree': 1, 'gamma': 0.2, 'max_depth': 6, 'n_estimators': 30}

In [20]:
xgb_grid.best_score_

0.94642091727389632

In [21]:
xgb_params2 = {'n_estimators': [150],
              'max_depth': [2, 4, 6, 8],
              'gamma': [0, .2, .4],
              'colsample_bytree': [.5, .75, 1]}

In [22]:
clf2 = XGBClassifier(colsample_bytree=1, gamma=0.2, max_depth=6, 
                     n_estimators=300).fit(X_train, y_train)

In [23]:
roc_auc_score(y_holdout, clf2.predict_proba(X_holdout)[:, 1])

0.93525784969635239