In [119]:
import pandas as pd
import xgboost as xgb
from collections import Counter
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
import warnings
from sklearn.ensemble import RandomForestClassifier

In [108]:
data = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv', index_col='customerID')

In [109]:
data.head(3)

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [110]:
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
target = ['Churn']
cat_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 
                        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                       'Contract', 'PaperlessBilling', 'PaymentMethod']
cat_features_processed = ['SeniorCitizen']

In [111]:
imp = SimpleImputer(missing_values=-1, strategy='mean')

In [112]:
X = pd.get_dummies(data.drop(target, axis=1), columns=cat_features)

In [113]:
# note -1 im reshape; this means infer dimension from the length of the array
X['TotalCharges'] = imp.fit_transform(X['TotalCharges'] \
                            .apply(lambda x: float(str(x)) if str(x).strip() else -1) \
                            .values.reshape(-1,1))

In [114]:
y = data[target]['Churn'].apply(lambda x: 1 if x == "Yes" else 0 if x == 'No' else None)

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4232)

In [129]:
# set up a warning filter to suppress annoying future warnings
with warnings.catch_warnings():
    warnings.filterwarnings('ignore',category=FutureWarning)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
    param['nthread'] = 4
    param['eval_metric'] = 'auc'
    
    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    
    num_round = 16
    bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=3)
    
    bst.dump_model(fout='dump.raw.json', dump_format='json')

[0]	eval-auc:0.793749	train-auc:0.78886
Multiple eval metrics have been passed: 'train-auc' will be used for early stopping.

Will train until train-auc hasn't improved in 3 rounds.
[1]	eval-auc:0.828415	train-auc:0.825413
[2]	eval-auc:0.834881	train-auc:0.833907
[3]	eval-auc:0.83703	train-auc:0.840716
[4]	eval-auc:0.841446	train-auc:0.845377
[5]	eval-auc:0.842991	train-auc:0.850008
[6]	eval-auc:0.840912	train-auc:0.851972
[7]	eval-auc:0.839741	train-auc:0.853403
[8]	eval-auc:0.840938	train-auc:0.85542
[9]	eval-auc:0.84117	train-auc:0.857284
[10]	eval-auc:0.841492	train-auc:0.85833
[11]	eval-auc:0.843172	train-auc:0.859596
[12]	eval-auc:0.843012	train-auc:0.860528
[13]	eval-auc:0.840828	train-auc:0.8619
[14]	eval-auc:0.840358	train-auc:0.862554
[15]	eval-auc:0.840057	train-auc:0.863628


In [120]:
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)

In [121]:
scores = cross_val_score(clf, X, y, cv=5)

In [122]:
scores

array([0.77004968, 0.77927608, 0.761533  , 0.78424414, 0.7938877 ])