In [1]:
import os
import numpy as np
import scipy as sp
import matplotlib as mpl
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [2]:
url = "http://www.dataminingconsultant.com/data/churn.txt"
data = pd.read_csv(url)

In [3]:
data.shape

(3333, 21)

In [4]:
data.apply(lambda x: sum(x.isnull()))

State             0
Account Length    0
Area Code         0
Phone             0
Int'l Plan        0
VMail Plan        0
VMail Message     0
Day Mins          0
Day Calls         0
Day Charge        0
Eve Mins          0
Eve Calls         0
Eve Charge        0
Night Mins        0
Night Calls       0
Night Charge      0
Intl Mins         0
Intl Calls        0
Intl Charge       0
CustServ Calls    0
Churn?            0
dtype: int64

In [5]:
data.drop('Phone', axis = 1, inplace = True)

In [6]:
data['Churn?'].value_counts()

False.    2850
True.      483
Name: Churn?, dtype: int64

In [7]:
cols = [col for col in data]
cols[3] = 'Intl Plan'
data.columns = cols

In [8]:
cols_to_encode = ['State','Area Code','Intl Plan','VMail Plan']
data = pd.get_dummies(data, columns = cols_to_encode)                   

In [9]:
data.shape

(3333, 74)

In [10]:
data['Churn'] = data['Churn?'].apply(lambda x: 1 if x == 'True.' else 0)

In [11]:
data.drop('Churn?', axis = 1, inplace = True)

In [12]:
y = data.pop('Churn')
X = data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [13]:
print (X_train.shape)
print (X_test.shape)

(2666, 73)
(667, 73)


In [15]:
print(y_train.mean())
print(y_test.mean())

0.147411852963
0.134932533733


In [16]:
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgtest = xgb.DMatrix(X_test)

In [46]:
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.01
params["min_child_weight"] = 10
params["subsample"] = 0.6
params["scale_pos_weight"] = 1.0
params["silent"] = 1
params["max_depth"] = 4
params["eval_metric"] = 'auc'
params["lambda"] = 10

In [47]:
plst = list(params.items())
num_rounds = 1000
model = xgb.train(plst, xgtrain, num_rounds)

In [48]:
train_pred = model.predict(xgtrain)

In [49]:
train_pred

array([ 0.07406547,  0.17701364,  0.07374127, ...,  0.01523155,
        0.07288183,  0.02742501], dtype=float32)

In [50]:
train_pred = train_pred >= 0.5
train_pred = train_pred.astype(int)

In [51]:
test_pred = model.predict(xgtest)
test_pred = test_pred >= 0.5
test_pred = test_pred.astype(int)

In [52]:
print (accuracy_score(y_train, train_pred))
print (accuracy_score(y_test, test_pred))

0.960615153788
0.943028485757


In [53]:
print (roc_auc_score(y_train, train_pred))
print (roc_auc_score(y_test, test_pred))

0.88640574327
0.826400924321


In [54]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [26]:
regmodel = logreg.fit(X_train, y_train)
train_regpred = regmodel.predict(X_train)
test_regpred = regmodel.predict(X_test)

In [27]:
print (accuracy_score(y_train, train_regpred))
print (accuracy_score(y_test, test_regpred))

0.868717179295
0.857571214393


In [28]:
print (roc_auc_score(y_train, train_regpred))
print (roc_auc_score(y_test, test_regpred))

0.627211221395
0.562288930582


In [55]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, train_pred)

array([[2254,   19],
       [  86,  307]])

In [56]:
confusion_matrix(y_test, test_pred)

array([[569,   8],
       [ 30,  60]])