In [110]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.linear_model import LassoCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
pth = './datafiles/'
inputfile = 'data'
colfile = 'column.names.txt'

In [3]:
df1 = pd.read_csv(os.path.join(pth, inputfile), header=None)
df2 = pd.read_csv(os.path.join(pth, colfile), sep=":", skiprows=0)
df2 = df2.reset_index()
df2.columns = ['variable', 'type']

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df1.columns = df2['variable'].tolist() + ['ad']

In [5]:
for col in df1.columns:
    if df1[col].dtype not in (float, int) and col!='ad':
        df1.ix[df1[col].str.strip()=='?', col] = np.nan
        df1[col] = df1[col].astype(float)

In [29]:
rans = np.random.rand(df1.shape[0]) 
trainfilt = rans <= 0.33
validfilt = (rans > 0.33) & (rans <= 0.66)
testfilt = rans > 0.66

In [7]:
for col in df1.columns:
    if df1[col].dtype in (float, int) and col!='ad':
        df1[col+'missing'] = df1[col].isnull().astype(int)
        if df1[col].nunique() ==2:
            df1[col] = df1[col].fillna(0)
        else:
            meanval = df1.ix[df1[col].notnull() & trainfilt, col].mean()
            df1[col] = df1[col].fillna(meanval)

In [44]:
ys = (df1['ad'].str.strip() == 'ad.' ).astype(int).values
xcols = [col for col in df1.columns if col !='ad']
X_train, X_valid, X_test = df1.ix[trainfilt, xcols].values, df1.ix[validfilt, xcols].values, df1.ix[testfilt, xcols].values
y_train, y_valid, y_test = ys[trainfilt], ys[validfilt], ys[testfilt]

In [95]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid  = xgb.DMatrix(X_valid, label=y_valid)
dtest  = xgb.DMatrix(X_test, label=y_test)

In [96]:
param = {'max_depth':2, 'eta': 1, 'silent':1, 'objective':'binary:logistic'}
model = xgb.train(param, dtrain, num_boost_round=10)

xgbpred = model.predict(dtrain)
xgbproba = 1/ (1. + np.exp( - model.predict(dtrain) ))

xgbpredvalid = model.predict(dvalid)
xgbprobavalid = 1/ (1. + np.exp( - model.predict(dvalid) ))

In [47]:
glmmodel  = LassoCV()
glmmodel.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [91]:
glmpred = glmmodel.predict(X_train)
glmproba = 1/ (1. + np.exp( - glmpred ))

glmpredvalid = glmmodel.predict(X_valid)
glmprobavalid = 1/ (1. + np.exp( - glmpredvalid ))

In [49]:
nbmodel = BernoulliNB() 
nbmodel.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [102]:
nbclass = nbmodel.predict(X_train)  
nbclassvalid = nbmodel.predict(X_valid)

nbproba = nbmodel.predict_proba(X_train)[:,1]
nbprobavalid = nbmodel.predict_proba(X_valid)[:,1]
nbprobatest = nbmodel.predict_proba(X_test)[:,1]

In [98]:
print("Train LASSO Logistic AUC = %.6f" % roc_auc_score(y_train, glmproba))
print("Valid LASSO Logistic AUC = %.6f" % roc_auc_score(y_valid, glmprobavalid))

print("Train XGBoost AUC = %.6f" % roc_auc_score(y_train, xgbproba))
print("Valid XGBoost AUC = %.6f" % roc_auc_score(y_valid, xgbprobavalid))

print("Train Naive Bayes AUC = %.6f" % roc_auc_score(y_train, nbproba))
print("Valid Naive Bayes AUC = %.6f" % roc_auc_score(y_valid, nbprobavalid))

Train LASSO Logistic AUC = 0.883916
Valid LASSO Logistic AUC = 0.889502
Train XGBoost AUC = 0.978075
Valid XGBoost AUC = 0.966410
Train Naive Bayes AUC = 0.968103
Valid Naive Bayes AUC = 0.959572


In [114]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, nbclass))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, nbclassvalid))

Training Confusion Matrix
[[930   0]
 [ 52 103]]
Validation Confusion Matrix
[[926   2]
 [ 45  98]]


In [115]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, glmproba >= 0.5))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, glmprobavalid >= 0.5))

Training Confusion Matrix
[[220 710]
 [  2 153]]
Validation Confusion Matrix
[[207 721]
 [  4 139]]


In [116]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, xgbproba >= 0.5))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, xgbprobavalid >= 0.5))

Training Confusion Matrix
[[  0 930]
 [  0 155]]
Validation Confusion Matrix
[[  0 928]
 [  0 143]]


In [104]:
# Choosing the model with the best performance on the validation data

print("Train Naive Bayes AUC = %.6f" % roc_auc_score(y_train, nbproba))
print("Valid Naive Bayes AUC = %.6f" % roc_auc_score(y_valid, nbprobavalid))
print("Test Naive Bayes AUC = %.6f" % roc_auc_score(y_test, nbprobatest))

Train Naive Bayes AUC = 0.968103
Valid Naive Bayes AUC = 0.959572
Test Naive Bayes AUC = 0.947379


In [118]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, nbclass))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, nbclassvalid))
print("Test Confusion Matrix")
print(confusion_matrix(y_test, nbclasstest))

Training Confusion Matrix
[[930   0]
 [ 52 103]]
Validation Confusion Matrix
[[926   2]
 [ 45  98]]
Test Confusion Matrix
[[958   4]
 [ 58 103]]


In [107]:
joblib.dump(nbmodel, 'ad_classifier.pkl') 

['ad_classifier.pkl']