In [1]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn
from sklearn.externals import joblib
from sklearn.linear_model import LassoCV
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

%matplotlib inline



In [2]:
pth = './datafiles/'
inputfile = 'data'
colfile = 'column.names.txt'

In [3]:
df1 = pd.read_csv(os.path.join(pth, inputfile), header=None)
df2 = pd.read_csv(os.path.join(pth, colfile), sep=":", skiprows=0)
df2 = df2.reset_index()
df2.columns = ['variable', 'type']

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df1.columns = df2['variable'].tolist() + ['ad']

In [5]:
for col in df1.columns:
    if df1[col].dtype not in (float, int) and col!='ad':
        df1.ix[df1[col].str.strip()=='?', col] = np.nan
        df1[col] = df1[col].astype(float)

In [6]:
rans = np.random.rand(df1.shape[0]) 
trainfilt = rans <= 0.33
validfilt = (rans > 0.33) & (rans <= 0.66)
testfilt = rans > 0.66

In [7]:
for col in df1.columns:
    if df1[col].dtype in (float, int) and col!='ad':
        df1[col+'missing'] = df1[col].isnull().astype(int)
        if df1[col].nunique() ==2:
            df1[col] = df1[col].fillna(0)
        else:
            meanval = df1.ix[df1[col].notnull() & trainfilt, col].mean()
            df1[col] = df1[col].fillna(meanval)

meanvals = df1[trainfilt].describe()
meanvals.to_csv('./inputmeanvalues.csv', index=False)

In [13]:
meanvals.to_csv('./inputmeanvalues.csv')

In [16]:
meanvals = pd.read_csv('./inputmeanvalues.csv', index_col=0)

In [18]:
meanvals.head()

Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbooks.com,url*www.slake.com,url*hydrogeologist,url*oso,url*media,...,caption*ofmissing,caption*homemissing,caption*mymissing,caption*yourmissing,caption*inmissing,caption*bytesmissing,caption*heremissing,caption*clickmissing,caption*formissing,caption*youmissing
count,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,1074.0,...,1074,1074,1074,1074,1074,1074,1074,1074,1074,1074
mean,65.320051,155.861716,3.922461,0.772812,0.006518,0.007449,0.004655,0.002793,0.004655,0.012104,...,0,0,0,0,0,0,0,0,0,0
std,45.197272,108.099162,5.010114,0.41921,0.080506,0.086024,0.068104,0.052802,0.068104,0.109403,...,0,0,0,0,0,0,0,0,0,0
min,10.0,10.0,0.2142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
25%,33.0,94.0,1.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
ys = (df1['ad'].str.strip() == 'ad.' ).astype(int).values
xcols = [col for col in df1.columns if col !='ad']
X_train, X_valid, X_test = df1.ix[trainfilt, xcols].values, df1.ix[validfilt, xcols].values, df1.ix[testfilt, xcols].values
y_train, y_valid, y_test = ys[trainfilt], ys[validfilt], ys[testfilt]

In [9]:
xgbmodel = GridSearchCV(xgb.XGBClassifier(),
                   {'max_depth': [1, 2, 4, 6],
                    'n_estimators': [10, 50, 100]}, verbose=1)
xgbmodel.fit(X_train, y_train)
print(xgbmodel.best_score_)
print(xgbmodel.best_params_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   25.7s finished


0.967619047619
{'n_estimators': 100, 'max_depth': 6}


In [10]:
xgbpred = xgbmodel.predict(X_train)
xgbproba = 1/ (1. + np.exp( - xgbmodel.predict(X_train) ))

xgbpredvalid = xgbmodel.predict(X_valid)
xgbprobavalid = 1/ (1. + np.exp( - xgbmodel.predict(X_valid) ))

In [11]:
glmmodel  = LassoCV()
glmmodel.fit(X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [12]:
glmpred = glmmodel.predict(X_train)
glmproba = 1/ (1. + np.exp( - glmpred ))

glmpredvalid = glmmodel.predict(X_valid)
glmprobavalid = 1/ (1. + np.exp( - glmpredvalid ))

In [9]:
nbmodel = BernoulliNB() 
nbmodel.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [10]:
nbclass = nbmodel.predict(X_train)  
nbclassvalid = nbmodel.predict(X_valid)
nbclasstest = nbmodel.predict(X_test)

nbproba = nbmodel.predict_proba(X_train)[:,1]
nbprobavalid = nbmodel.predict_proba(X_valid)[:,1]
nbprobatest = nbmodel.predict_proba(X_test)[:,1]

In [15]:
print("Train LASSO Logistic AUC = %.6f" % roc_auc_score(y_train, glmproba))
print("Valid LASSO Logistic AUC = %.6f" % roc_auc_score(y_valid, glmprobavalid))

print("Train XGBoost AUC = %.6f" % roc_auc_score(y_train, xgbproba))
print("Valid XGBoost AUC = %.6f" % roc_auc_score(y_valid, xgbprobavalid))

print("Train Naive Bayes AUC = %.6f" % roc_auc_score(y_train, nbproba))
print("Valid Naive Bayes AUC = %.6f" % roc_auc_score(y_valid, nbprobavalid))

Train LASSO Logistic AUC = 0.891316
Valid LASSO Logistic AUC = 0.862454
Train XGBoost AUC = 0.964184
Valid XGBoost AUC = 0.924961
Train Naive Bayes AUC = 0.968006
Valid Naive Bayes AUC = 0.943938


In [16]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, nbclass))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, nbclassvalid))

Training Confusion Matrix
[[894   0]
 [ 53 103]]
Validation Confusion Matrix
[[929   3]
 [ 56 102]]


In [17]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, glmproba >= 0.5))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, glmprobavalid >= 0.5))

Training Confusion Matrix
[[202 692]
 [  3 153]]
Validation Confusion Matrix
[[212 720]
 [  6 152]]


In [18]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, xgbproba >= 0.5))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, xgbprobavalid >= 0.5))

Training Confusion Matrix
[[  0 894]
 [  0 156]]
Validation Confusion Matrix
[[  0 932]
 [  0 158]]


In [11]:
# Choosing the model with the best performance on the validation data, which happens to be the Naive Bayes

print("Train Naive Bayes AUC = %.6f" % roc_auc_score(y_train, nbproba))
print("Valid Naive Bayes AUC = %.6f" % roc_auc_score(y_valid, nbprobavalid))
print("Test Naive Bayes AUC = %.6f" % roc_auc_score(y_test, nbprobatest))

Train Naive Bayes AUC = 0.954917
Valid Naive Bayes AUC = 0.961046
Test Naive Bayes AUC = 0.954096


In [22]:
# Confusion Matrix
print("Training Confusion Matrix")
print(confusion_matrix(y_train, nbclass))
print("Validation Confusion Matrix")
print(confusion_matrix(y_valid, nbclassvalid))
print("Test Confusion Matrix")
print(confusion_matrix(y_test, nbclasstest))

Training Confusion Matrix
[[894   0]
 [ 53 103]]
Validation Confusion Matrix
[[929   3]
 [ 56 102]]
Test Confusion Matrix
[[992   2]
 [ 45 100]]


In [23]:
joblib.dump(nbmodel, 'ad_classifier.pkl') 

['ad_classifier.pkl']