# Problem
Using a dataset of past advertisements on the Internet, can we accurately predict what image will be an advertisement based on attributes of that image?

# Project
The features encode the geometry of the image (if available) as well as phrases occurring in the URL, 
the image's URL and alt text, the anchor text, and words occurring near the anchor text.

Number of Instances: 3,279 (2,821 non ads, 458 ads)
Number of Attributes: 1,558 (3 continous; others binary)

28% of instances are missing some of the continuous attributes.

Missing values should be interpreted as "unknown"

Class Distribution- number of instances per class: 2,821 non ads, 458 ads.

The task is to predict whether an image is an advertisement ("ad") or not ("non ad").

Deliverables

Please send us the following:

Code, and associated files, used for the project. You can send us a zipfile, or upload the project to a public github repo.

The algorithm you developed to make your predictions

How we can run the algorithm on a test data set

The process you used to analyze the data and came to your conclusions

In [77]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn
from sklearn.linear_model import LassoCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
pth = './datafiles/'
inputfile = 'data'
colfile = 'column.names.txt'

In [31]:
df1 = pd.read_csv(os.path.join(pth, inputfile), header=None)
df2 = pd.read_csv(os.path.join(pth, colfile), sep=":", skiprows=0)
df2 = df2.reset_index()
df2.columns = ['variable', 'type']

In [33]:
df1.columns = df2['variable'].tolist() + ['ad']

In [44]:
for col in df1.columns:
    if df1[col].dtype not in (float, int) and col!='ad':
        df1.ix[df1[col].str.strip()=='?', col] = np.nan
        df1[col] = df1[col].astype(float)

In [190]:
rans = np.random.rand(df1.shape[0]) 
trainfilt = rans <= 0.33
validfilt = (rans > 0.33) & (rans <= 0.66)
testfilt = rans > 0.66

In [191]:
for col in df1.columns:
    if df1[col].dtype in (float, int) and col!='ad':
        df1[col+'missing'] = df1[col].isnull().astype(int)
        if df1[col].nunique() ==2:
            df1[col] = df1[col].fillna(0)
        else:
            meanval = df1.ix[df1[col].notnull() & trainfilt, col].mean()
            df1[col] = df1[col].fillna(meanval)

In [207]:
ys = (df1['ad']=='ad').astype(int).values
xcols = [col for col in df1.columns if col !='ad']
X_train, X_valid, X_test = df1.ix[trainfilt, xcols].values, df1.ix[validfilt, xcols].values, df1.ix[testfilt, xcols].values
y_train, y_valid, y_test = ys[trainfilt], ys[validfilt], ys[testfilt]

In [208]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid  = xgb.DMatrix(X_test, label=y_valid)
dtest  = xgb.DMatrix(X_test, label=y_test)

In [167]:
param = {'max_depth':2, 'eta': 1, 'silent':1, 'objective':'binary:logistic'}
model = xgb.train(param, dtrain, num_boost_round=10)

xgbpred = model.predict(dtrain)
xgbproba = 1/ (1. + np.exp( - model.predict(dtrain) ))

xgbpredtest = model.predict(dtest)
xgbprobatest = 1/ (1. + np.exp( - model.predict(dtest) ))

In [172]:
glmmodel  = LassoCV()
glmmodel.fit(X_train, y_train)

In [174]:
glmpred = model.predict(X_train)
glmproba = 1/ (1. + np.exp( - glmpred ))

glmpredtest = model.predict(X_test)
glmprobatest = 1/ (1. + np.exp( - glmpredtest ))

In [175]:
# Confusion Matrix
print(pd.crosstab(y_train, glmproba >= .5))

print("AUC = %.3f" % roc_auc_score(y_train, proba))

col_0  False  True 
row_0              
0        263   1195
1         10    209
AUC = 0.870


In [176]:
fullpred  = (xgbpred + pred)/2.
fullproba = 1/ (1. + np.exp( - fullpred ))

In [177]:
# Confusion Matrix
print(pd.crosstab(y_train, fullproba >= .5))

print("AUC = %.3f" % roc_auc_score(y_train, fullproba))

col_0  False  True 
row_0              
0        166   1292
1          0    219
AUC = 0.960


In [160]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X_train, y_train)
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [178]:
nbclass = clf.predict(X_train)  
nbclasstest = clf.predict(X_test)

In [182]:
# Confusion Matrix
print(pd.crosstab(y_train, nbclass))
print(pd.crosstab(y_test, nbclasstest ))

col_0     0    1
row_0           
0      1451    7
1        57  162
col_0     0    1
row_0           
0      1357    5
1        54  186


In [184]:
glmprobatest

array([ 0.70922805,  0.64847951,  0.64847951, ...,  0.48652719,
        0.52276597,  0.52730148])

In [185]:
# Confusion Matrix
print(pd.crosstab(y_train, glmproba >= 0.5))
print(pd.crosstab(y_test, glmprobatest >= 0.5 ))

col_0  False  True 
row_0              
0        263   1195
1         10    209
col_0  False  True 
row_0              
0        258   1104
1          2    238


In [186]:
# Confusion Matrix
print(pd.crosstab(y_train, xgbproba >= 0.5))
print(pd.crosstab(y_test, xgbprobatest >= 0.5 ))

col_0  True
row_0      
0      1458
1       219
col_0  True
row_0      
0      1362
1       240
