# Internet Advertisements Predictor
## Dataset Source
http://archive.ics.uci.edu/ml/datasets/Internet+Advertisements

### Import Libraries

In [99]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_svmlight_file
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.externals import joblib
import requests, zipfile, StringIO
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense
from keras.regularizers import l2, l1
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

### Read Data

In [17]:
np.random.seed(1)

In [25]:
column_names = pd.read_csv('data\\ad.names.csv', header=None)[0]

In [26]:
len(column_names)

1558

In [91]:
X = pd.read_csv('data\\ad.data', header=None, na_values=['unknown','?','   ?','     ?'], usecols=range(0,1558))
X.columns = column_names
X.head()

Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbooks.com,url*www.slake.com,url*hydrogeologist,url*oso,url*media,...,caption*of,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you
0,125.0,125.0,1.0,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,57.0,468.0,8.2105,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33.0,230.0,6.9696,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60.0,468.0,7.8,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
X.shape

(3279, 1558)

In [158]:
y = np.asarray(pd.read_csv('data\\ad.data', header=None, na_values='unknown', usecols=[1558])[1558])
y[y == 'ad.'] = 1
y[y == 'nonad.'] = 0
set(y)

{0, 1}

In [159]:
print 'X Shape: %s, y Shape: %s' % (str(X.shape), str(y.shape))

X Shape: (3279, 1558), y Shape: (3279L,)


### Split Data into Training and Validation

In [95]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=5)

In [96]:
X_train.head()

Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbooks.com,url*www.slake.com,url*hydrogeologist,url*oso,url*media,...,caption*of,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you
1986,90.0,65.0,0.7222,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
496,,,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2765,36.0,114.0,3.1666,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2495,24.0,120.0,5.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
342,,,,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
print 'X_train Shape: %s, y_train Shape: %s' % (str(X_train.shape), str(y_train.shape))
print 'X_test Shape: %s, y_test Shape: %s' % (str(X_test.shape), str(y_test.shape))

X_train Shape: (2623, 1558), y_train Shape: (2623L,)
X_test Shape: (656, 1558), y_test Shape: (656L,)


### Impute Data

In [100]:
imp = Imputer(missing_values='NaN', strategy='mean')
%time X_train_imp = pd.DataFrame(imp.fit_transform(X_train), columns = X_train.columns.values)
%time X_test_imp = pd.DataFrame(imp.transform(X_test), columns = X_test.columns.values)

Wall time: 174 ms
Wall time: 13 ms


In [103]:
X_train_imp.head()

Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbooks.com,url*www.slake.com,url*hydrogeologist,url*oso,url*media,...,caption*of,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you
0,90.0,65.0,0.7222,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,64.350656,154.504193,3.872688,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,36.0,114.0,3.1666,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,24.0,120.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,64.350656,154.504193,3.872688,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
print 'X_train Shape: %s, y_train Shape: %s' % (str(X_train_imp.shape), str(y_train.shape))
print 'X_test Shape: %s, y_test Shape: %s' % (str(X_test_imp.shape), str(y_test.shape))

X_train Shape: (2623, 1558), y_train Shape: (2623L,)
X_test Shape: (656, 1558), y_test Shape: (656L,)


### Standardize Data

In [106]:
X_train_imp[['height','width','aratio']]

Unnamed: 0,height,width,aratio
0,90.000000,65.000000,0.722200
1,64.350656,154.504193,3.872688
2,36.000000,114.000000,3.166600
3,24.000000,120.000000,5.000000
4,64.350656,154.504193,3.872688
5,22.000000,204.000000,9.272700
6,64.350656,154.504193,3.872688
7,35.000000,135.000000,3.857100
8,133.000000,200.000000,1.503700
9,64.350656,154.504193,3.872688


In [136]:
stdsc = StandardScaler()
%time X_train_std_tmp = pd.DataFrame(stdsc.fit_transform(X_train_imp.ix[:,0:3]), columns = ['height','width','aratio'])
%time X_test_std_tmp = pd.DataFrame(stdsc.transform(X_test_imp.ix[:,0:3]), columns = ['height','width','aratio'])

Wall time: 1 ms
Wall time: 1e+03 µs


In [137]:
X_train_std_tmp.head()

Unnamed: 0,height,width,aratio
0,0.539681,-0.808092,-0.612159
1,0.0,0.0,0.0
2,-0.596519,-0.365693,-0.137197
3,-0.849008,-0.311522,0.219044
4,0.0,0.0,0.0


In [130]:
X_train_imp.ix[:,3:1558].head()

Unnamed: 0,local,url*images+buttons,url*likesbooks.com,url*www.slake.com,url*hydrogeologist,url*oso,url*media,url*peace+images,url*blipverts,url*tkaine+kats,...,caption*of,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
X_train_std = pd.concat([X_train_std_tmp,X_train_imp.ix[:,3:1558]], axis=1)
X_test_std = pd.concat([X_test_std_tmp,X_test_imp.ix[:,3:1558]], axis=1)
X_train_std.shape, X_test_std.shape

((2623, 1558), (656, 1558))

In [139]:
X_train_std.head()

Unnamed: 0,height,width,aratio,local,url*images+buttons,url*likesbooks.com,url*www.slake.com,url*hydrogeologist,url*oso,url*media,...,caption*of,caption*home,caption*my,caption*your,caption*in,caption*bytes,caption*here,caption*click,caption*for,caption*you
0,0.539681,-0.808092,-0.612159,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.596519,-0.365693,-0.137197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.849008,-0.311522,0.219044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Select Features Using Principle Component Analysis

In [205]:
pca = PCA(n_components=0.9)
%time X_train_pca = pca.fit_transform(X_train_std)
%time X_test_pca = pca.transform(X_test_std)

Wall time: 3.47 s
Wall time: 21 ms


In [206]:
pca.explained_variance_ratio_

array([ 0.11169111,  0.0927008 ,  0.05902631,  0.03728182,  0.02471957,
        0.02431037,  0.02124087,  0.01768988,  0.01525561,  0.01342111,
        0.0118184 ,  0.01143971,  0.01128154,  0.01022271,  0.01000152,
        0.00954137,  0.00941227,  0.00904128,  0.00887915,  0.00847654,
        0.00843625,  0.00798719,  0.00768672,  0.00748282,  0.00718921,
        0.00697742,  0.00686913,  0.00672539,  0.00653582,  0.00639065,
        0.00615715,  0.00610442,  0.00597399,  0.0058998 ,  0.00580447,
        0.00564036,  0.00559523,  0.00536031,  0.0051561 ,  0.00511048,
        0.00489415,  0.00463656,  0.00461056,  0.00440129,  0.00429042,
        0.00424552,  0.0041283 ,  0.00394329,  0.00373857,  0.00370432,
        0.00364843,  0.00353301,  0.00350704,  0.00338787,  0.00332829,
        0.00329412,  0.00325485,  0.00321134,  0.00317517,  0.00311247,
        0.00310291,  0.00306282,  0.00301341,  0.00299648,  0.00294408,
        0.00289051,  0.00285884,  0.00281343,  0.00278288,  0.00

In [207]:
print 'X_train_pca Shape: %s, y_train Shape: %s' % (str(X_train_pca.shape), str(y_train.shape))
print 'X_test_pca Shape: %s, y_test Shape: %s' % (str(X_test_pca.shape), str(y_test.shape))

X_train_pca Shape: (2623L, 156L), y_train Shape: (2623L,)
X_test_pca Shape: (656L, 156L), y_test Shape: (656L,)


### Logistic Regression

In [208]:
lr = LogisticRegression()
%time lr.fit(X_train_pca, list(y_train))

Wall time: 177 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [209]:
y_predicted_lr = lr.predict(X_test_pca)
y_predicted_proba_lr = lr.predict_proba(X_test_pca)
y_predicted_proba_lr = np.asarray(zip(*y_predicted_proba_lr)[1])

In [210]:
y_test.shape, y_predicted_lr.shape, y_predicted_proba_lr.shape, X_test_pca.shape

((656L,), (656L,), (656L,), (656L, 156L))

In [211]:
set(y_test), set(y_predicted_lr)

({0, 1}, {0, 1})

In [212]:
print 'accuracy', accuracy_score(list(y_test), y_predicted_lr)
print 'confusion matrix\n', confusion_matrix(list(y_test), y_predicted_lr)
print '(row=expected, col=predicted)'
print classification_report(list(y_test),y_predicted_lr)

accuracy 0.966463414634
confusion matrix
[[561   5]
 [ 17  73]]
(row=expected, col=predicted)
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       566
          1       0.94      0.81      0.87        90

avg / total       0.97      0.97      0.97       656



### Random Forests

In [213]:
X_train_pca.shape, y_train.shape

((2623L, 156L), (2623L,))

In [214]:
n_estimators_list = [5,10,50,100,200,300,400,1000]

rfc = RandomForestClassifier(random_state=47)
rfc_grid = GridSearchCV(estimator=rfc, param_grid=dict(n_estimators=n_estimators_list))
%time rfc_grid.fit(X_train_pca, list(y_train))

print(rfc_grid)
# summarize the results of the grid search
print(rfc_grid.best_score_)
print(rfc_grid.best_estimator_.n_estimators)

Wall time: 2min 51s
GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=47, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 50, 100, 200, 300, 400, 1000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.973694243233
200


In [215]:
y_predicted_rfc = rfc_grid.predict(X_test_pca)
y_predicted_proba_rfc = rfc_grid.predict_proba(X_test_pca)
y_predicted_proba_rfc = np.asarray(zip(*y_predicted_proba_rfc))[1]

print 'Accuracy: ', accuracy_score(list(y_test), y_predicted_rfc)
print '\n Decision Tree Results\nConfusion Matrix: '
print confusion_matrix(list(y_test),y_predicted_rfc)
print '\n Classifcation Report'
print classification_report(list(y_test),y_predicted_rfc)

Accuracy:  0.97256097561

 Decision Tree Results
Confusion Matrix: 
[[563   3]
 [ 15  75]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.97      0.99      0.98       566
          1       0.96      0.83      0.89        90

avg / total       0.97      0.97      0.97       656



In [216]:
#0.8 - 0.975609756098
#0.9 - 0.97256097561
#0.95 - 0.971036585366

In [217]:
#joblib.dump(rfc, 'model\\rfc\\rfc.pkl')
#joblib.dump(rfc_grid, 'model\\rfc\\rfc_grid.pkl')

In [218]:
#rfc = joblib.load('model\\rfc\\rfc.pkl')
#rfc_grid = joblib.load('model\\rfc\\rfc_grid.pkl')

### Gradient Boosting Classifier

In [219]:
gbc = GradientBoostingClassifier(random_state=47)

n_estimators_list = [5,10,50,100,200,300,400,1000]
gbc_grid = GridSearchCV(estimator=gbc, param_grid=dict(n_estimators=n_estimators_list))
%time gbc_grid.fit(X_train_pca, list(y_train))

print(gbc_grid)
# summarize the results of the grid search
print(gbc_grid.best_score_)
print(gbc_grid.best_estimator_.n_estimators)

Wall time: 3min 58s
GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=47, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 50, 100, 200, 300, 400, 1000]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.973313000381
1000


In [220]:
y_predicted_gbc = gbc_grid.predict(X_test_pca)
y_predicted_proba_gbc = gbc_grid.predict_proba(X_test_pca)
y_predicted_proba_gbc = np.asarray(zip(*y_predicted_proba_gbc))[1]

print 'Accuracy: ', accuracy_score(list(y_test), y_predicted_gbc)
print '\n Decision Tree Results\nConfusion Matrix: '
print confusion_matrix(list(y_test),y_predicted_gbc)
print '\n Classifcation Report'
print classification_report(list(y_test),y_predicted_gbc)

Accuracy:  0.981707317073

 Decision Tree Results
Confusion Matrix: 
[[562   4]
 [  8  82]]

 Classifcation Report
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       566
          1       0.95      0.91      0.93        90

avg / total       0.98      0.98      0.98       656



In [186]:
#0.8 - 0.978658536585
#0.9 - 0.981707317073

In [187]:
#joblib.dump(gbc, 'model\\gbc\\gbc.pkl')
#joblib.dump(gbc_grid, 'model\\gbc\\gbc_grid.pkl')

In [25]:
#gbc = joblib.load('model\\gbc\\gbc.pkl')
#gbc_grid = joblib.load('model\\gbc\\gbc_grid.pkl')