In [1]:
import pandas as pd

n_rows = 300000
df = pd.read_csv('ad-click.csv', nrows=n_rows)

In [2]:
print(df.head(5))

             id  click      hour    C1  banner_pos   site_id site_domain  \
0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   
1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   
2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   
3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   
4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   

  site_category    app_id app_domain  ... device_type device_conn_type    C14  \
0      28905ebd  ecad2386   7801e8d9  ...           1                2  15706   
1      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   
2      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   
3      28905ebd  ecad2386   7801e8d9  ...           1                0  15706   
4      0569f928  ecad2386   7801e8d9  ...           1                0  18993   

   C15  C16   C17  C18  C19     C20  C21  
0  320   50  

In [3]:
Y = df['click'].values
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
print(X.shape, Y.shape)

(300000, 19) (300000,)


In [4]:
# Split data into training and testing sets
# Samples are in chronological order, ordered by Hour
# We cant use future samples to predict older ones

n_train = int(n_rows*0.9)
X_train, Y_train = X[:n_train], Y[:n_train]
X_test, Y_test = X[n_train:], Y[n_train:]

In [5]:
# Transform categorical features into one hot encoded vectors
# news, education, sports -> is_news, is_education, is_sports
# Handle unknown to prevent errors due to any unseen categorical values
# e.g. if there is a sample with the value movie,
# all of the three converted binary features (is_news, is_education, and is_sports)
# become 0. If we do not specify ignore, an error will be raised.

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown="ignore")

X_train_enc = enc.fit_transform(X_train)
X_train_enc[0]

<1x8204 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [6]:
print(X_train_enc[0])

  (0, 2)	1.0
  (0, 6)	1.0
  (0, 188)	1.0
  (0, 2608)	1.0
  (0, 2679)	1.0
  (0, 3771)	1.0
  (0, 3885)	1.0
  (0, 3929)	1.0
  (0, 4879)	1.0
  (0, 7315)	1.0
  (0, 7319)	1.0
  (0, 7475)	1.0
  (0, 7824)	1.0
  (0, 7828)	1.0
  (0, 7869)	1.0
  (0, 7977)	1.0
  (0, 7982)	1.0
  (0, 8021)	1.0
  (0, 8189)	1.0


In [7]:
X_test_enc = enc.transform(X_test)

print(X_test_enc.shape)
print(X_train_enc.shape)

(30000, 8204)
(270000, 8204)


In [8]:
from sklearn.tree import DecisionTreeClassifier

parameters = {'max_depth': [3, 10, None]}

decision_tree = DecisionTreeClassifier(criterion='gini', min_samples_split=30)

In [9]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3, scoring='roc_auc')

In [10]:
grid_search.fit(X_train_enc, Y_train)
print(grid_search.best_params_)

{'max_depth': 10}


In [11]:
decision_tree_best = grid_search.best_estimator_
pos_prob = decision_tree_best.predict_proba(X_test_enc)[:, 1]

from sklearn.metrics import roc_auc_score
print(f"The ROC AUC on testing set is: {roc_auc_score(Y_test, pos_prob):.3f}")

The ROC AUC on testing set is: 0.719


In [14]:
# Randomly selecting 17 % of samples instead will lower the accuracy
import numpy as np
pos_prob = np.zeros(len(Y_test))
click_index = np.random.choice(len(Y_test), int(len(Y_test) * 51211.0/300000), replace=False)

pos_prob[click_index] = 1

print(f'The ROC AUC on testing set is {roc_auc_score(Y_test, pos_prob):.3f}')

The ROC AUC on testing set is 0.501


In [15]:
# Decision tree is a sequence of greedy searches for the best splitting point at each step, based on training set
# tends to cause overfitting
# Ensembling is the technique to correct this
# Random forest is an ensemble tree model that usually outperforms a simple decision tree

In [16]:
# Ensemble technique of bagging (bootstrap aggregating) can overcome overfitting
# Different sets of training samples are randomly drawn with replacement from the original training data
# Each set is used to fit an individual classification model
# Results are combined together through majority vote to make final decision

In [None]:
# Random forest is a variant of the tree bagging model with an additional of feature-based bagging

from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, criterion='gini', min_samples_split=30, n_jobs=-1)

grid_search = GridSearchCV(random_forest, parameters, n_jobs=-1, cv=3, scoring='roc_auc')
grid_search.fit(X_train_enc, Y_train)
print(grid_search.best_params_)

In [None]:
random_forest_best = grid_search.best_estimator_
pos_prob = random_forest_best.predict_proba(X_test_enc)[:, 1]
print(f'The ROC AUC on testing set is {roc_auc_score(Y_test, pos_prob):.3f}')

In [None]:
# Critical hyperparameters:
# 1. max_depth: deepest individual tree. It tends to overfit if too deep or underfit if too shallow
# 2. min_samples_split: minimum number of split required for further splitting a node.
# Too small cause overfitting, too large cause underfitting. 10, 30, 50 may be good to start with

# Hyperparameters for forest / collection of trees
# 1. max_features: number of features to consider for best splitting point search.
# typically in an m-dimensional dataset, sqrt(m) is recommended value for max_features.
# this can be specified as max_features='sqrt'
# 2. n_estimators: number of trees considered for majority voting.
# the more the trees, the better the performance, so is the computation time. 100, 200, 300 and so on

In [None]:
# GRADIENT BOOSTED TREES
# boosting, another ensemble technique
# in boosted trees, individual trees are no longer trained separately
# trees are trained in succession, where a tree aims to correct the errors made by the previous tree


In [None]:
pip install xgboost

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y_train_enc = le.fit_transform(Y_train)

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(learning_rate=0.1, max_depth = 10, n_estimators = 1000)

model.fit(X_train_enc, Y_train_enc)

In [None]:
pos_prob = model.predict_proba(X_test_enc)[:, 1]
print(f'The ROC AUC on testing set is: {roc_auc_score(Y_test_enc, pos_proba):.3f}')