In [1]:
import math
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import collections as col
import re
import random
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB

In [2]:
"""Web ecomonics assignemnt 1
   pCTR with Naive Bayes in SkLearn
"""

def load_data(train='Yes', test='No', validation='No'):
	"""Loads and returns datasets as required
	   Return empty lst for if 'No'
	"""
	if train=='Yes':
		df_train = pd.read_csv('dataset/train.csv', sep=',')
	else:
		df_train = []

	if test=='Yes':
		df_test = pd.read_csv('dataset/test.csv', sep=',')
	else:
		df_test = []

	if validation=='Yes':
		df_validation = pd.read_csv('dataset/validation.csv', sep=',')
	else:
		df_validation = []
	print('Data loaded', len(df_train), len(df_test), len(df_validation))
	return df_train, df_test, df_validation


def le_non_integers(df_data, column_name= 'adexchange', le_old= None):
	"""Label encode column. Used as preprocessing non-integer columns  
	   Returns LE (req for new ecoding/decoing) and new column 
	"""
	if le_old== None:
		le = LabelEncoder()
		le.fit(df_data[column_name].unique())
	else:
		le = le_old 
	column_le = le.transform(df_data[column_name])
	#print(np.unique(column_le))
	#print(column_le.shape)
	return le, np.asarray(column_le)


def build_NB_model(df_train):
	"""Format, label encode data and build NB model for specific columns 
	   Return NB_model
	"""
	# y
	array_y = df_train[['click']].as_matrix()
	array_y = np.reshape(array_y, (-1, 1))

	# x (int features)
	array_x_i = df_train[['weekday', 'hour', 'region', 'city', 'slotwidth', 'slotheight', 'advertiser']].as_matrix()
	# x ('non-int converted-to-int' features)
	adexchange_le, col_adexchange_le= le_non_integers(df_train, 'adexchange')
	slotformat_le, col_slotformat_le= le_non_integers(df_train, 'slotformat')
	slotvisibility_le, col_slotvisibility_le= le_non_integers(df_train, 'slotvisibility')
	useragent_le, col_useragent_le= le_non_integers(df_train, 'useragent')

	array_x_ni = np.column_stack((array_x_i, col_adexchange_le, col_slotformat_le, col_slotvisibility_le, col_useragent_le))

	# Model
	NB_model = GaussianNB()
	NB_model.fit(array_x_ni, array_y)

	return NB_model, adexchange_le, slotformat_le, slotvisibility_le, useragent_le

def pred_NB_model(NB_model, df_test, adexchange_le, slotformat_le, slotvisibility_le, useragent_le):
	"""Uses NB_model to predict probabiolitiy on test set
	   Return predictions (mainly 0s) and probabilities
	"""
	# x (int features)
	array_bid = np.asarray(df_test[['bidid']].as_matrix())
	array_x_i = df_test[['weekday', 'hour', 'region', 'city', 'slotwidth', 'slotheight', 'advertiser']].as_matrix()
	# x ('non-int converted-to-int' features). 
	# By providing a xxxxxxx_le we are NOT creating a new encoder
	adexchange_le, t_col_adexchange_le= le_non_integers(df_test, 'adexchange', adexchange_le)
	slotformat_le, t_col_slotformat_le= le_non_integers(df_test, 'slotformat', slotformat_le)
	slotvisibility_le, t_col_slotvisibility_le= le_non_integers(df_test, 'slotvisibility', slotvisibility_le)
	useragent_le, t_col_useragent_le= le_non_integers(df_test, 'useragent', useragent_le)

	array_x_ni = np.column_stack((array_x_i, t_col_adexchange_le, t_col_slotformat_le, t_col_slotvisibility_le, t_col_useragent_le))

	lst_predict_log_proba = []
	lst_predict = []
	for i in range(0, len(df_test)):
	    bid_name = array_bid[i]
	    lst_predict_log_proba.append(NB_model.predict_log_proba(array_x_ni[i]))
	    lst_predict.append(NB_model.predict(array_x_ni[i]))
	    
	return lst_predict_log_proba, lst_predict

In [3]:
df_train, df_test, df_validation= load_data('Yes', 'Yes')

Data loaded 2697738 299749 0


In [26]:
NB_model, adexchange_le, slotformat_le, slotvisibility_le, useragent_le= build_NB_model(df_train[:2000])

  y = column_or_1d(y, warn=True)


In [28]:
NB_model.class_count_

array([  1.99900000e+03,   1.00000000e+00])

In [44]:
data_point = df_train[:2000][df_train[:2000]['click']==1]

In [50]:
np.asarray(data_point[['weekday', 'hour', 'region', 'city', 'slotwidth', 'slotheight', 'advertiser']])

array([[   6,   16,  333,  334,  728,   90, 3358]])

In [53]:
adexchange_le, t_col_adexchange_le= le_non_integers(data_point, 'adexchange', adexchange_le)
slotformat_le, t_col_slotformat_le= le_non_integers(data_point, 'slotformat', slotformat_le)
slotvisibility_le, t_col_slotvisibility_le= le_non_integers(data_point, 'slotvisibility', slotvisibility_le)
useragent_le, t_col_useragent_le= le_non_integers(data_point, 'useragent', useragent_le)
array_x_ni = np.column_stack((np.asarray(data_point[['weekday', 'hour', 'region', 'city', 'slotwidth', 'slotheight', 'advertiser']]), t_col_adexchange_le, t_col_slotformat_le, t_col_slotvisibility_le, t_col_useragent_le))

In [54]:
array_x_ni

array([[   6,   16,  333,  334,  728,   90, 3358,    1,    0,    0,    9]])

In [64]:
NB_model.predict_proba([   6,   16,  333,  334,  728,   90, 3358,    1,    0,    0,    9])



array([[  7.51805703e-31,   1.00000000e+00]])

In [6]:
dir(NB_model)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_estimator_type',
 '_get_param_names',
 '_joint_log_likelihood',
 '_partial_fit',
 '_update_mean_variance',
 'class_count_',
 'class_prior_',
 'classes_',
 'fit',
 'get_params',
 'partial_fit',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_params',
 'sigma_',
 'theta_']

In [None]:
NB_model, adexchange_le, slotformat_le, slotvisibility_le, useragent_le= build_NB_model(df_train)
lst_predict_log_proba, lst_predict= pred_NB_model(NB_model, df_test, adexchange_le, slotformat_le, slotvisibility_le, useragent_le)

np.save('lst_predict', np.asarray(lst_predict))
np.save('lst_predict_log_proba', np.asarray(lst_predict_log_proba))

print('Script end')

In [67]:
pred_lst = np.load('predict_log_proba.npy')
pred_log_proba = np.load('lst_predict_log_proba.npy')

print('Script end')

Script end


In [78]:
pred_lst_r = np.reshape(pred_lst, [-1, 2])
max_val = 0
for i in range(10000):
    if pred_lst_r[i, 1] < max_val:
        max_val= pred_lst_r[i, 1]
print(max_val)

-9.48707869195


In [71]:
pred_log_proba

array([[[ -2.78080751e-04,  -8.18773805e+00]],

       [[ -3.78923482e-04,  -7.87836572e+00]],

       [[ -3.19964895e-04,  -8.04745925e+00]],

       ..., 
       [[ -2.28439448e-04,  -8.38435360e+00]],

       [[ -4.31542543e-04,  -7.74836022e+00]],

       [[ -3.00092381e-04,  -8.11157024e+00]]])