In [1]:
import math
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import collections as col
import re
import random
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
#from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
#from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import auc,roc_curve

In [2]:
def load_data(train='Yes', test='No', validation='No'):
	"""Loads and returns datasets as required
	   Return empty lst for if 'No'
	"""
	if train=='Yes':
		df_train = pd.read_csv('dataset/train.csv', sep=',')
	else:
		df_train = []

	if test=='Yes':
		df_test = pd.read_csv('dataset/test.csv', sep=',')
	else:
		df_test = []

	if validation=='Yes':
		df_validation = pd.read_csv('dataset/validation.csv', sep=',')
	else:
		df_validation = []
	
	print('Data loaded', len(df_train), len(df_test), len(df_validation))
	return df_train, df_test, df_validation

In [3]:
df_train, df_test, df_validation= load_data('Yes', 'No', 'Yes')

Data loaded 2697738 0 299749


## pCTR, Logistic regression and Naive Bayes

In [4]:
# Feature engineering
def label_encoder(df_column_nint, column_le= None): 
    if column_le== None:
        column_le = LabelEncoder()
        column_le.fit(df_column_nint.unique())

    df_column_int = column_le.transform(df_column_nint)
    return pd.DataFrame(df_column_int), column_le

def onehot_encoder(df_column_nohe, OHE= None): 
    
    ar = df_column_nohe.reshape((-1, 1))
    ar_u = np.unique(ar)
    ar_u_r = ar_u.reshape((-1, 1))
    ar_u_r.shape
    
    if OHE== None:
        OHE= OneHotEncoder()
        OHE.fit(ar_u_r)
    
    return pd.DataFrame(OHE.transform(ar).toarray()), OHE

In [5]:
# Build model
def AdaBoost_Classifier(array_x, array_y):
    AB_model = AdaBoostClassifier()
    AB_model.fit(array_x, array_y)
    return AB_model

def AUC_accuracy(XX_model, array_x, array_y):
    fpr, tpr, thresholds = metrics.roc_curve(array_y, XX_model.predict_proba(array_x)[:, 1])
    return metrics.auc(fpr, tpr)

#AB_model_all = AdaBoost_Classifier(LR_OHE_array_x_all, array_y_r)
#AUC_accuracy(AB_model_all, LR_OHE_array_x_all_val, array_y_r)

In [4]:
X_train = df_train.drop(['click','bidid','logtype','userid','IP','domain',
                'url','urlid','slotid','creative','bidprice','payprice','keypage'], axis=1)
y_train = df_train.click

X_val = df_validation.drop(['click','bidid','logtype','userid','IP','domain',
                'url','urlid','slotid','creative','bidprice','payprice','keypage'], axis=1)
y_val = df_validation.click
#X_test = df_test.drop(['bidid','logtype','userid','IP','domain',
#                'url','urlid','slotid','creative','keypage'], axis=1)

In [5]:
# Handling categorical data with one hot encoding

# 1. Encode day of week
def enc_day(X):
    X = pd.concat([X,pd.get_dummies(X.weekday,prefix='day')],axis=1)
    X = X.drop('weekday',axis=1)
    return X

# 2. Encode hours
def enc_hrs(X):
    X = pd.concat([X,pd.get_dummies(X.hour,prefix='hour')],axis=1)
    X = X.drop('hour',axis=1)
    return X

# Split user agent into 2 ~ OS and browser
def enc_OS_browser(X):
    df = pd.DataFrame(X.useragent.str.split('_',1).tolist(),
                                   columns = ['OS','browser'])
    X = pd.concat([X,df],axis=1)

    # 3. Encode OS
    X = pd.concat([X,pd.get_dummies(X.OS,prefix='OS')],axis=1)
    X = X.drop('OS',axis=1)

    # 4. Encode browser
    X = pd.concat([X,pd.get_dummies(X.browser,prefix='browser')],axis=1)
    X = X.drop('browser',axis=1)
    
    X = X.drop('useragent',axis=1)
    return X

# 5. Encode region
def enc_region(X):
    X = pd.concat([X,pd.get_dummies(X.region,prefix='region')],axis=1)
    X = X.drop('region',axis=1)
    return X

# 6. Encode adexchange
def enc_adexchange(X):
    X = pd.concat([X,pd.get_dummies(X.adexchange,prefix='adexchange')],axis=1)
    X = X.drop('adexchange',axis=1)
    return X

# 7. Encode slotwidth
def enc_slotwidth(X):
    X = pd.concat([X,pd.get_dummies(X.slotwidth,prefix='slotwidth')],axis=1)
    X = X.drop('slotwidth',axis=1)
    return X

# 8. Encode slotheight
def enc_slotheight(X):
    X = pd.concat([X,pd.get_dummies(X.slotheight,prefix='slotheight')],axis=1)
    X = X.drop('slotheight',axis=1)
    return X

# 9. Encode slotvisibility
def enc_slotvisibility(X):
    X = pd.concat([X,pd.get_dummies(X.slotvisibility,prefix='slotvisibility')],axis=1)
    X = X.drop('slotvisibility',axis=1)
    return X

# 10. Encode slotformat
def enc_slotformat(X):
    X = pd.concat([X,pd.get_dummies(X.slotformat,prefix='slotformat')],axis=1)
    X = X.drop('slotformat',axis=1)
    return X

# 11. Encode advertiser
def enc_advertiser(X):
    X = pd.concat([X,pd.get_dummies(X.advertiser,prefix='advertiser')],axis=1)
    X = X.drop('advertiser',axis=1)
    return X

# 12. Encoding slotprice into buckets
def enc_slotprice(X):
    bins = pd.DataFrame()
    bins['slotprice_bins'] = pd.cut(X.slotprice.values,5, labels=[1,2,3,4,5])

    X = pd.concat([X,bins],axis=1)
    X = pd.concat([X,pd.get_dummies(X.slotprice_bins,prefix='slotprice')],axis=1)

    X = X.drop('slotprice',axis=1)
    X = X.drop('slotprice_bins',axis=1)
    bins.pop('slotprice_bins')
    return X

# 13. Encoding user tags
def enc_usertag(X):
    a = pd.DataFrame(X.usertag.str.split(',').tolist())
    usertag_df = pd.DataFrame(a)
    usertag_df2 = pd.get_dummies(usertag_df,prefix='usertag')
    usertag_df2 = usertag_df2.groupby(usertag_df2.columns, axis=1).sum()
    X = pd.concat([X, usertag_df2], axis=1)
    X = X.drop('usertag', axis=1)
    return X

def encode_labels(X):
    X = enc_day(X)
    X = enc_hrs(X)
    X = enc_OS_browser(X)
    X = enc_region(X)
    X = enc_adexchange(X)
    X = enc_slotwidth(X)
    X = enc_slotheight(X)
    X = enc_slotvisibility(X)
    X = enc_slotformat(X)
    X = enc_advertiser(X)
    X = enc_slotprice(X)
    return X

In [None]:
#X_train = encode_labels(X_train)
X_train = enc_usertag(X_train)

In [None]:
X_val = enc_usertag(X_val)
X_val = encode_labels(X_val)

In [None]:
if False:
    C_s = np.logspace(-10, 1, 11)
    scores = list()
    scores_std = list()
    lr = LogisticRegression(class_weight='balanced')

    for C in C_s:
        lr.C = C
        this_scores = cross_val_score(lr, X_train, y_train, cv=4, scoring='roc_auc')
        scores.append(np.mean(this_scores))
        scores_std.append(np.std(this_scores))

    lr_results = pd.DataFrame({'score':scores, 'C':C_s}) 
    lr_results

In [None]:
clf_l2_LR = LogisticRegression(class_weight='balanced')
y_pred = clf_l2_LR.fit(X_train_1, y_train).predict(X_val)

In [None]:
predprobs = clf_l2_LR.predict_proba(X_val)
pCTR = pd.DataFrame(predprobs)

In [None]:
pred =[]
a = len(train) / 2 * np.bincount(train.click)
w = a[1] / a[0]

for p in pCTR[1]:
    pred.append( p / (p + ((1-p)/w)))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve([click for click in validation.click], pred)
print('AUC accuracy:',metrics.auc(fpr, tpr))

## Linear bidding strategy

In [None]:
df_int2= pd.DataFrame()
df_int2= df_validation
df_int2['pCTR']= NB_model_allpCTR_v
df_int2['pCTR_norm'] = df_int2['pCTR'].values/ df_int2['pCTR'].mean()

In [None]:
def linear_bid(baseline_bid, imp_pCTR_norm):
    lin_bid = baseline_bid* imp_pCTR_norm
    return lin_bid

def nlinear_bid(baseline_bid, imp_pCTR_norm):
    lin_bid = baseline_bid* (imp_pCTR_norm**2)
    return nlin_bid

def bid_run(df_data_pCTR, baseline_bid, bid_strat= linear_bid, test_run= False, save_run= False):
    
    if test_run== True:    
        df_data_pCTR['imp_bid']= linear_bid(baseline_bid, df_data_pCTR['pCTR_norm'].values)
        
        df_output = pd.DataFrame()
        df_output[['bidid', 'bidprice']]= df_data_pCTR[['bidid', 'imp_bid']]
        
        if save_run== True:
            output_directory= 'Submission/Val/'
            output_filename= 'Group_XX.csv'
            df_output.to_csv(output_directory + output_filename)
            print('Submission file saved: ', os.getcwd(), output_directory, output_filename)
        if save_run== False:
            print('Submission file not saved')
        
        return df_data_pCTR
    
    if test_run== False:
        df_data_pCTR['imp_bid']= linear_bid(baseline_bid, df_data_pCTR['pCTR_norm'].values)
        df_data_pCTR['imp_h_bid']= np.where(df_data_pCTR['imp_bid'] >= df_data_pCTR['bidprice'], 'Y', 'N')
        
        total_cost = 0
        budget = 25000000
        
        df_data_pCTR['imp_win']= 'TBC'
        
        for index, row in df_data_pCTR.iterrows():
            if row['imp_h_bid']== 'Y':
                if (total_cost+ row['bidprice']) < budget:
                    df_data_pCTR.set_value(index,'imp_win', 'Y')
                    total_cost+= row['bidprice']
                else:
                    df_data_pCTR.set_value(index,'imp_win', 'Insuf/b')
            else:
                df_data_pCTR.set_value(index,'imp_win', 'N')
        
        return df_data_pCTR

In [None]:
df_data_pCTR_bid = pd.DataFrame()
df_data_pCTR_bid = bid_run(df_int2, 1000, test_run= False)

In [None]:
budget= 25000000

def CTR_evaluation(df_data_pCTR_bid):
    clicks = df_data_pCTR_bid['click'][(df_data_pCTR_bid['click'] == 1)].count()
    total = df_data_pCTR_bid['click'][(df_data_pCTR_bid['click'] != 10)].count()
    total_CTR = clicks/ total
    print('Sample average', total_CTR)
    
    win = df_data_pCTR_bid['click'][(df_data_pCTR_bid['imp_h_bid'] == 'Y')].count()
    win_click = df_data_pCTR_bid['click'][(df_data_pCTR_bid['imp_h_bid'] == 'Y') & (df_data_pCTR_bid.click == 1)].count()
    won_CTR = win_click/ win
    print('Win bids', won_CTR)
    
    loss = df_data_pCTR_bid['click'][(df_data_pCTR_bid['imp_h_bid'] == 'N')].count()
    loss_click = df_data_pCTR_bid['click'][(df_data_pCTR_bid['imp_h_bid'] == 'N') & (df_data_pCTR_bid.click == 1)].count()
    loss_CTR = loss_click/ loss
    print('Not won bids', loss_CTR)
    print('Won: Average ratio', won_CTR/ total_CTR)
    
    print('KPIs---')
    print('Number of clicks bid (NCB)', win_click)
    print('Total spend', df_data_pCTR_bid['bidprice'][(df_data_pCTR_bid['imp_h_bid'] == 'Y')].sum())
    print('Total spend (% budget)', (df_data_pCTR_bid['bidprice'][(df_data_pCTR_bid['imp_h_bid'] == 'Y')].sum()/ budget))
    print('Average CPM (Cost Per Mille)', '???')
    print('Average CPC (Cost Per Click)', (df_data_pCTR_bid['bidprice'][(df_data_pCTR_bid['imp_h_bid'] == 'Y')].sum())/ win_click)
    
CTR_evaluation(df_data_pCTR_bid)

In [None]:
# Find other evaluation metric
# Optimise for baseline_bid
# Optimise for other pCRT