In [1]:
# CONSIDER WHETHER SAMPLING REQUIRED BECAUSE OHC WILL INCREASE DATA SIZE SIGNIFICANTLY
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
# try with my custom encoder for user tags
from sklearn.base import BaseEstimator, TransformerMixin

In [40]:
scriptDir = os.getcwd() # Assumes that script is executed from its actual location
relPath = r"../data/" # And data is located in a 'data' folder in the same parent directory as 'code' folder
trainFilePath = os.path.join(scriptDir, relPath,"train.csv")
valFilePath = os.path.join(scriptDir, relPath,"validation.csv")
sampleFilePath = os.path.join(scriptDir, relPath,"sample.csv")
testFilePath = os.path.join(scriptDir, relPath,"test.csv")
relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [3]:
# Load data
df = pd.read_csv(sampleFilePath, sep=',',header=0, quotechar='"')

In [15]:
df.head(2)

Unnamed: 0,click,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,...,slotprice,creative,bidprice,payprice,keypage,advertiser,usertag,OS,browser,slotsize
0,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,2,2.0,...,5,a4f763f78ef3eedfe614263b94a8924e,238,5,0f951a030abdaedd733ee8d114ce2944,3427,,windows,ie,40000
1,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,239,1.0,...,0,10722,294,23,,2821,,windows,chrome,75000


In [4]:
# Split into features and labels
# List all columns and summarise what will be done with them
# In general, numerical will be scaled and categorical will be one-hot-encoded
# Some may be dropped
# Some additional features need to be created
# Some will be dropped

In [5]:
# click - LABEL
# weekday - categorical, encode
# hour - categorical, encode
# bidid - ignore
# userid - ignore
# useragent - ignore (but see OS and browser)
# IP - ignore
# region - categorical, encode
# city - categorical, encode
# adexchange - categorical, encode
# domain - ignore
# url - ignore
# urlid - ignore
# slotid - ignore
# slotwidth - numeric, scaled
# slotheight - numeric, scaled
# slotvisibility - categorical, encode
# slotformat - categorical, encode
# slotprice - numeric, scaled
# creative - ?
# bidprice - ignore (train/val only)
# payprice - ignore (train/val only)
# keypage - ?
# advertiser - categorical, encode
# usertag - NEED TO REVIEW
# OS - derived, categorical, encode
# browser - derived, categorical, encode
# slotsize - derived, numerical, scaled

In [6]:
# reduce dataset for testing
df = df.head(10)

In [7]:
# Split 'useragent' field into 'OS' and 'browser'
dfn = df['useragent'].str.split('_', expand = True)
df['OS'] = dfn[0]
df['browser'] = dfn[1]
# Create slotsize field
df['slotsize'] = df['slotheight'] * df['slotwidth']

In [8]:
# convert numeric (non categorical) fields from int to float (to avoid warnings later)
# TODO

In [16]:
# split into features and labels
X_train = df[['weekday', 'hour', 'region', 'city', 'adexchange', 'slotvisibility', 'slotformat', 'advertiser', 'OS', 'browser',
                 'slotwidth', 'slotheight', 'slotprice', 'slotsize',
                 'usertag']]
y_train = df['click']
# print(X_train.head(5))
# print(y_train.head(5))

   weekday  hour  region  city  adexchange slotvisibility slotformat  \
0        5    22       2     2         2.0              2          0   
1        1    20     238   239         1.0     FourthView         Na   
2        3    13      40    41         2.0              2          0   
3        6    23       1     1         1.0              2          1   
4        5     6     216   233         2.0      OtherView         Na   
5        4    17     276   277         1.0              2          1   
6        3    12      80    89         3.0              0          0   
7        3    16      15    19         1.0              2          1   
8        5    14     333   342         3.0              0          0   
9        5     0       2     2         3.0              0          0   

   advertiser       OS browser  slotwidth  slotheight  slotprice  slotsize  \
0        3427  windows      ie        200         200          5     40000   
1        2821  windows  chrome        300         2

In [30]:
class UserTagEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.fitted = False

    def fit(self, X, y=None):
        #X = np.array(X).reshape(X.shape[0],)
        X = np.array(X)
        
        usertagsUnique = set()

        for idx in range(X.shape[0]):
            if type(X[idx,0]) != str:
                continue
            else:
                tags = str(X[idx,0]).split(',')

            for tag in tags:
                usertagsUnique.add(tag)

        usertagsUnique = list(usertagsUnique)
        columns = ['u0_'+x for x in usertagsUnique]
        
        self.tags = usertagsUnique
        self.columns = columns
        self.fitted = True

        # Return the transformer
        return self

    def transform(self, X):
        #X = np.array(X).reshape(X.shape[0],)
        X = np.array(X)
        
        # Check is fit had been called
        if not(self.fitted):
            raise ValueError("Estimator not fitted")

        # Input validation
        #X = check_array(X, accept_sparse=True)
        
        # already have the unique list
        usertagsUnique = self.tags
        # now get all the tags from the data provided and check against it
        X_new = np.zeros((X.shape[0],len(usertagsUnique)))
        for n in range(X.shape[0]):
            #print(n)
            if type(X[n,0]) != str:
                usertagsMulti = []
            else:
                usertagsMulti = str(X[n,0]).split(',')
        
            for m in range(len(usertagsUnique)):
                if usertagsUnique[m] in usertagsMulti:
                    X_new[n,m] = 1

        return X_new

In [52]:
# Pre-processing
# NOTE: adexchange removed from catgColumns since contains NaNs. TODO properly
# catg
oneHotEnc = OneHotEncoder(sparse=False,handle_unknown='ignore')
catgColumns = ['weekday', 'hour', 'region', 'city', 'slotvisibility', 'slotformat', 'advertiser', 'OS', 'browser']
catgTransform_info = ('catg', oneHotEnc, catgColumns)
# num
scale = StandardScaler()
numColumns = ['slotwidth', 'slotheight', 'slotprice', 'slotsize']
numTransform_info = ('num', scale, numColumns)
# usertags
tagEncoder = UserTagEncoder()
tagColumn = ['usertag']
tagTransform_info = ('tag', tagEncoder, tagColumn)
# column transformer for both
allPreProc = ColumnTransformer([catgTransform_info, numTransform_info, tagTransform_info])
X_train_pp = allPreProc.fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [49]:
# example processed record
X_train_pp[3,:]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        , -0.99925267,
        1.84775919, -0.69959743,  0.90641259,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ])

In [11]:
# Load and run on test data (will need to do other stesps first too)

In [41]:
dfTest = pd.read_csv(testFilePath, sep=',',header=0, quotechar='"')

In [42]:
dfTest.head(2)

Unnamed: 0,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,domain,...,slotid,slotwidth,slotheight,slotvisibility,slotformat,slotprice,creative,keypage,advertiser,usertag
0,0,12,366c563de7d90feb9d4dab53e795a93fb3157387,75045dd2f2136c93fe55fe6c446ec1527ed8f0bb,windows_chrome,27.197.36.*,146,159,3.0,DDTSQuf0MTTNaqKIvMpENpn,...,Astro_F_Rectangle,300,250,0,0,10,00fccc64a1ee2809348509b7ac2a97a5,b2e35064f3549d447edbbdfb1f707c8c,3427,"10024,10077,10075,10063,10031,10102,10006,1304..."
1,3,14,29167d4caa719788b5a342dbaa25151d53121f80,11279eb1f8f7a88f877db911673522b6ff202aa7,android_safari,124.126.227.*,1,1,,d63d96468f66986d5a8eb5225ad0b6f0,...,2745306850,320,50,FirstView,Na,5,11908,,2997,


In [43]:
# Split 'useragent' field into 'OS' and 'browser'
dfn = dfTest['useragent'].str.split('_', expand = True)
dfTest['OS'] = dfn[0]
dfTest['browser'] = dfn[1]
# Create slotsize field
dfTest['slotsize'] = dfTest['slotheight'] * dfTest['slotwidth']

In [44]:
# convert numeric (non categorical) fields from int to float (to avoid warnings later)
# TODO

In [46]:
# split into features and labels
X_test = dfTest[['weekday', 'hour', 'region', 'city', 'adexchange', 'slotvisibility', 'slotformat', 'advertiser', 'OS', 'browser',
                 'slotwidth', 'slotheight', 'slotprice', 'slotsize',
                 'usertag']]
# print(X_train.head(5))

In [53]:
X_test_pp = allPreProc.transform(X_test) # note, transform only (no fit)
# adexchange has NaNs in the test dataset (non in train)
#  I have removed this  for now but need to address properly later with additional pre-processing step



  res = transformer.transform(X)


In [54]:
# example transformed test record
X_test_pp[0,:]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  1.        ,  1.        ,  0.        , -0.60339706,
       -0.02136138, -0.46870719, -0.27659949,  1.        ,  1.        ,
        1.        ,  1.        ,  0.        ,  1.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ])