In [125]:
# CONSIDER WHETHER SAMPLING REQUIRED BECAUSE OHC WILL INCREASE DATA SIZE SIGNIFICANTLY
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin # for custom encoder for user tags
import gc # force garbage collection to free memory

In [126]:
SAVE_PROCESSED_DATA = True

In [127]:
scriptDir = os.getcwd() # Assumes that script is executed from its actual location
relPath = r"../data/" # And data is located in a 'data' folder in the same parent directory as 'code' folder
trainFilePath = os.path.join(scriptDir, relPath,"train.csv")
valFilePath = os.path.join(scriptDir, relPath,"validation.csv")
sampleFilePath = os.path.join(scriptDir, relPath,"sample.csv")
testFilePath = os.path.join(scriptDir, relPath,"test.csv")
relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [128]:
# Load data
df = pd.read_csv(trainFilePath, sep=',',header=0, quotechar='"')

In [129]:
print(df.shape)
df.head(5)

(2430981, 25)


Unnamed: 0,click,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,...,slotheight,slotvisibility,slotformat,slotprice,creative,bidprice,payprice,keypage,advertiser,usertag
0,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,2,2.0,...,200,2,0,5,a4f763f78ef3eedfe614263b94a8924e,238,5,0f951a030abdaedd733ee8d114ce2944,3427,NaN
1,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,239,1.0,...,250,FourthView,Na,0,10722,294,23,,2821,NaN
2,0,3,13,b604e3fd054a658ab7ced4285ebf2ef54d2bd890,801d18a056b6fe6b06a794aef17fb0d6daff2414,windows_ie,59.46.106.*,40,41,2.0,...,250,2,0,5,798b2d49952d77f1eace9f23c210d0b5,238,24,0f951a030abdaedd733ee8d114ce2944,3427,10052100061386610110
3,0,6,23,0348beeae93e561584c3b50fc9e7746a33048ad7,0d6eaf2259699990e38a1fc5116f112070b9ecdc,windows_ie,114.250.226.*,1,1,1.0,...,600,2,1,0,cb7c76e7784031272e37af8e7e9b062c,300,25,bebefa5efe83beee17a3d245e7c5085b,1458,138661006310111
4,0,5,6,268149c1789bce2bc9798ffd97ec431219bafeb3,a239d9bb642460d974ba67f85e63b8d3e214da0e,windows_ie,183.63.192.*,216,233,2.0,...,90,OtherView,Na,133,7330,277,133,,2259,NaN


In [130]:
# Split into features and labels
# List all columns and summarise what will be done with them
# In general, numerical will be scaled and categorical will be one-hot-encoded
# Some may be dropped
# Some additional features need to be created
# Some will be dropped

In [131]:
# click - LABEL
# weekday - categorical, encode
# hour - categorical, encode
# bidid - ignore
# userid - ignore
# useragent - ignore (but see OS and browser)
# IP - ignore
# region - categorical, encode
# city - categorical, encode
# adexchange - categorical, encode
# domain - ignore
# url - ignore
# urlid - ignore
# slotid - ignore
# slotwidth - numeric, scaled
# slotheight - numeric, scaled
# slotvisibility - categorical, encode
# slotformat - categorical, encode
# slotprice - numeric, scaled
# creative - ?
# bidprice - ignore (train/val only)
# payprice - ignore (train/val only)
# keypage - ?
# advertiser - categorical, encode
# usertag - categorical, parse+separate and encode
# OS - derived, categorical, encode
# browser - derived, categorical, encode
# slotsize - derived, numerical, scaled

In [132]:
# reduce dataset for testing
#df = df.head(10)

In [133]:
# Split 'useragent' field into 'OS' and 'browser'
dfn = df['useragent'].str.split('_', expand = True)
df['OS'] = dfn[0]
df['browser'] = dfn[1]
# Create slotsize field
df['slotsize'] = df['slotheight'] * df['slotwidth']

In [134]:
# convert numeric (non categorical) fields from int to float (to avoid warnings later)
# TODO

In [135]:
# split into features and labels
X_train = df[['weekday', 'hour', 'region', 'city', 'adexchange', 'slotvisibility', 'slotformat', 'advertiser', 'OS', 'browser',
                 'slotwidth', 'slotheight', 'slotprice', 'slotsize',
                 'usertag']]
y_train = df['click']
# print(X_train.head(5))
# print(y_train.head(5))

In [136]:
class UserTagEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.fitted = False

    def fit(self, X, y=None):
        #X = np.array(X).reshape(X.shape[0],)
        X = np.array(X)
        
        usertagsUnique = set()

        for idx in range(X.shape[0]):
            if type(X[idx,0]) != str:
                continue
            else:
                tags = str(X[idx,0]).split(',')

            for tag in tags:
                usertagsUnique.add(tag)

        usertagsUnique = list(usertagsUnique)
        columns = ['u0_'+x for x in usertagsUnique]
        
        self.tags = usertagsUnique
        self.columns = columns
        self.fitted = True

        # Return the transformer
        return self

    def transform(self, X):
        #X = np.array(X).reshape(X.shape[0],)
        X = np.array(X)
        
        # Check is fit had been called
        if not(self.fitted):
            raise ValueError("Estimator not fitted")

        # Input validation
        #X = check_array(X, accept_sparse=True)
        
        # already have the unique list
        usertagsUnique = self.tags
        # now get all the tags from the data provided and check against it
        X_new = np.zeros((X.shape[0],len(usertagsUnique)))
        for n in range(X.shape[0]):
            #print(n)
            if type(X[n,0]) != str:
                usertagsMulti = []
            else:
                usertagsMulti = str(X[n,0]).split(',')
        
            for m in range(len(usertagsUnique)):
                if usertagsUnique[m] in usertagsMulti:
                    X_new[n,m] = 1

        return X_new

In [137]:
# Pre-processing
# NOTE: adexchange removed from catgColumns since contains NaNs. TODO properly
# catg
oneHotEnc = OneHotEncoder(sparse=False, dtype=int,handle_unknown='ignore')
catgColumns = ['weekday', 'hour', 'region', 'city', 'slotvisibility', 'slotformat', 'advertiser', 'OS', 'browser']
catgTransform_info = ('catg', oneHotEnc, catgColumns)
# num
scale = StandardScaler()
numColumns = ['slotwidth', 'slotheight', 'slotprice', 'slotsize']
numTransform_info = ('num', scale, numColumns)
# usertags
tagEncoder = UserTagEncoder()
tagColumn = ['usertag']
tagTransform_info = ('tag', tagEncoder, tagColumn)
# column transformer for all
allPreProc = ColumnTransformer([numTransform_info, catgTransform_info, tagTransform_info])
X_train_pp = allPreProc.fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [138]:
# example processed record
#X_train_pp[0,:]

In [139]:
# check dimensions
X_train_pp.shape

(2430981, 547)

In [140]:
# save results
if SAVE_PROCESSED_DATA:
    numNumeric = len(numColumns)
    numCatg = X_train_pp.shape[1] - numNumeric
    np.savetxt(r"../output/y_train.csv", y_train, fmt='%i')
    np.savetxt(r"../output/X_train_pp.csv", X_train_pp, fmt=','.join(['%12.10f']*numNumeric + ['%i']*numCatg))

In [113]:
# TODO save column layout

In [141]:
# *** clear training data from memory before continuing ***
del df
del X_train
del X_train_pp
del y_train
gc.collect()

83

In [142]:
# Load and run on test data

In [143]:
dfTest = pd.read_csv(testFilePath, sep=',',header=0, quotechar='"')

In [144]:
dfTest.head(5)

Unnamed: 0,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,domain,...,slotid,slotwidth,slotheight,slotvisibility,slotformat,slotprice,creative,keypage,advertiser,usertag
0,0,12,366c563de7d90feb9d4dab53e795a93fb3157387,75045dd2f2136c93fe55fe6c446ec1527ed8f0bb,windows_chrome,27.197.36.*,146,159,3.0,DDTSQuf0MTTNaqKIvMpENpn,...,Astro_F_Rectangle,300,250,0,0,10,00fccc64a1ee2809348509b7ac2a97a5,b2e35064f3549d447edbbdfb1f707c8c,3427,"10024,10077,10075,10063,10031,10102,10006,1304..."
1,3,14,29167d4caa719788b5a342dbaa25151d53121f80,11279eb1f8f7a88f877db911673522b6ff202aa7,android_safari,124.126.227.*,1,1,,d63d96468f66986d5a8eb5225ad0b6f0,...,2745306850,320,50,FirstView,Na,5,11908,,2997,
2,5,19,ff8bc3f4d44a3ea60c5f3a3a8fbe7cd98fb2966e,4771a819a3f5b86776d8a9456f4f2506578f78d8,windows_ie,116.116.104.*,27,34,1.0,trqRTJkrBoq7JsNr5SqfNX,...,mm_34022157_3445226_11175100,336,280,2,1,0,77819d3e0b3467fe5c7b16d68ad923a1,bebefa5efe83beee17a3d245e7c5085b,1458,"13866,10006,10024,10059,10048,10063,10067,1008..."
3,0,21,844c2da00d45315f20b748ec131c26ee99a7cbc7,5360671379f3204afa9cc0edd8fd2e2096cb09eb,windows_ie,115.46.145.*,238,245,4.0,6d27caf5a202ec085f80af7163ef7cb0,...,210842,960,90,FirstView,Na,30,10717,,2821,10075130421000610110137761003110063
4,2,20,c6017f0ad0c44d7d0c9b62583ea863f28941c0ca,d791b5f6306b9d299f81daa5448ced7e5bfa010f,windows_ie,221.199.203.*,27,35,3.0,3FKElpuEMusyJqKbuKz,...,Fashion_F_Rectangle,300,250,0,0,50,44966cc8da1ed40c95d59e863c8c75f0,43f4a37f42a7c5e6219e2601b26c6976,3386,10063


In [145]:
# Split 'useragent' field into 'OS' and 'browser'
dfn = dfTest['useragent'].str.split('_', expand = True)
dfTest['OS'] = dfn[0]
dfTest['browser'] = dfn[1]
# Create slotsize field
dfTest['slotsize'] = dfTest['slotheight'] * dfTest['slotwidth']

In [146]:
# convert numeric (non categorical) fields from int to float (to avoid warnings later)
# TODO

In [147]:
# split into features and labels
X_test = dfTest[['weekday', 'hour', 'region', 'city', 'adexchange', 'slotvisibility', 'slotformat', 'advertiser', 'OS', 'browser',
                 'slotwidth', 'slotheight', 'slotprice', 'slotsize',
                 'usertag']]
# print(X_train.head(5))

In [148]:
X_test_pp = allPreProc.transform(X_test) # note, transform only (no fit)
# adexchange has NaNs in the test dataset (non in train)
#  I have removed this  for now but need to address properly later with additional pre-processing step



  res = transformer.transform(X)


In [149]:
# example transformed test record
X_test_pp[0,:]

array([-0.86243235,  0.46715048, -0.45129118, -0.11576527,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

In [150]:
# check correct size
X_test_pp.shape

(303375, 547)

In [154]:
# save results
if SAVE_PROCESSED_DATA:
    numNumeric = len(numColumns)
    numCatg = X_test_pp.shape[1] - numNumeric
    np.savetxt(r"../output/X_test_pp.csv", X_test_pp, fmt=','.join(['%12.10f']*numNumeric + ['%i']*numCatg))

In [113]:
# TODO save column layout

In [159]:
del dfTest
del X_test
del X_test_pp
gc.collect()

112