In [1]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import string
import math
import csv
import random

In [2]:
class Bid():
    def __init__(self,weekday,hour,bidid,useragent,region,city,adexchange,slotwidth,slotheight,
                 slotvisibility,slotformat,slotprice,bidprice, payprice,advertiser,usertag):
        self.weekday = weekday #row[1]
        self.hour = hour #row[2]
        self.bidid = bidid #row[3]
        self.useragent = useragent #row[6]
        #self.IP = IP
        self.region = region #row[8]
        self.city = city #row[9]
        self.adexchange = adexchange #row[10]
        #self.domain = domain
        #self.url = url
        #self.urlid = urlid
        #self.slotid = slotid
        self.slotwidth = int(slotwidth) #row[15]
        self.slotheight = int(slotheight) #row[16]
        self.slotvisibility = slotvisibility #row[17]
        self.slotformat = slotformat #row[18]
        self.slotprice = int(slotprice) #row[19]
        #self.creative = creative
        self.bidprice = int(bidprice) #row[21]
        self.payprice = int(payprice) #row[22]
        #self.keypage = int(keypage)
        self.advertiser = advertiser #row[24]
        self.usertag = usertag #row[25]
        
def load_data(filepath):
    data = defaultdict(list)
    
    with open(filepath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')  
        next(reader)
        i = 0

        for row in reader:
            if i >= 10000:
                break
            instance = Bid(row[1],row[2],row[3],row[6],row[8],row[9],row[10],row[15],row[16],row[17],\
                          row[18],row[19],row[21],row[22],row[24],row[25])
            data[row[24]].append((instance,int(row[0])))
            i += 1
            
    return data

def load_data2(filepath):
    data = defaultdict(list)
    labels = defaultdict(list)
    
    with open(filepath, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')  
        next(reader)
        #i = 0

        for row in reader:
            #if i >= 100000:
            #    break
            
            usertags = row[25].split(',')
            #do the thing
            temp_dict = {}
            for tag in usertags:
                temp_dict["tag " + tag] = True
                       
            instance = {'weekday':row[1],'hour':row[2],'useragent':row[6],'region':row[8],\
                        'city':row[9],'adexchange':row[10],'slotwidth':int(row[15]),'slotheight':int(row[16]),\
                        'slotvisibility':row[17],'slotformat':row[18],'slotprice':int(row[19]),\
                        'bidprice':int(row[21]),'payprice':int(row[22]),'advertiser':row[24]}
            
            instance.update(temp_dict)
            data[row[24]].append(instance)
            labels[row[24]].append(int(row[0]))
            #i += 1
            
    return data,labels

In [3]:
training_path = r"/home/jovyan/work/Desktop/Desktop/UCL/WebEcon/dataset/train.csv"
validation_path = r"/home/jovyan/work/Desktop/Desktop/UCL/WebEcon/dataset/validation.csv"
training_events,labels = load_data2(training_path)

In [33]:
print(training_data['3358'][0]['useragent'])

windows_ie


In [4]:
# converts labels into integers, and vice versa, needed by scikit-learn.
#label_encoder = LabelEncoder()
# encodes feature dictionaries as numpy vectors, needed by scikit-learn.
#vectorizer = DictVectorizer()

def predict_event_labels(instance, advertiser, models:dict):    
    lr = models[advertiser][0]
    label_encoder = models[advertiser][1]
    vectorizer = models[advertiser][2]  
    event = [instance]
    event_x = vectorizer.transform(event)
    event_y = label_encoder.inverse_transform(lr.predict(event_x))

    return event_y

In [8]:
def train(data,labels):
    bidprices = {}
    models = {}
    for key in data.keys():
        #get contant price to bid
        current_data = data[key]
        
        summ = 0
        i = 0
        for item in current_data:
            summ += item['payprice']
            i += 1
        
        bidprices[key] = summ/i
        
        label_encoder = LabelEncoder()
        vectorizer = DictVectorizer()
        
        train_event_x = vectorizer.fit_transform(current_data)
        train_event_y = label_encoder.fit_transform(labels[key])
    
        #Create and train the model.
        p = 0.13
        lr = LogisticRegression(C=p)
        lr.fit(train_event_x, train_event_y)
        models[key] = (lr,label_encoder,vectorizer)
    
    return models,bidprices

def process_event(row):
    instance = {'weekday':row[1],'hour':row[2],'useragent':row[6],'region':row[8],\
                'city':row[9],'adexchange':row[10],'slotwidth':int(row[15]),'slotheight':int(row[16]),\
                'slotvisibility':row[17],'slotformat':row[18],'slotprice':int(row[19]),\
                'bidprice':int(row[21]),'payprice':int(row[22]),'advertiser':row[24]}
    
    usertags = row[25].split(',')
    
    temp_dict = {}
    for tag in usertags:
        temp_dict["tag " + tag] = True
    instance.update(temp_dict)
    
    return instance

In [10]:
def RTB_simulation(mode,filepath,models,param=None): #param is the dictionary with the bidprice per advertiser
    impressions = 0
    clicks = 0
    budget = 25000
    if mode == 'constant' and param is not None:
        with open(filepath, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            next(reader)
            
            for row in reader:
                advertiser = row[24]
                conts_payprice = param[advertiser]
                if budget > conts_payprice:
                    instance = process_event(row)
                    if predict_event_labels(instance,advertiser,models) > 0.5:
                        payprice = instance['payprice']
                        if conts_payprice > payprice:
                            impressions += 1
                            budget -= conts_payprice
                            if row[0] == "1":
                                clicks += 1
                        
    if mode == 'random' and param is not None:
        with open(filepath, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            next(reader)
            
            for row in reader:
                current_bid = random.randrange(param)
                if budget > current_bid:
                    payprice = int(row[22])
                    if current_bid > payprice:
                        impressions += 1
                        budget =- current_bid
                        if row[0] == "1":
                            clicks += 1
                            
    print("Impressions:{0}".format(impressions))
    print("Clicks:{0}".format(clicks))
    if impressions > 0:
        return (clicks/impressions)*100
    else:
        return -1

In [7]:
models,bidprices = train(training_events,labels)
print('Trainin done')

Trainin done


In [11]:
print("Simulation")
CTR = RTB_simulation("constant",validation_path,models,param=bidprices)
print("CTR:" + str(CTR) + "%")

Simulation
Impressions:4
Clicks:2
CTR:50.0%
