In [0]:
from google.colab import auth
auth.authenticate_user()

In [2]:
! ls -al

total 4688696
drwxr-xr-x 1 root root       4096 Apr 25 21:36 .
drwxr-xr-x 1 root root       4096 Apr 25 21:15 ..
drwx------ 4 root root       4096 Apr 25 21:16 .cache
-rw-r--r-- 1 root root 1486734654 Oct  3  2016 clicks_train.csv
-rw-r--r-- 1 root root  408685316 Apr 25 21:17 clicks_train.csv.zip
drwxr-xr-x 3 root root       4096 Apr 25 21:16 .config
drwxr-xr-x 3 root root       4096 Apr 25 21:37 datalab
-rw-r--r-- 1 root root  118017029 Oct  3  2016 documents_categories.csv
-rw-r--r-- 1 root root   33912256 Apr 25 21:17 documents_categories.csv.zip
-rw-r--r-- 1 root root  324096832 Oct  3  2016 documents_entities.csv
-rw-r--r-- 1 root root  131770515 Apr 25 21:17 documents_entities.csv.zip
-rw-r--r-- 1 root root   89380566 Oct  3  2016 documents_meta.csv
-rw-r--r-- 1 root root   16268071 Apr 25 21:17 documents_meta.csv.zip
-rw-r--r-- 1 root root  339473038 Oct  3  2016 documents_topics.csv
-rw-r--r-- 1 root root  126782416 Apr 25 21:17 documents_topics.csv.zip
-rw-r--

In [3]:
project_id = 'leftover-199123'
!gcloud config set project {project_id}

Updated property [core/project].


In [3]:
! gsutil cp gs://advml-bucket/ads.pickle .
! gsutil cp gs://advml-bucket/filtered_events.csv .



Updates are available for some Cloud SDK components.  To install them,
please run:
  $ gcloud components update

Copying gs://advml-bucket/ads.pickle...
/ [1 files][ 19.2 MiB/ 19.2 MiB]                                                
Operation completed over 1 objects/19.2 MiB.                                     
Copying gs://advml-bucket/filtered_events.csv...
- [1 files][102.3 MiB/102.3 MiB]                                                
Operation completed over 1 objects/102.3 MiB.                                    


In [4]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import keras
import scipy

Using TensorFlow backend.


In [0]:
def getAdIdConverterFunction(adIds):
    # adIdsRev is a dict mapping from ad_id to ad_inx (i.e. the inx of such ad in adIds)
    adIdsRev = {adId: inx for inx, adId in enumerate(adIds)}
    # Vectorized function to convert ad_id into ad_inx
    convertToAdInx = np.vectorize(lambda adId: adIdsRev[adId])
    return (adIdsRev,convertToAdInx)

def loadAdWeightsAndIds():
    with open('ads.pickle','rb') as f:
        obj = pickle.load(f)
    return obj

sparseAdWeights,adIds = loadAdWeightsAndIds()
adIdsRev,convertToAdInx = getAdIdConverterFunction(adIds)

eventsDf = pd.read_csv('filtered_events.csv')

uniqUser = np.unique(eventsDf.uuid)

In [13]:
eventsDf.columns
# uniqUser

Index(['display_id', 'uuid', 'ad_id', 'clicked', 'ad_inx', 'user_inx'], dtype='object')

In [7]:
from keras.layers import *


def probit_activation(x):
    return tf.distributions.Normal(loc=0., scale=1.).cdf(x)

def createProbitModel():

    userInxInput = Input(shape=(1,))
    adInxInput = Input(shape=(1,))

    adWeightLayer = Embedding(
        sparseAdWeights.shape[0],
        sparseAdWeights.shape[1],
        input_length=1,
        trainable=False,
        weights=[sparseAdWeights.toarray()]
    )(adInxInput)

    userWeightLayer = Embedding(uniqUser.shape[0],sparseAdWeights.shape[1],input_length=1)(userInxInput)

    dotLayer = Dot(-1)([adWeightLayer,userWeightLayer])

    flat_ = Flatten()(dotLayer)

    activationLayer = Activation(probit_activation)(flat_)

    model = keras.models.Model(inputs=(userInxInput,adInxInput),outputs=(activationLayer))

    model.compile(loss='mse', optimizer='adam') ## Maybe another optimizer?
    
    return model

model = createProbitModel()

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 96)        53719968    input_2[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 96)        9205824     input_1[0][0]                    
__________________________________________________________________________________________________
dot_1 (Dot

In [11]:
from keras.callbacks import *

weights_filename = 'probit.h5'

model.fit(
    [eventsDf.user_inx,eventsDf.ad_inx],
    eventsDf.clicked,
    epochs = 60,
    shuffle=True,
    batch_size=2000,
    callbacks=[
        EarlyStopping(monitor='loss', patience=2),
        ModelCheckpoint(weights_filename, monitor='loss', save_best_only=True, save_weights_only=True),
    ]
)

Epoch 1/30
Epoch 2/30

Epoch 3/30
Epoch 4/30

Epoch 5/30
Epoch 6/30
 340000/2405074 [===>..........................] - ETA: 19s - loss: 0.1561

Epoch 7/30
Epoch 8/30
 208000/2405074 [=>............................] - ETA: 21s - loss: 0.1330

Epoch 9/30
Epoch 10/30
 178000/2405074 [=>............................] - ETA: 21s - loss: 0.1159

Epoch 11/30
Epoch 12/30
 152000/2405074 [>.............................] - ETA: 22s - loss: 0.1034

Epoch 13/30
Epoch 14/30
 146000/2405074 [>.............................] - ETA: 22s - loss: 0.0922

Epoch 15/30
Epoch 16/30
 152000/2405074 [>.............................] - ETA: 22s - loss: 0.0840

Epoch 17/30
Epoch 18/30
 166000/2405074 [=>............................] - ETA: 21s - loss: 0.0775

Epoch 19/30
Epoch 20/30
 170000/2405074 [=>............................] - ETA: 21s - loss: 0.0719

Epoch 21/30
Epoch 22/30
 180000/2405074 [=>............................] - ETA: 21s - loss: 0.0672

Epoch 23/30
Epoch 24/30
 170000/2405074 [=>............................] - ETA: 21s - loss: 0.0634

Epoch 25/30
Epoch 26/30
 178000/2405074 [=>............................] - ETA: 21s - loss: 0.0605

Epoch 27/30
Epoch 28/30
 166000/2405074 [=>............................] - ETA: 21s - loss: 0.0578

Epoch 29/30
Epoch 30/30
 170000/2405074 [=>............................] - ETA: 21s - loss: 0.0552



<keras.callbacks.History at 0x7f9556b5dcf8>

In [16]:
! gsutil cp probit.h5 gs://advml-bucket/

Copying file://probit.h5 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/240.1 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

-
Operation completed over 1 objects/240.1 MiB.                                    


In [8]:
! gsutil cp gs://advml-bucket/probit.h5 .
    
model.load_weights('probit.h5')

Copying gs://advml-bucket/probit.h5...
\ [1 files][240.1 MiB/240.1 MiB]                                                
Operation completed over 1 objects/240.1 MiB.                                    


## simulation

In [15]:



class SubModDiv():
    def __init__(self,wAds,regrModel,alpha=1.0,beta=1.0):
        self.regrModel = regrModel
        
        self.a = alpha
        self.b = beta
        self.wAds = wAds.toarray() # "a_i" in paper
        self.c = np.zeros(wAds.shape[1]) # "c" in section 4.1 of paper
        self.v = np.zeros(wAds.shape[1]) # "v" in section 4.1 of paper
        
#         with tf.device(' ')
        with tf.device('/gpu:0'):
            self.initTensorflowOp()
        
    def initTensorflowOp(self):
        ## Tensorflow code for subsetiteration
        t_prevAdInx = tf.placeholder(tf.int32,shape=(None))
        t_probs = tf.placeholder(tf.float32,shape=(self.wAds.shape[0]))
        t_wAds = tf.placeholder(tf.float32,shape=self.wAds.shape)
        t_w = tf.placeholder(tf.float32,shape=(self.wAds.shape[1]))
        
        t_prevAdSum = tf.constant(1.0)+tf.reduce_sum(tf.gather(t_wAds,t_prevAdInx),axis=0)
        
        t_prevProbSum = tf.reduce_sum(tf.gather(t_probs,t_prevAdInx),axis=0)
        
        t_newAs = tf.log(t_wAds + t_prevAdSum)
        
        t_dotProds = tf.reduce_sum(t_w * t_newAs,axis=1)
        
        t_scores = t_prevProbSum + t_probs
        
        t_prevAdMask = tf.cond(
            tf.equal( tf.shape(t_prevAdInx)[0], tf.constant(0,dtype=tf.int32) ),
            lambda: tf.zeros(self.wAds.shape[0]),
            lambda: tf.reduce_sum(tf.one_hot(t_prevAdInx,tf.constant(self.wAds.shape[0])),axis=0)
        )
        
        
        t_rho = t_dotProds + t_scores - ( t_prevAdMask * tf.constant(1e5))
        
        t_maxInx = tf.argmax(t_rho)
        
        self.t_prevAdInx = t_prevAdInx
        self.t_probs = t_probs
        self.t_wAds = t_wAds
        self.t_w = t_w
        
        self.t_maxInx = t_maxInx
        
          
        ##
        
        
    def getW(self):
        return (self.c + self.a)/(self.v + self.a + self.b)
    
    def resetW(self):
        self.c[:] = 0
        self.v[:] = 0   
    
    def subSetIteration(self,probs,prevAdInx):
        w = self.getW()
        
        prevAdSum = 1+self.wAds[prevAdInx].sum(axis=0)
        prevProbSum = probs[prevAdInx].sum()
       
        newAs = np.log(self.wAds + prevAdSum)  
    
        dotProds = (w * newAs).sum(axis=1)
        
        scores = prevProbSum + probs

        rho = dotProds + scores
        rho[prevAdInx] = -np.inf
        
        maxInx = np.argmax(rho)
        
        return maxInx
        
        
    def getSubSet(self,userInx,n=6):
#         t = time.time()
        probs = self.regrModel.predict([
            np.array([userInx]*self.wAds.shape[0]),
            np.arange(self.wAds.shape[0])
        ],batch_size=50000).ravel()
#         print(time.time()-t)
        
        currAdSet = np.empty(0,dtype=np.int)
#         currAdSet = np.array([1])
        
        with tf.device('/gpu:0'):
            with tf.Session() as sess:

                while len(currAdSet) < n:
#                     t = time.time()
                    newAd = sess.run(self.t_maxInx,feed_dict={
                        self.t_prevAdInx: currAdSet,
                        self.t_probs: probs,
                        self.t_wAds: self.wAds,
                        self.t_w: self.getW()
                    })
#                     print(time.time()-t)
#                     print(newAd)
#                     newAd = self.subSetIteration(probs,currAdSet)
#                     print(list(newAd)[:50])
#                     break 
                    currAdSet = np.append(currAdSet,newAd)
        
        # Update v
        self.v += self.wAds[currAdSet].sum(axis=0)
        
#         print(self.wAds[currAdSet])
            
        return currAdSet
    
    def registerClick(self,adInx):
        self.c += self.wAds[adInx]
        
smd = SubModDiv(sparseAdWeights,model)

import time

t = time.time()
print(smd.getSubSet(1))
print(time.time()-t)
    

[ 25614    554 143927  97239    714 446097]
1.8968541622161865


In [20]:
# print(smd.getSubSet(4))
# print(smd.getSubSet(40))
# print(smd.getSubSet(420))

smd.getW()

array([0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.33333333, 0.5       , 0.33333333, 0.5       , 0.33333333,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.33333333, 0.5       ,
       0.5       , 0.33333333, 0.5       , 0.5       , 0.5       ,
       0.5       , 0.5       , 0.33333333, 0.5       , 0.5       ,
       0.5       , 0.33333333, 0.5       , 0.5       , 0.33333333,
       0.5       , 0.25      , 0.33333333, 0.5       , 0.5       ,
       0.25      , 0.5       , 0.5       , 0.5       , 0.33333333,
       0.5       , 0.33333333, 0.5       , 0.5       , 0.33333333,
       0.33333333, 0.5       , 0.5       , 0.2       , 0.5       ,
       0.5       , 0.5       , 0.16666667, 0.2       , 0.5       ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
       0.33333333, 0.33333333, 0.5       , 0.5       , 0.5       ,
       0.33333333, 0.25      , 0.5       , 0.5       , 0.33333