In [1]:
import gzip
import gc
import math
import json
from collections import Counter, defaultdict
import random
from tqdm.notebook import tqdm
import numpy as np
import string

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def jl_to_list(fname):
    output = []
    with gzip.open(fname, "rb") as f:
        for line in f:
            output.append(json.loads(line))
    return output

## Load user data

In [2]:
samples = None
test_size = .2
rows = jl_to_list("data/train_dataset.jl.gz")
if samples:
    rows = rows[:samples]

## Split

In [3]:
val_size = .15
rows_train, rows_test = train_test_split(rows, test_size = test_size, random_state = 42)
print(f"Train data: {len(rows_train)}.")
print(f"Test data: {len(rows_test)}.")

Train data: 330530.
Test data: 82633.


## Load items

In [4]:
item_data = jl_to_list("data/item_data.jl.gz")
metadata = {x["item_id"] : x for x in item_data}
all_items = list(metadata.keys())

## Get queries and items bought

In [5]:
def get_query_item(row):
    queries = [ev["event_info"] for ev in row["user_history"]
               if ev["event_type"] == "search"]
    item_bought = row["item_bought"]
    return queries, item_bought

In [6]:
queries = []
items = []
for row in tqdm(rows_train):
    q, i = get_query_item(row)
    if q != []:
        queries.append(q)
        items.append(i)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=330530.0), HTML(value='')))




## Get word vocabulary

In [7]:
def get_words(queries, length_t = 4):
    words = []
    for q in queries:
        words_q = list(q.lower().split())
        words += words_q
    return words

In [8]:
word_counter = Counter()
for query_list in tqdm(queries):
    words = get_words(query_list)
    for w in list(set(words)):
        word_counter[w] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=272629.0), HTML(value='')))




In [9]:
word_counter_sorted = word_counter.most_common()
word_rank = {w[0] : i for i, w in enumerate(word_counter_sorted, start = 1)}
inv_word = {v : k for k, v in word_rank.items()}

## Delete low frequency words

In [10]:
def drop_words(counter_sorted, thresh = .99):
    i = 0
    buf = 0
    total = sum([w[1] for w in counter_sorted])
    while buf < thresh*total:
        buf += counter_sorted[i][1]
        i += 1
    return i

In [11]:
vocab_word = drop_words(word_counter_sorted);
print("vocab size: ", vocab_word)

vocab size:  81500


## Get domain vocabulary

In [12]:
dom_counter = Counter()
for item in item_data:
    dom = item["domain_id"]
    dom_counter[dom] += 1
    
print("total domains: ", len(dom_counter.keys()))

total domains:  7894


In [13]:
dom_counter_sorted = dom_counter.most_common()
dom_rank = {w[0] : i for i, w in enumerate(dom_counter_sorted, start = 1)}
inv_dom = {v : k for k, v in dom_rank.items()}

## Delete low frequency domains

In [14]:
vocab_dom = drop_words(dom_counter_sorted);
print("vocab size: ", vocab_dom)

vocab size:  4802


## Generate samples

In [18]:
def sample(query_list, item_bought, num_words = 4, num_ns = 4):
    sample_words = []
    for q in query_list:
        words_q = list(q.lower().split())[:num_words]
        words_int = []
        for w in words_q:
            w_int = word_rank[w] if word_rank[w] < vocab_word else 0
            words_int.append(w_int)
        if len(words_int) < num_words:
            words_int += [0]*(num_words - len(words_int))
        if words_int not in sample_words:
            sample_words.append(words_int)
    
    # positive domain
    dom = dom_rank[metadata[item_bought]["domain_id"]]
    if dom > vocab_dom: dom = 0
    sample_doms = []
    ys = []
    negatives = np.arange(vocab_dom + 1).tolist()
    negatives.remove(dom)
    for i in range(len(sample_words)):
        doms = [dom]
        # negative domains
        negative_doms = np.random.choice(negatives, size = num_ns, replace = False)
        doms += negative_doms.tolist()
        sample_doms.append(doms)
    
        # y (output)
        y = [1] + [0]*num_ns
        ys.append(y)
    
    return sample_words, sample_doms, ys

In [19]:
s, d, y = sample(queries[0], 77)
print(s)
print(d)
print(y)

[[673, 602, 47, 655], [47, 641, 0, 0], [47, 3541, 12, 3681]]
[[2577, 801, 2230, 2528, 4266], [2577, 3431, 3780, 1712, 1229], [2577, 4471, 4564, 1423, 4581]]
[[1, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 0, 0, 0, 0]]


In [20]:
ws = []
ds = []
ys = []
for q, i in tqdm(zip(queries, items)):
    w, d, y = sample(q, i)
    ws += w
    ds += d
    ys += y

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [21]:
assert len(ws) == len(ds) == len(ys)

In [None]:
word_in = np.array(ws)
dom_in = np.array(ds)
y = np.array(ys)

print("word_in: ", word_in.shape)
print("dom_in: ", dom_in.shape)
print("y: ", y.shape)

## Build model

In [29]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dot
from tensorflow.keras.layers import Average
from tensorflow.keras.layers import Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [88]:
emb_dim = 32
num_ns = 4
# inputs
w_in = Input((word_in.shape[1], ))
d_in = Input((dom_in.shape[1], ))

# embeddings
w_emb = Embedding(input_dim = vocab_word + 1, output_dim = emb_dim, input_length= 4)(w_in)
d_emb = Embedding(input_dim = vocab_dom + 1, output_dim = emb_dim, input_length = num_ns + 1)(d_in)

# average word embeddings
w_avg = Average()([w_emb[:,i,:] for i in range(w_emb.shape[1])])
w_avg = Reshape((emb_dim, 1))(w_avg)

# dot
dot = Dot(axes = (2,1), normalize = True)([d_emb, w_avg])
output = Flatten()(dot)
#dot = Reshape((1,))(dot)

# sigmoid output
#output = Activation("sigmoid")(dot)

In [89]:
# create the primary training model
model = Model(inputs = [w_in, d_in], outputs = output)
model.compile(loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              optimizer = 'rmsprop',
              metrics = ["accuracy"])

model.summary()

Model: "functional_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_27 (InputLayer)           [(None, 4)]          0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 4, 32)        2608032     input_27[0][0]                   
__________________________________________________________________________________________________
tf_op_layer_strided_slice_44 (T [(None, 32)]         0           embedding_24[0][0]               
__________________________________________________________________________________________________
tf_op_layer_strided_slice_45 (T [(None, 32)]         0           embedding_24[0][0]               
______________________________________________________________________________________

## Train model

In [None]:
epochs = 5
bs = 512
model.fit(
    x = [word_in, dom_in], 
    y = y,
    epochs = epochs, 
    batch_size = bs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [73]:
indx = 0
print(word_in[indx])
print(y[indx])
print(model.predict([word_in[indx][np.newaxis], dom_in[indx][np.newaxis]]))

[673 602  47 655]
[1 0 0 0 0]
[[ 0.99628973 -0.05130254 -0.9883058   0.99766225 -0.98217404]]


'MLB-DECORATIVE_PAINTINGS'

In [75]:
word_emb = model.get_layer('embedding_16').get_weights()[0]
dom_emb = model.get_layer('embedding_17').get_weights()[0]

In [76]:
dom_emb_norm = np.zeros(dom_emb.shape)
for i, d in enumerate(dom_emb):
    dom_emb_norm[i] = d/np.linalg.norm(d)

In [77]:
def predict_query(query, num_words = 4, word_dim = 64, k = 10):
    words_q = list(query.lower().split())[:num_words]
    words_int = []
    for w in words_q:
        w_int = word_rank[w] if word_rank[w] < vocab_word else 0
        words_int.append(w_int)
    if len(words_int) < num_words:
        words_int += [0]*(num_words - len(words_int))
    
    word_embeds = np.zeros((num_words, word_dim))
    for i, indx in enumerate(words_int):
        word_embeds[i] = word_emb[indx]
    word_avg = np.mean(word_embeds, axis = 0).reshape((-1, 1))
    #word_avg /= np.linalg.norm(word_avg)
    dots = (dom_emb @ word_avg).flatten()
    top_doms = np.argsort(dots)[::-1][:k]
    return [inv_dom[t] for t in top_doms]

In [78]:
def get_queries_dom(queries):
    doms = Counter()
    for q in queries:
        top_d = predict_query(q, k = 1)
        doms[top_d[0]] += 1
    
    return doms.most_common()[0][0]

In [79]:
suc = 0
for q, i in tqdm(zip(queries, items)):
    true_dom = metadata[i]["domain_id"]
    print(true_dom + " => ", end = "")
    pred_dom = get_queries_dom(q)
    print(pred_dom)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

MLB-SURVEILLANCE_CAMERAS => MLB-VEHICLE_LED_BULBS
MLB-LOAFERS_AND_OXFORDS => MLB-FACIAL_SKIN_CARE_PRODUCTS
MLM-LAPTOP_CHARGERS => MLM-COSTUMES
MLB-THERMOSES => MLB-VEHICLE_LED_BULBS
MLB-CELLPHONES => MLB-VEHICLE_LED_BULBS
MLB-TV_AND_MONITOR_MOUNTS => MLB-VEHICLE_LED_BULBS
MLB-CELLPHONE_BATTERIES => MLB-VEHICLE_LED_BULBS
MLB-CELLPHONES => MLB-VEHICLE_LED_BULBS
MLB-JUMP_ROPES => MLB-SKIN_CARE_SUPPLIES
MLB-CELLPHONE_AND_TABLET_SKINS => MLB-VEHICLE_LED_BULBS
MLB-CAMPING_AND_FISHING_EQUIPMENT => MLB-VEHICLE_LED_BULBS
MLB-AIR_FRESHENERS => MLB-VEHICLE_LED_BULBS
MLB-CELLPHONES => MLB-FACIAL_SKIN_CARE_PRODUCTS
MLB-SPEAKERS => MLB-VEHICLE_LED_BULBS
MLB-SNEAKERS => MLB-VEHICLE_LED_BULBS
MLM-HATS_AND_CAPS => MLB-WALKIE_TALKIES
MLB-BABY_TOOTH_BOXES => MLB-VEHICLE_LED_BULBS
MLB-MICRONEEDLING_CARTRIDGES => MLB-VEHICLE_LED_BULBS
MLB-SNEAKERS => MLB-VEHICLE_LED_BULBS
MLB-BODY_SHAPERS => MLB-VEHICLE_LED_BULBS
MLB-MANNEQUINS => MLB-VEHICLE_LED_BULBS
MLB-SOUVENIRS => MLB-VEHICLE_LED_BULBS
MLB-CELLPHONES 

KeyboardInterrupt: 

In [67]:
train_counter = Counter()
for d in tqdm(dom_in):
    true_d = d[0]
    if true_d != 0: train_counter[inv_dom[true_d]] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1172164.0), HTML(value='')))




In [68]:
train_counter.most_common()

[('MLB-CELLPHONES', 87103),
 ('MLB-SNEAKERS', 42248),
 ('MLB-HEADPHONES', 27460),
 ('MLB-SUPPLEMENTS', 25399),
 ('MLB-SMARTWATCHES', 24625),
 ('MLM-HEADPHONES', 15205),
 ('MLB-DOLLS', 13544),
 ('MLB-STREAMING_MEDIA_DEVICES', 13370),
 ('MLB-VIDEO_GAMES', 12249),
 ('MLB-T_SHIRTS', 11626),
 ('MLB-BOOTS_AND_BOOTIES', 11381),
 ('MLB-PANTS', 11366),
 ('MLB-SURVEILLANCE_CAMERAS', 10735),
 ('MLB-HARD_DRIVES_AND_SSDS', 9683),
 ('MLB-WALL_AND_CEILING_LIGHTS', 9081),
 ('MLB-CELLPHONE_SCREENS', 8722),
 ('MLB-HAIR_TREATMENTS', 8604),
 ('MLB-SHORTS', 8555),
 ('MLM-CELLPHONE_COVERS', 7750),
 ('MLB-SPEAKERS', 7615),
 ('MLB-SANDALS_AND_FLIP_FLOPS', 7361),
 ('MLM-CELLPHONES', 7231),
 ('MLB-CARD_PAYMENT_TERMINALS', 6997),
 ('MLB-MICROWAVES', 6798),
 ('MLB-MALE_UNDERWEAR', 6690),
 ('MLB-GEL_NAIL_KITS', 6523),
 ('MLB-HAIR_CLIPPERS', 6453),
 ('MLB-WRISTWATCHES', 6379),
 ('MLB-FANS', 6324),
 ('MLB-DRESSES', 6245),
 ('MLB-TELEVISIONS', 5826),
 ('MLB-ACTION_FIGURES', 5755),
 ('MLM-SMARTWATCHES', 5562),
 ('MLB-