In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 89 kB 8.4 MB/s 
[K     |████████████████████████████████| 5.2 MB 28.2 MB/s 
[?25h

### Prepare training data
- filtering out queries that aren't made up mostly of English words

In [None]:
import pandas as pd
import nltk


In [None]:
TRAIN_DATA = "./drive/MyDrive/data/train-v0.2.csv"

train_data = pd.read_csv(TRAIN_DATA)
train_data = train_data[train_data["query_locale"] == "us"]
print("length of train data before filtering: ", len(train_data))
train_data.head()

length of train data before filtering:  419730


Unnamed: 0,query_id,query,query_locale,product_id,esci_label
0,0,!awnmower tires without rims,us,B00004RA3F,irrelevant
1,0,!awnmower tires without rims,us,B0018TWDOI,exact
2,0,!awnmower tires without rims,us,B005O5Y3QI,exact
3,0,!awnmower tires without rims,us,B0089RA0HY,substitute
4,0,!awnmower tires without rims,us,B0089RNSNM,exact


In [None]:
print("unique queries:", len(train_data["query"].drop_duplicates()))
print("unique products returned:", len(train_data["product_id"].drop_duplicates()))

unique queries: 20888
unique products returned: 352028


In [None]:
train_data.groupby("esci_label").count()

Unnamed: 0_level_0,query_id,query,query_locale,product_id
esci_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
complement,15408,15408,15408,15408
exact,149634,149634,149634,149634
irrelevant,58273,58273,58273,58273
substitute,122687,122687,122687,122687


## Preparing positive interactions

In [None]:
PRODUCT_DATA = "./drive/MyDrive/data/product_catalogue-v0.2.csv"

product_data = pd.read_csv(PRODUCT_DATA, on_bad_lines='skip')
product_data = product_data[product_data["product_title"].notnull()]
product_data = product_data[product_data["product_locale"] == "us"]
product_data.fillna("", inplace=True)
print("total products: ", len(product_data["product_id"].drop_duplicates()))
product_data.head()

total products:  482099


Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color_name,product_locale
0,B0188A3QRM,"Amazon Basics Woodcased #2 Pencils, Unsharpene...",,144 woodcase #2 HB pencils made from high-qual...,Amazon Basics,Yellow,us
1,B075VXJ9VG,"BAZIC Pencil #2 HB Pencils, Latex Free Eraser,...",<p><strong>BACK TO BAZIC</strong></p><p>Our go...,&#11088; UN-SHARPENED #2 PREMIUM PENCILS. Each...,BAZIC Products,12-count,us
2,B07G7F6JZ6,Emraw Pre Sharpened Round Primary Size No 2 Ju...,<p><b>Emraw Pre-Sharpened #2 HB Wood Pencils -...,✓ PACK OF 8 NUMBER 2 PRESHARPENED BEGINNERS PE...,Emraw,Yellow,us
3,B07JZJLHCF,Emraw Pre Sharpened Triangular Primary Size No...,<p><b>Emraw Pre-Sharpened #2 HB Wood Pencils -...,✓ PACK OF 6 NUMBER 2 PRESHARPENED BEGINNERS PE...,Emraw,Yellow,us
4,B07MGKC3DD,"BIC Evolution Cased Pencil, #2 Lead, Gray Barr...",,Premium #2 HB lead pencils with break-resistan...,Design House,Gray,us


In [None]:
features = ["query", "product_id", "product_title", "product_description", "product_brand"] + ["esci_label"]
train_data = train_data.merge(product_data, left_on=['query_locale','product_id'], 
    right_on=['product_locale', 'product_id'])[features]

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
positive_labels = train_data[train_data["esci_label"] != "irrelevant"]

print("number of positive labels:", len(positive_labels))
print("number of unique queries:", len(positive_labels["query"].drop_duplicates()))
print("number of unique products:", len(positive_labels["product_id"].drop_duplicates()))

train_ds = tf.data.Dataset.from_tensor_slices(dict(positive_labels))

# Select the basic features.
train_ds = train_ds.map(lambda x: {
    "product_id": x["product_id"],
    "product_title": x["product_title"],
    "product_description": x["product_description"],
    "product_brand": x["product_brand"],
    "query": x["query"]
})
# TODO: add additional product fields
products = train_ds.map(lambda x: x["product_title"])
queries = train_ds.map(lambda x: x["query"])

number of positive labels: 348535
number of unique queries: 20888
number of unique products: 301436


In [None]:
# only using title atm
product_ds = tf.data.Dataset.from_tensor_slices(dict(product_data[["product_title", "product_description"]]))


### Setup Model
- We will tokenize the query and use a bag of words approach, averaging across the embedding vectors
- Also tokenize the product title and use BoW on the product side as well

In [None]:
from typing import Dict, Text
import tensorflow_recommenders as tfrs

class QueryProductsModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      query_model: tf.keras.Model,
      products_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up query and product representations.
    self.query_model = query_model
    self.products_model = products_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    query_embeddings = self.query_model(features["query"])
    product_embeddings = self.products_model(features["product_title"])

    return self.task(query_embeddings, product_embeddings)

In [None]:
MAX_TOKENS = 5000
text_vectorization_layer =  tf.keras.layers.TextVectorization(max_tokens=MAX_TOKENS,
                                              ngrams=5,
                                              standardize='lower_and_strip_punctuation',
                                              output_mode='int',
                                              output_sequence_length = 5
                                              )

In [None]:
text_vectorization_layer.adapt(queries.batch(64))
text_vectorization_layer.adapt(product_ds.map(lambda x: x["product_title"]).batch(64))
len(text_vectorization_layer.get_vocabulary())

5000

In [None]:
import keras.backend as K

EMBEDDING_DIM  = 64
# Define query and product models.
query_model = tf.keras.Sequential([
    text_vectorization_layer,
    tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM),
    tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), output_shape=(EMBEDDING_DIM,))
])
products_model = tf.keras.Sequential([
    text_vectorization_layer,
    tf.keras.layers.Embedding(MAX_TOKENS, EMBEDDING_DIM),
    tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1), output_shape=(EMBEDDING_DIM,))
])

task = tfrs.tasks.Retrieval()

In [None]:
import numpy as np
from datetime import datetime
from tensorflow import keras

import tensorboard

In [None]:
%load_ext tensorboard

In [None]:
# Define the Keras TensorBoard callback.
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

# Create a retrieval model.
model = QueryProductsModel(query_model, products_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 20 epochs.
model.fit(train_ds.batch(4096), epochs=20, callbacks=[tensorboard_callback])

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
    products.batch(100).map(lambda title: (title, model.products_model(title))))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f1f52365f40>

In [None]:
queries = ["airpods", "wine", "shoes", "tree"]
[q in text_vectorization_layer.get_vocabulary() for q in queries]

[True, True, True, True]

In [None]:
pd.options.mode.chained_assignment = None

def query_index(query_string):
  scores, product_ids = index(np.array([query_string]))
  top_10 = [s.decode("utf-8") for s in product_ids[0, :10].numpy()]
  scores_10 = [s for s in scores.numpy()[:10][0]]
  score_dict = dict(zip(top_10, scores_10))
  df = product_data[product_data["product_title"].isin(top_10)]
  df["score"] = df["product_title"].map(lambda title: score_dict[title])
  df.sort_values(["score"],inplace = True, ascending = False)
  df = df[["score", "product_id", "product_title", "product_description", "product_color_name", "product_brand", "product_bullet_point"]]
  return df

In [None]:
query = "airpods"
query_index(query)

Unnamed: 0,score,product_id,product_title,product_description,product_color_name,product_brand,product_bullet_point
797894,8.24618,B07ZPC9QD4,Apple AirPods Pro,,White,Apple,Active noise cancellation for immersive sound\...


In [None]:
query = "black shoes"
query_index(query)

Unnamed: 0,score,product_id,product_title,product_color_name,product_brand,product_bullet_point,product_description
0,11.855474,B07HG1TCT4,New Balance Men's Ml801v1 Black Running Shoe S...,Black,New Balance,,<p>New Balance is an icon when it comes to foo...
1,11.823403,B07D9N9GBB,"adidas Men's Streetfire, Black/Black/White, 10...",Black/Black/White,adidas,Regular fit; Lace closure with forefoot webbin...,
2,11.804886,B006IYQYA2,"DC Men's Court Graffik Skate Shoe, Black/Black...",Black/Black/Black,DC,"COMFORT: The lightweight, padded tongue and co...",
3,11.804886,B006IYTFBM,"DC Men's Court Graffik Skate Shoe, Black/Black...",Black/Black/Black,DC,"COMFORT: The lightweight, padded tongue and co...",
4,11.705588,B00NQY20VO,Skechers Men's Black Flex Advantage Slip Resis...,Black,Skechers,Slip-on shoe work shoe featuring mesh fabric u...,
5,11.705588,B00X454T6Y,Skechers Men's Black Flex Advantage Slip Resis...,Black,Skechers,Slip-on shoe work shoe featuring mesh fabric u...,
6,11.705588,B00NQY208W,Skechers Men's Black Flex Advantage Slip Resis...,Black,Skechers,Slip-on shoe work shoe featuring mesh fabric u...,
7,11.705588,B00NQY20VY,Skechers Men's Black Flex Advantage Slip Resis...,Black,Skechers,Slip-on shoe work shoe featuring mesh fabric u...,
8,11.704123,B000G3910G,"Skechers Men's Verdict Men's Boot,Dark Brown,8...",Dark Brown,Skechers,Air Cooled Memory Foam\nWaterproof\nUtility\nP...,
9,11.704123,B003BLPFXC,"Skechers Men's Verdict Men's Boot,Dark Brown,1...",Dark Brown,Skechers,"Work boot featuring waterproof leather upper, ...",


In [None]:
query = "wine"
query_index(query)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color_name,product_locale,score
295130,B08S6PPNXW,Wine Racks Wall Mounted-Hanging Wine Bottle Ho...,<p> <p> Wall-Mounted Wine Rack </p> <p> Materi...,STYLISH：To keep your favorite vintages decorat...,KATDANS,,us,8.37324
74482,B0818KKWDR,"Coasters for Drinks, Funny Drink Coasters Abso...",<p>Adorever coasters are a wonderful addiction...,【NOVELTY COASTERS FOR DRINKS】Witty conversatio...,Adorever,Beige,us,8.312427
97911,B00CHGVS06,"Wine Things 6-Piece Equestrian Wine Charms, Pa...","Wine Things WT-1476P Equestrian, Painted Wine ...",Size: Set of 6. Fits neatly around stem.\nMate...,Supreme Housewares,Wine,us,8.203733
97912,B00D85HQVA,"Wine Things Equestrian, Painted Wine Charms, F...",,Size: Set of 6. Fits neatly around stem.\nMate...,Wine Things,Multicolor,us,8.203733
30114,B07WCHCRGK,Hterepi Gift Gift Gift Gift Gift Gift Gift Gif...,Bernat Pipsqueak Stripes is a luxuriously soft...,Bernat Pipsqueak Stripes is a luxuriously soft...,Bernat Pipsqueak Stripes,,us,8.01849
297986,B002LIJOHO,Wine Enthusiast Wine Cork Lazy Susan Kit,,Holds 300 of your favorite corks\nFill this la...,Wine Enthusiast,Light Brown,us,7.85341
270666,B07TF9BCWM,"Beverage Refrigerator and Cooler, Drink Fridge...",<p><b>Feature:</b><br> No Noise: <br>Low noise...,Modern Electric Beverage Cooler: This 2.3 cubi...,TAVATA,,us,7.808433
229594,B08PC4ZRSY,"TYLZA Wine and Beverage Refrigerator, 24 Inch ...","Type of Cooling: Compressor<br> Input: AC115V,...",[2-IN-1 Smart Independent Cooling Zone] You ca...,TYLZA,,us,7.794359
303985,B08DNW4ZFH,Wine Bottle Accessories Gift Set,Wine Kit - Wine Opener Set - Perfect Gift for ...,ALL-IN-ONE WINE KIT SET - This wine kit has yo...,Lulu & You,,us,7.743504


In [None]:
query = "tree"
query_index(query)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color_name,product_locale,score
739952,B07R6663QG,53 Inches Multi-Level Cat Tree Stand House Fur...,Your feline friends need a cozy home of their ...,Your feline friends need a cozy home of their ...,Nova Microdermabrasion,Grey,us,13.406267
32328,B000IL9TMQ,"Willow Tree Angel of The Spirit, Sculpted Hand...",,"Sentiment: ”Nurturing the spirit, inside and o...",Willow Tree,Natural,us,13.395191
107400,B07N3RL8JZ,John Wick / John Wick: Chapter 2 (Double feature),,,,,us,13.328344
61311,B01BLSMQT2,Deadpool #7 John Tyler Christopher Action Figu...,,,,,us,13.079531
107260,1938221214,John Cage: Diary: How to Improve the World (Yo...,,,Siglio Press,,us,12.804401
124203,B07Y8Z6GHN,Angel Soft Toilet Paper with Fresh Lavender Sc...,,Angel Soft with Fresh Lavender Scented Tube to...,Angel Soft,White,us,12.174142
148103,B07Y8ZK1Y4,Angel Soft Toilet Paper with Fresh Linen Scent...,,"The softness and strength you already love, wi...",Angel Soft,White,us,12.174142
289328,B00MS7WMY0,National Tree Company 'Feel Real' Artificial C...,,This Downswept Douglas Fir Full Artificial Gre...,National Tree Company,Green,us,12.16424


In [None]:
%tensorboard --logdir logs