# MIT-GSL Uruguay 

## January 2020

-----

# Week - 2 | Lesson - 03 
# NLP: Product embeddings

1. After introducing the concept of embeddings through NLP, we extend the notion of embeddings to other settings
2. Note that embeddings are dense continuous representations for discrete, sparse tokens - this makes embeddings widely applicable
3. We will use the concept of embeddings to understand the world of e-commerce better 

---

# Word2Vec and its Applications to Market-Basket Data

# Instacart Grocery Dataset

### Source: https://www.instacart.com/datasets/grocery-shopping-2017

1. Instacart is an online grocery delivery service
2. They have made available 3M grocery orders for over 200K users
3. They provide between 4 to 100 orders for each user and each order contains the sequence of products purchased
4. We also have a brief description of the products

### Overview:
1. We will use this data to build an understanding of word embeddings and investigate their application to downstream tasks
2. For this purpose, we will consider each purchase basket to be a sentence with an unordered sequence of words

---

# 0. Import Modules

In [1]:
# ==============================================
# 0. Module imports
# ==============================================

import pandas as pd
pd.options.display.max_colwidth = 100
import numpy as np
from itertools import product
import csv

# w2v
import gensim

# text processing
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize import ToktokTokenizer
import string
import re # regular expressions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity

# T-Sne
#import umap
#from openTSNE import TSNE, TSNEEmbedding, affinity, initialization
#from openTSNE import initialization
#from openTSNE.callbacks import ErrorLogger
from sklearn.manifold import TSNE


# utils
from sklearn import utils


# parallel processing
import multiprocessing
from joblib import delayed, Parallel

# time code
import time

# 2-d visualiztion
%matplotlib inline
from ggplot import *
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import os
# os.chdir("/pool001/madhavk/gsl-uruguay/W-02-NLP/")

In [2]:
# =========================================================
# set directories
# =========================================================

import os
EC2 = True  # If using EC2 (for data directory/paths)

# Select path based off of local or remote
if not EC2:
    wd = "/pool001/madhavk/gsl-uruguay/W-02-NLP/"
else:
    wd = "/home/ubuntu/machine_learning_aws/"
os.chdir(wd)

EC2 = True  # If using EC2 (for data directory/paths)
if not EC2:
    # raw data
    raw_data_dir = "nlp-data/in-grocery/instacart_2017_05_01/"
    # processed data
    process_dir = "nlp-data/in-grocery/prepared-data/"  
else:
    # raw data
    raw_data_dir = "data/in-grocery/instacart_2017_05_01"
    # processed data
    process_dir = "data/in-grocery/prepared-data/"

In [3]:
# =========================================================
# processed files
# =========================================================

os.listdir(process_dir)

['orders-split-v1.csv',
 'all-orders-wide-v1.csv',
 'all-orders-long-v1.csv',
 'products-merged-v1.csv']

In [4]:
# =========================================================
# global parameters
# =========================================================

# show entire value of cell in pandas
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 500)

# number of cpus
cpus = multiprocessing.cpu_count()
f"Number of CPUs: {cpus}"

'Number of CPUs: 4'

------

# 1. Import grocery data

## 1.1. Order level data

In [5]:
# =========================================================
# order-level data
# =========================================================

orders_wide = pd.read_csv(process_dir + "all-orders-wide-v1.csv")
# This data set has one row per order with the products ordered in the product_id column. 
# Products are separated by space.
print(orders_wide.shape)

(3152555, 4)


In [6]:
display(orders_wide.head(10))

Unnamed: 0,order_id,product_id,num_products,eval
0,2,33120 28985 9327 45918 30035 17794 40141 1819,8,prior
1,3,33754 24838 17704 21903 17668 46667 17461 32665,8,prior
2,4,46842 39758 27761 10054 21351 22598 34862 40285 17616 25146 32645 41276,12,prior
3,5,13176 47329 27966 23909 48370 13245 27360 6348 40878 6184 48002 20914 37011 12962 45698 41176 48366 47209 46522 38693 48825 8479,22,prior
4,7,34050 46802,2,prior
5,9,21405 47890 11182 2014 29193 34203 14992 31506 23288 44533 18362 432 3990 14183,14,prior
6,10,24852 4796 31717 47766 4605 1529 21137 22122 34134 27156 14992 49235 26842 3464 25720,15,prior
7,11,30162 5994 1313 31506,4,prior
8,12,30597 15221 43772 37886 37215 34335 26910 38888 38050 29471,10,prior
9,13,17330 27407 35419 196 44635 26878 25783 41290 33198 23020 36086 3800 25952,13,prior


In [7]:
# =========================================================
# orders meta-data
# =========================================================

orders_meta = pd.read_csv(process_dir + "orders-split-v1.csv")
# This dataset includes the meta data for each order, i.e., the user who ordered it, order day of the week, order time
print(orders_meta.shape)

(3346083, 7)


In [8]:
display(orders_meta.head(10))

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,eval
0,2539329,1,1,2,8,,prior
1,2398795,1,2,3,7,15.0,prior
2,473747,1,3,3,12,21.0,prior
3,2254736,1,4,4,7,29.0,prior
4,431534,1,5,4,15,28.0,prior
5,3367565,1,6,2,7,19.0,prior
6,550135,1,7,1,9,20.0,prior
7,3108588,1,8,1,14,14.0,prior
8,2295261,1,9,1,16,0.0,prior
9,2550362,1,10,4,8,30.0,prior


----

## 1.2. Merge train-val-test split

In [9]:
# =========================================================
# merge orders-wide and orders-meta
# =========================================================

orders_wide.drop(labels = "eval", axis = 1, inplace = True)
orders_wide = pd.merge(orders_wide, # data - 1
                       orders_meta[["order_id", "user_id", "eval"]], # data - 2
                       on = "order_id", # merge key
                       how = "left") # left join
print(orders_wide.shape)

(3152555, 5)


In [10]:
display(orders_wide.head(10))

Unnamed: 0,order_id,product_id,num_products,user_id,eval
0,2,33120 28985 9327 45918 30035 17794 40141 1819,8,202279,prior
1,3,33754 24838 17704 21903 17668 46667 17461 32665,8,205970,prior
2,4,46842 39758 27761 10054 21351 22598 34862 40285 17616 25146 32645 41276,12,178520,prior
3,5,13176 47329 27966 23909 48370 13245 27360 6348 40878 6184 48002 20914 37011 12962 45698 41176 48366 47209 46522 38693 48825 8479,22,156122,prior
4,7,34050 46802,2,142903,prior
5,9,21405 47890 11182 2014 29193 34203 14992 31506 23288 44533 18362 432 3990 14183,14,139016,prior
6,10,24852 4796 31717 47766 4605 1529 21137 22122 34134 27156 14992 49235 26842 3464 25720,15,135442,prior
7,11,30162 5994 1313 31506,4,143742,prior
8,12,30597 15221 43772 37886 37215 34335 26910 38888 38050 29471,10,152610,prior
9,13,17330 27407 35419 196 44635 26878 25783 41290 33198 23020 36086 3800 25952,13,45082,prior


In [13]:
# =========================================================
# eval-set distribution
# =========================================================

orders_wide["eval"].value_counts()

prior    2959079
train    116333 
val      38701  
test     38442  
Name: eval, dtype: int64

In [12]:
orders_wide["aisle"].value_counts()

KeyError: 'aisle'

In [84]:
# clear some space
del orders_meta

----

## 1.3. Import product info data

In [14]:
products = pd.read_csv(process_dir + "products-merged-v1.csv")
print(products.shape)

(49688, 6)


In [15]:
display(products.head(10))

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry
5,6,Dry Nose Oil,11,11,cold flu allergy,personal care
6,7,Pure Coconut Water With Orange,98,7,juice nectars,beverages
7,8,Cut Russet Potatoes Steam N' Mash,116,1,frozen produce,frozen
8,9,Light Strawberry Blueberry Yogurt,120,16,yogurt,dairy eggs
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,water seltzer sparkling water,beverages


In [16]:
#===============================================
# top departments
#===============================================
products["aisle"].value_counts()

missing                          1258
candy chocolate                  1246
ice cream ice                    1091
vitamins supplements             1038
yogurt                           1026
chips pretzels                   989 
tea                              894 
packaged cheese                  891 
frozen meals                     880 
cookies cakes                    874 
energy granola bars              832 
hair care                        816 
spices seasonings                797 
juice nectars                    792 
crackers                         747 
soup broth bouillon              737 
baby food formula                718 
coffee                           680 
refrigerated                     675 
cleaning products                655 
baking ingredients               623 
packaged vegetables fruits       615 
asian foods                      605 
nuts seeds dried fruit           582 
fresh vegetables                 569 
oral hygiene                     565 
salad dressi

-----

### In-class exercise

In [88]:
#===============================================
# top aisles
#===============================================
# can you figure out which aisles host the most number of products?

---

# 2. Data exploration

In [17]:
# long-form of orders
orders_long = pd.read_csv(process_dir + "all-orders-long-v1.csv")
print(orders_long.shape)

(32019330, 5)


In [19]:
# top-20 observations in the data frame
display(orders_long.head(20))

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,eval
0,2,33120,1,1,prior
1,2,28985,2,1,prior
2,2,9327,3,0,prior
3,2,45918,4,1,prior
4,2,30035,5,0,prior
5,2,17794,6,1,prior
6,2,40141,7,1,prior
7,2,1819,8,1,prior
8,3,33754,1,1,prior
9,3,24838,2,1,prior


## 2.1. Most frequently purchased products

In [24]:
# most frequently bought products
most_freq_purchased = pd.DataFrame(orders_long["product_id"].value_counts()) # count the number times each product-id appears in data frame
most_freq_purchased.reset_index(drop = False, inplace = True) # complying with pandas indexing 
most_freq_purchased.columns = ["product_id", "freq"] # assign column names
display(most_freq_purchased.head(10))

Unnamed: 0,product_id,freq
0,24852,491291
1,13176,394930
2,21137,275577
3,21903,251705
4,47209,220877
5,47766,184224
6,47626,160792
7,16797,149445
8,26209,146660
9,27845,142813


In [25]:
# merge with product info from the meta dataset
most_freq_purchased = pd.merge(most_freq_purchased, 
                               products, 
                               on = "product_id", 
                               how = "left")
display(most_freq_purchased.head(20))

Unnamed: 0,product_id,freq,product_name,aisle_id,department_id,aisle,department
0,24852,491291,Banana,24,4,fresh fruits,produce
1,13176,394930,Bag of Organic Bananas,24,4,fresh fruits,produce
2,21137,275577,Organic Strawberries,24,4,fresh fruits,produce
3,21903,251705,Organic Baby Spinach,123,4,packaged vegetables fruits,produce
4,47209,220877,Organic Hass Avocado,24,4,fresh fruits,produce
5,47766,184224,Organic Avocado,24,4,fresh fruits,produce
6,47626,160792,Large Lemon,24,4,fresh fruits,produce
7,16797,149445,Strawberries,24,4,fresh fruits,produce
8,26209,146660,Limes,24,4,fresh fruits,produce
9,27845,142813,Organic Whole Milk,84,16,milk,dairy eggs


----

### In-class exercise

In [26]:
display(most_freq_purchased.tail(20))

Unnamed: 0,product_id,freq,product_name,aisle_id,department_id,aisle,department
14320,15612,206,Unwrapped Bites,45,19,candy chocolate,snacks
14321,20434,206,Sardines in Olive Oil,95,15,canned meat seafood,canned goods
14322,22315,206,Slow Churned Chocolate Light Ice Cream,37,1,ice cream ice,frozen
14323,39328,206,Organic Mexican Chocolate Ice Cream,37,1,ice cream ice,frozen
14324,12549,206,"Salad Dressing, Bleu Cheese, Vegan",89,13,salad dressing toppings,pantry
14325,35355,205,94% Fat Free Microwave Butter Popcorn,23,19,popcorn jerky,snacks
14326,5634,205,Organic Pasture Raised Local Eggs,86,16,eggs,dairy eggs
14327,18975,205,Uncured Cherry Smoked Bacon,106,12,hot dogs bacon sausage,meat seafood
14328,20814,205,Sensitive with Iron Infant Formula,92,18,baby food formula,babies
14329,46815,205,Ultimate Omega 1280mg omega-3,47,11,vitamins supplements,personal care


In [28]:
# most popular aisle
aisle = most_freq_purchased.groupby("aisle").agg({"freq": "sum"})
aisle = aisle.reset_index(drop = False, inplace = False).sort_values("freq", ascending = False)
display(aisle)

Unnamed: 0,aisle,freq
50,fresh fruits,3782826
53,fresh vegetables,3553596
98,packaged vegetables fruits,1823815
133,yogurt,1475395
93,packaged cheese,990483
83,milk,918611
131,water seltzer sparkling water,868265
25,chips pretzels,717827
119,soy lactosefree,655592
11,bread,588261


----

### In-class exercise

In [30]:
aisle = most_freq_purchased.groupby("aisle").agg({"freq": "sum"})
aisle = aisle.reset_index(drop = False, inplace = False).sort_values("freq", ascending = False)
display(aisle)

Unnamed: 0,aisle,freq
50,fresh fruits,3782826
53,fresh vegetables,3553596
98,packaged vegetables fruits,1823815
133,yogurt,1475395
93,packaged cheese,990483
83,milk,918611
131,water seltzer sparkling water,868265
25,chips pretzels,717827
119,soy lactosefree,655592
11,bread,588261


In [33]:
department = most_freq_purchased.groupby("department").agg({"freq": "sum"})
department = department.reset_index(drop = False, inplace = False).sort_values("freq", ascending = False)
display(department)

Unnamed: 0,department,freq
19,produce,9840710
7,dairy eggs,5522579
20,snacks,2767610
3,beverages,2641843
10,frozen,2178717
16,pantry,1763171
2,bakery,1168021
8,deli,1046768
6,canned goods,1032645
9,dry goods pasta,833181


-----

## 2.2. Co-purchased products

In [34]:
# ===========================================
# Generate co-purchase matrix
# ===========================================

def CoPurchaseMatrix(orders_wide, product_info = True):
    '''
    orders_wide is order-level data with one row per order
    '''
    count_vec = CountVectorizer(ngram_range = (1,1), binary = True, 
                            token_pattern = "\\b\\w+\\b") # sku counts
    pur_mat = count_vec.fit_transform(orders_wide["product_id"])
    co_pur_mat = (pur_mat.T * pur_mat) #copurchase matrix
    co_pur_mat.setdiag(0) # set diagonal to 0
    co_pur_mat_df = pd.DataFrame(co_pur_mat.todense()) # convert to data frame
    co_pur_mat_df.index = count_vec.vocabulary_ # row names from sku-ids
    co_pur_mat_df.columns = count_vec.vocabulary_ # column names from sku-ids
    co_pur_mat_df = co_pur_mat_df.where(np.triu(np.ones(co_pur_mat_df.shape)).astype(np.bool)) # consider the uppre tri
    co_pur_mat_df = co_pur_mat_df.stack().reset_index() # melt to sku-1 and sku-2 per row
    co_pur_mat_df.columns = ["product_id_1", "product_id_2", "copur"]
    co_pur_mat_df = co_pur_mat_df.loc[co_pur_mat_df["copur"] > 0, :] # subset for copur > 0
    co_pur_mat_df_top = co_pur_mat_df.sort_values(["product_id_1", "copur"], ascending = False)
    co_pur_mat_df_top = co_pur_mat_df_top.drop_duplicates(["product_id_1"], keep = "first")
    co_pur_mat_df_top["product_id_1"] = co_pur_mat_df_top["product_id_1"].astype(int) # fix data types
    co_pur_mat_df_top["product_id_2"] = co_pur_mat_df_top["product_id_2"].astype(int) # fix data types
    co_pur_mat_df_top = co_pur_mat_df_top.sort_values(["copur"], ascending = False).reset_index(drop = True)
    if product_info:
        co_pur_mat_df_top = pd.merge(co_pur_mat_df_top, products, how = "left", 
                                     left_on = "product_id_1", right_on = "product_id")
        co_pur_mat_df_top.drop("product_id", axis = 1, inplace = True)
        co_pur_mat_df_top = pd.merge(co_pur_mat_df_top, products, how = "left",
                                     left_on = "product_id_2", right_on = "product_id", 
                                     suffixes = ["_1", "_2"])
        co_pur_mat_df_top.drop("product_id", axis = 1, inplace = True)
        col_order = ['product_id_1', 'product_id_2', 'copur', 'product_name_1', 'product_name_2',
                             'aisle_1', 'aisle_2', 'department_1', 'department_2', 
                             'aisle_id_1', 'aisle_id_2', 'department_id_1', 'department_id_2']
        co_pur_mat_df_top = co_pur_mat_df_top[col_order]
    return(co_pur_mat_df_top)

In [35]:
# calculate copurchases
copur = CoPurchaseMatrix(orders_wide = orders_wide, product_info = True)
print(copur.shape)

(14332, 13)


In [36]:
display(copur[["product_name_1", "product_name_2", "copur", "aisle_1", "aisle_2"]].head(20))

Unnamed: 0,product_name_1,product_name_2,copur,aisle_1,aisle_2
0,Organic Lacinato (Dinosaur) Kale,Soft Eating Strawberry Flavored Licorice,64761.0,fresh vegetables,candy chocolate
1,"Mighty 4 Sweet Potato, Blueberry, Millet & Greek Yogurt Tots Snack",Nut Delight Fruit & Nut Bar,58330.0,baby food formula,energy granola bars
2,Nut Delight Fruit & Nut Bar,Carrot Bunch,55611.0,energy granola bars,fresh vegetables
3,Natural Finely Shredded Triple Cheddar Cheese,Nut Delight Fruit & Nut Bar,53395.0,packaged cheese,energy granola bars
4,Italian Style Meatballs & Mozzarella Sandwiches,Nut Delight Fruit & Nut Bar,43180.0,frozen meals,energy granola bars
5,Organic Granny Smith Apples,Garlic Spice Blend Paste,28998.0,fresh fruits,packaged vegetables fruits
6,Pecan Pie Fruit & Nut Food Bar,Soft Eating Strawberry Flavored Licorice,26812.0,energy granola bars,candy chocolate
7,Garlic Spice Blend Paste,Carrot Bunch,25766.0,packaged vegetables fruits,fresh vegetables
8,"Almond Coconut Bar, Organic",2nd Foods Organic Pear and Spinach Baby Food,22923.0,energy granola bars,baby food formula
9,Soft Eating Strawberry Flavored Licorice,Spicy Minis Guacamole,22027.0,candy chocolate,fresh dips tapenades


----

# 3. Split train-val-test datasets


In [37]:
#===============================================
# split train-val-test
#===============================================

train = orders_wide.loc[orders_wide["eval"].isin(["prior", "train"]), :]
val = orders_wide.loc[orders_wide["eval"] == "val", :]
test = orders_wide.loc[orders_wide["eval"] == "test", :]
print("train size:", train.shape)
print("val size:", val.shape)
print("test size:", test.shape)

train size: (3075412, 5)
val size: (38701, 5)
test size: (38442, 5)


In [101]:
# clear more space
# del orders_wide, orders_long

In [38]:
display(train.head())

Unnamed: 0,order_id,product_id,num_products,user_id,eval
0,2,33120 28985 9327 45918 30035 17794 40141 1819,8,202279,prior
1,3,33754 24838 17704 21903 17668 46667 17461 32665,8,205970,prior
2,4,46842 39758 27761 10054 21351 22598 34862 40285 17616 25146 32645 41276,12,178520,prior
3,5,13176 47329 27966 23909 48370 13245 27360 6348 40878 6184 48002 20914 37011 12962 45698 41176 48366 47209 46522 38693 48825 8479,22,156122,prior
4,7,34050 46802,2,142903,prior


## 3.1. Random sample for faster processing

In [39]:
#===============================================
# randomly sample training data
#===============================================

sample_size = 1000000
train = train.sample(n = sample_size)
train = train.reset_index(drop = True)
print(train.shape)

(1000000, 5)


In [40]:
display(train.head(10))

Unnamed: 0,order_id,product_id,num_products,user_id,eval
0,2713434,36127 260 38739 9839 31717 5212 16797 26209 5652,9,39434,prior
1,2564998,34126 34065 405 43772 43129 27966 24964 33731 49520 42736,10,123901,prior
2,390578,7677 27086 46175 4421 12276 9339 19678 42701 4605 24964,10,52624,prior
3,3171317,48364 37158 34358,3,102586,prior
4,1649445,14161 47357 20345 17341 11281 38849 25544 27676 14144 20520 36227 37722 1543,13,11004,prior
5,3409448,31506 46667 21847 20588 20638 49424 23516 48679,8,52968,prior
6,3032813,26282 10804 21344 24583 5479 28745 28928 18594 3896 37220 30391 40706 2091 810 5134 36772,16,73394,prior
7,230563,43279 7676 27796 31964 3298 31992 17224 38656 594 5322 23909,11,49744,prior
8,2882950,22922 46802 24489 24695 43295 5876 45002,7,127529,prior
9,817584,11182 46667,2,173746,prior


## 3.2. Tokenize sentences

In [46]:
# start pool process for parallel procressing
pool = multiprocessing.Pool(processes = cpus)
cpus

4

In [51]:
# text processing
import nltk 
nltk.download("punkt")
from nltk import sent_tokenize, word_tokenize


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [52]:
# training sentences
t0 = time.time()
train_orders = pool.map(word_tokenize, train["product_id"])
t1 = time.time()
print(len(train_orders))

1000000


In [53]:
# time taken
print(f"Time Taken: {t1 - t0}")

Time Taken: 63.66749143600464


In [54]:
# training orders
train_orders[0:3]

[['36127', '260', '38739', '9839', '31717', '5212', '16797', '26209', '5652'],
 ['34126',
  '34065',
  '405',
  '43772',
  '43129',
  '27966',
  '24964',
  '33731',
  '49520',
  '42736'],
 ['7677',
  '27086',
  '46175',
  '4421',
  '12276',
  '9339',
  '19678',
  '42701',
  '4605',
  '24964']]

In [55]:
# validation sentences
t0 = time.time()
val_orders = pool.map(word_tokenize, val["product_id"])
t1 = time.time()
print(len(val_orders))

38701


----

### In-class exercise

In [111]:
# what do the first three validation orders look like?

In [112]:
# Can you similarly tokenize the test sentences?
#print(len(test_orders))

----

# 4. Word2Vec sample model

## 4.1. Define and train model

In [56]:
#===============================================
# define and train model
#===============================================

w2v_1 = gensim.models.Word2Vec(sentences = train_orders,
                               workers = multiprocessing.cpu_count(),
                               seed = 1234)

## Model properties

In [57]:
#===============================================
# vocabulary
#===============================================

# vocabulary length
f"Vocab length: {len(w2v_1.wv.vocab)}"

'Vocab length: 14340'

In [60]:
# sample vocabulary
list(w2v_1.wv.vocab.keys())[0:5]

['36127', '260', '38739', '9839', '31717']

In [61]:
# length of corpus
print(w2v_1.corpus_count)

1000000


In [62]:
# number of iterations
w2v_1.iter

5

In [118]:
#===============================================
# product vectors
#===============================================

# enter product-id
prod_id = "1"
print(w2v_1[prod_id].shape)
print("------------")
print("------------")
print(w2v_1[prod_id])

(100,)
------------
------------
[ 0.33256823 -0.8674035   0.78945756 -0.8326404  -0.35906547  1.0120238
 -0.06898001  0.2782327  -0.17833115  0.87456757 -0.02812021  1.8168643
  0.30041236  0.3711286  -0.20275462 -0.23503277  1.3704741  -0.5922891
  1.0876763   1.792287   -0.612754   -1.331187    0.64436466 -0.20730683
  1.269803   -1.0397085   0.9416942   1.3022867   0.11127125 -1.0850266
 -0.56653965 -0.06770324 -0.8991397   0.21280771  0.09323756  2.894438
  0.42654473 -1.4698052  -0.5661502   0.37270918  0.13584968  1.2705847
  0.0365853  -0.9683485  -1.5081807  -0.51652515  0.2912822   0.12690805
  0.29624075  0.5983758   1.0610453   0.74076706  0.9236454   0.18861865
 -0.06008022  0.34126574 -0.44573402 -0.26175475  0.20415974  0.77992284
 -0.33681318 -0.11273185  0.07486103 -1.496057   -0.6121371   0.33590174
  0.7643749   0.5597887  -1.2259272  -1.067092    0.7632559   0.09929376
 -0.39279962  0.20848511 -0.3468598   0.23529449  1.0096282  -0.45790723
 -0.37464425 -0.01287276 

## 4.2. Update embeddings

In [63]:
#===============================================
# update model weights
#===============================================
w2v_1.train(sentences = train_orders, total_examples = w2v_1.corpus_count, epochs = 2)

(19307842, 20190222)

## 4.3. Inspect model output

In [64]:
#===============================================
# upadted product vectors
#===============================================

# enter product-id
prod_id = "1"
print(w2v_1[prod_id].shape)
print("------------")
print("------------")
print(w2v_1[prod_id])

(100,)
------------
------------
[-0.15897858  1.3138765  -0.4458094  -1.1327949  -0.52582616 -0.8279934
  0.15774423  0.25105605 -0.4333221  -0.08082795  0.86285216 -0.70886874
  0.7045319  -0.2238739  -0.1918213   0.72482157 -0.8717197  -0.5153289
  0.83717036 -0.31201124 -0.82710207 -0.08100231 -0.69025236  0.7978647
 -1.2918801   0.12263945 -0.4358498   0.64619535  0.26545691  0.18796475
 -1.0406277  -0.95823187  1.5320331  -0.6381243   0.55556875  0.81362915
  0.725703   -0.59123546  1.5706028   1.0049564   0.73284686  0.38947198
  0.5179963  -1.1165116  -0.5719591  -0.5047214   0.19921948 -1.0657848
 -0.6876577  -0.30792212  0.03496341  1.1194674  -0.6617525  -0.35422048
 -0.25476882 -0.01683265 -0.23153007 -0.1554616   1.59474     2.0798566
 -0.91083187 -0.01950055 -0.59591573 -0.82973194 -0.53231645  0.93636
  0.17496532  1.453717    1.6236032   0.0149919   0.9578527  -0.25117806
  0.59807855 -0.7706782   0.5603701   2.2639897  -0.8622636  -0.20782329
 -1.5082802   1.3553026   

--------

# 5. Improve W2V model

In [68]:
#===============================================
# model parameters
#===============================================

# size of embedding matrix
emb_size = 100

# context window size
cxt_window = 10

# batch size for gradient update
batch_size = 10000

# down-sample high frequency words
hfs = 0.001

# learning rate
lr = 0.05

In [69]:
#===============================================
# define model
#===============================================

w2v_1 = gensim.models.Word2Vec(sentences = train_orders,
                               size = emb_size, # number of columns in embedding matrix
                               hs = 1, # hierarchical softmax
                               negative = 0, # negative sampling
                               window = cxt_window, # context window
                               min_count = 1, # minimum frequency count
                               batch_words = batch_size, # batch size for update
                               alpha = lr, # learning rate
                               sample = hfs, # down sample high frequency words 
                               workers = cpus,
                               seed = 1234)

## 5.1. Score on validation and test

In [70]:
#===============================================
# Score model to get log-likelihood
#===============================================

def ScoreW2V(test_sent, model, normalize = True, avg_over_sent = True):
    test_score = model.score(test_sent, total_sentences = len(test_sent), 
                             chunksize = 100,
                             queue_factor = 2,
                             report_delay = 1)
    if normalize:
        test_score = [test_score[x]/len(test_sent[x]) for x in range(len(test_sent))]
    else:
        test_score = list(test_score)
    if avg_over_sent:
        test_score = np.mean(test_score)
    return test_score

In [71]:
w2v_1_val = ScoreW2V(test_sent = val_orders, model = w2v_1, normalize = True, avg_over_sent = True)

In [72]:
w2v_1_val

-7.532278745791191

## 5.2. Similar products

In [73]:
#===============================================
# similar products
#===============================================

# sample product
prod_id = "10"

# product info for sample product
display(products.loc[products["product_id"].isin([prod_id]), :])

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department


In [74]:
#===============================================
# similarity from model
#===============================================

w2v_1.wv.most_similar(prod_id)

[('44375', 0.7115499973297119),
 ('6475', 0.6679061055183411),
 ('4138', 0.5879172086715698),
 ('4493', 0.5435665845870972),
 ('34024', 0.4911191463470459),
 ('28004', 0.478809118270874),
 ('32380', 0.4643350839614868),
 ('39921', 0.4568260908126831),
 ('15327', 0.4303375482559204),
 ('6474', 0.4241175651550293)]

In [75]:
#===============================================
# lookup product info
#===============================================
most_similar_prods = [y[0] for y in w2v_1.wv.most_similar(positive = prod_id)]
most_similar_prods = products.loc[products["product_id"].isin(most_similar_prods), :]
display(most_similar_prods)

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department


In [76]:
#===============================================
# most dissimilar products
#===============================================
most_dissimilar_prods = [y[0] for y in w2v_1.wv.most_similar(negative = [prod_id])]
most_dissimilar_prods = products.loc[products["product_id"].isin(most_dissimilar_prods), :]
display(most_dissimilar_prods)

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department


----

### In-class exercise

In [77]:
# find the most similar products to 
prod_id = "100"

# product info for sample product
display(products.loc[products["product_id"].isin([prod_id]), :])

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department


In [131]:
# which products are most similar to peanut better and strawberry jam sandwich

In [132]:
# what about the most dissimilar products?

----

# 5. Product maps using t-sne 

## 5.1. Extract all product vectors

In [78]:
#===============================================
# product vectors
#===============================================

items = [key for key in w2v_1.wv.vocab.keys()] # all product-ids
word_vec = [list(w2v_1[item]) for item in items] # "word" vector for each product id
word_vec = np.array(word_vec) # convert to array
word_vec_df = pd.DataFrame(word_vec) # convert to data frame
w2v_vec_names = ["wv" + str(x + 1) for x in range(word_vec_df.shape[1])] # column names
word_vec_df.columns = w2v_vec_names # assign column names
word_vec_df["product_id"] = items # include product id in data frame
word_vec_df["product_id"] = word_vec_df["product_id"].astype(int) # convert to type integer for later merge
word_vec_df = word_vec_df[["product_id"] + w2v_vec_names] # re-order columns
print(word_vec_df.shape)

(14340, 101)


In [79]:
display(word_vec_df.head(10))

Unnamed: 0,product_id,wv1,wv2,wv3,wv4,wv5,wv6,wv7,wv8,wv9,wv10,wv11,wv12,wv13,wv14,wv15,wv16,wv17,wv18,wv19,wv20,wv21,wv22,wv23,wv24,wv25,wv26,wv27,wv28,wv29,wv30,wv31,wv32,wv33,wv34,wv35,wv36,wv37,wv38,wv39,wv40,wv41,wv42,wv43,wv44,wv45,wv46,wv47,wv48,wv49,wv50,wv51,wv52,wv53,wv54,wv55,wv56,wv57,wv58,wv59,wv60,wv61,wv62,wv63,wv64,wv65,wv66,wv67,wv68,wv69,wv70,wv71,wv72,wv73,wv74,wv75,wv76,wv77,wv78,wv79,wv80,wv81,wv82,wv83,wv84,wv85,wv86,wv87,wv88,wv89,wv90,wv91,wv92,wv93,wv94,wv95,wv96,wv97,wv98,wv99,wv100
0,36127,-1.323354,0.612752,-0.398217,0.192804,0.471136,-0.007201,0.234444,-0.119251,-0.10956,0.434752,-0.114712,0.604006,-0.584084,-0.729728,-0.668062,0.166142,-0.731873,-0.589978,0.534785,-0.345381,-1.276464,0.126199,-1.374089,1.094494,1.086177,0.045582,-0.749013,0.529887,-0.170947,0.448088,0.277849,0.226307,-0.537723,-0.606602,0.178671,-0.1419,0.641265,0.336838,-0.728748,0.169845,-0.161269,-0.258954,0.624895,-1.670931,-1.957299,-0.4841,-0.220584,0.740474,-0.045081,0.310734,-0.571723,-0.402898,-0.23551,0.055262,-0.920894,-1.011401,-0.57518,-0.533943,0.26943,0.25972,-0.450105,-0.867342,0.311568,1.371603,-0.230373,1.032849,1.345813,-0.169027,1.641874,0.549985,0.704802,-0.688428,-0.241989,-1.421533,0.102905,-0.460917,0.182365,-0.017828,-0.248468,0.04415,-0.321015,-0.567452,-1.468887,1.221046,-0.237686,0.487675,0.233861,0.031875,0.556341,-0.805869,-1.055952,0.386513,-0.37329,-1.271031,-0.508794,0.389508,0.278921,-0.882582,0.721076,0.30944
1,260,-1.004508,-0.630055,0.464731,-0.591676,-0.101519,-0.439685,-0.097783,0.182098,-0.035338,-0.470347,0.360966,-0.373011,0.011665,-0.576346,-0.447226,-0.281747,-0.492026,-1.521371,0.568475,0.019473,0.580504,0.235527,-0.148362,0.054256,0.81316,0.685138,-0.728926,-0.729078,0.351365,0.543991,1.201986,0.257552,-0.44451,-0.157458,0.909715,0.316515,0.091883,-0.221406,0.584473,-0.137114,0.909496,0.69757,-0.163526,0.69553,-0.214333,0.237807,0.133612,-0.515258,-0.303973,0.452173,-0.476972,-1.096938,-0.075441,-0.537928,0.244431,-0.168813,-0.091763,0.077005,-0.285658,-0.402576,0.643075,-0.3541,-0.20881,0.983676,0.497485,0.433619,0.062218,0.421122,0.220806,0.252234,0.134303,-0.352079,1.236468,0.996296,-0.369578,-0.506816,0.665192,0.667269,0.127533,0.448892,-0.436633,0.080658,0.438812,0.260601,-0.137237,-0.604359,-0.425628,-0.234384,-0.319999,0.088704,0.591365,0.419638,1.360114,-0.381888,-0.196939,-0.270422,0.737956,-0.170715,-0.289386,0.099266
2,38739,0.117583,-0.18622,0.670645,0.761699,-0.36801,-0.634671,0.324184,-0.363019,-0.495294,-0.007327,0.582694,-0.094889,-0.1565,0.732701,-0.30363,-0.683933,-0.33077,-0.002054,0.035253,1.271901,0.70463,-0.170999,-1.28101,-0.255003,-0.023034,0.08385,1.093869,-0.638069,-0.021917,0.981816,0.981029,-1.598578,0.678937,0.18338,-0.220802,-0.11604,-0.756623,0.174813,-0.201381,0.545292,1.924214,-0.225527,-0.845195,0.016864,0.113818,-0.280119,0.30654,-1.224081,-0.01797,-0.399201,0.584661,0.35541,1.078108,0.093555,0.049144,0.566148,-0.2116,0.067576,0.343647,-0.126946,-0.115765,0.234162,1.215726,1.065178,-0.525049,0.044128,-0.600766,-0.792392,-0.249204,0.334837,0.529613,0.099851,0.167005,-0.93414,1.91565,-0.472308,-0.402012,0.923185,0.648542,0.800638,-0.84496,-0.268484,-0.532,-0.825948,-0.122777,-0.139188,0.59838,-0.699188,-0.439457,0.148125,-0.08411,0.155991,-0.529889,0.23542,0.361776,-0.426244,-0.072735,-0.07356,-0.155613,-0.262718
3,9839,0.317504,0.583372,0.504759,-0.539479,0.068734,-0.038911,0.660003,0.769504,0.683357,0.445805,-0.783957,-0.420226,0.384448,0.23511,-0.094279,-0.78108,-1.061411,-1.024354,-0.333595,0.69494,-0.291977,-0.498779,0.159861,-0.284094,-0.215898,0.047123,-0.69241,-0.034591,0.402449,0.812948,1.02875,-0.089558,-0.363315,0.235936,0.25911,-0.240004,-0.08726,-0.075477,-0.364875,-1.188952,-0.703688,-0.501646,-0.345118,-0.289469,-1.236472,1.303083,0.638172,0.15857,0.495495,1.44069,0.656163,0.091888,0.230711,-0.248618,-0.575258,-0.364714,-0.030669,-0.615998,0.079768,-1.014509,-0.384285,-0.218831,0.494353,0.093158,-0.798852,0.405772,0.829207,-0.625638,-0.041349,-0.447457,0.188692,-0.05065,-0.307453,0.740972,-0.613796,-0.150296,-0.568961,-0.459989,-0.017533,-0.441938,0.27458,0.144471,0.375467,0.298864,0.14701,-0.199594,0.47273,-0.17427,0.99963,-0.81136,0.276606,0.403559,-0.103758,0.801768,0.250866,0.302406,0.562349,-0.293643,-0.087884,-0.037337
4,31717,-0.373762,0.435795,1.142431,-0.862603,-1.278272,0.215065,-1.287513,0.285897,0.047191,-0.184114,-1.091585,0.943866,-0.003419,0.713798,0.667865,-0.655189,-0.117985,-0.791433,0.137488,0.929038,0.695396,0.711686,-1.074966,0.566319,0.667925,0.887162,0.094221,0.608371,-1.418125,-0.587344,0.086177,-0.399253,1.071235,1.363468,0.080687,-1.19435,0.398083,-1.456245,-0.003016,-0.139117,0.233109,0.034521,0.013685,0.39892,-0.594234,-1.511846,0.200493,0.210373,-0.420258,1.294862,0.112224,-1.015286,1.506298,-0.882786,-2.260687,-0.587217,0.285004,-0.342228,-0.288684,-0.852069,0.690514,-0.03256,0.152792,-1.327897,-0.205222,0.809507,0.182102,-0.29443,-0.648796,-0.15142,-0.036266,1.671798,-1.533812,-0.833977,-0.247758,-0.460273,-0.743436,-1.042352,0.592273,-0.238199,1.051929,0.926751,-0.339278,0.058365,-1.058145,0.055896,-2.220438,-0.453299,-0.754881,-0.522198,-0.587561,1.776467,0.714092,1.044143,-0.041489,0.264331,-0.202571,0.547653,-0.845983,0.875648
5,5212,-0.818815,0.180607,0.136177,0.612281,0.600557,0.238149,0.163759,0.061738,0.285364,0.024532,-0.605987,0.800362,-0.483818,0.149021,-0.451635,0.172221,-0.405496,-0.643536,1.179748,0.148758,-0.277999,0.623272,-0.511321,-0.659431,0.427368,0.257491,-0.290088,0.170167,-0.219357,0.193384,0.761138,-0.595859,0.464823,-0.052496,0.066465,0.304421,0.183906,-0.777056,-0.180334,-0.362663,0.91164,0.126569,0.108248,-0.589616,-1.129117,0.110202,0.550834,0.896855,-0.291394,-0.705634,-0.530012,-0.804641,-1.060862,-0.01016,0.06441,-1.186585,-0.102499,0.166178,-0.542839,0.962054,-0.121408,-0.760859,-0.175007,0.537475,-0.219366,0.60996,1.129205,0.413566,0.666322,-0.453705,1.808612,-0.706619,0.175343,-0.134272,0.143524,-0.30309,0.225452,0.193177,-0.468822,-0.796135,-0.245802,0.04453,0.400368,-0.160361,0.649751,-0.451736,0.848711,0.283787,-0.776243,0.429563,-0.603455,-0.20568,-0.638819,-0.090085,0.764611,0.675595,0.785366,-0.7106,0.064751,0.761582
6,16797,-0.166064,-0.317955,0.104622,-0.82526,-0.378327,0.242934,0.234317,0.592944,-0.097499,0.328691,0.331864,0.081258,0.123241,0.168444,-0.007696,0.401575,0.910677,-0.233831,0.26286,0.148762,0.597213,0.594735,-0.602418,-0.261591,0.82852,0.227614,-0.328182,-0.539235,0.367265,1.043073,0.353008,0.264609,-0.102158,-0.098942,-0.554873,-0.002079,-0.312151,-0.635685,-0.119457,0.375286,0.591662,0.366868,0.527536,-0.179372,-0.094412,-0.092562,-0.574574,-0.236148,-0.628508,-0.092695,-0.621274,-0.520392,0.237285,-0.363692,0.219033,0.439922,0.014253,0.36089,0.470978,0.594837,-0.258583,-0.500768,-0.308255,0.69023,0.504381,0.131782,-0.751477,-0.0924,0.297822,-0.159396,0.319916,-0.296032,0.25446,-0.443031,0.975631,-0.780605,0.651818,0.049639,0.316448,-0.62572,0.379214,-0.198846,0.000499,-0.104608,0.63117,-0.885987,0.788903,-0.693761,-0.630054,0.483273,0.118588,0.209591,0.768779,-0.562258,-0.043281,-0.471579,0.931629,-0.38318,-0.288198,-0.011025
7,26209,-0.35476,-0.644864,0.227268,-0.06587,-1.306665,-0.880987,-0.282387,0.695682,-0.926289,0.424724,-0.139752,-0.093642,-0.113732,-0.020063,-1.208746,0.419861,-0.415695,-0.675471,0.305806,0.392596,0.173758,-0.185888,-1.057713,0.964265,-1.008959,-0.333944,-0.605124,0.702796,0.055801,-0.531045,0.295447,-0.288786,0.316021,0.216399,0.730186,0.299407,-0.768576,0.043248,-1.727868,1.273195,0.138486,-0.82314,-0.157186,0.539678,0.455704,0.542845,-0.136866,-0.163003,-0.57061,0.429627,-0.967747,0.062889,-0.131935,0.12584,1.595353,-0.438772,-0.07926,-0.316565,0.110196,-0.280957,-0.262761,0.151496,-0.127971,-1.394867,0.156899,-0.113941,0.069785,0.547024,-0.551614,0.587624,0.305145,-0.107341,1.109838,-0.211961,0.749383,0.248738,-0.820243,0.222949,-0.364877,-0.665637,0.162979,0.560009,-0.531751,1.219871,-1.367162,1.027175,0.218629,0.143873,-0.838338,-0.95037,0.309982,-0.821145,0.17783,-0.161209,-0.452098,0.466275,1.40907,-0.899138,0.742926,1.147942
8,5652,-0.626611,-0.536736,0.699321,0.220304,-1.064125,0.26915,-0.069648,0.609917,0.548155,-0.424771,-0.622606,-1.035554,0.128773,-0.551777,0.590342,-0.779317,-0.115073,-1.065807,-0.245104,-0.055572,-0.267911,-0.484808,0.286527,0.560205,0.447987,-0.128526,-0.39231,0.188325,0.345459,1.159879,0.534522,0.691841,-0.085189,-0.412941,0.672474,-0.076099,0.680544,0.375949,0.302549,0.507145,-0.542728,0.111691,-0.665785,-0.211821,0.618541,0.271815,0.8662,0.119346,0.126814,0.570069,-0.605416,-0.847013,0.902905,0.032831,0.114133,0.223572,-0.587772,-0.427074,0.05092,-0.287895,0.010505,-0.054615,0.228011,0.329422,-0.103972,0.206701,0.43917,0.25604,-0.234481,0.912599,-0.604322,0.223599,-0.073678,0.461801,-0.597029,0.552328,-0.659357,-0.304025,0.075541,0.110497,-0.1843,0.23861,-0.863934,0.159016,-0.112514,0.069451,-0.263913,-1.12116,0.429359,-0.248246,-0.403404,0.758858,0.546652,0.374721,-0.417174,-0.867844,0.491607,0.119946,0.367245,0.624874
9,34126,-0.425449,-0.683473,0.548868,-0.183955,-0.337578,0.054791,-0.614078,0.361305,1.044114,-0.325436,-0.468478,-1.93915,0.77794,0.220814,-0.377226,0.465752,-0.222034,-0.506737,0.779743,-0.333091,0.42681,-0.693962,-0.908174,-0.441424,-0.259904,-1.644247,-0.235217,0.788804,0.42938,1.316897,0.564673,1.196751,0.171457,0.276649,0.732731,0.974457,0.227458,0.4896,-0.609282,1.315817,-0.502203,-0.640549,-1.605252,-0.2643,0.797594,0.764003,0.653687,-0.261085,0.537445,0.117384,-0.540298,0.380481,0.687863,0.555034,1.103621,0.324118,-0.762527,-1.35504,0.843354,-0.414133,0.531828,-0.443177,0.672226,-0.860262,-0.214442,0.060679,0.37348,0.041324,-1.305113,0.441053,-0.203223,0.333599,0.016116,0.247823,0.423366,0.108296,-1.296999,-0.330721,-1.065882,-0.37689,-0.018798,-0.599894,-1.531332,1.227854,-0.545032,0.04835,-0.057489,-0.356748,-0.637578,-0.301199,-1.254392,0.39577,0.344005,1.986016,0.550523,-0.504566,0.155512,-0.603835,-0.175887,1.105543


## Merge product vectors with product info

In [80]:
#===============================================
# merge relevant columns
#===============================================

# relevant columns from product info
prod_info_cols = ["product_id", "product_name", "department", "aisle"]

# merge
word_vec_df = pd.merge(products[prod_info_cols], word_vec_df, on = "product_id", how = "inner")
print(word_vec_df.shape)

(14340, 104)


In [136]:
display(word_vec_df.head(10))

Unnamed: 0,product_id,product_name,department,aisle,wv1,wv2,wv3,wv4,wv5,wv6,wv7,wv8,wv9,wv10,wv11,wv12,wv13,wv14,wv15,wv16,wv17,wv18,wv19,wv20,wv21,wv22,wv23,wv24,wv25,wv26,wv27,wv28,wv29,wv30,wv31,wv32,wv33,wv34,wv35,wv36,wv37,wv38,wv39,wv40,wv41,wv42,wv43,wv44,wv45,wv46,wv47,wv48,wv49,wv50,wv51,wv52,wv53,wv54,wv55,wv56,wv57,wv58,wv59,wv60,wv61,wv62,wv63,wv64,wv65,wv66,wv67,wv68,wv69,wv70,wv71,wv72,wv73,wv74,wv75,wv76,wv77,wv78,wv79,wv80,wv81,wv82,wv83,wv84,wv85,wv86,wv87,wv88,wv89,wv90,wv91,wv92,wv93,wv94,wv95,wv96,wv97,wv98,wv99,wv100
0,1,Chocolate Sandwich Cookies,snacks,cookies cakes,0.744708,1.295512,-1.193265,0.598794,-0.641248,1.223336,0.919235,0.809076,0.790224,-1.406452,0.579844,-0.417504,0.320172,-1.006021,-1.247783,0.728549,0.282868,-0.163652,0.442921,0.128002,1.018741,-1.040465,0.126721,0.412622,0.333163,0.891142,-0.438378,-0.414526,-0.543069,-1.434892,-0.36965,-0.234173,-0.450169,0.144263,-0.889828,-0.092579,0.095353,-0.893399,-0.008923,-0.70394,-0.921108,-1.429784,0.343726,-0.504728,-1.58324,0.781951,-0.361618,-0.487692,0.108233,-1.003854,0.44551,0.412661,2.036789,0.664207,-0.936389,0.153528,-0.561427,0.708672,-1.219241,-0.778229,0.468867,-0.237254,-0.688795,0.463018,0.221844,-1.234477,-0.520427,0.230692,-1.529698,0.867693,-0.80247,-0.222265,-0.507642,0.082833,-0.856651,0.124394,-0.133245,-1.038114,-0.809275,-0.205799,-0.915241,-0.091222,-0.302496,0.530782,0.226141,-0.134771,-0.599206,-0.370918,0.447913,0.08971,-0.735691,-0.266758,-1.274789,-1.471167,0.414221,-0.482969,-0.722963,-0.789088,-0.890457,0.141746
1,3,Robust Golden Unsweetened Oolong Tea,beverages,tea,-0.156999,-0.097948,1.033049,-0.075712,-0.156379,0.859535,-0.202249,-0.69284,1.155985,-0.154656,1.157791,-0.287568,0.291294,0.716055,-0.463947,-0.26647,-0.43225,-0.462481,-0.609656,0.9397,0.484884,-0.209098,0.338069,1.126593,-0.584675,-1.76212,-1.089806,1.540631,0.531554,0.231101,0.813328,0.308757,-1.116319,0.098051,0.718829,0.123319,-0.361023,-0.134135,0.584352,-0.445583,0.308385,0.375197,-0.108583,-1.362428,0.132718,-1.448387,-0.245276,1.475555,1.410261,-0.275512,0.245381,-0.178379,0.160996,-0.317731,-0.28679,-0.621966,1.150026,-1.059867,-1.085093,-0.115807,0.323545,-1.090988,0.613781,0.329271,1.433473,0.560549,-0.153129,-0.580917,-1.053341,0.052376,-0.34375,1.419617,0.731421,0.625508,0.882422,-0.762834,-0.411799,0.003922,-1.166005,-0.027614,0.316082,-0.358464,-0.382854,1.93014,-0.251959,0.190719,0.46801,0.38275,0.392147,-0.921297,0.853766,0.890511,0.992793,0.780575,0.511955,0.541127,-0.39919,0.485569,-0.989308,-0.412728
2,4,Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce,frozen,frozen meals,0.632169,-0.042714,0.447641,0.565454,-0.173308,-1.259952,-0.374098,0.565182,-0.501129,-0.326884,-1.161234,0.58826,1.618659,0.294504,0.15479,0.866748,-1.177649,-0.134442,-0.567716,-0.921026,0.309276,-0.598171,2.971751,-0.787572,-0.64975,-0.231039,1.806901,1.559718,0.50179,0.281565,0.014173,0.633412,0.325695,0.359038,-0.0701,0.124627,-0.962404,-0.134815,-0.145717,1.183958,0.241399,0.964502,1.174005,1.029604,0.15526,-0.624118,-0.044555,0.265823,1.074033,-1.058899,-0.206956,0.068395,1.224924,0.802075,-0.673816,2.257773,-1.068509,0.063531,1.522198,0.739853,1.355033,0.423484,0.501739,0.359911,0.319578,-0.454862,0.050564,0.328704,0.205272,-0.856989,0.670041,1.876261,1.597733,1.334862,-0.317515,-0.957292,-1.040504,-0.442202,-0.018599,0.2461,0.799682,-0.482413,-1.251701,-1.626282,-0.359774,0.296589,-0.049335,0.718455,1.660907,-0.985967,2.556258,-0.344786,-0.415751,-1.505674,-0.344438,0.000373,-0.931319,2.007142,-0.076104,-0.533512
3,10,Sparkling Orange Juice & Prickly Pear Beverage,beverages,water seltzer sparkling water,-0.253488,0.374834,-1.06959,0.956164,-0.311371,1.789404,0.26726,0.387516,-0.654127,1.027248,1.398794,0.410207,0.221405,1.188907,0.842219,0.270739,-1.056333,-1.88476,1.011292,0.223562,-0.034032,-0.055429,0.418091,0.480409,1.238376,0.885619,0.360691,3.544943,-0.051665,-0.017406,-0.987518,0.134707,-0.523529,0.258269,-0.30471,-0.891016,-0.382807,-0.938065,-0.871738,-0.025178,-0.21168,-0.965195,-0.190047,-1.046221,0.048076,-0.790697,1.053874,1.348278,1.777241,-0.112067,0.000398,-0.285349,0.411035,-1.454198,-0.068388,-0.343851,0.311053,-0.464327,-0.959483,1.488464,-1.142036,-1.981344,0.644177,-0.701146,0.385863,0.544416,-0.302204,0.667317,0.317814,-0.475939,0.729246,0.753712,0.588491,-0.187501,0.174661,-0.5897,1.184333,-0.253011,-1.038517,0.01551,-0.417327,-0.130255,0.523872,-0.082021,0.122164,0.55126,-0.024936,-0.428157,1.421382,0.789992,1.751679,-0.564681,-0.353809,0.96465,-0.701595,0.662892,-0.655238,0.252089,0.170364,-0.982335
4,12,Chocolate Fudge Layer Cake,frozen,frozen dessert,1.327038,1.344844,-0.715764,-0.140443,1.049643,-1.912242,0.343646,0.159571,-0.79021,0.403136,-0.281968,0.412603,0.347124,1.009087,0.802364,-0.692366,1.123618,-0.177917,0.107383,-1.384778,0.550639,-0.357951,-0.501989,0.804199,0.894451,-1.266077,-1.041235,-0.659717,-0.556226,-0.151734,-0.490867,-0.322606,0.701626,-0.448401,0.353176,-0.590934,-0.581699,-0.513539,0.366323,1.075075,0.502353,-0.451745,-0.928443,0.407792,0.541153,-1.002005,0.241243,-0.807081,1.255953,-0.177165,-0.706863,-0.288796,-0.049234,0.116173,-0.272984,0.631281,-0.386951,-2.198615,-0.302028,-0.46028,0.325847,0.030885,-2.105733,-0.270276,1.108249,0.379686,-0.683982,0.882951,0.176824,-0.823371,0.993661,-0.008259,0.203629,1.187102,0.553098,-0.10743,0.448566,0.580543,-0.688241,1.708184,0.44948,-1.134583,1.173021,-0.158643,0.477433,0.029551,0.278665,-0.013011,-1.108271,0.093632,0.029749,0.551239,-0.421093,-1.260726,0.405004,-0.380647,0.492613,0.691278,0.682424,-0.482249
5,23,Organic Turkey Burgers,meat seafood,packaged poultry,1.281689,1.048959,-0.087085,-0.259186,-0.41775,0.409531,-0.595253,0.745968,-0.450151,-0.047111,0.056733,1.116437,0.861204,-0.395207,-0.584873,-0.651139,-0.617091,1.195289,0.56252,0.331357,0.763006,0.769858,0.069841,0.29765,0.836339,-0.241667,0.974203,-0.348714,-0.601945,0.630662,0.56235,-1.417201,0.23381,-0.579337,1.458099,-0.150728,-0.004849,0.582879,-0.113335,0.627711,-1.29425,1.653909,-0.254007,0.560506,-0.912542,-0.60036,-1.010657,1.168578,-0.796146,-0.259093,0.277253,-0.89189,0.277,-0.817577,0.596441,-0.051523,-0.921772,-0.592638,1.776701,0.159778,0.066023,0.366919,0.43034,-0.610881,-0.48897,0.391611,-0.859604,-0.577904,0.62155,-0.338586,-0.246057,-1.836237,-0.957246,0.825464,0.366084,-0.917018,0.076385,-1.110705,0.380992,1.050979,-0.236705,0.103676,-0.513434,-0.679433,0.222142,0.004132,-0.127225,-0.501494,0.646623,0.50386,0.890724,-0.595622,-0.338392,0.451462,1.401664,-0.631341,-1.221392,0.302681,-0.342512,-0.041452
6,25,Salted Caramel Lean Protein & Fiber Bar,snacks,energy granola bars,0.704936,0.527594,-0.475766,0.067871,0.984812,1.075257,-1.00641,-0.84898,0.164817,-0.694122,-0.858648,0.779288,3.092391,1.050593,-0.034361,0.696025,-1.684608,0.124925,-1.535254,-0.288634,0.666161,-0.630741,0.411031,0.133784,-0.341806,-2.152892,-0.865416,-0.224923,0.07162,-0.099577,-0.056801,0.544065,2.118198,-0.337014,-0.159262,0.275371,0.672277,1.183733,0.063948,0.093446,0.352131,-1.317648,-0.495123,0.29075,0.283799,-0.911738,0.052114,-1.439304,-1.440612,-1.647701,-1.04306,-0.065286,-1.257465,0.521029,0.712134,-0.327804,-0.797467,1.466682,0.300995,1.538655,0.157716,0.329096,0.350415,-0.46552,0.136987,1.000664,1.11993,-2.255718,0.922869,-1.994733,1.333807,1.17683,-0.992273,-0.330782,-0.411566,-1.977853,0.125064,0.05945,-0.455411,0.003345,1.619707,0.247498,0.861991,0.500372,1.801972,2.886704,0.790417,-1.375822,-1.843633,-0.772261,0.662342,-0.105223,-0.373791,0.486574,0.021705,0.158155,0.534222,0.6782,-0.169715,0.545667
7,26,Fancy Feast Trout Feast Flaked Wet Cat Food,pets,cat food care,-0.25993,0.479054,0.024661,-0.974695,0.613846,-0.443279,1.217526,-0.139749,0.159328,-0.975453,2.365123,-0.199709,2.725972,-1.561348,0.871912,-1.512866,-0.64549,0.299155,0.716994,1.545311,0.166697,-1.288145,0.09195,1.136037,-0.337469,-1.059111,-0.069138,0.499691,-0.078959,2.206135,0.659091,-0.456524,0.315582,-2.157763,0.435822,0.659313,0.482266,-0.927864,-0.003133,0.881113,-1.201548,-1.370942,2.1467,-0.676909,1.899045,1.216807,-0.021967,0.233578,1.347754,0.147646,-1.287271,1.272144,0.970496,-0.846075,-1.04565,1.422048,1.107175,-1.341389,0.613931,0.832464,-0.10832,1.837682,0.063495,-0.459702,1.215817,-1.332855,0.222142,0.482275,-0.245079,-0.826059,0.196069,1.595432,0.019567,0.848055,1.254083,-0.293945,-0.856024,0.362408,-1.621047,-1.857358,0.807549,-0.243153,2.209505,3.598671,0.766132,0.634362,0.015468,0.892359,-0.313393,-0.777657,-0.091984,0.608246,-0.604637,0.547842,2.109301,-0.441909,-3.178446,0.897954,-0.31346,1.518307
8,28,Wheat Chex Cereal,breakfast,cereal,1.551934,-0.063192,-0.126314,0.054126,0.358737,0.194197,0.906389,-0.212437,0.634078,1.009118,-0.537664,0.097468,-1.044648,-1.020048,-0.417367,0.381773,-0.746397,-0.231332,0.096689,0.362068,-0.743048,-1.272645,-0.303648,-0.632498,-0.216862,0.003694,1.527987,0.264572,-0.626162,-0.461936,2.192944,0.356213,-0.532494,-0.400109,0.382776,0.305221,-0.500394,0.495915,0.660836,-0.011022,-0.511125,0.236746,-0.917441,0.863931,0.456143,0.281974,-0.844871,0.272576,-0.276962,-0.586735,-1.545401,-0.888963,0.950916,-0.252548,-0.188981,-0.302334,1.324622,0.278279,1.409999,-0.299727,0.169171,-1.258821,-0.724491,0.393844,0.586699,-0.548479,0.298617,0.291076,-0.070926,0.54922,-0.177431,0.396786,-0.235213,-1.852426,-0.833704,-0.13567,-0.221717,0.793202,-0.122024,0.621324,0.93873,-0.354383,-0.082007,0.200955,1.726734,0.172192,0.423436,1.599597,-0.599566,-0.550859,0.551727,0.831676,0.88752,-1.81873,0.17402,-0.159686,-0.213589,-0.185836,1.071417,-0.431435
9,29,Fresh Cut Golden Sweet No Salt Added Whole Kernel Corn,canned goods,canned jarred vegetables,-0.099535,0.026925,0.092476,0.040248,0.219087,-0.41171,-0.085742,0.118383,0.354839,0.216073,-0.419646,0.034342,0.322212,-0.962055,0.497153,-1.257611,-0.352518,-1.605562,-0.774982,0.082881,0.671315,-0.919024,0.647485,-0.317767,0.762318,-0.529241,0.406091,-0.115517,0.162307,0.384497,-0.206539,-0.28806,0.029108,-0.197216,0.607624,-1.182837,-1.029887,-0.082268,0.226504,0.352733,-0.039552,-0.759861,-0.079501,1.087697,-0.711835,-0.78451,0.125496,0.870484,-1.085535,0.384175,-0.641595,-0.037602,0.701198,-0.254137,-0.837875,1.1407,1.041522,0.189193,0.11809,-0.194093,0.120151,0.595512,-0.175523,1.014627,-0.866555,0.73841,-0.175047,-0.672686,1.470878,0.179683,0.132148,0.016398,-0.031339,-0.717995,-0.359915,-0.58154,0.036269,0.42813,0.577695,0.297692,1.562186,-0.416386,0.191496,-0.393401,0.312084,0.145899,0.273367,0.32845,-0.346311,-0.187861,0.156926,-0.040625,-0.490074,-0.517148,-0.40848,1.033066,0.542465,0.262616,-0.649964,-0.491701


## 5.2. Fit t-sne

In [81]:
# =================================================
# fit t-sne
# =================================================
tsne = TSNE(n_components = 2, verbose = 1, perplexity = 35, n_iter = 400)

In [None]:
# fit
t0 = time.time()
tsne_fit = tsne.fit_transform(word_vec_df[w2v_vec_names])
t1 = time.time()

[t-SNE] Computing 106 nearest neighbors...
[t-SNE] Indexed 14340 samples in 0.242s...


In [139]:
# time taken
f"Time Taken: {t1 - t0}"

'Time Taken: 187.86533784866333'

## T-sne component data frame

In [140]:
#===============================================
# create t-sne data frame
#===============================================

tsne_df = word_vec_df[["product_name", "department", "aisle"]]

# extract t-sne dimensions
tsne_df["x_tsne"] = tsne_fit[:,0]
tsne_df["y_tsne"] = tsne_fit[:,1]
print(tsne_df.describe())

             x_tsne        y_tsne
count  14340.000000  14340.000000
mean   0.081110     -0.013953    
std    10.179037     7.783475    
min   -22.075830    -19.978592   
25%   -7.855527     -5.532616    
50%   -0.820201     -0.028690    
75%    8.695611      5.579124    
max    22.841021     21.091007   


## Plot with ggplot

In [141]:
#===============================================
# subset data for plot
#===============================================

# select only top departments
select_dept = ["produce", "babies", "beverages"]
tsne_plot_df = tsne_df.loc[tsne_df["department"].isin(select_dept), :]
print(tsne_plot_df.shape)

(2726, 5)


In [None]:
tsne_plot = ggplot(tsne_plot_df, aes(x = "x_tsne", y = "y_tsne", color = "department") ) \
        + geom_point(size = 70, alpha = 0.5) \
        + ggtitle("T-sne on product vectors") \
        + xlab(" ") + ylab(" ")
tsne_plot

---

## In-class exercise

## Learning embeddings using Skip-gram

- We will now use another method to train embeddings called skip-gram

In [None]:
#===============================================
# model parameters
#===============================================

# size of embedding matrix
emb_size  # set between 50-100

# context window size
cxt_window # set between 2-10 

# batch size for gradient update
batch_size # set between 2000 to 10000

# learning rate
lr # set between 0.001 to 0.01

In [None]:
#===============================================
# define and train model
#===============================================

t0 = time.time()
### Write model code here
t1 = time.time()

In [None]:
# time taken
f"Time Taken: {t1 - t0}"

In [None]:
# find the most similar products to 
prod_id = "1000"

# product info for sample product
display(products.loc[products["product_id"].isin([prod_id]), :])