# Text Representation

In [2]:
# Load data generated in Session 1 or the provided data splits (see Absalon, W7 Lab)
import pandas as pd

df_train = pd.read_pickle("train_dataframe.pkl")
df_test = pd.read_pickle("test_dataframe.pkl")

# In this session, we will also need to load the metadata file (see Absalon, W9 Lab)
meta_file = 'meta_All_Beauty.json'

# Exercise 1 {-}

Load the [metadata file](https://absalon.ku.dk/courses/80396/files/9386857?module_item_id=2657111) from Absalon and discard any item that was not rated by our subset of users (not in training or test sets). You can refer to the [original metadata file](https://nijianmo.github.io/amazon/index.html) if you want to look up more explanations about the columns of the metada file. Apply preprocessing in this order: lowercasing, tokenizing, stemming, and stopwords removal (including punctuation) to clean up the text from the `title`. Report the vocabulary size before and after the preprocessing. You may have to specify the language for these steps.

In [3]:
import os
import sys
sys.path.append('../')
import pickle
import pandas as pd

# Load the metadata (items)
item_metadata = pd.read_json("meta_All_Beauty.json", lines=True)
all_rated_items_set = set(df_train.asin.tolist() + df_test.asin.tolist())

# Discard items that weren't rated by our subset of users
filtered_item_metadata = item_metadata[item_metadata.asin.isin(all_rated_items_set)].drop_duplicates(['asin', 'title'])
item_titles = filtered_item_metadata.title.tolist()


In [4]:
filtered_item_metadata.shape

(84, 19)

In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
import string

corpus =  " ".join([item_descr for item_descr in filtered_item_metadata.title.tolist()])
#lower_case_corpus = corpus.lower()
tokenized_corpus = word_tokenize(corpus, language="english")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielpenchev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/danielpenchev/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielpenchev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
len(set(tokenized_corpus))

545

In [9]:
stemmer = PorterStemmer()
stemmed_tokenized_corpus = [stemmer.stem(token) for token in tokenized_corpus]
stops_words_english = set(stopwords.words('english')+ list(string.punctuation)) 
corpus_filtered_tokens = set([token for token in stemmed_tokenized_corpus if token not in stops_words_english])

In [10]:
len(corpus_filtered_tokens)

471

# Exercise 2

Representation in vector spaces.

## 2.1

Represent all the items from Exercise 1 in a TF-IDF space. Interpret the meaning of the TF-IDF matrix dimensions. Be careful with multiple instances of preprocessing in the process, as default settings for creating the TF-IDF space may include some.

Tip: You may use the library [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) 

In [None]:
def preprocessing(text):
    lower_case_corpus = text.lower()
    tokenized_corpus = word_tokenize(lower_case_corpus)
    stemmer = PorterStemmer()
    stemmed_tokenized_corpus = [stemmer.stem(token) for token in tokenized_corpus]
    stops_words_english = set(stopwords.words('english')) | set(string.punctuation)
    return " ".join([token for token in stemmed_tokenized_corpus if token not in stops_words_english])

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(vocabulary=corpus_filtered_tokens, tokenizer=lambda x: x.split(' '))
processed_titles = [preprocessing(title) for title in item_titles]
X = vectorizer.fit_transform(processed_titles)

# (84, 471) is interpreted as for each document we have entry for each word from our vocab. Each entry uses the tf-idf formula for the calculating of the significance of each word.

['aqua', 'velva', 'after', 'shave', ',', 'classic', 'ice', 'blue', ',', '7', 'ounce']
['citre', 'shine', 'moisture', 'burst', 'shampoo', '-', '16', 'fl', 'oz']
['nars', 'blush', ',', 'taj', 'mahal']
['avalon', 'organics', 'wrinkle', 'therapy', 'coq10', 'cleansing', 'milk', ',', '8.50', 'oz']
['zum', 'zum', 'bar', 'anise', 'lavender', ',', '3', 'ounce']
['yardley', 'by', 'yardley', 'of', 'london', 'unisexs', 'lay', 'it', 'on', 'thick', 'hand', '&', 'amp', ';', 'foot', 'cream', '5.3', 'oz']
['fruits', '&', 'amp', ';', 'passion', 'blue', 'refreshing', 'shower', 'gel', '-', '6.7', 'fl', '.', 'oz', '.']
['waterpik', 'ultra', 'water', 'flosser']
['aqua', 'velva', 'after', 'shave', ',', 'classic', 'ice', 'blue', ',', '3.5', 'ounce']
['waterpik', 'ultra', 'water', 'flosser']
['fresh', 'eau', 'de', 'parfum', ',', 'sugar', 'lemon', ',', '3.4', 'oz']
['crest', 'pro-health', 'multi-protection', 'rinse', ',', 'cool', 'wintergreen', ',', '33.8', 'fluid', 'ounce']
['philips', 'norelco', 'arcitec', '1



In [20]:
processed_titles[6]

'fruit amp passion blue refresh shower gel 6.7 fl oz'

In [21]:
X.shape

(84, 471)

## 2.2

Using the TF-IDF representation, compute the cosine similarity between products with asin `B000FI4S1E`, `B000LIBUBY` and `B000W0C07Y`. Take a look at their features to see whether results make sense with their characteristics. Round your final answer to 3 decimal places.

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

asin1 = 'B000FI4S1E' 
asin2 = 'B000LIBUBY'
asin3 = 'B000W0C07Y'

asin1_title = filtered_item_metadata[filtered_item_metadata.asin == asin1].title.tolist()[0]
asin2_title = filtered_item_metadata[filtered_item_metadata.asin == asin2].title.tolist()[0]
asin3_title = filtered_item_metadata[filtered_item_metadata.asin == asin3].title.tolist()[0]

asin1_title_index = item_titles.index(asin1_title)
asin2_title_index = item_titles.index(asin2_title)
asin3_title_index = item_titles.index(asin3_title)

cosine_similarity(X[[asin1_title_index,asin2_title_index,asin3_title_index]],X[[asin1_title_index,asin2_title_index,asin3_title_index]])


array([[1.        , 0.03075944, 0.02377955],
       [0.03075944, 1.        , 0.50060985],
       [0.02377955, 0.50060985, 1.        ]])

# Exercise 3

Representation in vector spaces with contextual Word Embeddings.

## 3.1.

Represent all the products from Exercise 1 in a vector space using embeddings from a pre-trained BERT model. The final embedding of a product should be the average of the word embeddings from all the words in the 'title'. Critically evaluate this procedure.

What is the vocabulary size of the model? What are the dimensions of the last hidden state?

Tip: you may install the transformers library and use their pretrained [BERT model uncased](https://huggingface.co/bert-base-uncased).

In [None]:
#Uncomment and run the following line to install the transformers library
# ! pip install transformers

In [24]:
# LOAD TRANSFORMER
"""
If you plan on using a pretrained model, it’s important to use the associated 
pretrained tokenizer: it will split the text you give it in tokens the same way
for the pretraining corpus, and it will use the same correspondence
token to index (that we usually call a vocab) as during pretraining.
"""

# % pip install transformers
import torch
import transformers
assert transformers.__version__ > '4.0.0'

from transformers import BertModel, BertTokenizerFast

# set-up environment
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


modelname = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(modelname)
model = BertModel.from_pretrained(modelname).to(DEVICE)

# Print out the vocabulary size
print(tokenizer.vocab_size)
print(f"Max input size: {tokenizer.model_max_length}")
print(model.config.hidden_size)
# YOUR CODE HERE

  Referenced from: <0B7EB158-53DC-3403-8A49-22178CAB4612> /opt/anaconda3/envs/recsys/lib/python3.10/site-packages/torchvision/image.so
  warn(


Using device: cpu
30522
Max input size: 512
768


In [29]:
# Represent products in a vector space
"""
When using pre-trained models, it is always advised to feed it data similar to what it was trained with. 
Basically, it doesn't hurt to keep all the words in.
However, the effect (or the lack of it) will vary based on corpus and task. 
Decision here: keep them all since pretraining was done that way.
"""

def batch_encoding(sentences):
    # Since we're using padding, we need to provide the attention masks to our
    # model. Otherwise it doesn't know which tokens it should not attend to. 
    inputs = tokenizer(sentences, padding=True, return_tensors='pt') # Make the tokenizer return encoded sequences with padding in pytorch tensor format
    print(inputs[0].attention_mask) # Look at the padding and attention_mask
    print(inputs[0].ids) # Look at the padding and attention_mask


    outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state

    return inputs, inputs["attention_mask"], last_hidden_states
  
encoded_inputs, attention_masks, title_last_hidden_states = batch_encoding(
                                                          filtered_item_metadata.title.tolist()  
                                                            )
# Note that the control token [CLS] has been added at the beginning of each sentence,
# and [SEP] at the end

print(f"last_hidden_states: {title_last_hidden_states.shape}")
# Let's mask out the padding tokens 

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[101, 28319, 2310, 22144, 2044, 27545, 1010, 4438, 3256, 2630, 1010, 1021, 19471, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
last_hidden_states: torch.Size([84, 52, 768])


In [30]:
attention_masks.shape

torch.Size([84, 52])

In [26]:
title_last_hidden_states

tensor([[[-0.5125, -0.6032, -0.0640,  ..., -0.0243, -0.5474,  0.0653],
         [ 0.6216, -0.2028,  0.0255,  ..., -0.1496,  0.1825,  0.6196],
         [-0.1194, -1.0340,  0.9036,  ...,  0.2274, -0.3123,  0.1688],
         ...,
         [ 0.3163, -0.2145,  0.4271,  ..., -0.0255, -0.2990,  0.1332],
         [ 0.5335, -0.2718,  0.4228,  ..., -0.2108, -0.2156, -0.0760],
         [ 0.2494, -0.0511,  0.2910,  ..., -0.0092, -0.2357,  0.0571]],

        [[-0.8001, -0.3167,  0.2363,  ..., -0.3909, -0.4204,  0.2607],
         [ 0.5632, -0.2454,  0.4371,  ...,  0.1749,  0.0928,  0.3471],
         [-0.0806, -0.2830,  0.6450,  ...,  0.3591, -0.4115, -0.0125],
         ...,
         [-0.2381,  0.2319,  0.3110,  ...,  0.2153, -0.3523,  0.3913],
         [ 0.1254,  0.3215,  0.1888,  ...,  0.2091, -0.0579,  0.3087],
         [-0.0851,  0.3480,  0.2515,  ...,  0.1133, -0.2591,  0.1225]],

        [[-0.5316, -0.2055, -0.1992,  ..., -0.4426,  0.3019,  0.4350],
         [ 0.2200, -0.7475,  0.0666,  ..., -1

In [28]:
title_last_hidden_states

torch.Size([84, 52, 768])

In [34]:
avg_embeddings = torch.sum(title_last_hidden_states * attention_masks.unsqueeze(-1), dim=1)
sentence_embeddings = (avg_embeddings / attention_masks.sum(dim=1, keepdim=True)).detach().numpy()

## 3.2.

Using the representation obtained from Exercise 3.1., compute the cosine similarity between items with asin `B000FI4S1E`, `B000LIBUBY` and `B000W0C07Y`.
Round your final answer to 3 decimal places.

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

asin1 = 'B000FI4S1E' 
asin2 = 'B000LIBUBY'
asin3 ='B000W0C07Y'

asin1_title = filtered_item_metadata[filtered_item_metadata.asin == asin1].title.tolist()[0]
asin2_title = filtered_item_metadata[filtered_item_metadata.asin == asin2].title.tolist()[0]
asin3_title = filtered_item_metadata[filtered_item_metadata.asin == asin3].title.tolist()[0]

asin1_title_index = item_titles.index(asin1_title)
asin2_title_index = item_titles.index(asin2_title)
asin3_title_index = item_titles.index(asin3_title)

cosine_similarity(sentence_embeddings[[asin1_title_index,asin2_title_index,asin3_title_index]],sentence_embeddings[[asin1_title_index,asin2_title_index,asin3_title_index]])

array([[0.9999997 , 0.73359346, 0.6593505 ],
       [0.73359346, 1.0000006 , 0.7475123 ],
       [0.6593505 , 0.7475123 , 1.0000002 ]], dtype=float32)