# Text Representation

Please, note that this notebook is intended to be run in Google Colab.

In [1]:
# Mount drive and define path to the data folder (from your Google Drive)
# from google.colab import drive
# drive.mount('/content/drive')

# datapath = 'drive/MyDrive/data/amazon_reviews/All_Beauty/'
train_file = 'training.pkl'
test_file = 'test.pkl'
meta_file = 'meta_All_Beauty.json'

# Exercise 1

Load the [metadata file](https://nijianmo.github.io/amazon/index.html) and discard any item that was not rated by our subset of users (nor in training or test sets). Apply preprocessing (stemming and stopwords removal) to clean up the text from the "title". Report the vocabulary size before and after the preprocessing.

In [3]:
import os
import sys
sys.path.append('../')
import pickle
import pandas as pd

# Load TRAIN and TEST sets 
test_data = pickle.load( open( "test.pkl", "rb" ) )
train_data = pickle.load( open( "training.pkl", "rb" ) )

# Load the METADATA (ITEMS)
df = pd.read_json('meta_All_Beauty.json', lines=True)

# Discard duplicates
df = df.sort_values(by=['asin'])
cleaned_dataset = df.drop_duplicates(subset=['asin'], keep = 'last').reset_index(drop=True)

# Discard items that weren't rated by our subset of users
item_in_subset = list(test_data.loc[:,'asin'])+list(train_data.loc[:,'asin'])
# print(list(item_in_subset))
cleaned_dataset = cleaned_dataset.loc[cleaned_dataset['asin'].isin(item_in_subset)]
cleaned_dataset = cleaned_dataset.drop_duplicates(subset=['asin'], keep = 'last')


In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

[nltk_data] Downloading package punkt to /Users/lwk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lwk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
def isfloat(input:str):
    try:
        float(input)
    except ValueError:
        return False
    else:
        return True

# <YOUR CODE HERE>
porter_stemmer = PorterStemmer()
len_words = 0
len_filter_words = 0
title_list = []
temp_list = []
for title in cleaned_dataset['title']:
    # print(title)
    word_list = [word for word in word_tokenize(title)]
    temp_list += word_list
    # temp_list.append(word_list)
len_words = len(set(temp_list))
temp_list_ = []
# print(temp_list.count('3.5'))
for title in cleaned_dataset['title']:
    filter_list = [porter_stemmer.stem(word.lower()) for word in word_tokenize(title) if word not in stopwords.words("english") and word.isalpha()]
    len_filter_words += len(filter_list)
    temp_list_ += filter_list
    title_list.append(TreebankWordDetokenizer().detokenize(filter_list))
    # title_list.append(" ".join(filter))
print(len_words)
print(len_filter_words)
print(len(set(temp_list_)))

545
712
402


# Exercise 2

Representation in vector spaces.

## 2.1

Represent all the products from Exercise 1 in a TF-IDF space. Interpret the meaning of the TF-IDF matrix dimensions.

Tip: You may use the library [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) 

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()

# <YOUR CODE HERE>
X = tfidf_vectorizer.fit_transform(title_list)
# print(tfidf_vectorizer.get_feature_names_out())
print(X.shape)

(84, 398)


## 2.2

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'. Take a look at their features to see whether results make sense with their characteristics. 

In [7]:
import numpy as np

In [8]:
def cos_sim(np_arrays):
    output = np.zeros((len(np_arrays),len(np_arrays)))
    for i,np_array_i  in enumerate(np_arrays):
        for j,np_array_j  in enumerate(np_arrays):
            output[i,j] = (np_array_i.T.dot(np_array_j))/(np.linalg.norm(np_array_i)*np.linalg.norm(np_array_j))
    return output

In [9]:
vector_np = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names()).to_numpy()
vector_list = []
item_list = ['B000FI4S1E', 'B000LIBUBY', 'B000W0C07Y']
for i, item in enumerate(cleaned_dataset['asin']):
    if item in item_list:
        vector_list.append(vector_np[i,:])
        print(i)

6
10
17




In [10]:
out = cos_sim(vector_list)
print(out)

[[1.         0.03382796 0.02339948]
 [0.03382796 1.         0.40755955]
 [0.02339948 0.40755955 1.        ]]


In [11]:
out = pd.DataFrame(cosine_similarity(vector_list))
print(out)

          0         1         2
0  1.000000  0.033828  0.023399
1  0.033828  1.000000  0.407560
2  0.023399  0.407560  1.000000


# Exercise 3

Representation in vector spaces with contextual Word Embeddings.

## 3.1.

Represent all the products from Exercise 1 in a vector space using embeddings from a pre-trained BERT model. The final embedding of a product should be the average of the word embeddings from all the words in the 'title'. What is the vocabulary size of the model? What are the dimensions of the last hidden state?

Tip: you may install the transformers library and use their pretrained [BERT model uncased](https://huggingface.co/bert-base-uncased).

In [12]:
# LOAD TRANSFORMER
"""
If you plan on using a pretrained model, it’s important to use the associated 
pretrained tokenizer: it will split the text you give it in tokens the same way
for the pretraining corpus, and it will use the same correspondence
token to index (that we usually call a vocab) as during pretraining.
"""

# % pip install transformers
import torch
import transformers
assert transformers.__version__ > '4.0.0'

from transformers import BertModel, BertTokenizerFast, BertConfig

# set-up environment
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


modelname = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(modelname)
model = BertModel.from_pretrained(modelname).to(DEVICE)

# Print out the vocabulary size
# <YOUR CODE HERE>

Using device: cpu


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
text = list(cleaned_dataset['title'])
# encoded_input = tokenizer(text[:4], max_length=100,
#                           add_special_tokens=True, truncation=True,
#                           padding=True, return_tensors="pt")
encoded_input = tokenizer(text, padding=True, return_tensors="pt")
output = model(**encoded_input)
c = model(**encoded_input)
#last_hidden_state, pooler_output = output[0], output[1]

In [15]:
print(output[0].shape)

torch.Size([84, 52, 768])


In [17]:
print(BertConfig().vocab_size)
print(BertConfig().hidden_size)

30522
768


In [16]:
# REPRESENT PRODUCTS IN A VECTOR SPACE


def batch_encoding(sentences):
    # Since we're using padding, we need to provide the attention masks to our
    # model. Otherwise it doesn't know which tokens it should not attend to. 
    inputs = tokenizer(sentences,padding=True,return_tensors="pt",return_attention_mask=True)
    # print(inputs) # Look at the padding and attention_mask

    outputs = model(**inputs)

    last_hidden_states = outputs[0]

    return inputs, last_hidden_states
  
encoded_inputs, title_last_hidden_states = batch_encoding(list(cleaned_dataset['title']))

"""
Note that the control token [CLS] has been added 
at the beginning of each sentence, and [SEP] at the end. 
"""

# Now, let's mask out the padding tokens and compute the embedding vector of each product

# <YOUR CODE HERE>

'\nNote that the control token [CLS] has been added \nat the beginning of each sentence, and [SEP] at the end. \n'

## 3.2.

Compute and the cosine similarity between products with asin 'B000FI4S1E', 'B000LIBUBY' and 'B000W0C07Y'.

In [19]:
import numpy as np
item_list = ['B000FI4S1E', 'B000LIBUBY', 'B000W0C07Y']
mask_list = []
vector_list = []
return_list = []
for i, item in enumerate(cleaned_dataset['asin']):
    if item in item_list:
        mask_list.append(encoded_inputs["attention_mask"][i])
        vector_list.append(title_last_hidden_states[i])
        return_list.append(encoded_inputs["attention_mask"][i].detach().numpy().dot(
            title_last_hidden_states[i].detach().numpy())/np.sum(encoded_inputs["attention_mask"][i].detach().numpy()))


In [22]:
print(cosine_similarity(return_list))

[[1.         0.73359339 0.65935059]
 [0.73359339 1.         0.74751203]
 [0.65935059 0.74751203 1.        ]]
