In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
import random
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
import string



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lucasduport/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print(x)
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

tensor([1.], device='mps:0')


In [3]:
ds = load_dataset("jniimi/tripadvisor-review-rating")
raw_data = pd.DataFrame(ds['train'])

display(raw_data.head())

# Remove the System Prompt
df = raw_data.drop(columns=['stay_year', 'post_date', 'freq', 'lang'])

# Drop the rows with missing User Prompt
df = df.dropna()

# Drop the duplicates
df = df.drop_duplicates()

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)

# FIXME: The dataset is not balanced

# FIXME: Remove this later, sampling only 10% of the dataset while developing
df = df.sample(frac=0.001).reset_index(drop=True)
print("Number of samples:", len(df))


Unnamed: 0,hotel_id,user_id,title,text,overall,cleanliness,value,location,rooms,sleep_quality,stay_year,post_date,freq,review,char,lang
0,127781101,2262DCBFC351F42A9DD30AC8BAD24686,Really excellent Hilton,Stayed here on business trips and the hotel is...,5.0,4.0,5.0,4.0,5.0,4.0,2012,2012-04-13,1,Really excellent Hilton\nStayed here on busine...,204,__label__en
1,137380592,8477E11DABF4D6743885E401BB4C8CCF,Exceptional service and comfort,Spent two nights here for a wedding in Brookly...,5.0,5.0,4.0,5.0,4.0,5.0,2012,2012-08-16,1,Exceptional service and comfort\nSpent two nig...,621,__label__en
2,129673371,483A193B7113ADFFD5CE30849564F69C,Nice room and five star service,Great place for a 3-night stay. Our king room ...,5.0,5.0,5.0,3.0,5.0,4.0,2012,2012-05-09,1,Nice room and five star service\nGreat place f...,1259,__label__en
3,129006626,E5A63DD7239A7057746D4644A5C986EB,"BRILLIANT hotel, my #1 Chicago pick for busine...","This is my favorite hotel in Chicago, and I've...",5.0,5.0,5.0,5.0,5.0,5.0,2012,2012-04-28,1,"BRILLIANT hotel, my #1 Chicago pick for busine...",2242,__label__en
4,139168159,CBFE281C9386225267BC52518836A6C2,Convenient and comfortable,BEST. BREAKFAST. EVER. Couldn't have been happ...,5.0,5.0,4.0,5.0,4.0,5.0,2012,2012-09-02,1,Convenient and comfortable\nBEST. BREAKFAST. E...,511,__label__en


Number of samples: 201


In [4]:
reviews = list(df['text'])
tokens = [word_tokenize(sentence.lower()) for sentence in reviews]

In [5]:
# Load stopwords
stopwords_list = set(stopwords.words('english'))  # Use set for faster lookups

# Remove stopwords function
def remove_stopwords(tokens):
    return [[word for word in sentence if word.lower() not in stopwords_list] for sentence in tokens]

# Punctuation removal function with sentence boundaries
def remove_punct(words):
    sentence = []
    processed = []
    
    for word in words:
        if word in ".!?":  # Sentence-ending punctuation
            if sentence:  # Avoid empty sentences
                processed.append(["[START]"] + sentence + ["[END]"])
                sentence = []  # Reset for next sentence
        elif word not in string.punctuation:  # Remove other punctuation
            sentence.append(word)
    
    if sentence:  # Edge case: last sentence without ending punctuation
        processed.append(["[START]"] + sentence + ["[END]"])

    return processed

reviews = remove_stopwords(tokens)
print(reviews[0:5])

reviews = [remove_punct(review) for review in reviews]
# Flatten the list of lists
reviews = [sentence for review in reviews for sentence in review]
# Remove empty sentences
reviews = [sentence for sentence in reviews if sentence]  # Remove empty sentences
print(reviews[0:5])


[['stayed', 'week', ',', 'part', 'holiday', 'uk', '..', 'really', 'impressed', 'hotel', ',', 'thought', 'fantastic', 'value', 'money', '.', 'room', 'king', 'bed', 'modern', 'building', ',', 'spacious', 'well-decorated', ',', 'huge', 'comfortable', 'bed', '.', 'large', 'studio', 'room', ',', 'separate', 'seating', 'area', 'kitchenette', '.', 'kitchenette', 'fridge', ',', 'microwave', 'coffee-maker', ',', "n't", 'planning', 'cook', '.', "'s", 'kettle', ',', 'bring', 'travel', 'one', 'want', 'make', 'tea', '.', 'thought', 'free', 'breakfast', 'excellent', ',', 'wide', 'choice', 'hot', 'cold', 'food', ',', 'fruit', ',', 'yoghurts', 'etc', '..', 'get', 'busy', 'breakfast', 'room', 'weekends', ',', "'s", 'expected', '.', "'ll", 'try', 'repeat', 'reviewers', 'said', ',', 'pick', 'things', '.', 'one', 'side', 'hotel', 'overlooks', 'square', ',', 'attractive', ',', 'trolleys', ',', 'great', 'inevitably', 'noisy', '.', 'outside', 'noise', 'really', 'bothers', ',', 'suggest', 'ringing', 'advance'

In [6]:
# Flatten review in a single list
flattened_reviews = [word for review in reviews for word in review]
print(flattened_reviews[0:5])

['[START]', 'stayed', 'week', 'part', 'holiday']


# Word 2 Vec Implemtation from scratch with pytorch

## Params

In [None]:

CONTEXT_SIZE = 3  # Window size
EMBEDDING_DIM = 100  # Dimension of word vectors
EPOCHS = 10
LEARNING_RATE = 0.01

### Build Vocabulary

In [8]:
word_counts = Counter(flattened_reviews)

vocab = {word: i for i, word in enumerate(word_counts.keys())}
rev_vocab = {i: word for word, i in vocab.items()}
vocab_size = len(vocab)

### Prepare Skip-gram Training Data

### Define Skip-gram Model

In [9]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_layer = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, target):
        embed = self.embeddings(target)
        out = self.out_layer(embed)
        return out

### Train

In [10]:
flattened_reviews

['[START]',
 'stayed',
 'week',
 'part',
 'holiday',
 'uk',
 '..',
 'really',
 'impressed',
 'hotel',
 'thought',
 'fantastic',
 'value',
 'money',
 '[END]',
 '[START]',
 'room',
 'king',
 'bed',
 'modern',
 'building',
 'spacious',
 'well-decorated',
 'huge',
 'comfortable',
 'bed',
 '[END]',
 '[START]',
 'large',
 'studio',
 'room',
 'separate',
 'seating',
 'area',
 'kitchenette',
 '[END]',
 '[START]',
 'kitchenette',
 'fridge',
 'microwave',
 'coffee-maker',
 "n't",
 'planning',
 'cook',
 '[END]',
 '[START]',
 "'s",
 'kettle',
 'bring',
 'travel',
 'one',
 'want',
 'make',
 'tea',
 '[END]',
 '[START]',
 'thought',
 'free',
 'breakfast',
 'excellent',
 'wide',
 'choice',
 'hot',
 'cold',
 'food',
 'fruit',
 'yoghurts',
 'etc',
 '..',
 'get',
 'busy',
 'breakfast',
 'room',
 'weekends',
 "'s",
 'expected',
 '[END]',
 '[START]',
 "'ll",
 'try',
 'repeat',
 'reviewers',
 'said',
 'pick',
 'things',
 '[END]',
 '[START]',
 'one',
 'side',
 'hotel',
 'overlooks',
 'square',
 'attractive',

In [11]:
from tqdm import tqdm

model = Word2Vec(vocab_size, EMBEDDING_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    total_loss = 0
    progress_bar = tqdm(range(CONTEXT_SIZE, len(flattened_reviews) - CONTEXT_SIZE), desc=f"Epoch {epoch + 1}")
    for i in progress_bar:
        target = vocab[flattened_reviews[i]]
        context = [vocab[flattened_reviews[i - j]] for j in range(1, CONTEXT_SIZE + 1)] + \
                  [vocab[flattened_reviews[i + j]] for j in range(1, CONTEXT_SIZE + 1)]
                          
        target_tensor = torch.tensor([target], dtype=torch.long, device=device)
        context_tensor = torch.tensor(context, dtype=torch.long, device=device)
        
        optimizer.zero_grad()
        output = model(target_tensor)
        loss = 0
        for context_word in context_tensor:
            loss += criterion(output, context_word.unsqueeze(0))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch + 1}, Total Loss: {total_loss:.4f}")

Epoch 1: 100%|██████████| 21459/21459 [01:28<00:00, 243.23it/s, loss=38.3]


Epoch 1, Total Loss: 1175360.5721


Epoch 2: 100%|██████████| 21459/21459 [01:32<00:00, 233.18it/s, loss=38]  


Epoch 2, Total Loss: 1063005.9597


Epoch 3: 100%|██████████| 21459/21459 [01:29<00:00, 239.77it/s, loss=37.8]


Epoch 3, Total Loss: 1029614.0552


Epoch 4: 100%|██████████| 21459/21459 [01:31<00:00, 234.28it/s, loss=38.7]


Epoch 4, Total Loss: 1013291.5632


Epoch 5: 100%|██████████| 21459/21459 [01:30<00:00, 236.18it/s, loss=38.8]


Epoch 5, Total Loss: 1002871.4357


Epoch 6: 100%|██████████| 21459/21459 [01:30<00:00, 236.74it/s, loss=38.4]


Epoch 6, Total Loss: 998209.5808


Epoch 7: 100%|██████████| 21459/21459 [01:32<00:00, 232.43it/s, loss=36.7]


Epoch 7, Total Loss: 994583.1732


Epoch 8: 100%|██████████| 21459/21459 [01:32<00:00, 231.69it/s, loss=37.9]


Epoch 8, Total Loss: 992004.4136


Epoch 9: 100%|██████████| 21459/21459 [01:30<00:00, 236.40it/s, loss=37.7]


Epoch 9, Total Loss: 991881.4691


Epoch 10: 100%|██████████| 21459/21459 [01:30<00:00, 236.57it/s, loss=37.8]

Epoch 10, Total Loss: 990362.5364





## Save the model

In [12]:
# Save the model
torch.save(model.state_dict(), "word2vec_model.pth")

# Save the vocabulary
with open("vocab.txt", "w") as f:
    for word, index in vocab.items():
        f.write(f"{word}\t{index}\n")

## Load the model

In [None]:
# # Load the vocabulary
# vocab = {}
# with open("vocab.txt", "r") as f:
#     for line in f:
#         word, index = line.strip().split("\t")
#         vocab[word] = int(index)
        
# # Load the model
# model.load_state_dict(torch.load("word2vec_model.pth"))
# model.eval()

## Test the embeddings

In [None]:
from scipy.spatial.distance import cosine

# Function to get the word vector
def get_word_vector(word):
    if word in vocab:
        index = vocab[word]
        return model.embeddings(torch.tensor(index, device=device)).detach().cpu().numpy()
    else:
        print(f"Word '{word}' not found in vocabulary.")
        return None

# Function to perform word arithmetic
def word_arithmetic(word1, operator, word2):
    vec1 = get_word_vector(word1)
    vec2 = get_word_vector(word2)

    if vec1 is not None and vec2 is not None:
        if operator == "+":
            result = vec1 + vec2
        elif operator == "-":
            result = vec1 - vec2
        else:
            print(f"Unsupported operator '{operator}'. Use '+' or '-'.")
            return None
        return result
    else:
        return None

# Function to test operations
def test_operations(test):
    results = []
    for i in range(0, len(test) - 2, 3):
        word1, operator, word2 = test[i], test[i + 1], test[i + 2]
        result_vector = word_arithmetic(word1, operator, word2)
        if result_vector is not None:
            results.append((result_vector, {word1, word2}))
        else:
            print(f"Could not perform operation on '{word1} {operator} {word2}'.")
    return results

# Function to find the closest word
def find_closest_word(vector, excluded_words):
    closest_word = None
    closest_distance = float('inf')

    for word, index in vocab.items():
        if word not in excluded_words:
            word_vector = model.embeddings(torch.tensor(index, device=device)).detach().cpu().numpy()
            distance = cosine(vector, word_vector)
            if distance < closest_distance:
                closest_distance = distance
                closest_word = word

    return closest_word, closest_distance

# Test the operations
test = ["cheap", "+", "bad"]
results = test_operations(test)

# Find the closest word for each result
for result_vector, excluded_words in results:
    closest_word, closest_distance = find_closest_word(result_vector, excluded_words)
    print(f"The closest word to the result vector is '{closest_word}' with a cosine distance of {closest_distance:.4f}")

The closest word to the result vector is 'teenage' with a cosine distance of 0.7011
