### Load processed reviews

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api

from tqdm import tqdm

In [9]:
# Load your original reviews dataframe (adjust file name)
reviews_pandas_df = pd.read_parquet("../processed/processed_reviews.parquet")

# Check columns
print(reviews_pandas_df['text'].shape)

(31571,)


### 1. Embeddings
Representing text data in numerical form for ML processing

#### i. Gensim Word2Vec (trained on our lemmas)
Each review has its own context window instead of a flat list of all lemmas

In [10]:
# Word2Vec embedding capture semantic meaning and relationships
# Gensim's word2vec
# USES LOCAL CONTEXT, GLOBAL CONTEXT NOT PRESERVED
reviews_pandas_df['lemma_tokens'] = reviews_pandas_df.lemmas.apply(lambda l: l.split())
model = Word2Vec(sentences=reviews_pandas_df.lemma_tokens.tolist(), vector_size=200, window=5, min_count=3, workers=6)

In [11]:
def get_review_word_2_vec_embedding(lemmas, model=model):
    vectors = [model.wv[word] for word in lemmas if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [12]:
tqdm.pandas(desc="Vectorizing review lemmas", ncols=100)
word_2_vec_embedding = reviews_pandas_df['lemma_tokens'].progress_apply(get_review_word_2_vec_embedding)

Vectorizing review lemmas: 100%|███████████████████████████| 31571/31571 [00:01<00:00, 30972.70it/s]


In [13]:
# Convert list of vectors to 2D array
word_2_vec_embedding_array = np.vstack(word_2_vec_embedding.values)  # shape: (31571, embedding_dim)

In [None]:
# Save to embeddings folder
np.save('embedders/word_2_vec_embeddings.npy', word_2_vec_embedding_array)

#### ii. Gensim Glove (pre-trained)
I am interested in using a pretrained glove model and comparing it to Word2Vec trained on our lemmas processed.

In [None]:
# Glove embeddings - Stanford
glove_model = api.load("glove-wiki-gigaword-200")

In [None]:
all_lemmas = [lemma for tokens in reviews_pandas_df.lemma_tokens for lemma in tokens]
unique_lemmas = list(set(all_lemmas))
lemmas_in_glove = [lemma for lemma in unique_lemmas if lemma in glove_model]
len(lemmas_in_glove)

In [None]:
def get_review_glove_embedding(lemmas, model=glove_model):
    vectors = [glove_model[word] for word in lemmas if word in glove_model]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
tqdm.pandas(desc="Vectorizing review lemmas using Glove pre-trained model", ncols=100)
glove_200_embedding = reviews_pandas_df['lemma_tokens'].progress_apply(get_review_glove_embedding)

In [None]:
# Convert list of vectors to 2D array
glove_200_embedding_array = np.vstack(glove_200_embedding.values)  # shape: (31571, embedding_dim)

In [None]:
# Save to embeddings folder
np.save('embedders/glove_200_embeddings.npy', glove_200_embedding_array)