In [21]:
import numpy as np
import pandas as pd

import os

In [9]:
# Load .env
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
# Emdeddings imports
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import gensim.downloader as api

from my_sentence_transformer import MySentenceTransformer
from open_ai_review_embeddings import get_embeddings_batch
from InstructorEmbedding import INSTRUCTOR

In [11]:
filepath = "../processed/processed_reviews.parquet"

reviews_df = pd.read_parquet(filepath)
sample_df = reviews_df.sample(n=1000, random_state=42).copy()

In [20]:
os.makedirs("data", exist_ok=True)

# Save your dataframe
sample_df.to_parquet("data/sample_reviews.parquet", index=False)

In [18]:
sample_df.head(3)

Unnamed: 0,review_id,business_id,stars,categories,date,text,lemmas,num_lemmas,month,review_length,avg_word_length
26402,G4M9NZ7Xvr4CKY_NPeD8LQ,5DwRX43KmGroXBlltpCGqA,1.0,"Irish, Restaurants",2022-01-03 18:19:40,Have never been there and have lived here for ...,live long time go friend time new year eve cho...,33,1,457,4.0
15837,IbSwiO2OkX6DrBZIQ8ofQg,gyY3NIjsfGF5SUbr5PdmHA,5.0,"Pizza, Bars, Lounges, Restaurants, Nightlife, ...",2022-01-14 15:31:32,This was our first try of Georgio's and we'll ...,try georgio definitely pick order look mediter...,26,1,339,5.0
1043,eAVYs7Lo90AQ-GzbcrsseQ,cx2fwY66_xTNFo0p2EEC4g,5.0,"American (New), Indian, Event Planning & Servi...",2022-01-18 19:43:36,Great Indian food in Media Borough. Right on s...,great indian food media borough right state st...,18,1,164,5.0


#### Word2Vec

In [None]:
sample_df['lemma_tokens'] = sample_df.lemmas.apply(lambda l: l.split())
model = Word2Vec(sentences=sample_df.lemma_tokens.tolist(), vector_size=300, window=5, min_count=3, workers=6)

In [None]:
def get_review_word_2_vec_embedding(lemmas, model=model):
    vectors = [model.wv[word] for word in lemmas if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
word_2_vec_embedding = sample_df['lemma_tokens'].apply(get_review_word_2_vec_embedding)

In [None]:
# Convert list of vectors to 2D array
word_2_vec_embedding_array = np.vstack(word_2_vec_embedding.values)

In [None]:
# Save to embeddings folder
np.save('embeddings/word_2_vec_embeddings.npy', word_2_vec_embedding_array)

#### Glove

In [None]:
glove_model = api.load("glove-wiki-gigaword-300")

In [None]:
all_lemmas = [lemma for tokens in sample_df.lemma_tokens for lemma in tokens]
unique_lemmas = list(set(all_lemmas))
lemmas_in_glove = [lemma for lemma in unique_lemmas if lemma in glove_model]
len(lemmas_in_glove)

In [None]:
def get_review_glove_embedding(lemmas, model=glove_model):
    vectors = [glove_model[word] for word in lemmas if word in glove_model]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

In [None]:
glove_300_embedding = sample_df['lemma_tokens'].apply(get_review_glove_embedding)

In [None]:
# Convert list of vectors to 2D array
glove_300_embedding_array = np.vstack(glove_300_embedding.values)

In [None]:
# Save to embeddings folder
np.save('embeddings/glove_300_embeddings.npy', glove_300_embedding_array)

#### Contextual Embedders
- all-MiniLM-L6-v2
- paraphrase-MiniLM-L3-v2
- all-MiniLM-L12-v2
- paraphrase-mpnet-base-v2
- all-distilroberta-v1

In [13]:
samples = sample_df['text'].tolist()

#### all-MiniLM-L6-v2

In [None]:
my_transformer = MySentenceTransformer()

In [None]:
samples_embeddings = my_transformer.transform_sentences(samples)
samples_embeddings.shape

In [None]:
np.save("embeddings/all_mini_LM_L6_v2_embeddings.npy", samples_embeddings)

#### paraphrase-MiniLM-L3-v2

In [None]:
my_para_l3_transformer = MySentenceTransformer(model_name='paraphrase-MiniLM-L3-v2')

In [None]:
samples_para_l3_embeddings = my_para_l3_transformer.transform_sentences(samples)
samples_para_l3_embeddings.shape

In [None]:
np.save("embeddings/paraphrase_mini_LM_L3_v2_embeddings.npy", samples_para_l3_embeddings)

#### all-MiniLM-L12-v2

In [None]:
my_mini_l12_v2_transformer = MySentenceTransformer(model_name='all-MiniLM-L12-v2')

In [None]:
samples_l12_embeddings = my_mini_l12_v2_transformer.transform_sentences(samples)
samples_l12_embeddings.shape

In [None]:
# save embeddings, free up RAM
np.save("embeddings/all_mini_LM_L12_v2_embeddings.npy", samples_l12_embeddings)

#### paraphrase-mpnet-base-v2

In [None]:
my_mpnet_transformer = MySentenceTransformer(model_name='paraphrase-mpnet-base-v2')

In [None]:
samples_mpnet_embeddings = my_mpnet_transformer.transform_sentences(samples)
samples_mpnet_embeddings.shape

In [None]:
np.save("embeddings/paraphrase_mpnet_base_v2_embeddings.npy", samples_mpnet_embeddings)

#### all-distilroberta-v1

In [None]:
my_roberta_transformer = MySentenceTransformer(model_name='all-distilroberta-v1')

In [None]:
samples_distil_roberta_embeddings = my_roberta_transformer.transform_sentences(samples)
samples_distil_roberta_embeddings.shape

In [None]:
np.save("embeddings/all_distill_roberta_v1_embeddings.npy", samples_distil_roberta_embeddings)

#### Open AI API - text-embedding-3-small embedding model

In [None]:
open_ai_embeddings = get_embeddings_batch(samples)
open_ai_embeddings.shape

In [None]:
np.save('embeddings/open_ai_text_small_embeddings.npy', open_ai_embeddings)

#### HuggingFace - HKU NLP Instructor

In [15]:
model = INSTRUCTOR("hkunlp/instructor-base")

instruction = "Represent the Yelp review for sentiment analysis:"
inputs = [[instruction, t] for t in samples]
instructor_embeddings = model.encode(inputs, batch_size=16, normalize_embeddings=True)
print("Embeddings shape:", instructor_embeddings.shape)

No sentence-transformers model found with name hkunlp/instructor-base. Creating a new one with mean pooling.
`SentenceTransformer._target_device` has been deprecated, please use `SentenceTransformer.device` instead.


Embeddings shape: (1000, 768)


In [17]:
np.save('embeddings/hku_nlp_instructor_embeddings.npy', instructor_embeddings)