In [None]:
from my_sentence_transformer import MySentenceTransformer
import pandas as pd
import numpy as np
import gc

In [None]:
# Default model_name='all-MiniLM-L6-v2'
my_transformer = MySentenceTransformer()

sentence = "I love NLP"
sentences = ["Hello World", "I Love NLP "]

sent_embedding = my_transformer.transform_sentences(sentence)
sent_embedding.shape

In [None]:
sents_embeddings = my_transformer.transform_sentences(sentences)
sents_embeddings.shape

In [None]:
my_transformer.normalize_embeddings(sent_embedding).shape

In [None]:
my_transformer.normalize_embeddings(sents_embeddings)

In [None]:
# Path to your Parquet file
parquet_file_path = '../filtered/yelp_reviews_2022.parquet'

# Read the Parquet file into a DataFrame
reviews_df = pd.read_parquet(parquet_file_path)

# Display the first few rows of the DataFrame
reviews_df.head()

In [None]:
reviews = reviews_df['text'].tolist()
review_embeddings = my_transformer.transform_sentences(reviews)
review_embeddings.shape

In [None]:
# save embeddings, free up RAM
np.save("mini_LM_6_embeddings.npy", review_embeddings)

### Using 'paraphrase-MiniLM-L3-v2'

In [None]:
my_l3_transformer = MySentenceTransformer(model_name='paraphrase-MiniLM-L3-v2')

In [None]:
review_l3_embeddings = my_l3_transformer.transform_sentences(reviews)
review_l3_embeddings.shape

# Processed faster than the model above

In [None]:
# save embeddings, free up RAM
np.save("paraphrase_LM_3_embeddings.npy", review_l3_embeddings)

### Using 'all-MiniLM-L12-v2'

In [None]:
my_l12_transformer = MySentenceTransformer(model_name='all-MiniLM-L12-v2')

In [None]:
review_l12_embeddings = my_l12_transformer.transform_sentences(reviews)
review_l12_embeddings.shape

# a bit slower than l3, and l6
# evaluate performnace and embedding quality later

In [None]:
# save embeddings, free up RAM
np.save("mini_LM_12_embeddings.npy", review_l12_embeddings)

### MPNet models (Higher Accuracy but larger/slower)

In [None]:
my_mpnet_transformer = MySentenceTransformer(model_name='paraphrase-mpnet-base-v2')

In [None]:
review_mpnet_embeddings = my_mpnet_transformer.transform_sentences(reviews)
review_mpnet_embeddings.shape

# much slower, almost 6 times slower than mini LM l6

In [None]:
# save embeddings, free up RAM
# embeddings: shape (num_sentences, 768)
np.save("paraphrase_mpnet_embeddings.npy", review_mpnet_embeddings)

In [None]:
del review_embeddings
del review_l3_embeddings
del review_l12_embeddings
del review_mpnet_embeddings
gc.collect()

### LightWeight RoBERTa

In [None]:
my_roberta_transformer = MySentenceTransformer(model_name='all-distilroberta-v1')

In [None]:
reviews = reviews_df['text'].tolist()

In [None]:
review_distil_roberta_embeddings = my_roberta_transformer.transform_sentences(reviews)
review_distil_roberta_embeddings.shape

# faster than MPNet, 768 dim

In [None]:
np.save("all_distill_roberta_embeddings.npy", review_distil_roberta_embeddings)

In [None]:
del review_distil_roberta_embeddings
gc.collect()

### Domain-specific models

In [None]:
# model trained on customer reviews

### Multilingual models

In [None]:
# multilingual