In [33]:
import numpy as np
import spacy
from tqdm import tqdm
from langdetect import detect
from pathos.multiprocessing import ProcessingPool as Pool
import polars as pl
import os

In [2]:
DATA_FOLDER = os.path.join(os.getcwd(), '../data/processed/')

In [3]:
df_ratings_no_text = pl.read_parquet(DATA_FOLDER + 'ratings_no_text.pq')
df_ratings_text = pl.read_parquet(DATA_FOLDER + 'ratings_only_text.pq')

In [None]:
def cleanup_polars(column: pl.Expr):
    return (
        column
        .str.replace_all(r'[^\w ]', '')  # Remove non-alphanumeric characters
        .str.to_lowercase()             # Convert to lowercase
    )

lazy_df = df_ratings_text.lazy()

lazy_df = lazy_df.with_columns(
    cleanup_polars(pl.col("text")).alias("text_cleaned")
)

df_ratings_text = lazy_df.collect(streaming=True, progress_bar=True)

## Lemmatization and Stopword Removal

In [10]:
spacy_models = {
    'en': spacy.load('en_core_web_sm', disable=['ner', 'parser']),
    'fr': spacy.load('fr_core_news_sm', disable=['ner', 'parser']),
    'es': spacy.load('es_core_news_sm', disable=['ner', 'parser']),
    'de': spacy.load('de_core_news_sm', disable=['ner', 'parser']),
    'pl': spacy.load('pl_core_news_sm', disable=['ner', 'parser']),
}

# Function to process a batch of texts for lemmatization and stopword removal
def batch_lemmatize_remove_stopwords(texts):
    results = []
    for text in texts:
        try:
            lang = detect(text)  # Detect language
            if lang in spacy_models:
                nlp = spacy_models[lang]
                doc = nlp(text)
                results.append(
                    ' '.join(
                        [token.lemma_ for token in doc if not token.is_stop]  # Remove stopwords
                    )
                )
            else:
                results.append(text)  # Return as-is if the language is unsupported
        except Exception:
            results.append(text)  # Return as-is in case of an error
    return results

# Parallel processing function
def parallel_process_lemmatization(df, column, chunk_size, num_workers):
    chunks = [df[column][i:i + chunk_size].to_list() for i in range(0, df.height, chunk_size)]
    tqdm_total = df.height  # Total number of entries for progress tracking

    # Use pathos for multiprocessing
    pool = Pool(num_workers)
    results = []
    for result in tqdm(pool.imap(batch_lemmatize_remove_stopwords, chunks), total=len(chunks), desc="Processing Batches"):
        results.extend(result)

    pool.close()
    pool.join()
    return results

In [8]:
# DON'T RUN THIS CELL - IT TAKES 2 HOURS AT LEAST (M4 Pro with GPU enabled)

# Parameters
chunk_size = 1000  # Adjust chunk size based on memory
num_workers = 4  # Set number of processes

# Process with progress bar
lemmatized_texts = parallel_process_lemmatization(
    df_ratings_text,
    'text_cleaned',
    chunk_size=chunk_size,
    num_workers=num_workers
)

df_ratings_text = df_ratings_text.with_columns(
    pl.Series(name='text_cleaned_lemma', values=lemmatized_texts)
)

In [12]:
# Save text_cleaned_lemma.pq
df_ratings_text.write_parquet(DATA_FOLDER + 'ratings_text_cleaned_lemma.pq')

## Embeddings

In [23]:
df_ratings_text = pl.read_parquet(DATA_FOLDER + 'ratings_text_cleaned_lemma.pq')

In [35]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(sentences)
embeddings

array([[ 0.1884568 ,  0.17425641,  0.05447782,  0.2905176 ,  0.16766421,
        -0.04720675,  0.64558   ,  0.15980871,  0.22689249, -0.03089059,
         0.2558835 , -0.0525877 , -0.2261015 , -0.05710628,  0.1304262 ,
         0.12495363,  0.31749603,  0.19444385, -0.58632535, -0.01258583,
         0.6099092 ,  0.16432738,  0.03331115, -0.27383083, -0.28975758,
        -0.2111971 , -0.02261382, -0.17035906,  0.16159007,  0.06082738,
        -0.24162409,  0.18579188,  0.42740935,  0.19295171, -0.07234462,
         0.16611078,  0.10442807,  0.20477235,  0.21116717,  0.19973986,
        -0.09408248, -0.17383674,  0.06427378,  0.28025475, -0.29530594,
         0.06209547,  0.10427693, -0.02364411,  0.12913184, -0.12617461,
        -0.17898999,  0.03700579, -0.61250603,  0.05029806,  0.17730357,
         0.22494122,  0.17386065, -0.03840281, -0.21286814,  0.25849253,
        -0.12101628,  0.30971506, -0.4196637 ,  0.0090766 ,  0.14188908,
        -0.30556944,  0.17621139, -0.07087357, -0.6

In [None]:
sentences = df_ratings_text.select("text_cleaned_lemma").to_pandas()
sentences = sentences["text_cleaned_lemma"].to_list()

In [None]:
## DON'T RUN THIS CELL - IT TAKES 3.5 HOURS AT LEAST (M4 Pro with GPU enabled)
embeddings = model.encode(sentences, show_progress_bar=True)

In [37]:
np.save(DATA_FOLDER + 'embeddings.npy', embeddings)