In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import pandas as pd
import os
import gc


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
output_file_path = '../data/beer_reviews_text.csv'

num_rows= 6000

df = pd.read_csv(output_file_path, nrows=num_rows)
df.head()

Unnamed: 0,text
0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło..."
1,Cerveza pale lager gabonesa. MÃ¡s floja que la...
2,"Kolor- złoty, klarowny. Piana - drobna, średni..."
3,"Botella, de GabÃ³n regalo familiar.31/01/2015C..."
4,Many thanks for this beer to Erzengel. Pours l...


In [4]:
class SentimentAnalysisModel:
    def __init__(self, model_path='multilingual-sentiment', weights=None):
        if weights is not None:
            self.weights = weights.to("mps")
        else:
            self.weights = torch.tensor([[-1, -0.5, 0, 0.5, 1]], device="mps").unsqueeze(0).unsqueeze(-1)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to("mps")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

    def forward(self, **inputs):
        with torch.no_grad():
            with torch.autocast("mps"):
                outputs = self.model(**inputs)
            predictions = outputs.logits
            probabilities = torch.softmax(predictions, dim=-1).to("mps")
            sentiment_score = torch.matmul(probabilities, self.weights).squeeze()
        return sentiment_score.cpu()  # Move back to CPU for further processing

class CommentsDataset(Dataset):
    def __init__(self, comments):
        self.comments = comments

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        return self.comments[idx]

In [5]:
def process_batch(batch, sentiment_model):
    tokenized_batch = sentiment_model.tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    tokenized_batch = {key: val.to("mps") for key, val in tokenized_batch.items()}
    with torch.no_grad():
        scores = sentiment_model.forward(**tokenized_batch)
    del tokenized_batch
    torch.mps.empty_cache()
    return scores.tolist()

def process_all_batches(data_loader, sentiment_model):
    all_scores = []
    for batch in tqdm(data_loader, desc="Processing Batches", unit="batch"):
        batch_scores = process_batch(batch, sentiment_model)
        all_scores.extend(batch_scores)
    return all_scores


In [6]:
if __name__ == "__main__":
    sentiment_model = SentimentAnalysisModel(model_path="../models/sentiment_model_1")

    batch_size = 512
    dataset = CommentsDataset(df.iloc[:, 0].astype(str).tolist())
    data_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        persistent_workers=False
    )

    df["sentiment_bert"] = process_all_batches(data_loader, sentiment_model)


Processing Batches: 100%|██████████| 12/12 [02:10<00:00, 10.90s/batch]


In [7]:
torch.mps.empty_cache()
gc.collect()

0

In [8]:
df

Unnamed: 0,text,sentiment_bert
0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",0.264435
1,Cerveza pale lager gabonesa. MÃ¡s floja que la...,-0.433727
2,"Kolor- złoty, klarowny. Piana - drobna, średni...",0.146698
3,"Botella, de GabÃ³n regalo familiar.31/01/2015C...",-0.067261
4,Many thanks for this beer to Erzengel. Pours l...,0.842057
...,...,...
5995,How: Bottle Apperance: Clear yellow. Aroma: fl...,-0.140594
5996,Typical international lager. Weak aroma with s...,-0.067390
5997,Een pilsbier dat heel goudblond eruitziet als ...,-0.259747
5998,"Clear golden, huge frothy white head. Grainy, ...",0.315399
