In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from multiprocessing import Pool
import pandas as pd

In [None]:
output_file_path = '../data/beer_reviews_text.csv'

num_rows= 600

df = pd.read_csv(output_file_path, nrows=num_rows)
df.head()

In [None]:
class SentimentAnalysisModel:
    def __init__(self, model_path='multilingual-sentiment', weights=None):
        if weights is not None:
            self.weights = weights
        else:
            self.weights = torch.tensor([[-1, -0.5, 0, 0.5, 1]]).unsqueeze(0).unsqueeze(-1)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path).to("mps")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)

    def forward(self, **inputs):
        """
        Calculate sentiment score ranging from -1 to 1.
        """
        with torch.no_grad():
            outputs = self.model(**inputs)
        predictions = outputs.logits
        probabilities = torch.softmax(predictions, dim=-1)
        sentiment_score = torch.matmul(probabilities, self.weights).squeeze()
        return sentiment_score.cpu()  # Move back to CPU for further processing

sentiment_model = SentimentAnalysisModel(model_path="../models/sentiment_model_1")

# Define dataset class for DataLoader
class CommentsDataset(Dataset):
    def __init__(self, comments):
        self.comments = comments

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        return self.comments[idx]

def process_batch(batch):
    """
    Tokenize a batch of comments and compute sentiment scores.
    """
    tokenized_batch = sentiment_model.tokenizer(
        batch,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    tokenized_batch = {key: val.to("mps") for key, val in tokenized_batch.items()}
    with torch.no_grad():
        scores = sentiment_model.forward(**tokenized_batch)
    return scores.tolist()

batch_size = 50
dataset = CommentsDataset(df.iloc[:, 0].astype(str).tolist())
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Process batches in parallel
def process_all_batches(data_loader):
    all_scores = []
    for batch in data_loader:
        batch_scores = process_batch(batch)
        all_scores.extend(batch_scores)
    return all_scores

df["sentiment_bert"] = process_all_batches(data_loader)


In [None]:
print(df)