# NLP

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import softmax
import pandas as pd
from tqdm import tqdm

In [86]:
def compute_weighted_rating(df_predictions_batch: pd.DataFrame) -> pd.DataFrame:
    df_final_prediction_batch = pd.DataFrame()
    for predicted_rating in range(1,6):
        df_prediction_rating_batch = df_predictions_batch[df_predictions_batch.idxmax(axis=1).astype(int) == predicted_rating]
        weight = df_prediction_rating_batch.iloc[:,predicted_rating-1]
        rating = predicted_rating
        if predicted_rating == 1:
            weight_minus = 0
            rating_minus = 0
            weight_plus = df_prediction_rating_batch.iloc[:,predicted_rating]
            rating_plus = 2
        elif predicted_rating == 5:
            weight_minus = df_prediction_rating_batch.iloc[:,predicted_rating-2]
            rating_minus = 4
            weight_plus = 0
            rating_minus = 0
        else:
            weight_minus = df_prediction_rating_batch.iloc[:,predicted_rating-2]
            rating_minus = predicted_rating-1
            weight_plus = df_prediction_rating_batch.iloc[:,predicted_rating]
            rating_plus = predicted_rating+1
            
        df_final_prediction_rating_batch = (weight_minus*rating_minus + weight*rating + weight_plus*rating_plus)/(weight_minus + weight + weight_plus)
        df_final_prediction_rating_batch = pd.concat([df_final_prediction_rating_batch, weight_minus + weight + weight_plus],axis=1)
        df_final_prediction_rating_batch.rename(columns={0:"rating",1:"confidence"},inplace=True)
        df_final_prediction_batch = pd.concat([df_final_prediction_batch,df_final_prediction_rating_batch],axis=0)
    return df_final_prediction_batch

def predict_rating(reviews: pd.Series, tokenizer: AutoTokenizer, device: torch.device, model: AutoModelForSequenceClassification) -> pd.DataFrame:
    encoded_input = tokenizer(reviews.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
    encoded_input = {key: tensor.to(device) for key, tensor in encoded_input.items()}
    with torch.no_grad():
        output = model(**encoded_input)
    scores = softmax(output.logits, dim=1)
    return pd.DataFrame([
        *scores.cpu().numpy()
    ], columns=['1', '2', '3', '4', '5'], index=reviews.index)

In [None]:
df_ratings = pd.read_csv("../data/beer_advocate/ratings.csv")

HUGGING_FACE_MODEL = "nlptown/bert-base-multilingual-uncased-sentiment"
BATCH_SIZE = 100

tokenizer = AutoTokenizer.from_pretrained(HUGGING_FACE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(HUGGING_FACE_MODEL)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

df_reviews = df_ratings[df_ratings['text'].str.len() > 377]['text']
n_rows = len(df_reviews)

df_final_predictions = pd.DataFrame()

for start in tqdm(range(0, n_rows, BATCH_SIZE)):

    end = min(start+BATCH_SIZE, n_rows)
    batch = df_reviews.iloc[start:end]
    df_predictions_batch = predict_rating(batch, tokenizer, device, model)
    df_final_prediction_batch = compute_weighted_rating(df_predictions_batch)

    df_final_predictions = pd.concat([df_final_predictions, df_final_prediction_batch], axis=0)


# Beer Knowledge

\begin{equation}
K_{L,u} = \max\limits_{\mathcal{S}_i}(\frac{|\mathcal{S}_{i,u}|}{|\mathcal{S}_i|})
\end{equation}

\begin{equation}
K_{G,u} = \frac{|\mathcal{S}_u|}{|\mathcal{S}|} \cdot \frac{\log(1 + \overline{|\mathcal{S}_{i,u}|})}{\log(1 + \overline{|\mathcal{S}_i|})}
\end{equation}

\begin{equation}
K_u = K_{G,u} \cdot gini_u + K_{L,u} \cdot (1 - gini_u)
\end{equation}