In [10]:
import pandas as pd
import numpy as np

In [11]:
# Load data (adjust filename if needed)
df = pd.read_csv("../../data/processed/02_brad_balanced_bert_labled.csv")
df.head()

Unnamed: 0,rating,review_id,book_id,user_id,review,review_clean,camel_sentiment,camel_score
0,2.0,1665743403,21435637,13637412,قرأتها من فترة طويلة و لا يحضرني فيها الا اعجا...,قراتها من فترة طويلة و لا يحضرني فيها الا اعجا...,0,0.380519
1,2.0,1664872313,20015365,13637412,كان نفسي احب الرواية دي، أغلب اصدقائي اللي قرأ...,كان نفسي احب الرواية دي، اغلب اصدقائي اللي قرا...,-1,0.984207
2,2.0,1659286461,56517018,13637412,عملوا منها مسلسل. اجوف المسلسل ولا اقرى الرواي...,عملوا منها مسلسل. اجوف المسلسل ولا اقرى الرواي...,-1,0.644428
3,2.0,1657686339,22103652,13637412,لطيفه :). كأنك بتتفرج ع مسلسل بس نوعا ما لطيف,لطيفه . كانك بتتفرج ع مسلسل بس نوعا ما لطيف,1,0.959755
4,2.0,1657401919,56445490,13637412,الرواية جميلة تحمل معاني ورسالات كثيرة اراد ال...,الرواية جميلة تحمل معاني ورسالات كثيرة اراد ال...,1,0.824633


In [12]:
# ---------------- 1. Ground-truth sentiment from rating ----------------
# rating 1–2 => -1, rating 3 => 0, rating 4–5 => 1
def rating_to_gt_sentiment(r):
    if r in [1, 2]:
        return -1
    elif r == 3:
        return 0
    else:  # 4 or 5
        return 1

In [13]:
df['gt_sentiment'] = df['rating'].apply(rating_to_gt_sentiment)
set(df['gt_sentiment'])

{-1, 1}

In [14]:
# ---------------- 2. Normalized rating score ----------------
# Map rating 1–5 to 0–1 (linear)
MAX_RATING = 5.0
MIN_RATING = 1.0
df["rating_normalized"] = (df["rating"] - MIN_RATING) / (MAX_RATING - MIN_RATING)

In [15]:
# ---------------- 3. Sentiment score from CAMeLBERT ----------------
# camel_sentiment is -1 / 0 / 1 and camel_score is confidence (0–1)
# Convert to sentiment_score in [0,1]:
# -1 => 0,  0 => 0.5,  1 => 1, then weight by confidence
def sentiment_score(row):
    base = { -1: 0.0, 0: 0.5, 1: 1.0 }[row["camel_sentiment"]]
    conf = row["camel_score"]
    return base * conf

df["sentiment_score"] = df.apply(sentiment_score, axis=1)

In [16]:
# ---------------- 4. Final score ----------------
# final_score = (0.5 * rating_normalized + 0.5 * sentiment_score) * max_rating
df["final_score"] = (0.5 * df["rating_normalized"] + 0.5 * df["sentiment_score"]) * MAX_RATING

In [17]:
# ---------------- 5. Save results ----------------
df.to_csv("../../data/processed/02_brad_balanced_final.csv", index=False)
print("Saved brad_final.csv")

Saved brad_final.csv


In [18]:
# Quick check
print(df[["rating", "gt_sentiment", "camel_sentiment",
          "camel_score", "rating_normalized",
          "sentiment_score", "final_score"]].head())

   rating  gt_sentiment  camel_sentiment  camel_score  rating_normalized  \
0     2.0            -1                0     0.380519               0.25   
1     2.0            -1               -1     0.984207               0.25   
2     2.0            -1               -1     0.644428               0.25   
3     2.0            -1                1     0.959755               0.25   
4     2.0            -1                1     0.824633               0.25   

   sentiment_score  final_score  
0         0.190260     1.100649  
1         0.000000     0.625000  
2         0.000000     0.625000  
3         0.959755     3.024388  
4         0.824633     2.686582  
