In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load data (adjust filename if needed)
df = pd.read_csv("../../data/processed/brad_reviews_with_camel_sentiment.csv")
df.head()

Unnamed: 0,rating,review_id,book_id,user_id,review,review_clean,camel_sentiment,camel_score
0,4,1682581870,57098525,13637412,صراع الجذور والانتماء، عقلة ساق الخيزان توائم ...,صراع الجذور والانتماء، عقلة ساق الخيزان توائم ...,1,0.626534
1,5,1682385404,56693085,13637412,كتاب رائع. اعتقد ان الروايه كلها تلخصت بجمله و...,كتاب رائع. اعتقد ان الروايه كلها تلخصت بجمله و...,1,0.520128
2,4,1682039752,30836455,13637412,رواية تلامس الروح بعمقها، فخورة اني اخيرا لقيت...,رواية تلامس الروح بعمقها، فخورة اني اخيرا لقيت...,1,0.995917
3,5,1681553886,6680940,13637412,رواية محكمة بكل اختصار. وكان الجزء المفضل بالن...,رواية محكمة بكل اختصار. وكان الجزء المفضل بالن...,-1,0.987505
4,3,1681248984,19011044,13637412,هذا الكتاب يحزن مرا، ظلم واضطهاد عيسى بلا ذنب ...,هذا الكتاب يحزن مرا، ظلم واضطهاد عيسى بلا ذنب ...,-1,0.996008


In [4]:
# ---------------- 1. Ground-truth sentiment from rating ----------------
# rating 1–2 => -1, rating 3 => 0, rating 4–5 => 1
def rating_to_gt_sentiment(r):
    if r in [1, 2]:
        return -1
    elif r == 3:
        return 0
    else:  # 4 or 5
        return 1

In [9]:
df['gt_sentiment'] = df['rating'].apply(rating_to_gt_sentiment)
set(df['gt_sentiment'])

{-1, 0, 1}

In [10]:
# ---------------- 2. Normalized rating score ----------------
# Map rating 1–5 to 0–1 (linear)
MAX_RATING = 5.0
MIN_RATING = 1.0
df["rating_normalized"] = (df["rating"] - MIN_RATING) / (MAX_RATING - MIN_RATING)

In [11]:
# ---------------- 3. Sentiment score from CAMeLBERT ----------------
# camel_sentiment is -1 / 0 / 1 and camel_score is confidence (0–1)
# Convert to sentiment_score in [0,1]:
# -1 => 0,  0 => 0.5,  1 => 1, then weight by confidence
def sentiment_score(row):
    base = { -1: 0.0, 0: 0.5, 1: 1.0 }[row["camel_sentiment"]]
    conf = row["camel_score"]
    return base * conf

df["sentiment_score"] = df.apply(sentiment_score, axis=1)

In [12]:
# ---------------- 4. Final score ----------------
# final_score = (0.5 * rating_normalized + 0.5 * sentiment_score) * max_rating
df["final_score"] = (0.5 * df["rating_normalized"] + 0.5 * df["sentiment_score"]) * MAX_RATING

In [13]:
# ---------------- 5. Save results ----------------
df.to_csv("../../data/processed/brad_final.csv", index=False)
print("Saved brad_final.csv")

Saved brad_final.csv


In [14]:
# Quick check
print(df[["rating", "gt_sentiment", "camel_sentiment",
          "camel_score", "rating_normalized",
          "sentiment_score", "final_score"]].head())

   rating  gt_sentiment  camel_sentiment  camel_score  rating_normalized  \
0       4             1                1     0.626534               0.75   
1       5             1                1     0.520128               1.00   
2       4             1                1     0.995917               0.75   
3       5             1               -1     0.987505               1.00   
4       3             0               -1     0.996008               0.50   

   sentiment_score  final_score  
0         0.626534     3.441334  
1         0.520128     3.800320  
2         0.995917     4.364793  
3         0.000000     2.500000  
4         0.000000     1.250000  
