In [62]:
import pandas as pd
import torch

In [63]:
df = pd.read_csv('../../data/processed/02_brad_features.csv')

In [64]:
print(df.head(1))

   Unnamed: 0  rating   review_id   book_id   user_id  \
0           0     2.0  1665743403  21435637  13637412   

                                              review  \
0  قرأتها من فترة طويلة و لا يحضرني فيها الا اعجا...   

                                        review_clean  camel_sentiment  \
0  قراتها من فترة طويلة و لا يحضرني فيها الا اعجا...                0   

   camel_score  gt_sentiment  ...  sentiment_score  final_score  len_chars  \
0     0.380519            -1  ...          0.19026     1.100649        167   

   len_words  num_exclam  num_qmark  len_chars_norm  len_words_norm  \
0       30.0         0.0        0.0        0.008263        0.007937   

   num_exclam_norm  num_qmark_norm  
0              0.0             0.0  

[1 rows x 21 columns]


In [65]:
# Length features
df["len_chars"] = df["review_clean"].str.len()
df["len_words"] = df["review_clean"].str.split().str.len().astype(float)

# Punctuation (optional)
df["num_exclam"] = df["review"].str.count("!").astype(float)
df["num_qmark"]  = df["review"].str.count("\\?").astype(float)

In [66]:
# Normalize some columns to [0,1]
def minmax(col):
    v = df[col].astype(float).values
    v_min, v_max = v.min(), v.max()
    if v_max == v_min:
        return np.zeros_like(v)
    return (v - v_min) / (v_max - v_min)

df["len_chars_norm"] = minmax("len_chars")
df["len_words_norm"] = minmax("len_words")
df["num_exclam_norm"] = minmax("num_exclam")
df["num_qmark_norm"]  = minmax("num_qmark")

In [67]:
# Select scalar features for each review (same order as y)
feature_cols = [
    "len_chars_norm",
    "len_words_norm",
    "camel_score",
    "sentiment_score",
    "final_score",
    "num_exclam_norm",
    "num_qmark_norm",
]

edge_feats_np = df[feature_cols].astype(float).to_numpy()   # shape: [num_edges, F]
edge_feats = torch.from_numpy(edge_feats_np).float()

In [68]:
df.columns

Index(['Unnamed: 0', 'rating', 'review_id', 'book_id', 'user_id', 'review',
       'review_clean', 'camel_sentiment', 'camel_score', 'gt_sentiment',
       'rating_normalized', 'sentiment_score', 'final_score', 'len_chars',
       'len_words', 'num_exclam', 'num_qmark', 'len_chars_norm',
       'len_words_norm', 'num_exclam_norm', 'num_qmark_norm'],
      dtype='object')

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [70]:
df.columns

Index(['Unnamed: 0', 'rating', 'review_id', 'book_id', 'user_id', 'review',
       'review_clean', 'camel_sentiment', 'camel_score', 'gt_sentiment',
       'rating_normalized', 'sentiment_score', 'final_score', 'len_chars',
       'len_words', 'num_exclam', 'num_qmark', 'len_chars_norm',
       'len_words_norm', 'num_exclam_norm', 'num_qmark_norm'],
      dtype='object')

In [72]:
df.to_csv('../../data/processed/02_brad_features_rm.csv')