In [1]:
import pandas as pd
from transformers import AutoTokenizer

In [2]:
file_path = "../data/cleaned/data_cleaned_full.csv"
df_clean = pd.read_csv(file_path)
print(f"Dataset shape: {df_clean.shape}")
print("Columns:", df_clean.columns.tolist())

Dataset shape: (87812, 47)
Columns: ['review_id', 'user_id', 'user_name', 'hotel_id', 'hotel_name', 'location', 'country', 'distance_center', 'hotel_rating_label', 'price', 'rating', 'review_text', 'review_title', 'review_date', 'traveler_type', 'stay_duration', 'sentiment_predicted', 'distance_center_km', 'hotel_rating', 'price_dzd', 'rating_normalized', 'review_date_parsed', 'review_year', 'review_month', 'review_day', 'review_day_of_week', 'location_area', 'location_city', 'stay_nights', 'review_text_clean', 'review_title_clean', 'review_length', 'review_word_count', 'traveler_type_encoded', 'sentiment_label', 'user_rating_count', 'user_avg_rating', 'user_rating_std', 'user_review_count', 'hotel_rating_count', 'hotel_avg_rating', 'hotel_rating_std', 'hotel_review_count', 'has_review_text', 'has_rating', 'has_date', 'is_complete']


# 2. Load tokenizers

In [3]:
camelbert_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment")
arabert_tokenizer = AutoTokenizer.from_pretrained("Muhannedbsh/arabert-sentiment-model-MuhannedSh")
arat5_tokenizer = AutoTokenizer.from_pretrained("Noanihio/arat5v2-darja-sentiment")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

# 3. Function to tokenize and return token IDs

In [4]:
def tokenize_text(tokenizer, text, max_length=128):
    return tokenizer.encode(
        str(text),
        add_special_tokens=True, 
        padding='max_length',
        truncation=True,
        max_length=max_length
    )

# 4. Apply tokenization for each model

In [5]:
# 1️⃣ CAMeLBERT sentiment tokenizer (3-class sentiment)
camelbert_tokenizer = AutoTokenizer.from_pretrained(
    "CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment"
)

# 2️⃣ AraBERT sentence-transformer / sentiment tokenizer
# (kathaem's version for classification tasks)
arabert_tokenizer = AutoTokenizer.from_pretrained(
    "kathaem/aubmindlab-arabertv02-base-sentence-transformer-xnli-ar"
)

# 3️⃣ T5 sentiment tokenizer (Algerian Darja)
arat5_tokenizer = AutoTokenizer.from_pretrained(
    "Noanihio/arat5v2-darja-sentiment"
)

print("✅ All sentiment tokenizers loaded successfully")

✅ All sentiment tokenizers loaded successfully


# 5. Preview

In [6]:
# Create token columns for each model
df_clean["review_token_ids_arabert"] = df_clean["review_text_clean"].apply(
    lambda x: tokenize_text(arabert_tokenizer, x)
)

df_clean["review_token_ids_camelbert"] = df_clean["review_text_clean"].apply(
    lambda x: tokenize_text(camelbert_tokenizer, x)
)

df_clean["review_token_ids_arat5v2"] = df_clean["review_text_clean"].apply(
    lambda x: tokenize_text(arat5_tokenizer, x)
)

In [7]:
df_clean.to_csv("../data/tokenized/data_cleaned_full_with_tokens.csv", index=False)
print(f"✅ Dataset saved")


✅ Dataset saved
