In [28]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter

# Arabic NLP libraries
import pyarabic.araby as araby
from pyarabic.araby import strip_tashkeel, strip_tatweel, normalize_hamza, normalize_alef
import nltk
from nltk.corpus import stopwords

# Download NLTK Arabic stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/nbx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
# Load your CSV file
df = pd.read_csv('../data/row/labr_row.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nRating distribution:")
print(df['rating'].value_counts().sort_index())

Dataset shape: (63257, 6)

First few rows:
   Unnamed: 0  rating  review_id   user_id   book_id  \
0           0       4  338670838   7878381  13431841   
1           1       4   39428407   1775679   3554772   
2           2       4   32159373   1304410   3554772   
3           3       1  442326656  11333112   3554772   
4           4       5   46492258    580165   3554772   

                                         review_text  
0   "عزازيل الذي صنعناه ،الكامن في أنفسنا" يذكرني...  
1   من أمتع ما قرأت من روايات بلا شك. وحول الشك ت...  
2   رواية تتخذ من التاريخ ،جوًا لها اختار المؤلف ...  
3   إني أقدّر هذه الرواية كثيرا، لسبب مختلف عن أس...  
4   الكاهن الذي أطلق على نفسه اسم هيبا تيمنا بالع...  

Rating distribution:
rating
1     2939
2     5285
3    12201
4    19054
5    23778
Name: count, dtype: int64


# ARABIC TEXT CLEANING FUNCTIONS

In [30]:
def remove_diacritics(text):
    """Remove Arabic diacritics (tashkeel)"""
    if pd.isna(text):
        return ""
    return strip_tashkeel(text)

def remove_tatweel(text):
    """Remove tatweel (ـ)"""
    if pd.isna(text):
        return ""
    return strip_tatweel(text)

def normalize_arabic(text):
    """Normalize Arabic letters"""
    if pd.isna(text):
        return ""
    # Normalize different forms of Alef to ا
    text = re.sub(r'[إأآٱ]', 'ا', text)
    # Normalize different forms of Hamza
    text = re.sub(r'[ؤئ]', 'ء', text)
    # Replace ة with ه at the end of words
    text = re.sub(r'ة([^\w]|$)', r'ه\1', text)
    # Normalize ى to ي
    text = re.sub(r'ى', 'ي', text)
    return text

def remove_repeating_chars(text):
    """Remove repeated characters (e.g., هههههه -> هه)"""
    if pd.isna(text):
        return ""
    # Keep max 2 repetitions
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def remove_english_text(text):
    """Remove English characters"""
    if pd.isna(text):
        return ""
    return re.sub(r'[a-zA-Z]+', '', text)

def remove_urls(text):
    """Remove URLs"""
    if pd.isna(text):
        return ""
    return re.sub(r'http\S+|www.\S+', '', text)

def remove_emails(text):
    """Remove email addresses"""
    if pd.isna(text):
        return ""
    return re.sub(r'\S+@\S+', '', text)

def remove_mentions_hashtags(text):
    """Remove mentions and hashtags"""
    if pd.isna(text):
        return ""
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    return text

def remove_punctuation(text):
    """Remove Arabic and English punctuation"""
    if pd.isna(text):
        return ""
    arabic_punctuation = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!"…"–ـ'''
    english_punctuation = string.punctuation
    all_punctuation = arabic_punctuation + english_punctuation
    translator = str.maketrans('', '', all_punctuation)
    return text.translate(translator)

def remove_numbers(text):
    """Remove Arabic and English numbers"""
    if pd.isna(text):
        return ""
    # Remove English numbers
    text = re.sub(r'\d+', '', text)
    # Remove Arabic-Indic numbers
    text = re.sub(r'[٠-٩]+', '', text)
    return text

def remove_extra_whitespace(text):
    """Remove extra whitespaces"""
    if pd.isna(text):
        return ""
    return ' '.join(text.split())

def remove_emojis(text):
    """Remove emojis"""
    if pd.isna(text):
        return ""
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_arabic_stopwords(text):
    """Remove Arabic stopwords"""
    if pd.isna(text):
        return ""
    arabic_stopwords = set(stopwords.words('arabic'))
    
    # Add common Arabic stopwords that might be missed
    additional_stopwords = {
        'من', 'الى', 'إلى', 'عن', 'على', 'في', 'حتى', 'أو', 'و', 'ف', 'ثم',
        'لكن', 'كان', 'هذا', 'هذه', 'ذلك', 'التي', 'الذي', 'كل', 'بعض', 'هل',
        'لا', 'نعم', 'إن', 'أن', 'كما', 'لقد', 'قد', 'ليس', 'غير', 'بل', 'لم'
    }
    arabic_stopwords.update(additional_stopwords)
    
    words = text.split()
    filtered_words = [word for word in words if word not in arabic_stopwords]
    return ' '.join(filtered_words)


# COMPREHENSIVE PREPROCESSING PIPELINE

In [31]:
def preprocess_arabic_text(text, remove_stopwords=False, keep_punctuation=False):
    """
    Complete preprocessing pipeline for Arabic text
    
    Parameters:
    -----------
    text : str
        Input Arabic text
    remove_stopwords : bool
        Whether to remove stopwords (default: False for BERT models)
    keep_punctuation : bool
        Whether to keep punctuation (default: False)
    
    Returns:
    --------
    str : Preprocessed text
    """
    if pd.isna(text):
        return ""
    
    # Step 1: Remove diacritics
    text = remove_diacritics(text)
    
    # Step 2: Remove tatweel
    text = remove_tatweel(text)
    
    # Step 3: Normalize Arabic characters
    text = normalize_arabic(text)
    
    # Step 4: Remove URLs, emails, mentions, hashtags
    text = remove_urls(text)
    text = remove_emails(text)
    text = remove_mentions_hashtags(text)
    
    # Step 5: Remove emojis
    text = remove_emojis(text)
    
    # Step 6: Remove English text
    text = remove_english_text(text)
    
    # Step 7: Remove numbers
    text = remove_numbers(text)
    
    # Step 8: Remove repeating characters
    text = remove_repeating_chars(text)
    
    # Step 9: Remove punctuation (optional)
    if not keep_punctuation:
        text = remove_punctuation(text)
    
    # Step 10: Remove stopwords (optional - usually not needed for BERT)
    if remove_stopwords:
        text = remove_arabic_stopwords(text)
    
    # Step 11: Remove extra whitespace
    text = remove_extra_whitespace(text)
    
    return text.strip()

# APPLY PREPROCESSING

In [32]:
print("\n" + "="*80)
print("PREPROCESSING ARABIC REVIEWS")
print("="*80)

# Create a copy of the dataframe
df_clean = df.copy()

# Apply preprocessing
print("\nApplying preprocessing pipeline...")
df_clean['review_text_clean'] = df_clean['review_text'].apply(
    lambda x: preprocess_arabic_text(x, remove_stopwords=False, keep_punctuation=True)
)

# Remove empty reviews after preprocessing
df_clean = df_clean[df_clean['review_text_clean'].str.strip() != '']

print(f"\nOriginal dataset size: {len(df)}")
print(f"Cleaned dataset size: {len(df_clean)}")
print(f"Removed {len(df) - len(df_clean)} empty reviews")


PREPROCESSING ARABIC REVIEWS

Applying preprocessing pipeline...

Original dataset size: 63257
Cleaned dataset size: 63254
Removed 3 empty reviews


# DATA QUALITY CHECKS

In [33]:
print("\n" + "="*80)
print("DATA QUALITY CHECKS")
print("="*80)

# Check text lengths
df_clean['text_length'] = df_clean['review_text_clean'].str.split().str.len()

print("\nText length statistics:")
print(df_clean['text_length'].describe())

# Remove very short reviews (less than 3 words)
min_words = 3
df_clean = df_clean[df_clean['text_length'] >= min_words]
print(f"\nRemoved reviews with less than {min_words} words")
print(f"Final dataset size: {len(df_clean)}")


DATA QUALITY CHECKS

Text length statistics:
count    63254.000000
mean        61.875344
std        107.856722
min          1.000000
25%         14.000000
50%         31.000000
75%         67.000000
max       3403.000000
Name: text_length, dtype: float64

Removed reviews with less than 3 words
Final dataset size: 61695


# PREVIEW RESULTS

In [34]:
print("\n" + "="*80)
print("PREPROCESSING EXAMPLES")
print("="*80)

# Show some examples
for idx in df_clean.head(5).index:
    print(f"\n--- Example {idx + 1} ---")
    print(f"Rating: {df_clean.loc[idx, 'rating']}")
    print(f"\nOriginal ({len(df_clean.loc[idx, 'review_text'])} chars):")
    print(df_clean.loc[idx, 'review_text'][:200] + "...")
    print(f"\nCleaned ({len(df_clean.loc[idx, 'review_text_clean'])} chars):")
    print(df_clean.loc[idx, 'review_text_clean'][:200])


PREPROCESSING EXAMPLES

--- Example 1 ---
Rating: 4

Original (618 chars):
 "عزازيل الذي صنعناه ،الكامن في أنفسنا" يذكرني يوسف زيدان بــ بورخس في استخدامه لحيلته الفنية،وخداع القاريء بأن الرواية ترجمة لمخطوط قديم. الهوامش المخترعة و اختلاق وجود مترجـِم عاد بي إلى بورخس و هوا...

Cleaned (593 chars):
"عزازيل الذي صنعناه ،الكامن في انفسنا" يذكرني يوسف زيدان ب بورخس في استخدامه لحيلته الفنيه،وخداع القاريء بان الروايه ترجمه لمخطوط قديم. الهوامش المخترعه و اختلاق وجود مترجم عاد بي الي بورخس و هوامشه و

--- Example 2 ---
Rating: 4

Original (86 chars):
 من أمتع ما قرأت من روايات بلا شك. وحول الشك تدندن (عزازيل) بلا هوادة. أحمد الديب 2008...

Cleaned (80 chars):
من امتع ما قرات من روايات بلا شك. وحول الشك تدندن (عزازيل) بلا هواده. احمد الديب

--- Example 3 ---
Rating: 4

Original (193 chars):
 رواية تتخذ من التاريخ ،جوًا لها اختار المؤلف فترة تاريخية ندر من يتناولها روائيًا. مكتوبة بدقة وإتقان وجمال.من أروع ما يمكن أن تقرأ من الروايات التاريخية. تركز على الإنسان.صانع المعنى ومدمره ....

Clea

In [35]:
df_clean.to_csv('../data/processed/labr_cleaned.csv')