In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

def scrape_site(url, p_tag='p', min_length=30):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all(p_tag)
    texts = [p.get_text().strip() for p in paragraphs if len(p.get_text().strip()) > min_length]
    return texts

# Scrape Al Jazeera
aljazeera_url = "https://www.aljazeera.net/news"
texts_jazeera = scrape_site(aljazeera_url)

# Scrape Al Arabiya
alarabiya_url = "https://www.alarabiya.net/arab-and-world"
texts_arabiya = scrape_site(alarabiya_url)

# Combine and assign random scores
combined_texts = texts_jazeera[:25] + texts_arabiya[:25]
data = [{'text': text, 'score': round(random.uniform(0, 10), 1)} for text in combined_texts]

# Save to CSV
df = pd.DataFrame(data)
df.to_csv('arabic_combined_dataset.csv', index=False, encoding='utf-8-sig')

print("✅ Scraped and saved dataset from Al Jazeera and Al Arabiya.")
print(df.head())


✅ Scraped and saved dataset from Al Jazeera and Al Arabiya.
                                                text  score
0  كشفت بيانات ملاحية أن سفينة متجهة لإسرائيل رست...    0.3
1  خرجت مظاهرات حاشدة في عدد من العواصم والمدن حو...    1.6
2  شهدت منصات التواصل الاجتماعي في مصر حالة من ال...    9.3
3  طوت الولايات المتحدة الأميركية وإيران صفحة الج...    2.1
4  عادت طائرة بوينغ 737 ماكس كانت مخصصة لشركة طير...    7.0


In [4]:
import pandas as pd
import re
import requests
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

# ✅ Download Arabic stopwords from GitHub
stopwords_url = 'https://raw.githubusercontent.com/mohataher/arabicstopwords/master/list.txt'
response = requests.get(stopwords_url)
arabic_stopwords = set(response.text.splitlines())

# ✅ Load the dataset
df = pd.read_csv("arabic_combined_dataset.csv")

# ✅ Clean and preprocess
def preprocess(text):
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  # Keep Arabic characters only
    text = dediac_ar(text)  # Remove diacritics
    tokens = simple_word_tokenize(text)
    tokens = [word for word in tokens if word not in arabic_stopwords]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess)
df.dropna(inplace=True)

print("✅ Preprocessing complete. Sample:")
print(df[['clean_text', 'score']].head())

✅ Preprocessing complete. Sample:
                                          clean_text  score
0  كشفت بيانات ملاحية أن سفينة متجهة لإسرائيل رست...    0.3
1  خرجت مظاهرات حاشدة في عدد من العواصم والمدن حو...    1.6
2  شهدت منصات التواصل الاجتماعي في مصر حالة من ال...    9.3
3  طوت الولايات المتحدة الأميركية وإيران صفحة الج...    2.1
4  عادت طائرة بوينغ ماكس كانت مخصصة لشركة طيران ص...    7.0


In [8]:

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenize and pad
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['clean_text'])
X = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(X, maxlen=100)
y = df['score'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build model function
def build_model(model_type='RNN'):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=100))

    if model_type == 'RNN':
        model.add(tf.keras.layers.SimpleRNN(64))
    elif model_type == 'BiRNN':
        model.add(tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(64)))
    elif model_type == 'GRU':
        model.add(tf.keras.layers.GRU(64))
    elif model_type == 'LSTM':
        model.add(tf.keras.layers.LSTM(64))

    model.add(tf.keras.layers.Dense(1))  # Regression
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Train all models
models = ['RNN', 'BiRNN', 'GRU', 'LSTM']
trained_models = {}

for name in models:
    print(f"\n🔧 Training {name} model...")
    model = build_model(name)
    model.fit(X_train, y_train, epochs=5, batch_size=16, validation_split=0.2)
    trained_models[name] = model



🔧 Training RNN model...
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 33.2224 - mae: 4.8422 - val_loss: 5.4661 - val_mae: 2.1873
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step - loss: 28.3096 - mae: 4.3677 - val_loss: 5.0442 - val_mae: 2.1152
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step - loss: 23.7490 - mae: 3.9456 - val_loss: 3.9964 - val_mae: 1.8758
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step - loss: 19.5147 - mae: 3.4957 - val_loss: 2.4516 - val_mae: 1.3917
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step - loss: 16.3631 - mae: 3.2190 - val_loss: 1.3487 - val_mae: 0.8923

🔧 Training BiRNN model...
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 31.2876 - mae: 4.6800 - val_loss: 3.2054 - val_mae: 1.5612
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate all models
for name, model in trained_models.items():
    y_pred = model.predict(X_test).flatten()
    print(f"\n📊 Evaluation for {name} model:")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
    print(f"R² Score: {r2_score(y_test, y_pred):.2f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step

📊 Evaluation for RNN model:
MAE: 1.58
MSE: 3.01
R² Score: -0.29
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 339ms/step

📊 Evaluation for BiRNN model:
MAE: 1.58
MSE: 2.52
R² Score: -0.08
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step

📊 Evaluation for GRU model:
MAE: 2.33
MSE: 7.68
R² Score: -2.29
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 289ms/step

📊 Evaluation for LSTM model:
MAE: 2.35
MSE: 7.85
R² Score: -2.36
