# Lab

##### Objective : The main purpose behind this lab is to get familiar with NLP language models using Pytorch library.

## Part 1 : Classification Regression

### Step 1 : Data Collection

In [1]:
# Import necessary libraries
import scrapy
from scrapy.crawler import CrawlerProcess
import random
import json

class ArabicProverbsSpider(scrapy.Spider):
    name = "arabic_proverbs"
    start_urls = ['https://arabpoems.com/حكم-وأمثال/']

    def parse(self, response):
        # Extracting the content of each <li> inside a <ul> inside a <div>
        proverbs = response.css('div.entry-content ul li::text, div.entry-content ul li strong::text').getall()

        # Clean and structure the data
        data = []
        for proverb in proverbs:
            proverb_encoded = proverb.strip().encode('utf-8').decode('utf-8')
            data.append({'Text': proverb_encoded, 'Score': round(random.uniform(0, 10), 1)})

        # Save the data to a JSON file
        with open('arabic_proverbs.json', 'w', encoding='utf-8') as jsonfile:
            json.dump(data, jsonfile, ensure_ascii=False, indent=4)

# Run the Scrapy crawler
process = CrawlerProcess()
process.crawl(ArabicProverbsSpider)
process.start()


2024-05-26 13:39:18 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-05-26 13:39:18 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.10.3, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.10.0 (tags/v3.10.0:b494f59, Oct  4 2021, 19:00:18) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 24.0.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Windows-10-10.0.19045-SP0
2024-05-26 13:39:18 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-05-26 13:39:18 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-05-26 13:39:18 [scrapy.extensions.telnet] INFO: Telnet Password: 491e2360483149a1
2024-05-26 13:39:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.log

In [20]:
# Import necessary libraries
import pandas as pd

# Read the JSON file into a DataFrame
df = pd.read_json('arabic_proverbs.json', encoding='utf-8')

# Display the DataFrame
df.head()

Unnamed: 0,Text,Score
0,أشفق عليك في شبابك حتى تجد من يشفق عليك وأنت ر...,3.3
1,من صاحب العلماء وقر- مثل لبناني.,4.3
2,قال الإمام علي بن أبي طالب: ليس اليتيم من مات ...,8.8
3,النجاح لا يحتاج إلى أقدام بل إلى إقدام.,9.1
4,روى الأصمعي في احد مؤلفاته: أول العلم الصمت وا...,2.3


### Step 2 : Data Preparation

In [21]:
import re
import unicodedata
from nltk.corpus import stopwords
import nltk
import stanza


# Download the Arabic models for the neural pipeline
nlp = stanza.Pipeline('ar', processors='tokenize,lemma')

# Remove Diacritization
def remove_diacritics(text):
    arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', text)
    return text

# Text Cleaning and Preprocessing
def clean_text(text):
    # Remove extra whitespace and punctuation/special characters
    pattern = r"[^\w\s]"  # Matches characters that are not alphanumeric or whitespace
    text = re.sub(pattern, '', text)
    # Remove stopwords
    arabic_stopwords = stopwords.words("arabic")
    text = ' '.join(word for word in text.split() if word not in arabic_stopwords)
    # Remove Diactitics
    text = remove_diacritics(text)
    # Normalize characters for consistent representation (especially for Arabic)
    text = unicodedata.normalize('NFKD', text)
    return text

# Text Tokenization
def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

# Lemmatization using Stanza Library
def lemmatize_text(text):
    # Process the text
    doc = nlp(text)
    # Lemmatize the tokens
    lemmatized_tokens = [word.lemma for sent in doc.sentences for word in sent.words]
    return lemmatized_tokens

2024-05-26 22:08:27 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-26 22:08:33 INFO: Downloaded file to C:\Users\admin\stanza_resources\resources.json
2024-05-26 22:08:33 INFO: Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| lemma     | padt_nocharlm |

2024-05-26 22:08:33 INFO: Using device: cpu
2024-05-26 22:08:33 INFO: Loading: tokenize
2024-05-26 22:08:33 INFO: Loading: mwt
2024-05-26 22:08:33 INFO: Loading: lemma
2024-05-26 22:08:33 INFO: Done loading processors!


In [22]:
# Apply the clean_text function
df['Clean_Text'] = df['Text'].apply(clean_text)

# Apply the tokenize_text function
df['Tokenized_Text'] = df['Clean_Text'].apply(tokenize_text)

# Apply the lemmatize_text function
df['Lemmatized_Text'] = df['Clean_Text'].apply(lemmatize_text)

# Display the DataFrame
df.head()

Unnamed: 0,Text,Score,Clean_Text,Tokenized_Text,Lemmatized_Text
0,أشفق عليك في شبابك حتى تجد من يشفق عليك وأنت ر...,3.3,أشفق شبابك تجد يشفق وأنت رجل كبير,"[أشفق, شبابك, تجد, يشفق, وأنت, رجل, كبير]","[أشفق, شبابك, وَجَد, يشفق, وأنت, رَجُل, كَبِير]"
1,من صاحب العلماء وقر- مثل لبناني.,4.3,صاحب العلماء وقر لبناني,"[صاحب, العلماء, وقر, لبناني]","[صَاحِب, عَالِم, وَقرَة, لُبنَانِيّ]"
2,قال الإمام علي بن أبي طالب: ليس اليتيم من مات ...,8.8,قال الإمام علي بن أبي طالب اليتيم مات والده ...,"[قال, الإمام, علي, بن, أبي, طالب, اليتيم, ما...","[قَال, الإمام, عَلَى, بِن, أبي, طَالَب, اليت..."
3,النجاح لا يحتاج إلى أقدام بل إلى إقدام.,9.1,النجاح يحتاج أقدام إقدام,"[النجاح, يحتاج, أقدام, إقدام]","[نَجَاح, اِحتَاج, أقدام, إقدام]"
4,روى الأصمعي في احد مؤلفاته: أول العلم الصمت وا...,2.3,روى الأصمعي احد مؤلفاته العلم الصمت والثاني ...,"[روى, الأصمعي, احد, مؤلفاته, العلم, الصمت, و...","[روى, الأصمعي, أَحَد, مو<UNK>لفات, هُوَ, عَلَ..."


In [23]:
# Drop unnecessary columns
df.drop(columns=['Text', 'Clean_Text', 'Tokenized_Text'], inplace=True)
df

Unnamed: 0,Score,Lemmatized_Text
0,3.3,"[أشفق, شبابك, وَجَد, يشفق, وأنت, رَجُل, كَبِير]"
1,4.3,"[صَاحِب, عَالِم, وَقرَة, لُبنَانِيّ]"
2,8.8,"[قَال, الإمام, عَلَى, بِن, أبي, طَالَب, اليت..."
3,9.1,"[نَجَاح, اِحتَاج, أقدام, إقدام]"
4,2.3,"[روى, الأصمعي, أَحَد, مو<UNK>لفات, هُوَ, عَلَ..."
...,...,...
282,4.8,"[المتشائم, رَأَى, حَيَاة, ظُلُّه]"
283,2.6,"[إني, لِ, أ<UNK>عجب, يظن, حَيَاة, شِيِّيّ, وَ..."
284,9.8,"[حُرِّيَّة, حَيَاة, حُرِّيَّة, بِ, لَا, فَضِيلَة]"
285,4.4,"[لَحِيايّ, قِيمَة, وَجدَنَا, شِيِّيّ, نناضل, ا..."


### Step 3 : Model Training and Evaluation

In [50]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from nltk.translate.bleu_score import corpus_bleu
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Define your dataset class
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])

# Define your RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])
        return output

# Define your hyperparameters
input_size = 10000  # Vocabulary size
hidden_size = 128
epochs = 20
batch_size = 32
learning_rate = 0.001

# Preprocessing
X = df['Lemmatized_Text'].values
y = df['Score'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')

max_length_test = max(len(seq) for seq in X_test)
X_test = pad_sequences(X_test, maxlen=max_length_test, padding='post')

# Create DataLoader for training and testing
train_dataset = MyDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = MyDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss function, and optimizer
model = RNNModel(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.flatten(), targets.float())
        loss.backward()
        optimizer.step()

# Evaluation
def evaluate_model(model, test_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            y_true.extend(targets.numpy())
            y_pred.extend(outputs.flatten().numpy())
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    return r2, mse, mae

r2, mse, mae = evaluate_model(model, test_loader)
print("R2 Score:", r2)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

R2 Score: 0.012013378801371877
Mean Squared Error (MSE): 8.866586663374566
Mean Absolute Error (MAE): 2.6641339737793497


In [51]:
epochs = 10
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, _ = self.lstm(embedded)
        output = self.dropout(output)
        output = self.fc(output[:, -1, :])
        return output

# Initialize model, loss function, and optimizer
model = LSTMModel(input_size, hidden_size, num_layers=2, dropout=0.5)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.flatten(), targets.float())
        loss.backward()
        optimizer.step()

# Evaluation
r2, mse, mae = evaluate_model(model, test_loader)
print("R2 Score:", r2)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)


R2 Score: 0.03337051827001869
Mean Squared Error (MSE): 8.674919161085116
Mean Absolute Error (MAE): 2.640769938764901


In [52]:
# Define your Bidirectional GRU model
class BiGRUModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(BiGRUModel, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Multiply by 2 for bidirection

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(torch.cat((output[:, -1, :hidden_size], output[:, 0, hidden_size:]), dim=1))  # Concatenate the last hidden state from forward and backward pass
        return output

# Initialize model, loss function, and optimizer
model = BiGRUModel(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training
for epoch in range(epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.flatten(), targets.float())
        loss.backward()
        optimizer.step()

# Evaluation
r2, mse, mae = evaluate_model(model, test_loader)
print("R2 Score:", r2)
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

R2 Score: -0.054461934300437775
Mean Squared Error (MSE): 9.463162681658176
Mean Absolute Error (MAE): 2.732141784141804


### Step 4 : Conclusion

Analysis
- **R2 Score** Measures how well the predictions approximate the actual values. Higher is better. The LSTM model performed slightly better than the RNN model, while the BiGRU model had a negative R2 score, indicating poor performance.
- The **MSE** Measures the average squared difference between predictions and actual values. Lower is better. The LSTM model had the lowest MSE, indicating it was the most accurate in terms of average squared error.
- **MAE** Measures the average absolute difference between predictions and actual values. Lower is better. The LSTM model also had the lowest MAE, making it the best at minimizing absolute error.

Conclusion

- **Best Model** The LSTM model showed the best performance across most metrics (R2 Score, MSE, and MAE).
iction.