<a href="https://colab.research.google.com/github/doaa-sala7/Arabic_question_Answering/blob/main/Arabic_QA_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install  arabic-reshaper
! pip install python-bidi


In [3]:
from huggingface_hub import login
login()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import ISRIStemmer
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import reuters
nltk.download('reuters')



from nltk.tokenize.punkt import PunktLanguageVars, PunktSentenceTokenizer, PunktTrainer, PunktToken

import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import arabic_reshaper
from bidi.algorithm import get_display


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


In [5]:
from datasets import load_dataset

dataset = load_dataset("arcd")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.53k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/174k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/192k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/693 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/702 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 693
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 702
    })
})

In [None]:


row_data = [context for context in dataset['train']['context']]
data = "".join(row_data)

In [7]:

stop_words = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def preprocess_arabic_text(doc):
    processed_text = []
    for text in doc:
        # Remove non-Arabic characters and symbols
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\s]', '', text)

        # Tokenize the text
        words = nltk.word_tokenize(text)

        # Remove stop words
        filtered_words = [word for word in words if word not in stop_words]

        # stem each word
        stemmed_words = [stemmer.stem(word) for word in filtered_words]

        # Join the stemmed words back into a single string
        processed_text.append( ' '.join(stemmed_words))

    return processed_text



In [9]:
# Define your custom Arabic punctuation
custom_arabic_punctuation = ['!', '؛', '؟','،']

class CustomArabicLanguageVars(PunktLanguageVars):
    sent_end_chars = PunktLanguageVars.sent_end_chars + tuple(custom_arabic_punctuation)



class LinkAwarePunktToken(PunktToken):
    def is_non_breaking(self):
        return super().is_non_breaking() or self.type == PunktToken.INTERNAL_PUNCT



# Extend the trainer to use the custom token class
class LinkAwareTrainer(PunktTrainer):
    def get_type(self, tok):
        if '.' in tok:
            return LinkAwarePunktToken.ABBREV

        return super().get_type(tok)

# Train a new sentence tokenizer with the custom trainer and language variables
trainer = LinkAwareTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(" ".join([" ".join(sent) for sent in nltk.corpus.reuters.sents()]))

# Create a PunktSentenceTokenizer with the custom token class
custom_arabic_tokenizer = PunktSentenceTokenizer(trainer.get_params(), lang_vars=CustomArabicLanguageVars())

def custom_arabic_sentence_tokenizer(text):
    sentences = custom_arabic_tokenizer.tokenize(text)
    return sentences





In [None]:

def preprocess_and_tokenizing(data):
    sentences_tokenzing =custom_arabic_sentence_tokenizer(data)
    preprocessed_sentences = preprocess_arabic_text( sentences_tokenzing)
    return sentences_tokenzing, preprocessed_sentences


In [10]:

def preprocess(text):
    """
    Preprocesses Arabic text by reshaping it and applying BiDi algorithm.
    """
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text

def most_similar_sentence(question, context ):
    """
    Finds the most similar sentence from a list of sentences to a given target sentence.
    """
    # Preprocess target sentence

    target_sentence = preprocess(question)
    #print(target_sentence)
    #target_sentence = preprocess_arabic_text(target_sentence)
    row_sentance, preprocessed_sentences = preprocess_and_tokenizing(context)

    # Preprocess list of sentences
    preprocessed_sentences = [preprocess(sentence) for sentence in preprocessed_sentences]

    # Initialize CountVectorizer
    vectorizer = CountVectorizer().fit(preprocessed_sentences)

    # Transform target sentence and list of sentences to vectors
    target_vector = vectorizer.transform([target_sentence])
    sentence_vectors = vectorizer.transform(preprocessed_sentences)

    # Compute cosine similarity between target sentence and list of sentences
    similarities = cosine_similarity(target_vector, sentence_vectors)

    # Find the index of the most similar sentence
    most_similar_index = np.argmax(similarities)

    # Return the most similar sentence
    return row_sentance[most_similar_index]



In [25]:
from sklearn.metrics import accuracy_score, f1_score

# Example function to evaluate the model
def evaluate_model( dataset):
    predictions = []
    ground_truth = []

    for example in dataset:
        context = example['context']
        question = example['question']
        ground_truth_answer = example['answers']["text"]

        # Generate prediction using your model
        predicted_answer = most_similar_sentence(question, context)

        # Append predictions and ground truth
        predictions.append(predicted_answer)
        ground_truth.append(ground_truth_answer)

    #print(ground_truth,predictions )
    # Calculate accuracy
    accuracy = accuracy_score(ground_truth, predictions)

    # Calculate F1 score
    f1 = f1_score(ground_truth, predictions, average='micro')

    # Calculate exact match
    exact_match = sum(1 for p, gt in zip(predictions, ground_truth) if p == gt) / len(ground_truth)

    return accuracy, f1, exact_match, {"ground_truth": ground_truth,"predictions": predictions }




In [26]:
from datasets import Dataset

accuracy, f1, exact_match , d= evaluate_model( Dataset.from_dict( dataset['validation'][:100]) )
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Exact Match:", exact_match)

Accuracy: 0.02
F1 Score: 0.02
Exact Match: 0.0
