In [242]:
import os
import re
import string
import requests
import pandas as pd
import qalsadi.lemmatizer
from bs4 import BeautifulSoup
import pyarabic.araby as araby
from nltk.corpus import stopwords
import arabicstopwords.arabicstopwords as stp
from sklearn.feature_extraction.text import TfidfVectorizer

In [188]:
def get_surah_names() -> list:
    surah_names = [] #surah names sorted
    URL = "https://surahquran.com/quran-search/quran.html"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    all_table = soup.find_all('table')[1]
    for idx, elm in enumerate(all_table.find_all("a")):
        surah_names.append(
            (idx + 1, elm.text)
        )
    return surah_names

In [235]:
LANGUAGE = "AR"
file = 'arabic.csv' if LANGUAGE == "AR" else 'french.csv'

book = pd.read_csv(
    f'{os.getcwd()}/data/{file}',
    sep="|",
    header=None,
)

book.columns = ["surah", "ayat", "text"]

book['surah'] = book['surah'].astype(int)
book['ayat'] = book['ayat'].astype(int)
book['text'] = book['text'].apply(lambda t: araby.strip_diacritics(t))

mapping_dictionary = {t[0]: t for t in get_surah_names()}
book['surah_name'] = book['surah'].map(mapping_dictionary).apply(lambda t: t[1])

book['text'].to_csv(f'{os.getcwd()}/output.txt', index=False, sep="\t", header=None)

<p>
    Testing TheFuzz package
</p>

In [201]:
from thefuzz import fuzz

search_text = "صراط الذين"
book = book.assign(
    score = lambda _df: 
        _df['text'].apply(lambda t: 
            fuzz.partial_ratio(t, search_text)
        )
    )

<h2>Text cleaning</h2>

In [236]:
def normalize_chars(txt) -> str:
    txt = re.sub("[إأٱآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ة", "ه", txt)
    return txt

stopwordlist = set(list(stp.stopwords_list()) + stopwords.words('arabic'))
stopwordlist = [normalize_chars(word) for word in stopwordlist]

In [237]:
lemmer = qalsadi.lemmatizer.Lemmatizer()

def clean_txt(txt: str) -> str:
    txt = araby.strip_diacritics(txt)
    txt = araby.strip_tatweel(txt)
    txt = normalize_chars(txt)
    txt = ' '.join([
        token.translate(
            str.maketrans(
                '', 
                '', 
                string.punctuation
            )
        ) for token in txt.split(' ') if token not in stopwordlist
    ])
    txt_lemmatized = ' '.join([lemmer.lemmatize(token) for token in txt.split(' ')])
    return f"{txt} {txt_lemmatized}"

In [238]:
book['clean_txt'] = book['text'].apply(lambda x: clean_txt(x))

In [248]:
corpus = book["clean_txt"]
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
corpus_vectorized = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names_out()[:20]

array(['آتى', 'آتى أجزى', 'آتى اتوا', 'آتى اجرا', 'آتى اجرها',
       'آتى اجرهم', 'آتى اجوركم', 'آتى اجورهم', 'آتى احدا', 'آتى الاخره',
       'آتى الحكمه', 'آتى الزكاه', 'آتى الله', 'آتى اوتي', 'آتى اوتيتم',
       'آتى باذن', 'آتى تب', 'آتى حذر', 'آتى خير', 'آتى ردى'],
      dtype=object)

In [282]:
def show_best_results(
    df: pd.DataFrame, 
    scores_array: list, 
    top_n: int = 2
) -> None:
    sorted_indices = scores_array.argsort()[::-1]
    for idx in sorted_indices[:top_n]:
        row = df.iloc[idx]
        surah = row["surah"]
        text = row["text"]
        ayah_num = row["ayat"]
        surah_name = row["surah_name"]
        score = scores_array[idx]
        if score > 0:
            print(f"Surat nb: {surah}  | Ayat: {ayah_num} | Surah: {surah_name} | Score: {score}")
            print(text)

In [283]:
def run_tfidf(query: str) -> None:
    query = clean_txt(query)
    query_vectorized = vectorizer.transform([query])
    scores = query_vectorized.dot(corpus_vectorized.transpose())
    scores_array = scores.toarray()[0]
    show_best_results(book, scores_array)

In [286]:
q = "أين أول؟ لقد تنبى الله علم النبي والمهاجرين، والأورو صار الذين تبعوه الذين اتبعوه في ساعة العسرة بعد ما كاد يزيغ قلوب فريق منهم، ثم." 
run_tfidf(q)

Surat nb: 9  | Ayat: 117 | Surah: التوبة | Score: 0.6379697081810952
لقد تاب الله على النبي والمهاجرين والأنصار الذين اتبعوه في ساعة العسرة من بعد ما كاد يزيغ قلوب فريق منهم ثم تاب عليهم  إنه بهم رءوف رحيم
Surat nb: 3  | Ayat: 8 | Surah: آل عمران | Score: 0.11008765329491813
ربنا لا تزغ قلوبنا بعد إذ هديتنا وهب لنا من لدنك رحمة  إنك أنت الوهاب
