In [224]:
import os
import re
import string
import requests
import pandas as pd
import qalsadi.lemmatizer
from bs4 import BeautifulSoup
import pyarabic.araby as araby
from nltk.corpus import stopwords
import arabicstopwords.arabicstopwords as stp

In [188]:
def get_surah_names():
    surah_names = [] #surah names sorted
    URL = "https://surahquran.com/quran-search/quran.html"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    all_table = soup.find_all('table')[1]
    for idx, elm in enumerate(all_table.find_all("a")):
        surah_names.append(
            (idx + 1, elm.text)
        )
    return surah_names

In [189]:
LANGUAGE = "AR"
file = 'arabic.csv' if LANGUAGE == "AR" else 'french.csv'

book = pd.read_csv(
    f'{os.getcwd()}/data/{file}',
    sep="|",
    header=None,
)

book.columns = ["surah", "ayat", "text"]

book['surah'] = book['surah'].astype(int)
book['ayat'] = book['ayat'].astype(int)
book['text'] = book['text'].apply(lambda t: araby.strip_diacritics(t))

mapping_dictionary = {t[0]: t for t in get_surah_names()}
book['surah_name'] = book['surah'].map(mapping_dictionary).apply(lambda t: t[1])

book['text'].to_csv(f'{os.getcwd()}/output.txt', index=False, sep="\t", header=None)

<p>
    Testing TheFuzz package
</p>

In [201]:
from thefuzz import fuzz

search_text = "صراط الذين"
book = book.assign(
    score = lambda _df: 
        _df['text'].apply(lambda t: 
            fuzz.partial_ratio(t, search_text)
        )
    )

<h2>Text cleaning</h2>

In [221]:
def normalize_chars(txt):
    txt = re.sub("[إأٱآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ة", "ه", txt)
    return txt

stopwordlist = set(list(stp.stopwords_list()) + stopwords.words('arabic'))
stopwordlist = [normalize_chars(word) for word in stopwordlist]

In [223]:
def clean_txt(txt):
    txt = araby.strip_diacritics(txt)
    txt = araby.strip_tatweel(txt)
    txt = normalize_chars(txt)
    txt = ' '.join([
        token.translate(
            str.maketrans(
                '', 
                '', 
                string.punctuation
            )
        ) for token in txt.split(' ') if token not in stopwordlist
    ])
    txt_lemmatized = ' '.join([lemmer.lemmatize(token) for token in txt.split(' ')])
    return txt+" "+txt_lemmatized

Unnamed: 0,surah,ayat,text,surah_name,score
0,1,1,بسم الله الرحمن الرحيم,الفاتحة,40
1,1,2,الحمد لله رب العالمين,الفاتحة,50
2,1,3,الرحمن الرحيم,الفاتحة,50
3,1,4,مالك يوم الدين,الفاتحة,50
4,1,5,إياك نعبد وإياك نستعين,الفاتحة,40
...,...,...,...,...,...
6231,114,2,ملك الناس,الناس,50
6232,114,3,إله الناس,الناس,50
6233,114,4,من شر الوسواس الخناس,الناس,50
6234,114,5,الذي يوسوس في صدور الناس,الناس,60
