In [227]:
import os
import re
import string
import requests
import pandas as pd
import qalsadi.lemmatizer
from bs4 import BeautifulSoup
import pyarabic.araby as araby
from nltk.corpus import stopwords
import arabicstopwords.arabicstopwords as stp

In [188]:
def get_surah_names():
    surah_names = [] #surah names sorted
    URL = "https://surahquran.com/quran-search/quran.html"
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")
    all_table = soup.find_all('table')[1]
    for idx, elm in enumerate(all_table.find_all("a")):
        surah_names.append(
            (idx + 1, elm.text)
        )
    return surah_names

In [235]:
LANGUAGE = "AR"
file = 'arabic.csv' if LANGUAGE == "AR" else 'french.csv'

book = pd.read_csv(
    f'{os.getcwd()}/data/{file}',
    sep="|",
    header=None,
)

book.columns = ["surah", "ayat", "text"]

book['surah'] = book['surah'].astype(int)
book['ayat'] = book['ayat'].astype(int)
book['text'] = book['text'].apply(lambda t: araby.strip_diacritics(t))

mapping_dictionary = {t[0]: t for t in get_surah_names()}
book['surah_name'] = book['surah'].map(mapping_dictionary).apply(lambda t: t[1])

book['text'].to_csv(f'{os.getcwd()}/output.txt', index=False, sep="\t", header=None)

<p>
    Testing TheFuzz package
</p>

In [201]:
from thefuzz import fuzz

search_text = "صراط الذين"
book = book.assign(
    score = lambda _df: 
        _df['text'].apply(lambda t: 
            fuzz.partial_ratio(t, search_text)
        )
    )

<h2>Text cleaning</h2>

In [236]:
def normalize_chars(txt):
    txt = re.sub("[إأٱآا]", "ا", txt)
    txt = re.sub("ى", "ي", txt)
    txt = re.sub("ة", "ه", txt)
    return txt

stopwordlist = set(list(stp.stopwords_list()) + stopwords.words('arabic'))
stopwordlist = [normalize_chars(word) for word in stopwordlist]

In [237]:
lemmer = qalsadi.lemmatizer.Lemmatizer()

def clean_txt(txt):
    txt = araby.strip_diacritics(txt)
    txt = araby.strip_tatweel(txt)
    txt = normalize_chars(txt)
    txt = ' '.join([
        token.translate(
            str.maketrans(
                '', 
                '', 
                string.punctuation
            )
        ) for token in txt.split(' ') if token not in stopwordlist
    ])
    txt_lemmatized = ' '.join([lemmer.lemmatize(token) for token in txt.split(' ')])
    return f"{txt} {txt_lemmatized}"

In [238]:
book['clean_txt'] = book['text'].apply(lambda x: clean_txt(x))

In [240]:
book.sample(3)

Unnamed: 0,surah,ayat,text,surah_name,clean_txt
5002,56,24,جزاء بما كانوا يعملون,الواقعة,جزاء كانوا يعملون جزاء كانو عمل
5915,85,7,وهم على ما يفعلون بالمؤمنين شهود,البروج,بالمؤمنين شهود مؤمن شهود
401,3,109,ولله ما في السماوات وما في الأرض وإلى الله تر...,آل عمران,ولله السماوات الارض الله ترجع الامور له سماوا...
