In [101]:
import sqlite3
import pandas as pd

def load_table(db_path: str, table_name: str):
    connection = sqlite3.connect(db_path)
    df = pd.read_sql_query(f'select * from {table_name}', connection)

    connection.close()
    return df

In [102]:
corpus_df = load_table('../app/databases/corpus.db', 'corpus')
corpus_df.head()

Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,


In [103]:
words_df = load_table('../app/databases/words.db', 'allwords')
words_df.head()

Unnamed: 0,sura,ayah,word,bn,in,en
0,1,1,1,নামে,dengan nama,In (the) name
1,1,1,2,আল্লাহ (র),Allah,"(of) Allah,"
2,1,1,3,পরম করুণাময়,Maha Pengasih,"the Most Gracious,"
3,1,1,4,অসীম দয়ালু,Maha Penyayang,the Most Merciful.
4,1,2,1,সকল প্রশংসা,pujian,All praises and thanks


In [104]:
for seq, column in enumerate(corpus_df.columns):
    print(f'{seq:^2}:{column}')

0 :surah
1 :ayah
2 :word
3 :ar1
4 :ar2
5 :ar3
6 :ar4
7 :ar5
8 :pos1
9 :pos2
10:pos3
11:pos4
12:pos5
13:count
14:root_ar
15:lemma
16:verb_type
17:verf_form


In [105]:
column_mapper = {
    "surah": "sura",
    "verf_form": "verb_form",
    "root_ar": "root",
}

corpus_df.rename(lambda x: column_mapper.get(x, x), axis="columns", inplace=True)
corpus_df.head()

Unnamed: 0,sura,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root,lemma,verb_type,verb_form
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,


In [106]:
merged_corpus_df = corpus_df.merge(words_df, how="outer", on=["sura", "ayah", "word"])
merged_corpus_df.head()

Unnamed: 0,sura,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,...,pos4,pos5,count,root,lemma,verb_type,verb_form,bn,in,en
0,1,1,1,بِ,سْمِ,,,,P,N,...,,,2,سمو,اسْم,,,নামে,dengan nama,In (the) name
1,1,1,2,ٱللَّهِ,,,,,PN,,...,,,1,اله,اللَّه,,,আল্লাহ (র),Allah,"(of) Allah,"
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,...,,,2,رحم,رَحْمٰن,,,পরম করুণাময়,Maha Pengasih,"the Most Gracious,"
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,...,,,2,رحم,رَحِيم,,,অসীম দয়ালু,Maha Penyayang,the Most Merciful.
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,...,,,2,حمد,حَمْد,,,সকল প্রশংসা,pujian,All praises and thanks


In [107]:
merged_corpus_df.drop(["bn", "in"], axis=1, inplace=True)

In [108]:
# integrity check

missing_word_translation = merged_corpus_df[pd.isna(merged_corpus_df['en'])].size
missing_corpus = merged_corpus_df[pd.isna(merged_corpus_df['ar1'])].size
assert missing_word_translation == missing_corpus == 0


In [109]:
max_words_in_ayah = merged_corpus_df.groupby(['sura', 'ayah'])['word'].transform('max')
merged_corpus_df['max_words_in_ayah'] = max_words_in_ayah
merged_corpus_df.head()

Unnamed: 0,sura,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root,lemma,verb_type,verb_form,en,max_words_in_ayah
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,,In (the) name,4
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,,"(of) Allah,",4
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,,"the Most Gracious,",4
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,,the Most Merciful.,4
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,,All praises and thanks,4


In [110]:
from dataclasses import dataclass

@dataclass
class AyahInfo:
    sura: int
    ayah: int

verses_by_ramadan_days = [
    (AyahInfo(sura=1, ayah=1), AyahInfo(sura=2, ayah=203)),
    (AyahInfo(sura=2, ayah=204), AyahInfo(sura=3, ayah=91)),
    (AyahInfo(sura=3, ayah=92), AyahInfo(sura=4, ayah=87)),
    (AyahInfo(sura=4, ayah=88), AyahInfo(sura=5, ayah=82)),
    (AyahInfo(sura=5, ayah=83), AyahInfo(sura=7, ayah=11)),
    (AyahInfo(sura=7, ayah=12), AyahInfo(sura=8, ayah=40)),
    (AyahInfo(sura=8, ayah=41), AyahInfo(sura=9, ayah=93)),
    (AyahInfo(sura=9, ayah=94), AyahInfo(sura=11, ayah=5)),
    (AyahInfo(sura=11, ayah=6), AyahInfo(sura=12, ayah=52)),
    (AyahInfo(sura=12, ayah=53), AyahInfo(sura=14, ayah=52)),
    (AyahInfo(sura=15, ayah=1), AyahInfo(sura=16, ayah=128)),
    (AyahInfo(sura=17, ayah=1), AyahInfo(sura=18, ayah=74)),
    (AyahInfo(sura=18, ayah=75), AyahInfo(sura=20, ayah=135)),
    (AyahInfo(sura=21, ayah=1), AyahInfo(sura=22, ayah=78)),
    (AyahInfo(sura=23, ayah=1), AyahInfo(sura=25, ayah=20)),
    (AyahInfo(sura=25, ayah=21), AyahInfo(sura=27, ayah=59)),
    (AyahInfo(sura=27, ayah=60), AyahInfo(sura=29, ayah=44)),
    (AyahInfo(sura=29, ayah=45), AyahInfo(sura=33, ayah=30)),
    (AyahInfo(sura=33, ayah=31), AyahInfo(sura=36, ayah=21)),
    (AyahInfo(sura=36, ayah=22), AyahInfo(sura=39, ayah=31)),
    (AyahInfo(sura=39, ayah=32), AyahInfo(sura=41, ayah=46)),
    (AyahInfo(sura=41, ayah=47), AyahInfo(sura=45, ayah=37)),
    (AyahInfo(sura=46, ayah=1), AyahInfo(sura=51, ayah=30)),
    (AyahInfo(sura=51, ayah=31), AyahInfo(sura=57, ayah=29)),
    (AyahInfo(sura=58, ayah=1), AyahInfo(sura=66, ayah=12)),
    (AyahInfo(sura=67, ayah=1), AyahInfo(sura=77, ayah=50)),
    (AyahInfo(sura=78, ayah=1), AyahInfo(sura=114, ayah=6)),
]

In [111]:
"""
set taraweeh_night to the the df
"""


def _get_comparision(sura, ayah, day_index):
    start_verse, end_verse = verses_by_ramadan_days[day_index]
    if (
        (start_verse.sura == sura and start_verse.ayah <= ayah)
        or (end_verse.sura == sura and end_verse.ayah >= ayah)
        or (start_verse.sura < sura < end_verse.sura)
    ):
        return 0

    if sura > end_verse.sura or (sura == end_verse.sura and ayah > end_verse.ayah):
        return 1
    return -1


def search_day_index(sura, ayah):
    start = 0
    end = len(verses_by_ramadan_days)

    while start < end:
        day_index = (start + end) // 2
        comparision = _get_comparision(sura, ayah, day_index)
        if comparision == 0:
            return day_index

        if comparision > 0:
            start = day_index + 1
        else:
            end = day_index

    return ValueError("Not found")


"""
test
"""
for verses in verses_by_ramadan_days:
    start_verse, end_verse = verses
    assert search_day_index(
        start_verse.sura, start_verse.ayah
    ) == verses_by_ramadan_days.index(verses)
    assert search_day_index(
        end_verse.sura, end_verse.ayah
    ) == verses_by_ramadan_days.index(verses)


merged_corpus_df["taraweeh_night"] = merged_corpus_df.apply(
    lambda row: search_day_index(row["sura"], row["ayah"]) + 1, axis=1
)
merged_corpus_df.head()

Unnamed: 0,sura,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,...,pos4,pos5,count,root,lemma,verb_type,verb_form,en,max_words_in_ayah,taraweeh_night
0,1,1,1,بِ,سْمِ,,,,P,N,...,,,2,سمو,اسْم,,,In (the) name,4,1
1,1,1,2,ٱللَّهِ,,,,,PN,,...,,,1,اله,اللَّه,,,"(of) Allah,",4,1
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,...,,,2,رحم,رَحْمٰن,,,"the Most Gracious,",4,1
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,...,,,2,رحم,رَحِيم,,,the Most Merciful.,4,1
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,...,,,2,حمد,حَمْد,,,All praises and thanks,4,1


In [112]:
"""
set frequency column
"""

# merged_corpus_df['lemma_root_frequency_by_night'] = merged_corpus_df.groupby(['root_ar', 'lemma']).transform('size')
# merged_corpus_df['lemma_root_frequency'] = merged_corpus_df.groupby(['taraweeh_night', 'root_ar', 'lemma']).transform('size')
# merged_corpus_df.head()

'\nset frequency column\n'

In [113]:
"""
Export to db
"""

quran_words_db_path = "quran_words.db"
corpus_with_word_meaning_table_name = "corpus_with_word_meaning"
quran_words_db_connection = sqlite3.connect(quran_words_db_path)
merged_corpus_df.to_sql(corpus_with_word_meaning_table_name, con=quran_words_db_connection, if_exists='replace', index=False)

df = load_table(quran_words_db_path, corpus_with_word_meaning_table_name)
df.head()

Unnamed: 0,sura,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,...,pos4,pos5,count,root,lemma,verb_type,verb_form,en,max_words_in_ayah,taraweeh_night
0,1,1,1,بِ,سْمِ,,,,P,N,...,,,2,سمو,اسْم,,,In (the) name,4,1
1,1,1,2,ٱللَّهِ,,,,,PN,,...,,,1,اله,اللَّه,,,"(of) Allah,",4,1
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,...,,,2,رحم,رَحْمٰن,,,"the Most Gracious,",4,1
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,...,,,2,رحم,رَحِيم,,,the Most Merciful.,4,1
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,...,,,2,حمد,حَمْد,,,All praises and thanks,4,1


In [114]:
"""
add verses arabic to db
"""
verses_arabic_df = load_table('../app/databases/quran_arabic.db', 'verses')

verses_arabic_table_name = "verses_arabic"
verses_arabic_df[["sura", "ayah", "text"]].to_sql(verses_arabic_table_name, con=quran_words_db_connection, if_exists='replace', index=False)

df = load_table(quran_words_db_path, verses_arabic_table_name)
df.head()

Unnamed: 0,sura,ayah,text
0,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
1,1,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
2,1,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
3,1,4,مَٰلِكِ يَوْمِ ٱلدِّينِ
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ


In [115]:
"""
add verses english to db
"""
verses_english_df = load_table('../app/databases/quran_english.db', 'verses')

verses_english_table_name = "verses_english"
verses_english_df[["sura", "ayah", "text"]].to_sql(verses_english_table_name, con=quran_words_db_connection, if_exists='replace', index=False)

df = load_table(quran_words_db_path, verses_english_table_name)
df.head()

Unnamed: 0,sura,ayah,text
0,1,1,"In the name of Allah, the Entirely Merciful, t..."
1,1,2,"[All] praise is [due] to Allah, Lord of the wo..."
2,1,3,"The Entirely Merciful, the Especially Merciful,"
3,1,4,Sovereign of the Day of Recompense.
4,1,5,It is You we worship and You we ask for help.
