In [3]:
import sqlite3
import pandas as pd

def load_table(db_path: str, table_name: str):
    connection = sqlite3.connect(db_path)
    query = connection.execute(f'select * from {table_name}')

    columns = [col[0] for col in query.description]
    df = pd.DataFrame.from_records(data=query.fetchall(), columns=columns)

    connection.close()
    return df

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
corpus_df = load_table('../../app/databases/corpus.db', 'corpus')
corpus_df.head()

Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,


In [5]:
words_df = load_table('../../app/databases/words.db', 'allwords')
words_df.head()

Unnamed: 0,sura,ayah,word,bn,in,en
0,1,1,1,নামে,dengan nama,In (the) name
1,1,1,2,আল্লাহ (র),Allah,"(of) Allah,"
2,1,1,3,পরম করুণাময়,Maha Pengasih,"the Most Gracious,"
3,1,1,4,অসীম দয়ালু,Maha Penyayang,the Most Merciful.
4,1,2,1,সকল প্রশংসা,pujian,All praises and thanks


In [6]:
words_80_percent_levels_df = load_table('../../app/databases/words80percent.db', 'levels')
words_80_percent_levels_df.head()

Unnamed: 0,level_num,level_title
0,1,"This, that"
1,2,"This, that (E-A)"
2,3,No and yes
3,4,No and yes (E-A)
4,5,Whose? pronouns endings


In [7]:
words_80_percent_words_df = load_table('../../app/databases/words80percent.db', 'words')
words_80_percent_words_df.head()

Unnamed: 0,level_num,serial_num,arabic,english
0,3,1,لَا إِلهَ,(There is) no god
1,5,1,هُ...,his
2,3,2,إِلَّا الله,except Allah
3,5,2,هَا...,"her, their (for broken plural)"
4,5,3,كَ...,your (male)


In [8]:
for seq, column in enumerate(corpus_df.columns):
    print(f'{seq:^2}:{column}')

0 :surah
1 :ayah
2 :word
3 :ar1
4 :ar2
5 :ar3
6 :ar4
7 :ar5
8 :pos1
9 :pos2
10:pos3
11:pos4
12:pos5
13:count
14:root_ar
15:lemma
16:verb_type
17:verf_form


- tokenize arabic segments with pos
- tokenize roots
- given a range:
    - Rank max appeared words
    - Rank max appeared segments
    - Rank max appeared roots
- Rank simpler sentences





In [9]:
from dataclasses import dataclass

@dataclass
class AyahInfo:
    sura: int
    ayah: int

verses_by_ramadan_days = [
    (AyahInfo(sura=1, ayah=1), AyahInfo(sura=2, ayah=203)),
    (AyahInfo(sura=2, ayah=204), AyahInfo(sura=3, ayah=91)),
    (AyahInfo(sura=3, ayah=92), AyahInfo(sura=4, ayah=87)),
    (AyahInfo(sura=4, ayah=88), AyahInfo(sura=5, ayah=82)),
    (AyahInfo(sura=5, ayah=83), AyahInfo(sura=7, ayah=11)),
    (AyahInfo(sura=7, ayah=12), AyahInfo(sura=8, ayah=40)),
    (AyahInfo(sura=8, ayah=41), AyahInfo(sura=9, ayah=93)),
    (AyahInfo(sura=9, ayah=94), AyahInfo(sura=11, ayah=5)),
    (AyahInfo(sura=11, ayah=6), AyahInfo(sura=12, ayah=52)),
    (AyahInfo(sura=12, ayah=53), AyahInfo(sura=14, ayah=52)),
    (AyahInfo(sura=15, ayah=1), AyahInfo(sura=16, ayah=128)),
    (AyahInfo(sura=17, ayah=1), AyahInfo(sura=18, ayah=74)),
    (AyahInfo(sura=18, ayah=75), AyahInfo(sura=20, ayah=135)),
    (AyahInfo(sura=21, ayah=1), AyahInfo(sura=22, ayah=78)),
    (AyahInfo(sura=23, ayah=1), AyahInfo(sura=25, ayah=20)),
    (AyahInfo(sura=25, ayah=21), AyahInfo(sura=27, ayah=59)),
    (AyahInfo(sura=27, ayah=60), AyahInfo(sura=29, ayah=44)),
    (AyahInfo(sura=29, ayah=45), AyahInfo(sura=33, ayah=30)),
    (AyahInfo(sura=33, ayah=31), AyahInfo(sura=36, ayah=21)),
    (AyahInfo(sura=36, ayah=22), AyahInfo(sura=39, ayah=31)),
    (AyahInfo(sura=39, ayah=32), AyahInfo(sura=41, ayah=46)),
    (AyahInfo(sura=41, ayah=47), AyahInfo(sura=45, ayah=37)),
    (AyahInfo(sura=46, ayah=1), AyahInfo(sura=51, ayah=30)),
    (AyahInfo(sura=51, ayah=31), AyahInfo(sura=57, ayah=29)),
    (AyahInfo(sura=58, ayah=1), AyahInfo(sura=66, ayah=12)),
    (AyahInfo(sura=67, ayah=1), AyahInfo(sura=77, ayah=50)),
    (AyahInfo(sura=78, ayah=1), AyahInfo(sura=114, ayah=6)),
]

In [10]:
def get_words_for_day(day: int, df: pd.DataFrame):
    day_start_verse, day_end_verse = verses_by_ramadan_days[day]
    filter_condition = (
        ((df["surah"] == day_start_verse.sura) & (df["ayah"] >= day_start_verse.ayah))
        | ((df["surah"] == day_end_verse.sura) & (df["ayah"] <= day_end_verse.ayah))
        | ((df["surah"] > day_start_verse.sura) & (df["surah"] < day_end_verse.sura))
    )

    return df[filter_condition]

In [11]:
words_for_day_df = get_words_for_day(0, corpus_df)
words_for_day_df.head()

Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,


In [27]:
metadata = {
    "total_words": words_for_day_df.shape[0],
    "total_ayah": words_for_day_df.groupby(["surah", "ayah"]).size().count(),
    "unique_words": words_for_day_df.groupby(["ar1", "ar2", "ar3", "ar4", "ar5"], dropna=False)
    .size()
    .count(),
    "uinque_roots": words_for_day_df.groupby(["root_ar"]).size().count(),
    "unique_lemma": words_for_day_df.groupby(["lemma"]).size().count(),
}

print(metadata)

{'total_words': 3837, 'total_ayah': 210, 'unique_words': 1884, 'uinque_roots': 460, 'unique_lemma': 817}


In [13]:
root_value_counts = words_for_day_df['root_ar'].value_counts()
for name, frequency in root_value_counts.items():
    print(name, frequency)

 1413
اله 151
قول 101
كون 64
علم 61
امن 51
ربب 34
كفر 34
كتب 31
اتي 30
هدي 29
انس 27
بين 24
وقي 24
نزل 23
نفس 20
ايي 20
سمو 20
ظلم 19
شيا 19
قبل 19
بعد 19
رحم 19
قوم 18
عذب 18
قتل 17
ارض 17
يوم 17
تبع 16
اخذ 16
عدو 16
كلل 15
ولي 15
اخر 15
حقق 15
عمل 15
عند 15
خير 14
ذكر 13
فرق 13
خرج 13
موت 13
بني 12
نصر 12
مثل 12
جعل 12
عبد 12
نور 11
عهد 11
حجج 11
توب 11
حرم 11
رسل 11
سمع 11
صدق 11
سجد 11
حيي 11
ملك 10
حسن 10
اكل 10
بعض 10
وجه 10
حيث 9
دعو 9
شري 9
قلب 9
صلو 9
سال 9
رزق 9
سلم 9
نعم 8
راي 8
امر 8
اثم 8
جيا 8
كسب 7
دنو 7
كتم 7
شهد 7
عدد 7
ضلل 7
تلو 7
نبا 7
زكو 7
صوم 7
عقل 7
هود 7
شدد 7
صلح 7
صبر 7
غفر 7
قلل 6
امم 6
حكم 6
بشر 6
فعل 6
خلف 6
بيت 6
باس 6
عرف 6
قرب 6
لعن 6
شكر 6
يدي 6
بصر 6
حبب 6
خوف 6
فضل 6
ضرر 5
جمع 5
ادم 5
بقر 5
وثق 5
سبل 5
احد 5
كبر 5
ليس 5
عجل 5
سكن 5
عمر 5
غفل 5
نظر 5
كلم 5
شهر 5
شطر 5
برر 5
برا 5
بدل 5
فسد 5
شعر 5
غير 5
خلق 5
شطن 5
مرض 5
رجع 5
لقي 5
خلد 5
شرب 4
دخل 4
ليل 4
عفو 4
جزي 4
صحب 4
قدر 4
خلو 4
فسق 4
عون 4
سفه 4
هبط 4
الم 4
مني 4
مول 4
عظم 4
سوي 4
بغي 4
ابو 4
ا

In [14]:
max_words_in_ayah = words_for_day_df.groupby(['surah', 'ayah'])['word'].transform('max')
words_for_day_df['max_words'] = max_words_in_ayah
words_for_day_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_for_day_df['max_words'] = max_words_in_ayah


Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form,max_words
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,,4
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,,4
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,,4
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,,4
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,,4


In [15]:
root_frequency = words_for_day_df['root_ar'].value_counts()
words_for_day_df['root_frequency'] = words_for_day_df['root_ar'].map(root_frequency)
words_for_day_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  words_for_day_df['root_frequency'] = words_for_day_df['root_ar'].map(root_frequency)


Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form,max_words,root_frequency
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,,4,20
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,,4,151
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,,4,19
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,,4,19
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,,4,2


In [16]:
words_for_day_df.loc[words_for_day_df.groupby(['root_ar', 'lemma'])['max_words'].idxmin()].sort_values(['root_frequency', 'root_ar'], ascending=False)

Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form,max_words,root_frequency
132,2,12,1,أَلَآ,,,,,ATT,,,,,1,,أَلا,,,7,1413
71,2,6,7,أَمْ,,,,,CONJ,,,,,1,,أَم,,,11,1413
371,2,26,12,فَ,أَمَّا,,,,REM,EXL,,,,2,,أَمّا,,,39,1413
3034,2,169,5,وَ,أَن,,,,CONJ,SUB,,,,2,,أَن,,,11,1413
682,2,46,3,أَنَّ,هُم,,,,ACC,PRON,,,,2,,أَنّ,,,8,1413
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3649,2,196,24,أَذًى,,,,,N,,,,,1,اذي,أَذًى,,,73,1
3247,2,178,23,وَ,أَدَآءٌ,,,,CONJ,N,,,,2,ادي,أَداء,,1,37,1
3243,2,178,19,أَخِي,هِ,,,,N,PRON,,,,2,اخو,أَخ,,,37,1
536,2,34,9,أَبَىٰ,,,,,V,,,,,1,ابي,أَبَى,b8,1,13,1


In [17]:
corpus_df['lemma'].dropna().count() == corpus_df['lemma'].count()

True

In [18]:
corpus_df.head(30)

Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form
0,1,1,1,بِ,سْمِ,,,,P,N,,,,2,سمو,اسْم,,
1,1,1,2,ٱللَّهِ,,,,,PN,,,,,1,اله,اللَّه,,
2,1,1,3,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,
3,1,1,4,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,
4,1,2,1,ٱلْ,حَمْدُ,,,,DET,N,,,,2,حمد,حَمْد,,
5,1,2,2,لِ,لَّهِ,,,,P,PN,,,,2,اله,اللَّه,,
6,1,2,3,رَبِّ,,,,,N,,,,,1,ربب,رَبّ,,
7,1,2,4,ٱلْ,عَٰلَمِينَ,,,,DET,N,,,,2,علم,عالَم,,
8,1,3,1,ٱل,رَّحْمَٰنِ,,,,DET,N,,,,2,رحم,رَحْمٰن,,
9,1,3,2,ٱل,رَّحِيمِ,,,,DET,N,,,,2,رحم,رَحِيم,,


In [19]:
corpus_df.at[29, 'lemma'] == None

True

In [20]:
corpus_df[pd.isna(corpus_df['lemma'])]

Unnamed: 0,surah,ayah,word,ar1,ar2,ar3,ar4,ar5,pos1,pos2,pos3,pos4,pos5,count,root_ar,lemma,verb_type,verf_form
29,2,1,1,الٓمٓ,,,,,INL,,,,,1,,,,
55,2,4,11,هُمْ,,,,,PRON,,,,,1,,,,
63,2,5,7,هُمُ,,,,,PRON,,,,,1,,,,
85,2,7,10,وَ,لَ,هُمْ,,,REM,P,PRON,,,3,,,,
97,2,8,10,هُم,,,,,PRON,,,,,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77321,109,5,2,أَنتُمْ,,,,,PRON,,,,,1,,,,
77325,109,6,1,لَ,كُمْ,,,,P,PRON,,,,2,,,,
77327,109,6,3,وَ,لِ,ىَ,,,CONJ,P,PRON,,,3,,,,
77372,112,1,2,هُوَ,,,,,PRON,,,,,1,,,,


In [35]:
# generate word parts df
# word_part, sura, ayah, word_index, part_index, pos, word_root, word_lemma, word_part_without_harakat

import pyarabic.araby as araby

word_parts_data = []

for index, row in corpus_df.iterrows():
    word_part_count = row['count']
    for word_part_index in range(word_part_count):
        word_part = row[f"ar{word_part_index+1}"]
        word_parts_data.append({
            "word_part": word_part,
            "sura": row["surah"],
            "ayah": row["ayah"],
            "word_index": row["word"],
            "part_index": word_part_index,
            "pos": row[f"pos{word_part_index+1}"],
            "word_root": row["root_ar"],
            "word_lemma": row["lemma"],
            # "verb_type": row["verb_type"],
            # "verb_form": row["verf_form"],
            "word_part_without_harakat": araby.strip_harakat(word_part)
        })

word_parts_df = pd.DataFrame(word_parts_data)
word_parts_df.head()

Unnamed: 0,word_part,sura,ayah,word_index,part_index,pos,word_root,word_lemma,word_part_without_harakat
0,بِ,1,1,1,0,P,سمو,اسْم,ب
1,سْمِ,1,1,1,1,N,سمو,اسْم,سم
2,ٱللَّهِ,1,1,2,0,PN,اله,اللَّه,ٱللّه
3,ٱل,1,1,3,0,DET,رحم,رَحْمٰن,ٱل
4,رَّحْمَٰنِ,1,1,3,1,N,رحم,رَحْمٰن,رّحمٰن


In [36]:
{
    "total_word_parts": word_parts_df.shape[0],
    "unique_word_parts": word_parts_df.groupby(["word_part"]).size().count(),
    "unique_word_parts_with_grammar_context": word_parts_df.groupby(["word_part", "pos"]).size().count(),
    "unique_word_part_without_harakat": word_parts_df.groupby(["word_part_without_harakat"]).size().count(),
    "unique_word_part_without_harakat_with_grammar_context": word_parts_df.groupby(["word_part_without_harakat", "pos"]).size().count(),
    "unique_word_root": word_parts_df.groupby(["word_root"]).size().count(),
    "unique_word_lemma": word_parts_df.groupby(["word_lemma"]).size().count(),
    "unique_lemma_with_root_context": word_parts_df.groupby(["word_lemma", "word_root"]).size().count(),
}

{'total_word_parts': 130035,
 'unique_word_parts': 12186,
 'unique_word_parts_with_grammar_context': 12391,
 'unique_word_part_without_harakat': 8703,
 'unique_word_part_without_harakat_with_grammar_context': 9078,
 'unique_word_root': 1652,
 'unique_word_lemma': 4726,
 'unique_lemma_with_root_context': 4751}