In [1]:
import json
import pandas as pd
from tabulate import tabulate

# Uthmani and indopak init print

In [None]:
uthmani_path = "data/uthmani.json"
indopak_path = "data/indopak.json"

uthmani_df = pd.read_json(uthmani_path, encoding="utf-8")
indopak_df = pd.read_json(indopak_path, encoding="utf-8")

In [19]:
print("source indopak shape:", indopak_df.shape)
print("source uthmani shape:", uthmani_df.shape)

# Transposition needed

indopak_transpose = indopak_df.transpose()
print("indopak transpose shape:", indopak_transpose.shape)
uthmani_transpose = uthmani_df.transpose()
print("uthmani transpose shape:", uthmani_transpose.shape)

print("Uthmani DataFrame:")
print(tabulate(uthmani_transpose.head(), headers='keys', tablefmt='psql'))

print("\nIndopak DataFrame:")
print(tabulate(indopak_transpose.head(), headers='keys', tablefmt='psql'))

source indopak shape: (5, 6236)
source uthmani shape: (6, 83668)
indopak transpose shape: (6236, 5)
uthmani transpose shape: (83668, 6)
Uthmani DataFrame:
+-------+------+---------+--------+--------+------------+---------+
|       |   id |   surah |   ayah |   word | location   | text    |
|-------+------+---------+--------+--------+------------+---------|
| 1:1:1 |    1 |       1 |      1 |      1 | 1:1:1      | بِسْمِ     |
| 1:1:2 |    2 |       1 |      1 |      2 | 1:1:2      | ٱللَّهِ    |
| 1:1:3 |    3 |       1 |      1 |      3 | 1:1:3      | ٱلرَّحْمَـٰنِ |
| 1:1:4 |    4 |       1 |      1 |      4 | 1:1:4      | ٱلرَّحِيمِ  |
| 1:1:5 |    5 |       1 |      1 |      5 | 1:1:5      | ١       |
+-------+------+---------+--------+--------+------------+---------+

Indopak DataFrame:
+-----+------+-------------+---------+--------+------------------------+
|     |   id | verse_key   |   surah |   ayah | text                   |
|-----+------+-------------+---------+--------+----

In [12]:
df = pd.read_csv("data/quran-md-indopak-joined.csv", encoding="utf-8-sig")

print("arabic text comparison")
print(df.head())

arabic text comparison
   surah_id  ayah_id surah_name_ar surah_name_en surah_name_tr  ayah_count  \
0         1        1       الفاتحة   The Opening    Al-Faatiha           7   
1         1        2       الفاتحة   The Opening    Al-Faatiha           7   
2         1        3       الفاتحة   The Opening    Al-Faatiha           7   
3         1        4       الفاتحة   The Opening    Al-Faatiha           7   
4         1        5       الفاتحة   The Opening    Al-Faatiha           7   

                                             ayah_en  \
0  In the name of Allah, the Entirely Merciful, t...   
1  [All] praise is [due] to Allah, Lord of the wo...   
2    The Entirely Merciful, the Especially Merciful,   
3                Sovereign of the Day of Recompense.   
4      It is You we worship and You we ask for help.   

                              ayah_tr  \
0    bis'mi l-lahi l-raḥmāni l-raḥīmi   
1   al-ḥamdu lillahi rabbi l-ʿālamīna   
2                 al-raḥmāni l-raḥīmi   
3      

# Checking word and letter csv cleaned output

In [16]:
df_wl_count = pd.read_csv('data/out/quran-word-and-letter-count-cleaned.csv')
print(df_wl_count.shape)
print(df_wl_count.head())

# Check row count
print("Checking row count...")
assert df_wl_count.shape[0] == 6236, f"Expected 6236 rows after filtering, but got {df_wl_count.shape[0]}"
print("Row count as expected")

# Check specific word counts for known surah and ayah combinations to ensure correctness.
def check_surah_ayah(df, surah, ayah, true_word_count, true_letter_count):
    assert len(df[(df['surah_id'] == surah) & (df['ayah_id'] == ayah)]) == 1, f"Expected exactly one match for Sura {surah} Ayah {ayah}, but found {len(df[(df['surah_id'] == surah) & (df['ayah_id'] == ayah)])}"
    assert df[(df['surah_id'] == surah) & (df['ayah_id'] == ayah)]['word_count'].values[0] == true_word_count, f"Word count mismatch for Sura {surah} Ayah {ayah}"
    assert df[(df['surah_id'] == surah) & (df['ayah_id'] == ayah)]['letter_count'].values[0] == true_letter_count, f"Letter count mismatch for Sura {surah} Ayah {ayah}"

print("Checking some known surah and ayah counts...")
check_surah_ayah(df_wl_count, 1, 1, 4, 19)
check_surah_ayah(df_wl_count, 1, 2, 4, 17)
check_surah_ayah(df_wl_count, 2, 255, 50, 185)
check_surah_ayah(df_wl_count, 56, 10, 2, 15)
check_surah_ayah(df_wl_count, 112, 2, 2, 9)
check_surah_ayah(df_wl_count, 74, 8, 4, 16)
print("All checks passed!")


(6236, 5)
   verse_sura_id  surah_id  ayah_id  word_count  letter_count
0            1.0         1        1           4            19
1            2.0         1        2           4            17
2            3.0         1        3           2            12
3            4.0         1        4           3            11
4            5.0         1        5           4            19
Checking row count...
Row count as expected
Checking some known surah and ayah counts...
All checks passed!


# Final join check

In [40]:
df = pd.read_csv("data/out/quran-md-indopak-wl-count.csv", encoding="utf-8-sig")
print(df.columns)
print(df.shape)
print(tabulate(df.drop(columns=['ayah_uthmani', 'ayah_indopak']).head(), headers='keys', tablefmt='psql'))
# print(tabulate(df[['surah_id', 'ayah_id', 'ayah_uthmani', 'ayah_indopak']].head(), headers='keys', tablefmt='psql'))
# print(tabulate(df.head(), headers='keys', tablefmt='psql'))

def get_row(df, row):
    return df.iloc[row]

print(get_row(df, 0)["ayah_uthmani"])
print(get_row(df, 0)["ayah_indopak"])
print(get_row(df, 0)["word_count"])

Index(['surah_id', 'ayah_id', 'surah_name_ar', 'surah_name_en',
       'surah_name_tr', 'ayah_count', 'ayah_en', 'ayah_tr', 'ayah_uthmani',
       'ayah_indopak', 'word_count', 'letter_count'],
      dtype='str')
(6236, 12)
+----+------------+-----------+-----------------+-----------------+-----------------+--------------+-----------------------------------------------------------------------+------------------------------------+--------------+----------------+
|    |   surah_id |   ayah_id | surah_name_ar   | surah_name_en   | surah_name_tr   |   ayah_count | ayah_en                                                               | ayah_tr                            |   word_count |   letter_count |
|----+------------+-----------+-----------------+-----------------+-----------------+--------------+-----------------------------------------------------------------------+------------------------------------+--------------+----------------|
|  0 |          1 |         1 | الفاتحة         | 

## Simulate the final join

In [25]:
import pandas as pd
from tabulate import tabulate

"""
This files joins the indopak.json with the quran-md-ayahs-no-audio.csv
on surah_id and ayah_id, to create a combined dataset with both the metadata
and the indopak text.
"""

# 1. Load indopak, transpose it, print shape and head.

indopak_path = "data/source/indopak.json"
indopak_df = pd.read_json(indopak_path, encoding="utf-8-sig")
print("source indopak shape:", indopak_df.shape)

# Transposition needed
indopak_transpose = indopak_df.transpose()
print("indopak transpose shape:", indopak_transpose.shape)

print("\nIndopak DataFrame:")
print(tabulate(indopak_transpose.head(), headers='keys', tablefmt='psql'))

source indopak shape: (5, 6236)
indopak transpose shape: (6236, 5)

Indopak DataFrame:
+-----+------+-------------+---------+--------+------------------------+
|     |   id | verse_key   |   surah |   ayah | text                   |
|-----+------+-------------+---------+--------+------------------------|
| 1:1 |    1 | 1:1         |       1 |      1 | بِسۡمِ اللهِ الرَّحۡمٰنِ الرَّحِيۡمِ |
| 1:2 |    2 | 1:2         |       1 |      2 | اَلۡحَمۡدُ لِلّٰهِ رَبِّ الۡعٰلَمِيۡنَۙ‏   |
| 1:3 |    3 | 1:3         |       1 |      3 | الرَّحۡمٰنِ الرَّحِيۡمِۙ‏          |
| 1:4 |    4 | 1:4         |       1 |      4 | مٰلِكِ يَوۡمِ الدِّيۡنِؕ‏          |
| 1:5 |    5 | 1:5         |       1 |      5 | اِيَّاكَ نَعۡبُدُ وَاِيَّاكَ نَسۡتَعِيۡنُؕ‏ |
+-----+------+-------------+---------+--------+------------------------+


In [38]:
# 2. Load quran-md-ayahs-no-audio.csv, join with indopak on surah_id and ayah_id, print shape and head.

quran_md_path = "data/out/quran-md-ayahs-no-audio.csv"
quran_md_df = pd.read_csv(quran_md_path, encoding="utf-8")
quran_md_df["verse_key"] = quran_md_df["surah_id"].astype(str) + ":" + quran_md_df["ayah_id"].astype(str)
print("source quran-md shape:", quran_md_df.shape)
print(quran_md_df.head())

quran_md_df.to_csv("data/out/dummy1.csv", index=False, encoding="utf-8")

source quran-md shape: (6236, 10)
   surah_id  ayah_id surah_name_ar surah_name_en surah_name_tr  ayah_count  \
0         1        1       الفاتحة   The Opening    Al-Faatiha           7   
1         1        2       الفاتحة   The Opening    Al-Faatiha           7   
2         1        3       الفاتحة   The Opening    Al-Faatiha           7   
3         1        4       الفاتحة   The Opening    Al-Faatiha           7   
4         1        5       الفاتحة   The Opening    Al-Faatiha           7   

                                             ayah_en  \
0  In the name of Allah, the Entirely Merciful, t...   
1  [All] praise is [due] to Allah, Lord of the wo...   
2    The Entirely Merciful, the Especially Merciful,   
3                Sovereign of the Day of Recompense.   
4      It is You we worship and You we ask for help.   

                              ayah_tr  \
0    bis'mi l-lahi l-raḥmāni l-raḥīmi   
1   al-ḥamdu lillahi rabbi l-ʿālamīna   
2                 al-raḥmāni l-raḥīmi

In [30]:
# Join indopak and quran-md on surah_id and ayah_id
indopak_transpose_renamed = indopak_transpose.rename(columns={'surah': 'surah_id', 'ayah': 'ayah_id'})
joined_df = pd.merge(quran_md_df, indopak_transpose_renamed, on=["surah_id", "ayah_id"], how='inner')
# print(tabulate(joined_df.head(), headers='keys', tablefmt='psql'))
print(joined_df.head())

  surah_id ayah_id surah_name_ar surah_name_en surah_name_tr  ayah_count  \
0        1       1       الفاتحة   The Opening    Al-Faatiha           7   
1        1       2       الفاتحة   The Opening    Al-Faatiha           7   
2        1       3       الفاتحة   The Opening    Al-Faatiha           7   
3        1       4       الفاتحة   The Opening    Al-Faatiha           7   
4        1       5       الفاتحة   The Opening    Al-Faatiha           7   

                                             ayah_en  \
0  In the name of Allah, the Entirely Merciful, t...   
1  [All] praise is [due] to Allah, Lord of the wo...   
2    The Entirely Merciful, the Especially Merciful,   
3                Sovereign of the Day of Recompense.   
4      It is You we worship and You we ask for help.   

                              ayah_tr  \
0    bis'mi l-lahi l-raḥmāni l-raḥīmi   
1   al-ḥamdu lillahi rabbi l-ʿālamīna   
2                 al-raḥmāni l-raḥīmi   
3                 māliki yawmi l-dīni   
4

In [None]:
# Rename columns to be more clear, drop redundant columns, print shape and head.
joined_df = joined_df.rename(columns={'text': 'ayah_indopak', 'ayah_ar': 'ayah_uthmani'})
joined_df = joined_df.drop(columns=['surah', 'ayah', 'verse_key', 'id'])
print("joined shape:", joined_df.shape)

# 3. Join with word and letter count dataset on surah_id and ayah_id, print shape and head.
wl_count_path = "data/out/quran-word-and-letter-count-cleaned.csv"
wl_count_df = pd.read_csv(wl_count_path)

joined_df = pd.merge(joined_df, wl_count_df, on=['surah_id', 'ayah_id'], how='inner')
joined_df = joined_df.drop(columns=['verse_sura_id'])
print("joined shape after adding word and letter count:", joined_df.shape)