In [6]:
import json
import pandas as pd
from tabulate import tabulate

# Uthmani and indopak init print

In [None]:
uthmani_path = "data/uthmani.json"
indopak_path = "data/indopak.json"

uthmani_df = pd.read_json(uthmani_path, encoding="utf-8")
indopak_df = pd.read_json(indopak_path, encoding="utf-8")

In [19]:
print("source indopak shape:", indopak_df.shape)
print("source uthmani shape:", uthmani_df.shape)

# Transposition needed

indopak_transpose = indopak_df.transpose()
print("indopak transpose shape:", indopak_transpose.shape)
uthmani_transpose = uthmani_df.transpose()
print("uthmani transpose shape:", uthmani_transpose.shape)

print("Uthmani DataFrame:")
print(tabulate(uthmani_transpose.head(), headers='keys', tablefmt='psql'))

print("\nIndopak DataFrame:")
print(tabulate(indopak_transpose.head(), headers='keys', tablefmt='psql'))

source indopak shape: (5, 6236)
source uthmani shape: (6, 83668)
indopak transpose shape: (6236, 5)
uthmani transpose shape: (83668, 6)
Uthmani DataFrame:
+-------+------+---------+--------+--------+------------+---------+
|       |   id |   surah |   ayah |   word | location   | text    |
|-------+------+---------+--------+--------+------------+---------|
| 1:1:1 |    1 |       1 |      1 |      1 | 1:1:1      | بِسْمِ     |
| 1:1:2 |    2 |       1 |      1 |      2 | 1:1:2      | ٱللَّهِ    |
| 1:1:3 |    3 |       1 |      1 |      3 | 1:1:3      | ٱلرَّحْمَـٰنِ |
| 1:1:4 |    4 |       1 |      1 |      4 | 1:1:4      | ٱلرَّحِيمِ  |
| 1:1:5 |    5 |       1 |      1 |      5 | 1:1:5      | ١       |
+-------+------+---------+--------+--------+------------+---------+

Indopak DataFrame:
+-----+------+-------------+---------+--------+------------------------+
|     |   id | verse_key   |   surah |   ayah | text                   |
|-----+------+-------------+---------+--------+----

# Loading transliteration from hugging face

Problem is the dataset is really big, I do not want to download the whole thing. I tried streaming with Dataset but it is not better. Taking each parquet file through URL is also not good.

Best to download everything

In [9]:
# 1. Path to your downloaded file
parquet_file_path = "data/burqaan-parquet/train-00000-of-00071.parquet"

# 2. Define the lightweight columns
cols = [
    'surah_id', 'ayah_id', 'surah_name_ar', 'surah_name_en',
    'surah_name_tr', 'ayah_count', 'ayah_en', 'ayah_tr'
]

# 3. Read ONLY those columns
# This makes the 1GB file feel like a 1MB file in terms of speed
df = pd.read_parquet(parquet_file_path, columns=cols)

# 4. Drop duplicates (removing the 30 reciters per verse)
df_unique = df.drop_duplicates(subset=['surah_id', 'ayah_id'])

# 5. Grab the first 4 rows and transpose for "Pretty Print"
print(tabulate(df_unique.head(), headers='keys', tablefmt='psql'))

+-----+------------+-----------+-----------------+-----------------+-----------------+--------------+-----------------------------------------------------------------------+------------------------------------+
|     |   surah_id |   ayah_id | surah_name_ar   | surah_name_en   | surah_name_tr   |   ayah_count | ayah_en                                                               | ayah_tr                            |
|-----+------------+-----------+-----------------+-----------------+-----------------+--------------+-----------------------------------------------------------------------+------------------------------------|
|   0 |          1 |         1 | الفاتحة         | The Opening     | Al-Faatiha      |            7 | In the name of Allah, the Entirely Merciful, the Especially Merciful. | bis'mi l-lahi l-raḥmāni l-raḥīmi   |
|  30 |          1 |         2 | الفاتحة         | The Opening     | Al-Faatiha      |            7 | [All] praise is [due] to Allah, Lord of the worlds -  

In [10]:
print(df_unique.shape)

(88, 8)
