In [1]:
import pandas as pd
import psycopg2
from datasets import load_dataset
from typing import Dict, List
from collections import defaultdict

In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv()

True

In [2]:
def create_db_connection(db_host: str, db_name: str, db_user: str, db_password: str, db_port: str = "5432"):
    """Create and return database connection and cursor"""
    conn = psycopg2.connect(
        host=db_host,
        database=db_name,
        user=db_user,
        password=db_password,
        port=db_port
    )
    return conn, conn.cursor()

In [3]:
def insert_surahs(cursor, df: pd.DataFrame) -> Dict[int, int]:
    """Insert unique surahs and return a mapping of surah numbers to their IDs"""
    surahs_data = df[['surah', 'surah_name', 'surah_name_transliteration', 'surah_name_en',
                      'surah_total_ayas', 'surah_type', 'surah_order_revealed', 'surah_rukus']].drop_duplicates()

    surah_mapping = {}
    for _, row in surahs_data.iterrows():
        cursor.execute("""
            INSERT INTO surahs (
                surah_id, name, name_transliteration, name_en, 
                total_ayas, type, order_revealed, rukus
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
            RETURNING surah_id;
        """, (
            row['surah'], row['surah_name'], row['surah_name_transliteration'],
            row['surah_name_en'], row['surah_total_ayas'], row['surah_type'],
            row['surah_order_revealed'], row['surah_rukus']
        ))
        surah_mapping[row['surah']] = cursor.fetchone()[0]

    return surah_mapping

In [4]:
def insert_languages_and_translators(cursor, df: pd.DataFrame) -> Dict[str, int]:
    """
    Insert languages and translators, return translator mapping.
    Only processes language codes that are exactly 2 characters long.
    """
    # Extract translator columns
    translation_cols = [
        col for col in df.columns if col.startswith('translation_')]

    # Create language entries
    languages = set()
    translator_to_lang = {}

    for col in translation_cols:
        # Extract language code (characters after 'translation_')
        parts = col.split('_')
        if len(parts) >= 3:
            lang_code = parts[1]

            # Only process if language code is exactly 2 characters
            if len(lang_code) == 2:
                languages.add(lang_code)
                # Join the remaining parts to get the translator name
                translator = '_'.join(parts[2:])
                translator_to_lang[translator] = lang_code
            else:
                print(
                    f"Skipping invalid language code: {lang_code} from column: {col}")

    # Insert valid languages
    for lang_code in languages:
        cursor.execute("""
            INSERT INTO languages (language_code, language_name)
            VALUES (%s, %s)
            ON CONFLICT (language_code) DO NOTHING;
        """, (lang_code, lang_code.upper()))

    # Insert translators and get mapping
    translator_mapping = {}
    for translator, lang_code in translator_to_lang.items():
        cursor.execute("""
            INSERT INTO translators (name, language_code)
            VALUES (%s, %s)
            RETURNING translator_id;
        """, (translator, lang_code))
        translator_mapping[translator] = cursor.fetchone()[0]

    # Print summary of processing
    print(f"Processed {len(languages)} valid languages")
    print(f"Processed {len(translator_mapping)} translators")

    return translator_mapping

In [5]:
def insert_ayahs(cursor, df: pd.DataFrame, surah_mapping: Dict[int, int]) -> Dict[tuple, int]:
    """Insert ayahs and return mapping of (surah, ayah) to ayah_id"""
    ayah_mapping = {}

    for _, row in df.iterrows():
        cursor.execute("""
            INSERT INTO ayahs (
                surah_id, ayah_number,
                arabic_text_simple, arabic_text_simple_min,
                arabic_text_simple_plain, arabic_text_simple_clean,
                arabic_text_uthmani, arabic_text_original
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
            RETURNING ayah_id;
        """, (
            surah_mapping[row['surah']], row['ayah'],
            row['arabic_text_simple'], row['arabic_text_simple_min'],
            row['arabic_text_simple_plain'], row['arabic_text_simple_clean'],
            row['arabic_text_uthmani'], row['arabic_text_original']
        ))
        ayah_mapping[(row['surah'], row['ayah'])] = cursor.fetchone()[0]

    return ayah_mapping

In [6]:
def insert_translations(cursor, df: pd.DataFrame, ayah_mapping: Dict[tuple, int], translator_mapping: Dict[str, int]):
    """Insert translations"""
    translation_cols = [
        col for col in df.columns if col.startswith('translation_')]

    for _, row in df.iterrows():
        ayah_id = ayah_mapping[(row['surah'], row['ayah'])]

        for col in translation_cols:
            translator = col.split('_', 2)[2]
            translator_id = translator_mapping[translator]

            if pd.notna(row[col]):  # Only insert if translation exists
                cursor.execute("""
                    INSERT INTO translations (ayah_id, translator_id, translation_text)
                    VALUES (%s, %s, %s)
                    ON CONFLICT (ayah_id, translator_id) DO UPDATE 
                    SET translation_text = EXCLUDED.translation_text;
                """, (ayah_id, translator_id, row[col]))

In [7]:
# Load the dataset
dataset = load_dataset("nazimali/quran")
df = dataset["train"].to_pandas()
df.columns = df.columns.str.replace('-', '_')

Resolving data files:   0%|          | 0/172 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/172 [00:00<?, ?it/s]

In [8]:
df.head()

Unnamed: 0,surah,ayah,surah_name,surah_total_ayas,surah_name_transliteration,surah_name_en,surah_type,surah_order_revealed,surah_rukus,arabic_text_simple,...,translation_ur_jalandhry,translation_ur_jawadi,translation_ur_junagarhi,translation_ur_kanzuliman,translation_ur_maududi,translation_ur_najafi,translation_ur_qadri,translation_uz_sodik,translation_zh_jian,translation_zh_majian
0,1,1,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,...,شروع الله کا نام لے کر جو بڑا مہربان نہایت رحم...,عظیم اور دائمی رحمتوں والے خدا کے نام سے,شروع کرتا ہوں اللہ تعالیٰ کے نام سے جو بڑا مہر...,اللہ کے نام سے شروع جو بہت مہربان رحمت والا,اللہ کے نام سے جو رحمان و رحیم ہے,(شروع کرتا ہوں) اللہ کے نام سے جو بڑا مہربان ن...,اللہ کے نام سےشروع جو نہایت مہربان ہمیشہ رحم ف...,Меҳрибон ва раҳмли Аллоҳнинг номи билан бошлай...,奉至仁至慈的真主之名,奉至仁至慈的真主之名
1,1,2,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,...,سب طرح کی تعریف خدا ہی کو (سزاوار) ہے جو تمام ...,ساری تعریف اللہ کے لئے ہے جو عالمین کا پالنے و...,سب تعریف اللہ تعالیٰ کے لئے ہے جو تمام جہانوں ...,سب خوبیاں اللہ کو جو مالک سارے جہان والوں کا،,تعریف اللہ ہی کے لیے ہے جو تمام کائنات کا رب ہے,ہر قسم کی تعریف اس اللہ کے لیے جو سب جہانوں کا...,سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں ک...,"Барча мақтов, шукрлар оламларнинг тарбиячиси А...",一切赞颂全归真主，众世界 的主，,一切贊頌，全歸真主，全世界的主，
2,1,3,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,الرَّحْمَـٰنِ الرَّحِيمِ,...,بڑا مہربان نہایت رحم والا,وہ عظیم اوردائمی رحمتوں والا ہے,بڑا مہربان نہایت رحم کرنے واﻻ,بہت مہربان رحمت والا،,رحمان اور رحیم ہے,جو (سب پر) بڑا مہربان (اور خاص بندوں پر) نہایت...,نہایت مہربان بہت رحم فرمانے والا ہے,"У Роҳман ва Раҳийм... (Барчага-кофирга ҳам, мў...",至仁至慈的主，,至仁至慈的主，
3,1,4,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,مَالِكِ يَوْمِ الدِّينِ,...,انصاف کے دن کا حاکم,روزِقیامت کا مالک و مختار ہے,بدلے کے دن (یعنی قیامت) کا مالک ہے,روز جزا کا مالک،,روز جزا کا مالک ہے,جزا و سزا کے دن کا مالک (و مختار) ہے۔,روزِ جزا کا مالک ہے,"У қиёмат кунининг подшоҳи, эгаси. (""Дин"" сўзи ...",报应日的主，,報應日的主。
4,1,5,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,...,(اے پروردگار) ہم تیری ہی عبادت کرتے ہیں اور تج...,پروردگار! ہم تیری ہی عبادت کرتے ہیں ا ور تجھی ...,ہم صرف تیری ہی عبادت کرتے ہیں اور صرف تجھ ہی س...,ہم تجھی کو پوجیں اور تجھی سے مدد چاہیں،,ہم تیری ہی عبادت کرتے ہیں اور تجھی سے مدد مانگ...,(اے اللہ!) ہم تیری ہی عبادت کرتے ہیں اور تجھ ہ...,(اے اللہ!) ہم تیری ہی عبادت کرتے ہیں اور ہم تج...,Фақат сенгагина ибодат қиламиз ва фақат сендан...,我们只崇拜你，只求你祐助,我們只崇拜你，只求你祐助，


In [9]:
# Get list of columns to drop - those starting with 'translation_' but not containing 'en_'
cols_to_drop = [col for col in df.columns if col.startswith(
    'translation_') and 'en_' not in col]

# Drop the identified columns
df = df.drop(columns=cols_to_drop)

In [10]:
df.head()

Unnamed: 0,surah,ayah,surah_name,surah_total_ayas,surah_name_transliteration,surah_name_en,surah_type,surah_order_revealed,surah_rukus,arabic_text_simple,...,translation_en_mubarakpuri,translation_en_pickthall,translation_en_qarai,translation_en_qaribullah,translation_en_sahih,translation_en_sarwar,translation_en_shakir,translation_en_transliteration,translation_en_wahiduddin,translation_en_yusufali
0,1,1,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,...,"In the Name of Allah, the Most Gracious, the M...","In the name of Allah, the Beneficent, the Merc...","In the Name of Allah, the All-beneficent, the ...","In the Name of Allah, the Merciful, the Most M...","In the name of Allah, the Entirely Merciful, t...","In the Name of Allah, the Beneficent, the Merc...","In the name of Allah, the Beneficent, the Merc...",Bismi All<u>a</U>hi a<b>l</B>rra<u>h</U>m<u>a<...,"﻿In the name of God, the Most Gracious, the Mo...","In the name of Allah, Most Gracious, Most Merc..."
1,1,2,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,...,"Al-Hamd be to Allah, the Lord of all that exists.","Praise be to Allah, Lord of the Worlds,","All praise belongs to Allah, Lord of all the w...","Praise be to Allah, Lord of the Worlds,","[All] praise is [due] to Allah, Lord of the wo...","All praise belongs to God, Lord of the Universe,","All praise is due to Allah, the Lord of the Wo...",Al<u>h</U>amdu lill<u>a</U>hi rabbi alAA<u>a</...,"All praise is due to God, the Lord of the Univ...","Praise be to Allah, the Cherisher and Sustaine..."
2,1,3,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,الرَّحْمَـٰنِ الرَّحِيمِ,...,"Ar-Rahman (the Most Gracious), Ar-Rahim (the M...","The Beneficent, the Merciful.","the All-beneficent, the All-merciful,","the Merciful, the Most Merciful,","The Entirely Merciful, the Especially Merciful,","the Beneficent, the Merciful","The Beneficent, the Merciful.",A<b>l</B>rra<u>h</U>m<u>a</U>ni a<b>l</B>rra<u...,"the Beneficent, the Merciful;","Most Gracious, Most Merciful;"
3,1,4,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,مَالِكِ يَوْمِ الدِّينِ,...,The Owner of the Day of Recompense.,"Master of the Day of Judgment,",Master of the Day of Retribution.,Owner of the Day of Recompense.,Sovereign of the Day of Recompense.,and Master of the Day of Judgment,Master of the Day of Judgment.,M<u>a</U>liki yawmi a<b>l</B>ddeen<b>i</b>,Lord of the Day of Judgement.,Master of the Day of Judgment.
4,1,5,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,...,"You we worship, and You we ask for help.",Thee (alone) we worship; Thee (alone) we ask f...,"You [alone] do we worship, and to You [alone] ...",You (alone) we worship; and You (alone) we rel...,It is You we worship and You we ask for help.,"(Lord), You alone We do worship and from You a...",Thee do we serve and Thee do we beseech for help.,Iyy<u>a</U>ka naAAbudu waiyy<u>a</U>ka nastaAA...,"You alone we worship, and to You alone we turn...","Thee do we worship, and Thine aid we seek."


In [None]:
df3 = pd.read_csv('quran.csv')
df['arabic_text_original'] = df3['ayah']

In [12]:
df.to_csv('full_quran.csv', index=False)
df.head(10)

Unnamed: 0,surah,ayah,surah_name,surah_total_ayas,surah_name_transliteration,surah_name_en,surah_type,surah_order_revealed,surah_rukus,arabic_text_simple,...,translation_en_pickthall,translation_en_qarai,translation_en_qaribullah,translation_en_sahih,translation_en_sarwar,translation_en_shakir,translation_en_transliteration,translation_en_wahiduddin,translation_en_yusufali,arabic_text_original
0,1,1,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,...,"In the name of Allah, the Beneficent, the Merc...","In the Name of Allah, the All-beneficent, the ...","In the Name of Allah, the Merciful, the Most M...","In the name of Allah, the Entirely Merciful, t...","In the Name of Allah, the Beneficent, the Merc...","In the name of Allah, the Beneficent, the Merc...",Bismi All<u>a</U>hi a<b>l</B>rra<u>h</U>m<u>a<...,"﻿In the name of God, the Most Gracious, the Mo...","In the name of Allah, Most Gracious, Most Merc...",بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
1,1,2,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,...,"Praise be to Allah, Lord of the Worlds,","All praise belongs to Allah, Lord of all the w...","Praise be to Allah, Lord of the Worlds,","[All] praise is [due] to Allah, Lord of the wo...","All praise belongs to God, Lord of the Universe,","All praise is due to Allah, the Lord of the Wo...",Al<u>h</U>amdu lill<u>a</U>hi rabbi alAA<u>a</...,"All praise is due to God, the Lord of the Univ...","Praise be to Allah, the Cherisher and Sustaine...",ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ
2,1,3,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,الرَّحْمَـٰنِ الرَّحِيمِ,...,"The Beneficent, the Merciful.","the All-beneficent, the All-merciful,","the Merciful, the Most Merciful,","The Entirely Merciful, the Especially Merciful,","the Beneficent, the Merciful","The Beneficent, the Merciful.",A<b>l</B>rra<u>h</U>m<u>a</U>ni a<b>l</B>rra<u...,"the Beneficent, the Merciful;","Most Gracious, Most Merciful;",ٱلرَّحْمَٰنِ ٱلرَّحِيمِ
3,1,4,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,مَالِكِ يَوْمِ الدِّينِ,...,"Master of the Day of Judgment,",Master of the Day of Retribution.,Owner of the Day of Recompense.,Sovereign of the Day of Recompense.,and Master of the Day of Judgment,Master of the Day of Judgment.,M<u>a</U>liki yawmi a<b>l</B>ddeen<b>i</b>,Lord of the Day of Judgement.,Master of the Day of Judgment.,مَٰلِكِ يَوْمِ ٱلدِّينِ
4,1,5,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,...,Thee (alone) we worship; Thee (alone) we ask f...,"You [alone] do we worship, and to You [alone] ...",You (alone) we worship; and You (alone) we rel...,It is You we worship and You we ask for help.,"(Lord), You alone We do worship and from You a...",Thee do we serve and Thee do we beseech for help.,Iyy<u>a</U>ka naAAbudu waiyy<u>a</U>ka nastaAA...,"You alone we worship, and to You alone we turn...","Thee do we worship, and Thine aid we seek.",إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ
5,1,6,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,اهْدِنَا الصِّرَاطَ الْمُسْتَقِيمَ,...,"Show us the straight path,","Guide us on the straight path,","Guide us to the Straight Path,",Guide us to the straight path -,"(Lord), guide us to the right path,",Keep us on the right path.,Ihdin<u>a</U> a<b>l</B><u>ss</U>ir<u>at</U>a a...,Guide us to the straight path:,"Show us the straight way,",ٱهْدِنَا ٱلصِّرَٰطَ ٱلْمُسْتَقِيمَ
6,1,7,الفاتحة,7,Al-Faatiha,The Opening,Meccan,5,1,صِرَاطَ الَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ...,...,The path of those whom Thou hast favoured; Not...,the path of those whom You have blessed — such...,"the Path of those upon whom You have favored, ...",The path of those upon whom You have bestowed ...,the path of those to whom You have granted ble...,The path of those upon whom Thou hast bestowed...,<u>S</U>ir<u>at</U>a alla<u>th</U>eena anAAamt...,the path of those You have blessed; not of tho...,The way of those on whom Thou hast bestowed Th...,صِرَٰطَ ٱلَّذِينَ أَنْعَمْتَ عَلَيْهِمْ غَيْرِ...
7,2,1,البقرة,286,Al-Baqara,The Cow,Medinan,87,40,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ الم,...,Alif. Lam. Mim.,"Alif, Lam, Mim.",AlifLaamMeem.,"Alif, Lam, Meem.",Alif. Lam. Mim.,Alif Lam Mim.,Alifl<u>a</u>mmeem,Alif Lam Mim.,A. L. M.,الٓمٓ
8,2,2,البقرة,286,Al-Baqara,The Cow,Medinan,87,40,ذَٰلِكَ الْكِتَابُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًى ...,...,This is the Scripture whereof there is no doub...,"This is the Book, there is no doubt in it, a g...","That is the (Holy) Book, where there is no dou...",This is the Book about which there is no doubt...,There is no doubt that this book is a guide fo...,"This Book, there is no doubt in it, is a guide...",<u>Tha</u>lika alkit<u>a</u>bu l<u>a</u> rayba...,This is the Book; there is no doubt in it. It ...,"This is the Book; in it is guidance sure, with...",ذَٰلِكَ ٱلْكِتَٰبُ لَا رَيْبَ ۛ فِيهِ ۛ هُدًۭى...
9,2,3,البقرة,286,Al-Baqara,The Cow,Medinan,87,40,الَّذِينَ يُؤْمِنُونَ بِالْغَيْبِ وَيُقِيمُونَ...,...,"Who believe in the Unseen, and establish worsh...","who believe in the Unseen, maintain the prayer...",Who believe in the unseen and establish the (d...,"Who believe in the unseen, establish prayer, a...","the pious who believe in the unseen, attend to...",Those who believe in the unseen and keep up pr...,Alla<u>th</u>eena yuminoona bi<b>a</b>lghaybi ...,"who believe in the unseen, and are steadfast i...","Who believe in the Unseen, are steadfast in pr...",ٱلَّذِينَ يُؤْمِنُونَ بِٱلْغَيْبِ وَيُقِيمُونَ...


In [16]:
# Database connection details
db_host = os.getenv("SUPABASE_POSTGRES_HOST")
db_name = os.getenv("SUPABASE_POSTGRES_DATABASE")
db_user = os.getenv("SUPABASE_POSTGRES_USER")
db_password = os.getenv("SUPABASE_POSTGRES_PASSWORD")

In [14]:
# Create database connection
conn, cur = create_db_connection(db_host, db_name, db_user, db_password)

try:
    # Begin transaction
    conn.autocommit = True

    # Insert data into normalized tables
    print("Inserting surahs...")
    surah_mapping = insert_surahs(cur, df)

    print("Inserting languages and translators...")
    translator_mapping = insert_languages_and_translators(cur, df)

    print("Inserting ayahs...")
    ayah_mapping = insert_ayahs(cur, df, surah_mapping)

    print("Inserting translations...")
    insert_translations(cur, df, ayah_mapping, translator_mapping)

    # Commit the transaction
    conn.commit()
    print("Data successfully inserted into the normalized database!")

except Exception as e:
    conn.rollback()
    print(f"An error occurred: {str(e)}")
    raise

finally:
    cur.close()
    conn.close()

Inserting surahs...
Inserting languages and translators...
Processed 1 valid languages
Processed 16 translators
Inserting ayahs...
Inserting translations...
Data successfully inserted into the normalized database!
