In [1]:
# Quran CSV to SQLite Consolidation

# Cell 1: Imports
import os
import pandas as pd
import sqlite3

# Cell 2: Set up paths
archive_folder = './csv_data'
db_path = 'quran_translations1.db'

# Cell 3: List CSV files
csv_files = [f for f in os.listdir(archive_folder) if f.endswith('.csv')]
print("CSV files found:", csv_files)

def read_translation_csv(filepath, lang_col):
    rows = []
    tried_encodings = ['utf-8-sig', 'latin1']
    for enc in tried_encodings:
        try:
            with open(filepath, encoding=enc) as f:
                for line in f:
                    parts = line.strip().split('|', 2)
                    if len(parts) < 3:
                        continue
                    chapter, verse, translation = parts
                    try:
                        chapter = int(chapter)
                        verse = int(verse)
                    except ValueError:
                        continue
                    rows.append({
                        'chapter_number': chapter,
                        'verse_number': verse,
                        lang_col: translation
                    })
            break  # success, exit loop
        except UnicodeDecodeError:
            rows = []  # reset rows and try next encoding
            continue
    if not rows:
        print(f"Could not decode {filepath} with tried encodings.")
    return pd.DataFrame(rows)

# Cell 4: Read and process CSVs
dfs = []
for csv_file in csv_files:
    lang = os.path.splitext(csv_file)[0].lower().replace(' ', '_') + '_translation'
    df = read_translation_csv(os.path.join(archive_folder, csv_file), lang)
    dfs.append(df)

# Cell 5: Merge DataFrames on chapter_number and verse_number
from functools import reduce
df_merged = reduce(
    lambda left, right: pd.merge(left, right, on=['chapter_number', 'verse_number'], how='outer'),
    dfs
)
df_merged = df_merged.sort_values(['chapter_number', 'verse_number']).reset_index(drop=True)
df_merged.head()

# Cell 6: Save to SQLite
with sqlite3.connect(db_path) as conn:
    df_merged.to_sql('quran_translations', conn, if_exists='replace', index=False)
print(f"Database saved to {db_path}")

CSV files found: ['Arabic-Original.csv', 'Bangla.csv', 'Chinese.csv', 'German.csv', 'Italian-Piccardo.csv', 'Japanese.csv', 'Malayalam.csv', 'Norwegian.csv', 'Persian.csv', 'Portuguese.csv', 'Russian.csv', 'Turkish.csv', 'Urdu.csv', 'Uzbek.csv']
Database saved to quran_translations1.db


In [5]:
import os
import pandas as pd
import sqlite3
# Cell: Add quran_metadata table from The Quran Dataset.csv

db_path = 'quran_translations.db'
metadata_csv = os.path.join('.','The Quran Dataset.csv')
metadata_columns = [
    'surah_no',
    'surah_name_en',
    'surah_name_ar',
    'surah_name_roman',
    'ayah_no_surah',
    'ayah_no_quran',
    'ayah_ar',
    'ayah_en',
    'ruko_no',
    'juz_no',
    'manzil_no',
    'hizb_quarter',
    'total_ayah_surah',
    'total_ayah_quran',
    'place_of_revelation',
    'sajah_ayah',
    'sajdah_no',
    'no_of_word_ayah',
    'list_of_words'
]

# Try utf-8-sig, fallback to latin1
try:
    metadata_df = pd.read_csv(metadata_csv, encoding='utf-8-sig', usecols=metadata_columns)
except UnicodeDecodeError:
    metadata_df = pd.read_csv(metadata_csv, encoding='latin1', usecols=metadata_columns)

with sqlite3.connect(db_path) as conn:
    metadata_df.to_sql('quran_metadata', conn, if_exists='replace', index=False)

print("quran_metadata table added to database.")

quran_metadata table added to database.
