In [1]:
# import the neccessary libraries to parse the XML file
import pandas as pd
import xml.etree.ElementTree as ET 

In [2]:
# Parse the XML files
tree_data = ET.parse('quran-simple.xml')
root = tree_data.getroot()
root

<Element 'quran' at 0x0000020D491517C0>

In [3]:
# Initialize lists to store data
sura_index = []
sura_name = []
aya_index = []
aya_text = []
aya_bismillah = []

In [4]:
# Iterate through each sura and aya
for sura in root.findall('sura'):
    for aya in sura.findall('aya'):
        # Append data to lists
        sura_index.append(sura.get('index'))
        sura_name.append(sura.get('name'))
        aya_index.append(aya.get('index'))
        aya_text.append(aya.get('text'))
        aya_bismillah.append(aya.get('bismillah', ''))

In [5]:
# Create a Pandas DataFrame from the lists
quran_df = pd.DataFrame({
    'Sura Index': sura_index,
    'Sura Name': sura_name,
    'Aya Index': aya_index,
    'Aya Text': aya_text,
    'Bismillah': aya_bismillah
})

In [6]:
quran_df

Unnamed: 0,Sura Index,Sura Name,Aya Index,Aya Text,Bismillah
0,1,الفاتحة,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,
1,1,الفاتحة,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,
2,1,الفاتحة,3,الرَّحْمَـٰنِ الرَّحِيمِ,
3,1,الفاتحة,4,مَالِكِ يَوْمِ الدِّينِ,
4,1,الفاتحة,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,
...,...,...,...,...,...
6231,114,الناس,2,مَلِكِ النَّاسِ,
6232,114,الناس,3,إِلَـٰهِ النَّاسِ,
6233,114,الناس,4,مِن شَرِّ الْوَسْوَاسِ الْخَنَّاسِ,
6234,114,الناس,5,الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ,


In [7]:
# Parse the XML file of meta data
tree_data = ET.parse('quran-meta-data.xml')
meta_data = tree_data.getroot()
meta_data

<Element 'quran' at 0x0000020D519A4400>

In [8]:
# initialize the list to store the meta data
sura_index = []
sura_tname = []
sura_ename = []
sura_type = []

In [9]:
# iterate over the suras
for sura in meta_data.findall('.//sura'):
    sura_index.append(sura.get('index'))
    sura_tname.append(sura.get('tname'))
    sura_ename.append(sura.get('ename'))
    sura_type.append(sura.get('type'))

In [10]:
sura_tname

['Al-Faatiha',
 'Al-Baqara',
 'Aal-i-Imraan',
 'An-Nisaa',
 'Al-Maaida',
 "Al-An'aam",
 "Al-A'raaf",
 'Al-Anfaal',
 'At-Tawba',
 'Yunus',
 'Hud',
 'Yusuf',
 "Ar-Ra'd",
 'Ibrahim',
 'Al-Hijr',
 'An-Nahl',
 'Al-Israa',
 'Al-Kahf',
 'Maryam',
 'Taa-Haa',
 'Al-Anbiyaa',
 'Al-Hajj',
 'Al-Muminoon',
 'An-Noor',
 'Al-Furqaan',
 "Ash-Shu'araa",
 'An-Naml',
 'Al-Qasas',
 'Al-Ankaboot',
 'Ar-Room',
 'Luqman',
 'As-Sajda',
 'Al-Ahzaab',
 'Saba',
 'Faatir',
 'Yaseen',
 'As-Saaffaat',
 'Saad',
 'Az-Zumar',
 'Al-Ghaafir',
 'Fussilat',
 'Ash-Shura',
 'Az-Zukhruf',
 'Ad-Dukhaan',
 'Al-Jaathiya',
 'Al-Ahqaf',
 'Muhammad',
 'Al-Fath',
 'Al-Hujuraat',
 'Qaaf',
 'Adh-Dhaariyat',
 'At-Tur',
 'An-Najm',
 'Al-Qamar',
 'Ar-Rahmaan',
 'Al-Waaqia',
 'Al-Hadid',
 'Al-Mujaadila',
 'Al-Hashr',
 'Al-Mumtahana',
 'As-Saff',
 "Al-Jumu'a",
 'Al-Munaafiqoon',
 'At-Taghaabun',
 'At-Talaaq',
 'At-Tahrim',
 'Al-Mulk',
 'Al-Qalam',
 'Al-Haaqqa',
 "Al-Ma'aarij",
 'Nooh',
 'Al-Jinn',
 'Al-Muzzammil',
 'Al-Muddaththir',
 'Al-

In [11]:
# creat dataframe for suras
suras_df = pd.DataFrame({
    'index': sura_index,
    'tname': sura_tname,
    'ename': sura_ename,
    'type': sura_type
    })

In [12]:
suras_df

Unnamed: 0,index,tname,ename,type
0,1,Al-Faatiha,The Opening,Meccan
1,2,Al-Baqara,The Cow,Medinan
2,3,Aal-i-Imraan,The Family of Imraan,Medinan
3,4,An-Nisaa,The Women,Medinan
4,5,Al-Maaida,The Table,Medinan
...,...,...,...,...
109,110,An-Nasr,Divine Support,Medinan
110,111,Al-Masad,The Palm Fibre,Meccan
111,112,Al-Ikhlaas,Sincerity,Meccan
112,113,Al-Falaq,The Dawn,Meccan


In [13]:
juz_index = []
juz_sura = []
juz_aya = []

In [14]:
# iterate over the juzs
for juz in meta_data.findall('.//juz'):
    juz_index.append(juz.get('index'))
    juz_sura.append(juz.get('sura'))
    juz_aya.append(juz.get('aya'))

In [15]:
# creat dataframe for juz (para)
juzs_df = pd.DataFrame({
    'juz_index': juz_index,
    'juz_sura': juz_sura,
    'juz_aya': juz_aya
    })

In [16]:
juzs_df

Unnamed: 0,juz_index,juz_sura,juz_aya
0,1,1,1
1,2,2,142
2,3,2,253
3,4,3,93
4,5,4,24
5,6,4,148
6,7,5,82
7,8,6,111
8,9,7,88
9,10,8,41


In [17]:
print(quran_df.shape)
quran_df.columns

(6236, 5)


Index(['Sura Index', 'Sura Name', 'Aya Index', 'Aya Text', 'Bismillah'], dtype='object')

In [18]:
print(juzs_df.shape)
juzs_df.columns

(30, 3)


Index(['juz_index', 'juz_sura', 'juz_aya'], dtype='object')

In [19]:
juzs_df

Unnamed: 0,juz_index,juz_sura,juz_aya
0,1,1,1
1,2,2,142
2,3,2,253
3,4,3,93
4,5,4,24
5,6,4,148
6,7,5,82
7,8,6,111
8,9,7,88
9,10,8,41


In [20]:
# merge the quran df and Juzs df
quran_df = pd.merge(
    quran_df, juzs_df, left_on = [
        'Sura Index', 'Aya Index'], right_on = ['juz_sura', 'juz_aya'], how = 'outer'
        )

In [21]:
quran_df.shape

(6236, 8)

In [22]:
quran_df

Unnamed: 0,Sura Index,Sura Name,Aya Index,Aya Text,Bismillah,juz_index,juz_sura,juz_aya
0,1,الفاتحة,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,,1,1,1
1,1,الفاتحة,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,,,,
2,1,الفاتحة,3,الرَّحْمَـٰنِ الرَّحِيمِ,,,,
3,1,الفاتحة,4,مَالِكِ يَوْمِ الدِّينِ,,,,
4,1,الفاتحة,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,,,,
...,...,...,...,...,...,...,...,...
6231,114,الناس,2,مَلِكِ النَّاسِ,,,,
6232,114,الناس,3,إِلَـٰهِ النَّاسِ,,,,
6233,114,الناس,4,مِن شَرِّ الْوَسْوَاسِ الْخَنَّاسِ,,,,
6234,114,الناس,5,الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ,,,,


In [23]:
# drop the duplicated columns 
quran_df.drop(['juz_sura', 'juz_aya'], axis=1, inplace=True)

In [24]:
# fill nan values with previous values
quran_df['juz_index'].fillna(method = 'ffill', inplace = True)

In [25]:
quran_df

Unnamed: 0,Sura Index,Sura Name,Aya Index,Aya Text,Bismillah,juz_index
0,1,الفاتحة,1,بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ,,1
1,1,الفاتحة,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ,,1
2,1,الفاتحة,3,الرَّحْمَـٰنِ الرَّحِيمِ,,1
3,1,الفاتحة,4,مَالِكِ يَوْمِ الدِّينِ,,1
4,1,الفاتحة,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ,,1
...,...,...,...,...,...,...
6231,114,الناس,2,مَلِكِ النَّاسِ,,30
6232,114,الناس,3,إِلَـٰهِ النَّاسِ,,30
6233,114,الناس,4,مِن شَرِّ الْوَسْوَاسِ الْخَنَّاسِ,,30
6234,114,الناس,5,الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ,,30


In [26]:
quran_df.shape

(6236, 6)

In [27]:
suras_df.columns

Index(['index', 'tname', 'ename', 'type'], dtype='object')

In [28]:
# merged the quran df with sura df
quran_df = pd.merge(
    quran_df, suras_df, left_on ='Sura Index', right_on = 'index', how = 'outer'
    )

In [None]:
quran_df.drop('index', axis = 1, inplace= True) # drop the duplicated column
quran_df

In [30]:
# load the text file of English translation 
eng_df = pd.read_csv('en.maududi.txt', sep='\  ', header=None, names=['eng_maududi'])

In [31]:
eng_df

Unnamed: 0,eng_maududi
0,"In the name of Allah, the Merciful, the Compas..."
1,"Praise be to Allah, the Lord of the entire uni..."
2,"The Merciful, the Compassionate"
3,The Master of the Day of Recompense.
4,"You alone do we worship, and You alone do we t..."
...,...
6231,"the King of mankind,"
6232,"the True God of mankind,"
6233,"from the mischief of the whispering, elusive p..."
6234,who whispers in the hearts of people;


In [32]:
# merege the quran df with Englisth Translationi df
quran_df = pd.merge(
    quran_df, eng_df, left_index=True, right_index=True, how = 'outer'
    )

In [33]:
quran_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6236 entries, 0 to 6235
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Sura Index   6236 non-null   object
 1   Sura Name    6236 non-null   object
 2   Aya Index    6236 non-null   object
 3   Aya Text     6236 non-null   object
 4   Bismillah    6236 non-null   object
 5   juz_index    6236 non-null   object
 6   index        6236 non-null   object
 7   tname        6236 non-null   object
 8   ename        6236 non-null   object
 9   type         6236 non-null   object
 10  eng_maududi  6236 non-null   object
dtypes: object(11)
memory usage: 713.7+ KB


In [35]:
# load the text file of Urdu Translation
urdu_df = pd.read_csv(
    'ur.maududi.txt', sep='\  ', header=None, names=['urdu_maududi']
    )

In [36]:
urdu_df

Unnamed: 0,urdu_maududi
0,اللہ کے نام سے جو رحمان و رحیم ہے
1,تعریف اللہ ہی کے لیے ہے جو تمام کائنات کا رب ہے
2,رحمان اور رحیم ہے
3,روز جزا کا مالک ہے
4,ہم تیری ہی عبادت کرتے ہیں اور تجھی سے مدد مانگ...
...,...
6231,انسانوں کے بادشاہ
6232,انسانوں کے حقیقی معبود کی
6233,اُس وسوسہ ڈالنے والے کے شر سے جو بار بار پلٹ ک...
6234,جو لوگوں کے دلوں میں وسوسے ڈالتا ہے


In [38]:
# load the text file of Pashto translation 
ps_df = pd.read_csv(
    'ps.abdulwali.txt', sep='\  ', header=None, names=['pashto_abdulwali']
    )

In [39]:
ps_df.shape

(6236, 1)

In [40]:
quran_df = pd.merge(quran_df, urdu_df, left_index=True, right_index=True, how = 'outer')

In [41]:
quran_df = pd.merge(quran_df, ps_df, left_index=True, right_index=True, how = 'outer')

In [42]:
quran_df.columns

Index(['Sura Index', 'Sura Name', 'Aya Index', 'Aya Text', 'Bismillah',
       'juz_index', 'index', 'tname', 'ename', 'type', 'eng_maududi',
       'urdu_maududi', 'pashto_abdulwali'],
      dtype='object')

In [43]:
# rename the columns
quran_df = quran_df.rename(columns={
    'Sura Index': 'chapter_number',
    'Sura Name': 'chapter_name',
    'type': 'chapter_type',
    'Aya Index':'verse_number',
    'Bismillah': 'bismillah',
    'juz_index': 'juz_number',
    'Aya Text': 'verse',
    })

In [44]:
quran_df.columns

Index(['chapter_number', 'chapter_name', 'verse_number', 'verse', 'bismillah',
       'juz_number', 'index', 'tname', 'ename', 'chapter_type', 'eng_maududi',
       'urdu_maududi', 'pashto_abdulwali'],
      dtype='object')

In [45]:
# Reorder the columns
quran_df = quran_df.reindex(columns=[
    'juz_number', 'chapter_number', 'chapter_name', 'tname', 'ename', 'chapter_type', 'bismillah',
    'verse_number', 'verse', 'eng_maududi', 'urdu_maududi',
    'pashto_abdulwali',
    ])

In [46]:
quran_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6236 entries, 0 to 6235
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   juz_number        6236 non-null   object
 1   chapter_number    6236 non-null   object
 2   chapter_name      6236 non-null   object
 3   tname             6236 non-null   object
 4   ename             6236 non-null   object
 5   chapter_type      6236 non-null   object
 6   bismillah         6236 non-null   object
 7   verse_number      6236 non-null   object
 8   verse             6236 non-null   object
 9   eng_maududi       6236 non-null   object
 10  urdu_maududi      6236 non-null   object
 11  pashto_abdulwali  6236 non-null   object
dtypes: object(12)
memory usage: 762.4+ KB


In [47]:
quran_df.iloc[7]

juz_number                                                1
chapter_number                                            2
chapter_name                                         البقرة
tname                                             Al-Baqara
ename                                               The Cow
chapter_type                                        Medinan
bismillah           بِسْمِ اللَّهِ الرَّحْمَـٰنِ الرَّحِيمِ
verse_number                                              1
verse                                                   الم
eng_maududi                                 Alif. Lam. Mim.
urdu_maududi                                    الف لام میم
pashto_abdulwali                              الف، لام، ميم
Name: 7, dtype: object

In [48]:
# Save the updated dataframe as a CSV file
# quran_df.to_csv('quran.csv', index=False)