In [1]:
import os
import pandas as pd
# import zipfile
import re

In [2]:
text_path = r'..\Datasets\QuranFullText'

In [3]:
quran_text = {
    "simple": "quran-simple.txt",
    "enhanced": "quran-simple-enhanced.txt",
    "min": "quran-simple-min.txt",
    "simple_cln": "quran-simple-clean.txt",
    "uthmani": "quran-uthmani.txt",
    "uthmani_min": "quran-uthmani-min.txt"
}

In [5]:
filename = os.path.join(text_path, quran_text["simple"])
filename

'..\\Datasets\\QuranFullText\\quran-simple.txt'

In [12]:
ds= pd.read_csv(filename, 
                nrows=6236, 
                delimiter="|",
                names=["SuraNum", "AyaNum", "AyaText"],
                )
display(ds.head(), ds.tail())

Unnamed: 0,SuraNum,AyaNum,AyaText
0,1,1,بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ
1,1,2,الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ
2,1,3,الرَّحْمَنِ الرَّحِيمِ
3,1,4,مَالِكِ يَوْمِ الدِّينِ
4,1,5,إِيَّاكَ نَعْبُدُ وَإِيَّاكَ نَسْتَعِينُ


Unnamed: 0,SuraNum,AyaNum,AyaText
6231,114,2,مَلِكِ النَّاسِ
6232,114,3,إِلَهِ النَّاسِ
6233,114,4,مِنْ شَرِّ الْوَسْوَاسِ الْخَنَّاسِ
6234,114,5,الَّذِي يُوَسْوِسُ فِي صُدُورِ النَّاسِ
6235,114,6,مِنَ الْجِنَّةِ وَالنَّاسِ


In [37]:
# Testing
aya = ds[(ds.SuraNum==1) & (ds.AyaNum==1)]["AyaText"]
aya

0    بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ
Name: AyaText, dtype: object

In [26]:
aya.iloc[0]

'بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ'

In [38]:
sura_num = 1
aya_num = 1
aya = ds[(ds.SuraNum == sura_num) & (ds.AyaNum == aya_num)]["AyaText"].iloc[0]
aya

'بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ'

In [29]:
words = aya.split(' ')
words

['بِسْمِ', 'اللَّهِ', 'الرَّحْمَنِ', 'الرَّحِيمِ']

In [30]:
words[0]

'بِسْمِ'

In [36]:
words_len = len(words)
words_count = list(range(words_len + 1))[1:]
words_count

[1, 2, 3, 4]

In [39]:
sura_list= [sura_num]*words_len
aya_num_list= [aya_num]*words_len
sura_list, aya_num_list

([1, 1, 1, 1], [1, 1, 1, 1])

In [44]:
def get_aya_lists(sura_num, aya_num):
    aya = ds[(ds.SuraNum == sura_num)
             & (ds.AyaNum == aya_num)]["AyaText"].iloc[0]
    words = aya.split(' ')
    words_len = len(words)
    words_count = list(range(words_len + 1))[1:]
    sura_list = [sura_num] * words_len
    aya_num_list = [aya_num] * words_len
    return [sura_list, aya_num_list, words_count, words]

In [45]:
get_aya_lists(1, 2)

[[1, 1, 1, 1],
 [2, 2, 2, 2],
 [1, 2, 3, 4],
 ['الْحَمْدُ', 'لِلَّهِ', 'رَبِّ', 'الْعَالَمِينَ']]

In [48]:
aya_df = pd.DataFrame(get_aya_lists(1, 2),
                      index=["sura_num", "aya_num", "word_num", "word_text"]).T
aya_df

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,1,2,1,الْحَمْدُ
1,1,2,2,لِلَّهِ
2,1,2,3,رَبِّ
3,1,2,4,الْعَالَمِينَ


In [49]:
def get_aya_df(sura_num, aya_num):
    return pd.DataFrame(get_aya_lists(sura_num, aya_num),
                      index=["sura_num", "aya_num", "word_num", "word_text"]).T

In [50]:
get_aya_df(1, 5)

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,1,5,1,إِيَّاكَ
1,1,5,2,نَعْبُدُ
2,1,5,3,وَإِيَّاكَ
3,1,5,4,نَسْتَعِينُ


In [51]:
pd.concat([get_aya_df(1, 2), get_aya_df(1, 3)])

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,1,2,1,الْحَمْدُ
1,1,2,2,لِلَّهِ
2,1,2,3,رَبِّ
3,1,2,4,الْعَالَمِينَ
0,1,3,1,الرَّحْمَنِ
1,1,3,2,الرَّحِيمِ


In [56]:
# getting number of ayat of a sura
sura_num = 2
ds[(ds.SuraNum == sura_num)].head()
sura_ayas = ds[(ds.SuraNum == sura_num)]
sura_ayas.AyaNum.max()

286

In [58]:
# Creating the final data set

for sura_num in range(1, 115):
    sura_ayas = ds[(ds.SuraNum == sura_num)]
    num_ayas = sura_ayas.AyaNum.max()
    print(sura_num, num_ayas, end=" * ")

1 7 * 2 286 * 3 200 * 4 176 * 5 120 * 6 165 * 7 206 * 8 75 * 9 129 * 10 109 * 11 123 * 12 111 * 13 43 * 14 52 * 15 99 * 16 128 * 17 111 * 18 110 * 19 98 * 20 135 * 21 112 * 22 78 * 23 118 * 24 64 * 25 77 * 26 227 * 27 93 * 28 88 * 29 69 * 30 60 * 31 34 * 32 30 * 33 73 * 34 54 * 35 45 * 36 83 * 37 182 * 38 88 * 39 75 * 40 85 * 41 54 * 42 53 * 43 89 * 44 59 * 45 37 * 46 35 * 47 38 * 48 29 * 49 18 * 50 45 * 51 60 * 52 49 * 53 62 * 54 55 * 55 78 * 56 96 * 57 29 * 58 22 * 59 24 * 60 13 * 61 14 * 62 11 * 63 11 * 64 18 * 65 12 * 66 12 * 67 30 * 68 52 * 69 52 * 70 44 * 71 28 * 72 28 * 73 20 * 74 56 * 75 40 * 76 31 * 77 50 * 78 40 * 79 46 * 80 42 * 81 29 * 82 19 * 83 36 * 84 25 * 85 22 * 86 17 * 87 19 * 88 26 * 89 30 * 90 20 * 91 15 * 92 21 * 93 11 * 94 8 * 95 8 * 96 19 * 97 5 * 98 8 * 99 8 * 100 11 * 101 11 * 102 8 * 103 3 * 104 9 * 105 5 * 106 4 * 107 7 * 108 3 * 109 6 * 110 3 * 111 5 * 112 4 * 113 5 * 114 6 * 

In [78]:
# Creating the final data set
def get_full_dataset():
    words_gross_list = []
    for sura_num in range(1, 115):
        sura_ayas = ds[(ds.SuraNum == sura_num)]
        num_ayas = sura_ayas.AyaNum.max()
        print(sura_num, end=">")
        for aya_num in range(1, num_ayas + 1):
            words_gross_list.append(get_aya_df(sura_num, aya_num))
            pass
        pass
    print()
    words_df = pd.concat(words_gross_list)
    print("*Done*")
    # to free the memory
    words_gross_list=[]
    return words_df

In [79]:
words_df = get_full_dataset()

1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>93>94>95>96>97>98>99>100>101>102>103>104>105>106>107>108>109>110>111>112>113>114>
*Done*


In [80]:
words_df.describe()

Unnamed: 0,sura_num,aya_num,word_num,word_text
count,77793,77793,77793,77793
unique,114,286,129,17574
top,2,7,1,مِنْ
freq,6140,1161,6236,1673


In [81]:
words_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77793 entries, 0 to 2
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sura_num   77793 non-null  object
 1   aya_num    77793 non-null  object
 2   word_num   77793 non-null  object
 3   word_text  77793 non-null  object
dtypes: object(4)
memory usage: 3.0+ MB


In [82]:
words_df.head()

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,1,1,1,بِسْمِ
1,1,1,2,اللَّهِ
2,1,1,3,الرَّحْمَنِ
3,1,1,4,الرَّحِيمِ
0,1,2,1,الْحَمْدُ


In [83]:
words_df.reset_index()

Unnamed: 0,index,sura_num,aya_num,word_num,word_text
0,0,1,1,1,بِسْمِ
1,1,1,1,2,اللَّهِ
2,2,1,1,3,الرَّحْمَنِ
3,3,1,1,4,الرَّحِيمِ
4,0,1,2,1,الْحَمْدُ
...,...,...,...,...,...
77788,3,114,5,4,صُدُورِ
77789,4,114,5,5,النَّاسِ
77790,0,114,6,1,مِنَ
77791,1,114,6,2,الْجِنَّةِ


In [66]:
words_df.head(40).reset_index()

Unnamed: 0,index,sura_num,aya_num,word_num,word_text
0,0,1,1,1,بِسْمِ
1,1,1,1,2,اللَّهِ
2,2,1,1,3,الرَّحْمَنِ
3,3,1,1,4,الرَّحِيمِ
4,0,1,2,1,الْحَمْدُ
5,1,1,2,2,لِلَّهِ
6,2,1,2,3,رَبِّ
7,3,1,2,4,الْعَالَمِينَ
8,0,1,3,1,الرَّحْمَنِ
9,1,1,3,2,الرَّحِيمِ


In [67]:
# The computer added the basmala to all suras!
# we should modify that to make it added to sura#1 only, 
# We should check if surat altawba (#9) contains basmala or not
#  as it should not contain it.

get_aya_df(9, 1)

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,9,1,1,بَرَاءَةٌ
1,9,1,2,مِنَ
2,9,1,3,اللَّهِ
3,9,1,4,وَرَسُولِهِ
4,9,1,5,إِلَى
5,9,1,6,الَّذِينَ
6,9,1,7,عَاهَدْتُمْ
7,9,1,8,مِنَ
8,9,1,9,الْمُشْرِكِينَ


In [68]:
# It is OK,

# But let's check the last aya
get_aya_df(9, 129)

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,9,129,1,فَإِنْ
1,9,129,2,تَوَلَّوْا
2,9,129,3,فَقُلْ
3,9,129,4,حَسْبِيَ
4,9,129,5,اللَّهُ
5,9,129,6,لَا
6,9,129,7,إِلَهَ
7,9,129,8,إِلَّا
8,9,129,9,هُوَ
9,9,129,10,عَلَيْهِ


In [69]:
# OK too,

# what about surat alnas (#114)
get_aya_df(114, 1)

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,114,1,1,بِسْمِ
1,114,1,2,اللَّهِ
2,114,1,3,الرَّحْمَنِ
3,114,1,4,الرَّحِيمِ
4,114,1,5,قُلْ
5,114,1,6,أَعُوذُ
6,114,1,7,بِرَبِّ
7,114,1,8,النَّاسِ


In [84]:
# We found the problem, that the dataset consideres the basmala as a
# part of each first aya, which is wrong!

# We should fix this byy starting to count from 4 if the aya is #1
# Except for sura alfatiha #1


def get_aya_lists(sura_num, aya_num):
    aya = ds[(ds.SuraNum == sura_num)
             & (ds.AyaNum == aya_num)]["AyaText"].iloc[0]
    words = aya.split(' ')
    # Any first aya should be truncated except Alfatiha and Altawba 1 & 9
    if (aya_num == 1) & (sura_num != 1) & (sura_num != 9):
        words = words[4:]
    words_len = len(words)
    words_count = list(range(words_len + 1))[1:]
    sura_list = [sura_num] * words_len
    aya_num_list = [aya_num] * words_len
    return [sura_list, aya_num_list, words_count, words]

In [71]:
get_aya_df(114, 1)

Unnamed: 0,sura_num,aya_num,word_num,word_text
0,114,1,1,قُلْ
1,114,1,2,أَعُوذُ
2,114,1,3,بِرَبِّ
3,114,1,4,النَّاسِ


In [124]:
def get_dataset(key="simple"):
    filename = os.path.join(text_path, quran_text[key])
    ds= pd.read_csv(filename, 
                nrows=6236, 
                delimiter="|",
                names=["SuraNum", "AyaNum", "AyaText"],
                )
    return ds
ds = get_dataset(key="simple")

In [125]:
# Creating the final data set
def get_aya_lists(sura_num, aya_num, num_ayas):
    aya = ds[(ds.SuraNum == sura_num)
             & (ds.AyaNum == aya_num)]["AyaText"].iloc[0]
    words = aya.split(' ')
    # Any first aya should be truncated except Alfatiha and Altawba 1 & 9
    if (aya_num == 1) & (sura_num != 1) & (sura_num != 9):
        words = words[4:]
    words_len = len(words)
    words_count = list(range(words_len + 1))[1:]
    words_c_list = [words_len] * words_len
    sura_list = [sura_num] * words_len
    aya_num_list = [aya_num] * words_len
    aya_c_list = [num_ayas] * words_len
    return [
        sura_list, aya_c_list, aya_num_list, words_c_list, words_count, words
    ]


def get_aya_df(sura_num, aya_num, num_ayas):
    return pd.DataFrame(get_aya_lists(sura_num, aya_num, num_ayas),
                        index=[
                            "sura_num", "aya_in_sura", "aya_num",
                            "words_in_aya", "word_num", "word_text"
                        ]).T


def get_full_dataset(ds):
    words_gross_list = []
    for sura_num in range(1, 115):
        sura_ayas = ds[(ds.SuraNum == sura_num)]
        num_ayas = sura_ayas.AyaNum.max()
        print(sura_num, end=">")
        for aya_num in range(1, num_ayas + 1):
            words_gross_list.append(get_aya_df(sura_num, aya_num, num_ayas))
            pass
        pass
    print()
    words_df = pd.concat(words_gross_list)
    print("*Done*")
    # to free the memory
    words_gross_list = []
    words_df.reset_index(inplace=True)
    words_df.drop('index', axis=1, inplace=True)
    return words_df

In [104]:
words_df = get_full_dataset(ds)

1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>93>94>95>96>97>98>99>100>101>102>103>104>105>106>107>108>109>110>111>112>113>114>
*Done*


In [105]:
words_df.describe()

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,word_text
count,77797,77797,77797,77797,77797,77797
unique,114,77,286,73,129,17574
top,2,286,7,11,1,مِنْ
freq,6140,6140,1161,3696,6236,1673


In [106]:
words_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77797 entries, 0 to 77796
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sura_num      77797 non-null  object
 1   aya_in_sura   77797 non-null  object
 2   aya_num       77797 non-null  object
 3   words_in_aya  77797 non-null  object
 4   word_num      77797 non-null  object
 5   word_text     77797 non-null  object
dtypes: object(6)
memory usage: 3.6+ MB


In [107]:
words_df

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,word_text
0,1,7,1,4,1,بِسْمِ
1,1,7,1,4,2,اللَّهِ
2,1,7,1,4,3,الرَّحْمَنِ
3,1,7,1,4,4,الرَّحِيمِ
4,1,7,2,4,1,الْحَمْدُ
...,...,...,...,...,...,...
77792,114,6,5,5,4,صُدُورِ
77793,114,6,5,5,5,النَّاسِ
77794,114,6,6,3,1,مِنَ
77795,114,6,6,3,2,الْجِنَّةِ


In [110]:
words_df.rename(columns={"word_text":"simple"})

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,simple
0,1,7,1,4,1,بِسْمِ
1,1,7,1,4,2,اللَّهِ
2,1,7,1,4,3,الرَّحْمَنِ
3,1,7,1,4,4,الرَّحِيمِ
4,1,7,2,4,1,الْحَمْدُ
...,...,...,...,...,...,...
77792,114,6,5,5,4,صُدُورِ
77793,114,6,5,5,5,النَّاسِ
77794,114,6,6,3,1,مِنَ
77795,114,6,6,3,2,الْجِنَّةِ


In [120]:
pd.concat([words_df, words_df["word_text"]], axis=1).head()

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,word_text,word_text.1
0,1,7,1,4,1,بِسْمِ,بِسْمِ
1,1,7,1,4,2,اللَّهِ,اللَّهِ
2,1,7,1,4,3,الرَّحْمَنِ,الرَّحْمَنِ
3,1,7,1,4,4,الرَّحِيمِ,الرَّحِيمِ
4,1,7,2,4,1,الْحَمْدُ,الْحَمْدُ


In [126]:
is_basic = True
basic_ds = []
for file_name in quran_text.keys():
    print(file_name)
    ds = get_dataset(file_name)
    words_df = get_full_dataset(ds)
    words_df.rename(columns={"word_text": file_name}, inplace=True)
    if is_basic:
        basic_ds.append(words_df)
        is_basic = False
    else:
        basic_ds.append(words_df[file_name])

simple
1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>93>94>95>96>97>98>99>100>101>102>103>104>105>106>107>108>109>110>111>112>113>114>
*Done*
enhanced
1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>93>94>95>96>97>98>99>100>101>102>103>104>105>106>107>108>109>110>111>112>113>114>
*Done*
min
1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>9

In [127]:
final_dataset = pd.concat(basic_ds, axis=1)

In [129]:
final_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77797 entries, 0 to 77796
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sura_num      77797 non-null  object
 1   aya_in_sura   77797 non-null  object
 2   aya_num       77797 non-null  object
 3   words_in_aya  77797 non-null  object
 4   word_num      77797 non-null  object
 5   simple        77797 non-null  object
 6   enhanced      77797 non-null  object
 7   min           77797 non-null  object
 8   simple_cln    77797 non-null  object
 9   uthmani       77430 non-null  object
 10  uthmani_min   77430 non-null  object
dtypes: object(11)
memory usage: 6.5+ MB


In [130]:
final_dataset.head(10)

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,simple,enhanced,min,simple_cln,uthmani,uthmani_min
0,1,7,1,4,1,بِسْمِ,بِسْمِ,بِسمِ,بسم,بِسْمِ,بِسمِ
1,1,7,1,4,2,اللَّهِ,اللَّهِ,اللَّهِ,الله,ٱللَّهِ,اللَّهِ
2,1,7,1,4,3,الرَّحْمَنِ,الرَّحْمَنِ,الرَّحمنِ,الرحمن,ٱلرَّحْمَٰنِ,الرَّحمٰنِ
3,1,7,1,4,4,الرَّحِيمِ,الرَّحِيمِ,الرَّحيمِ,الرحيم,ٱلرَّحِيمِ,الرَّحيمِ
4,1,7,2,4,1,الْحَمْدُ,الْحَمْدُ,الحَمدُ,الحمد,ٱلْحَمْدُ,الحَمدُ
5,1,7,2,4,2,لِلَّهِ,لِلَّهِ,لِلَّهِ,لله,لِلَّهِ,لِلَّهِ
6,1,7,2,4,3,رَبِّ,رَبِّ,رَبِّ,رب,رَبِّ,رَبِّ
7,1,7,2,4,4,الْعَالَمِينَ,الْعَالَمِينَ,العالَمينَ,العالمين,ٱلْعَٰلَمِينَ,العٰلَمينَ
8,1,7,3,2,1,الرَّحْمَنِ,الرَّحْمَنِ,الرَّحمنِ,الرحمن,ٱلرَّحْمَٰنِ,الرَّحمٰنِ
9,1,7,3,2,2,الرَّحِيمِ,الرَّحِيمِ,الرَّحيمِ,الرحيم,ٱلرَّحِيمِ,الرَّحيمِ


In [131]:
# All files are 77797 words, while Uthmani files are only 77430 words each!

# Why? Don't know

# Let's separate the uthmani files

In [132]:
final_dataset.drop(['uthmani','uthmani_min'], axis=1).head()

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,simple,enhanced,min,simple_cln
0,1,7,1,4,1,بِسْمِ,بِسْمِ,بِسمِ,بسم
1,1,7,1,4,2,اللَّهِ,اللَّهِ,اللَّهِ,الله
2,1,7,1,4,3,الرَّحْمَنِ,الرَّحْمَنِ,الرَّحمنِ,الرحمن
3,1,7,1,4,4,الرَّحِيمِ,الرَّحِيمِ,الرَّحيمِ,الرحيم
4,1,7,2,4,1,الْحَمْدُ,الْحَمْدُ,الحَمدُ,الحمد


In [133]:
final_dataset.drop(['uthmani','uthmani_min'], axis=1, inplace=True)

In [134]:
is_basic = True
basic_ds = []
for file_name in ['uthmani','uthmani_min']:
    print(file_name)
    ds = get_dataset(file_name)
    words_df = get_full_dataset(ds)
    words_df.rename(columns={"word_text": file_name}, inplace=True)
    if is_basic:
        basic_ds.append(words_df)
        is_basic = False
    else:
        basic_ds.append(words_df[file_name])
        
uthmani_dataset = pd.concat(basic_ds, axis=1)
basic_ds = []
uthmani_dataset.head(10)

uthmani
1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>93>94>95>96>97>98>99>100>101>102>103>104>105>106>107>108>109>110>111>112>113>114>
*Done*
uthmani_min
1>2>3>4>5>6>7>8>9>10>11>12>13>14>15>16>17>18>19>20>21>22>23>24>25>26>27>28>29>30>31>32>33>34>35>36>37>38>39>40>41>42>43>44>45>46>47>48>49>50>51>52>53>54>55>56>57>58>59>60>61>62>63>64>65>66>67>68>69>70>71>72>73>74>75>76>77>78>79>80>81>82>83>84>85>86>87>88>89>90>91>92>93>94>95>96>97>98>99>100>101>102>103>104>105>106>107>108>109>110>111>112>113>114>
*Done*


Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,uthmani,uthmani_min
0,1,7,1,4,1,بِسْمِ,بِسمِ
1,1,7,1,4,2,ٱللَّهِ,اللَّهِ
2,1,7,1,4,3,ٱلرَّحْمَٰنِ,الرَّحمٰنِ
3,1,7,1,4,4,ٱلرَّحِيمِ,الرَّحيمِ
4,1,7,2,4,1,ٱلْحَمْدُ,الحَمدُ
5,1,7,2,4,2,لِلَّهِ,لِلَّهِ
6,1,7,2,4,3,رَبِّ,رَبِّ
7,1,7,2,4,4,ٱلْعَٰلَمِينَ,العٰلَمينَ
8,1,7,3,2,1,ٱلرَّحْمَٰنِ,الرَّحمٰنِ
9,1,7,3,2,2,ٱلرَّحِيمِ,الرَّحيمِ


In [135]:
uthmani_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77430 entries, 0 to 77429
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   sura_num      77430 non-null  object
 1   aya_in_sura   77430 non-null  object
 2   aya_num       77430 non-null  object
 3   words_in_aya  77430 non-null  object
 4   word_num      77430 non-null  object
 5   uthmani       77430 non-null  object
 6   uthmani_min   77430 non-null  object
dtypes: object(7)
memory usage: 4.1+ MB


In [136]:
uthmani_dataset.describe()

Unnamed: 0,sura_num,aya_in_sura,aya_num,words_in_aya,word_num,uthmani,uthmani_min
count,77430,77430,77430,77430,77430,77430,77430
unique,114,77,286,74,128,18994,17943
top,2,286,7,11,1,فِى,مِن
freq,6116,6116,1158,3795,6236,1098,1673


In [139]:
final_dataset.to_csv(os.path.join(text_path, 'Output',
                                  'simple_text.CSV'), encoding='utf-8-sig')
print('SimpleTexts exported successfully')
uthmani_dataset.to_csv(os.path.join(text_path, 'Output',
                                  'uthmani_text.CSV'), encoding='utf-8-sig')
print('UthmaniTexts exported successfully')

SimpleTexts exported successfully
UthmaniTexts exported successfully
