In [50]:
from nltk.tokenize import punkt
import pandas as pd
import numpy
import sklearn
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
bible_df = pd.read_csv(r"bible_data_set.csv")
print(bible_df)

               citation        book  chapter  verse  \
0           Genesis 1:1     Genesis        1      1   
1           Genesis 1:2     Genesis        1      2   
2           Genesis 1:3     Genesis        1      3   
3           Genesis 1:4     Genesis        1      4   
4           Genesis 1:5     Genesis        1      5   
...                 ...         ...      ...    ...   
31097  Revelation 22:17  Revelation       22     17   
31098  Revelation 22:18  Revelation       22     18   
31099  Revelation 22:19  Revelation       22     19   
31100  Revelation 22:20  Revelation       22     20   
31101  Revelation 22:21  Revelation       22     21   

                                                    text  
0      In the beginning God created the heaven and th...  
1      And the earth was without form, and void; and ...  
2      And God said, Let there be light: and there wa...  
3      And God saw the light, that it was good: and G...  
4      And God called the light Day, and the

In [52]:
bible_df.head(5)
bible_df.dtypes

citation    object
book        object
chapter      int64
verse        int64
text        object
dtype: object

In [53]:
#check if the dataset has a null data

missing_data = bible_df[bible_df.isna().any(axis=1)]
print(missing_data)
bible_df.info()
print(bible_df.shape)

Empty DataFrame
Columns: [citation, book, chapter, verse, text]
Index: []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31102 entries, 0 to 31101
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   citation  31102 non-null  object
 1   book      31102 non-null  object
 2   chapter   31102 non-null  int64 
 3   verse     31102 non-null  int64 
 4   text      31102 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.2+ MB
(31102, 5)


In [54]:
# drop rows with missing data
bible_df.dropna(inplace=True)

# check for and remove duplicates
bible_df.drop_duplicates(inplace=True)

Normalizing the text

In [55]:
# convert all text to lowercase
bible_df["text"] = bible_df["text"].str.lower()

# remove special characters and numbers
bible_df["text"] = bible_df["text"].str.replace('[^a-zA-Z]', ' ')

  bible_df["text"] = bible_df["text"].str.replace('[^a-zA-Z]', ' ')


tokenize the text

In [58]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# tokenize the text
bible_df["text"] = bible_df["text"].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [59]:
bible_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,Genesis,1,1,"[in, the, beginning, god, created, the, heaven..."
1,Genesis 1:2,Genesis,1,2,"[and, the, earth, was, without, form, and, voi..."
2,Genesis 1:3,Genesis,1,3,"[and, god, said, let, there, be, light, and, t..."
3,Genesis 1:4,Genesis,1,4,"[and, god, saw, the, light, that, it, was, goo..."
4,Genesis 1:5,Genesis,1,5,"[and, god, called, the, light, day, and, the, ..."
...,...,...,...,...,...
31097,Revelation 22:17,Revelation,22,17,"[and, the, spirit, and, the, bride, say, come,..."
31098,Revelation 22:18,Revelation,22,18,"[for, i, testify, unto, every, man, that, hear..."
31099,Revelation 22:19,Revelation,22,19,"[and, if, any, man, shall, take, away, from, t..."
31100,Revelation 22:20,Revelation,22,20,"[he, which, testifieth, these, things, saith, ..."


Stemming and lemmatizing the dataset

In [62]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
bible_df["text"] = bible_df["text"].apply(lambda x: [stemmer.stem(word) for word in x])
bible_df["text"] = bible_df["text"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...


In [63]:
bible_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,Genesis,1,1,"[in, the, begin, god, creat, the, heaven, and,..."
1,Genesis 1:2,Genesis,1,2,"[and, the, earth, wa, without, form, and, void..."
2,Genesis 1:3,Genesis,1,3,"[and, god, said, let, there, be, light, and, t..."
3,Genesis 1:4,Genesis,1,4,"[and, god, saw, the, light, that, it, wa, good..."
4,Genesis 1:5,Genesis,1,5,"[and, god, call, the, light, day, and, the, da..."
...,...,...,...,...,...
31097,Revelation 22:17,Revelation,22,17,"[and, the, spirit, and, the, bride, say, come,..."
31098,Revelation 22:18,Revelation,22,18,"[for, i, testifi, unto, everi, man, that, hear..."
31099,Revelation 22:19,Revelation,22,19,"[and, if, ani, man, shall, take, away, from, t..."
31100,Revelation 22:20,Revelation,22,20,"[he, which, testifieth, these, thing, saith, s..."


Encoding categorical variables

In [64]:
from sklearn.preprocessing import LabelEncoder
# encode categorical variable 'book'
le = LabelEncoder()
bible_df["book"] = le.fit_transform(bible_df["book"])

In [65]:
bible_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,29,1,1,"[in, the, begin, god, creat, the, heaven, and,..."
1,Genesis 1:2,29,1,2,"[and, the, earth, wa, without, form, and, void..."
2,Genesis 1:3,29,1,3,"[and, god, said, let, there, be, light, and, t..."
3,Genesis 1:4,29,1,4,"[and, god, saw, the, light, that, it, wa, good..."
4,Genesis 1:5,29,1,5,"[and, god, call, the, light, day, and, the, da..."
...,...,...,...,...,...
31097,Revelation 22:17,59,22,17,"[and, the, spirit, and, the, bride, say, come,..."
31098,Revelation 22:18,59,22,18,"[for, i, testifi, unto, everi, man, that, hear..."
31099,Revelation 22:19,59,22,19,"[and, if, ani, man, shall, take, away, from, t..."
31100,Revelation 22:20,59,22,20,"[he, which, testifieth, these, thing, saith, s..."
