In [33]:
from nltk.tokenize import punkt
import pandas as pd
import numpy
import sklearn
import re
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dagbo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
bible_df = pd.read_csv(r"bible_data_set.csv")
print(bible_df)

               citation        book  chapter  verse  \
0           Genesis 1:1     Genesis        1      1   
1           Genesis 1:2     Genesis        1      2   
2           Genesis 1:3     Genesis        1      3   
3           Genesis 1:4     Genesis        1      4   
4           Genesis 1:5     Genesis        1      5   
...                 ...         ...      ...    ...   
31097  Revelation 22:17  Revelation       22     17   
31098  Revelation 22:18  Revelation       22     18   
31099  Revelation 22:19  Revelation       22     19   
31100  Revelation 22:20  Revelation       22     20   
31101  Revelation 22:21  Revelation       22     21   

                                                    text  
0      In the beginning God created the heaven and th...  
1      And the earth was without form, and void; and ...  
2      And God said, Let there be light: and there wa...  
3      And God saw the light, that it was good: and G...  
4      And God called the light Day, and the

In [35]:
bible_df.head(5)
bible_df.dtypes

citation    object
book        object
chapter      int64
verse        int64
text        object
dtype: object

In [36]:
#check if the dataset has a null data

missing_data = bible_df[bible_df.isna().any(axis=1)]
print(missing_data)
bible_df.info()
print(bible_df.shape)

Empty DataFrame
Columns: [citation, book, chapter, verse, text]
Index: []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31102 entries, 0 to 31101
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   citation  31102 non-null  object
 1   book      31102 non-null  object
 2   chapter   31102 non-null  int64 
 3   verse     31102 non-null  int64 
 4   text      31102 non-null  object
dtypes: int64(2), object(3)
memory usage: 1.2+ MB
(31102, 5)


In [37]:
# drop rows with missing data
bible_df.dropna(inplace=True)

# check for and remove duplicates
bible_df.drop_duplicates(inplace=True)

Normalizing the text

In [38]:
# convert all text to lowercase
bible_df["text"] = bible_df["text"].str.lower()

# Remove special characters and numbers
bible_df["text"] = bible_df["text"].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))


tokenize the text

In [39]:
from nltk.tokenize import word_tokenize
# tokenize the text
bible_df["text"] = bible_df["text"].apply(nltk.word_tokenize)

In [40]:
bible_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,Genesis,1,1,"[in, the, beginning, god, created, the, heaven..."
1,Genesis 1:2,Genesis,1,2,"[and, the, earth, was, without, form, and, voi..."
2,Genesis 1:3,Genesis,1,3,"[and, god, said, let, there, be, light, and, t..."
3,Genesis 1:4,Genesis,1,4,"[and, god, saw, the, light, that, it, was, goo..."
4,Genesis 1:5,Genesis,1,5,"[and, god, called, the, light, day, and, the, ..."
...,...,...,...,...,...
31097,Revelation 22:17,Revelation,22,17,"[and, the, spirit, and, the, bride, say, come,..."
31098,Revelation 22:18,Revelation,22,18,"[for, i, testify, unto, every, man, that, hear..."
31099,Revelation 22:19,Revelation,22,19,"[and, if, any, man, shall, take, away, from, t..."
31100,Revelation 22:20,Revelation,22,20,"[he, which, testifieth, these, things, saith, ..."


Stemming and lemmatizing the dataset

In [41]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
bible_df["text"] = bible_df["text"].apply(lambda x: [stemmer.stem(word) for word in x])
bible_df["text"] = bible_df["text"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [42]:
bible_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,Genesis,1,1,"[in, the, begin, god, creat, the, heaven, and,..."
1,Genesis 1:2,Genesis,1,2,"[and, the, earth, wa, without, form, and, void..."
2,Genesis 1:3,Genesis,1,3,"[and, god, said, let, there, be, light, and, t..."
3,Genesis 1:4,Genesis,1,4,"[and, god, saw, the, light, that, it, wa, good..."
4,Genesis 1:5,Genesis,1,5,"[and, god, call, the, light, day, and, the, da..."
...,...,...,...,...,...
31097,Revelation 22:17,Revelation,22,17,"[and, the, spirit, and, the, bride, say, come,..."
31098,Revelation 22:18,Revelation,22,18,"[for, i, testifi, unto, everi, man, that, hear..."
31099,Revelation 22:19,Revelation,22,19,"[and, if, ani, man, shall, take, away, from, t..."
31100,Revelation 22:20,Revelation,22,20,"[he, which, testifieth, these, thing, saith, s..."


Encoding categorical variables

In [43]:

# encode categorical variable 'book'
le = LabelEncoder()
bible_df["book"] = le.fit_transform(bible_df["book"])

In [44]:
print(le.classes_)

['1 Chronicles' '1 Corinthians' '1 John' '1 Kings' '1 Peter' '1 Samuel'
 '1 Thessalonians' '1 Timothy' '2 Chronicles' '2 Corinthians' '2 John'
 '2 Kings' '2 Peter' '2 Samuel' '2 Thessalonians' '2 Timothy' '3 John'
 'Acts' 'Amos' 'Colossians' 'Daniel' 'Deuteronomy' 'Ecclesiastes'
 'Ephesians' 'Esther' 'Exodus' 'Ezekiel' 'Ezra' 'Galatians' 'Genesis'
 'Habakkuk' 'Haggai' 'Hebrews' 'Hosea' 'Isaiah' 'James' 'Jeremiah' 'Job'
 'Joel' 'John' 'Jonah' 'Joshua' 'Jude' 'Judges' 'Lamentations' 'Leviticus'
 'Luke' 'Malachi' 'Mark' 'Matthew' 'Micah' 'Nahum' 'Nehemiah' 'Numbers'
 'Obadiah' 'Philemon' 'Philippians' 'Proverbs' 'Psalms' 'Revelation'
 'Romans' 'Ruth' 'Song of Solomon' 'Titus' 'Zechariah' 'Zephaniah']


train the model

In [45]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [46]:
bible_df

Unnamed: 0,citation,book,chapter,verse,text
0,Genesis 1:1,29,1,1,"[in, the, begin, god, creat, the, heaven, and,..."
1,Genesis 1:2,29,1,2,"[and, the, earth, wa, without, form, and, void..."
2,Genesis 1:3,29,1,3,"[and, god, said, let, there, be, light, and, t..."
3,Genesis 1:4,29,1,4,"[and, god, saw, the, light, that, it, wa, good..."
4,Genesis 1:5,29,1,5,"[and, god, call, the, light, day, and, the, da..."
...,...,...,...,...,...
31097,Revelation 22:17,59,22,17,"[and, the, spirit, and, the, bride, say, come,..."
31098,Revelation 22:18,59,22,18,"[for, i, testifi, unto, everi, man, that, hear..."
31099,Revelation 22:19,59,22,19,"[and, if, ani, man, shall, take, away, from, t..."
31100,Revelation 22:20,59,22,20,"[he, which, testifieth, these, thing, saith, s..."


In [47]:
# Load the dataset

train_df, val_df = train_test_split(bible_df, test_size=0.2)

In [48]:
# Load the pre-trained weights of the Bert model and create the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [51]:
# Convert the data to a list of dictionaries
train_data = []
for i in range(train_df.shape[0]):
    input_ids = tokenizer.encode(train_df.iloc[i]["text"], return_tensors="pt").squeeze()
    attention_mask = (input_ids > 0).long()
    token_type_ids = torch.zeros_like(input_ids)
    label = train_df.iloc[i]["text"]
    train_data.append(
        {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, "label": label}
    )

In [54]:
val_data = []
for i in range(val_df.shape[0]):
    input_ids = tokenizer.encode(val_df.iloc[i]["text"], return_tensors="pt").squeeze()
    attention_mask = (input_ids > 0).long()
    token_type_ids = torch.zeros_like(input_ids)
    label = val_df.iloc[i]["book"]
    val_data.append(
        {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids, "label": label}
    )

In [60]:
# Create a DataLoader for the train and validation sets
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=32)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=32)

In [63]:
from transformers import DataCollatorForLanguageModeling
# Convert the data to the correct format
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False, mlm_probability=0.15
)
train_dataset = data_collator.convert_to_dataset(train_df)
val_dataset = data_collator.convert_to_dataset(val_df)

AttributeError: 'DataCollatorForLanguageModeling' object has no attribute 'convert_to_dataset'

In [62]:
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "steps",
    eval_steps = 100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Create the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data
)

# Start training
trainer.train()


***** Running training *****
  Num examples = 24881
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2334
  Number of trainable parameters = 109483778


ValueError: too many dimensions 'str'