In [12]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = "C:\\Users\\USER\\Downloads\\bible_passage_finder\\BIble passage data.csv"
data = pd.read_csv(file_path)

# Extract relevant columns
data = data[['BIBLE TEXT', 'VERSE']] # Keep only non-null rows in both columns
texts = data['BIBLE TEXT'].astype(str).values
labels = data['VERSE'].astype(str).values


# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
max_seq_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split( padded_sequences, encoded_labels, test_size=0.2, random_state=42)

# Define the RNN model
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=max_seq_length),
    SimpleRNN(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')  # Output layer
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

def predict_verse(input_text, model, tokenizer, label_encoder, max_seq_length):
    # Preprocess the input text
    preprocessed_text = preprocess_text(input_text, tokenizer, max_seq_length)
    
    # Debug: Check if preprocessed text is empty
    if preprocessed_text.sum() == 0:
        return "Error: Input contains only out-of-vocabulary words!"

    # Predict the label
    prediction = model.predict(preprocessed_text)
    predicted_class = prediction.argmax(axis=1)[0]

    # Debug: Check if predicted class is within range
    if predicted_class >= len(label_encoder.classes_):
        return "Error: Predicted class is out of range!"
    
    # Decode the predicted label to get the verse reference
    predicted_verse = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_verse
input_text = ''

# Predict the verse reference
predicted_verse = predict_verse(input_text, model, tokenizer, label_encoder, max_seq_length)
print(f"Predicted Verse Reference: {predicted_verse}")



Epoch 1/10




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 214ms/step - accuracy: 0.0309 - loss: 3.3291 - val_accuracy: 0.1600 - val_loss: 3.1305
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.2535 - loss: 2.9882 - val_accuracy: 0.8400 - val_loss: 2.4652
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.6107 - loss: 2.4199 - val_accuracy: 0.8400 - val_loss: 1.4479
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.7622 - loss: 1.6368 - val_accuracy: 0.8400 - val_loss: 1.0404
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.7924 - loss: 1.2023 - val_accuracy: 0.8400 - val_loss: 1.1080
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.8205 - loss: 1.1891 - val_accuracy: 0.8400 - val_loss: 1.1516
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [9]:
# Define a function for preprocessing
def preprocess_text(input_text, tokenizer, max_seq_length):
    # Tokenize and pad the input text
    sequence = tokenizer.texts_to_sequences([input_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_seq_length)
    return padded_sequence

# Define a function for prediction
def predict_verse(input_text, model, tokenizer, label_encoder, max_seq_length):
    # Preprocess the input text
    preprocessed_text = preprocess_text(input_text, tokenizer, max_seq_length)
    
    # Predict the label
    prediction = model.predict(preprocessed_text)
    predicted_class = prediction.argmax(axis=1)[0]
    
    # Decode the predicted label to get the verse reference
    predicted_verse = label_encoder.inverse_transform([predicted_class])[0]
    return predicted_verse

# Example input text
input_text = 'habakkuk chapter 2 from verse 1 to 3. [Music] i will stand upon my watch and set me upon the tower and we watch to see what he will say unto me and what i shall answer when i am reproved and the lord answered me i say right division and make it plain upon tables that he may wrong that readeth it for division is yet for an appointed time but at the end it shall speak and not lie do italy wait for it because it will surely come it will not tarry [Applause] all the promises of god for you will come to pass tonight'

# Predict the verse reference
predicted_verse = predict_verse(input_text, model, tokenizer, label_encoder, max_seq_length)
print(f"Predicted Verse Reference: {predicted_verse}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 225ms/step
Predicted Verse Reference: nan
