In [1]:
!pip install sentence-transformers
!pip install faiss-cpu

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [2]:
import json

def combine_json_files(file1, file2, output_file):
    """
    Combines two JSON files into a single file with a specific format.

    Args:
        file1 (str): Path to the first JSON file.
        file2 (str): Path to the second JSON file.
        output_file (str): Path to the output JSON file.
    """
    all_entries = []

    with open(file1, 'r') as f:
        data_raw = json.load(f)
        data1 = data_raw.get("verses")
        for entry in data1:
            all_entries.append({
                "reference": entry.get("reference"),
                "text": entry.get("text")
            })

    with open(file2, 'r') as f:
        data_raw2 = json.load(f)
        data2 = data_raw2.get("verses")
        for entry in data2:
            all_entries.append({
                "reference": entry.get("reference"),
                "text": entry.get("text")
            })

    with open(output_file, 'w') as f:
        json.dump(all_entries, f, indent=4)

combine_json_files('ot.json', 'nt.json', 'bible_verses.json')

print("Files combined successfully!")

Files combined successfully!


In [3]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np

# 1. Load a pre-trained model
# 'all-MiniLM-L6-v2' is a great starting point: fast and high quality.
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

# 2. Load your Bible data
with open('bible_verses.json', 'r') as f:
    bible_data = json.load(f)

# Extract just the text for encoding
verse_texts = [item['text'] for item in bible_data]

print(f"Loaded {len(verse_texts)} verses.")

# 3. Generate embeddings
# This is the magic. It's highly optimized to run on a GPU if available.
# It will take some time (minutes, not hours/days) but you only do it once.
print("Generating embeddings... This may take a few minutes.")
verse_embeddings = model.encode(verse_texts, show_progress_bar=True)

print("Embeddings generated successfully.")
print("Shape of embeddings:", verse_embeddings.shape) # E.g., (31102, 384)

# 4. Save the embeddings to a file for later use
np.save('bible_embeddings.npy', verse_embeddings)

print("Embeddings saved to bible_embeddings.npy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loaded 31102 verses.
Generating embeddings... This may take a few minutes.


Batches:   0%|          | 0/972 [00:00<?, ?it/s]

Embeddings generated successfully.
Shape of embeddings: (31102, 384)
Embeddings saved to bible_embeddings.npy


In [10]:
# build_index.py
import numpy as np
import faiss

print("Loading embeddings from file...")
verse_embeddings = np.load('bible_embeddings.npy').astype('float32') # FAISS requires float32

# Get the dimension of the vectors (e.g., 384 for MiniLM)
d = verse_embeddings.shape[1]

# Build a FAISS index. IndexFlatL2 is a simple, exact search index.
# For massive datasets, you might use an approximate index like 'IndexIVFFlat'.
# But for 31,000 verses, exact search is already blazing fast.
quantizer = faiss.IndexFlatL2(d)

nlist = 128
m = 48
bits = 8
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

print("Training the index...")
index.train(verse_embeddings)

print("Adding vectors to the index...")
index.add(verse_embeddings)

print(f"Quantized index built successfully.")
print(f"Number of vectors in the index: {index.ntotal}")

faiss.write_index(index, "bible_verse_index.faiss")
print("Quantized index saved.")

Loading embeddings from file...
Training the index...
Adding vectors to the index...
Quantized index built successfully.
Number of vectors in the index: 31102
Quantized index saved.


In [39]:
with open('bible_verses.json', 'r') as f:
    bible_data = json.load(f)

index = faiss.read_index("bible_verse_index.faiss")
index.nprobe = 10

model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
print("Data loaded successfully.")

Data loaded successfully.


In [40]:
query_vector = model.encode(["God is love"])

In [41]:
distances, indices = index.search(query_vector, 11)

In [42]:
results = []
for i in range(0, 11): # Start from 1 to skip the query verse itself
    verse_index = indices[0][i]
    result_verse = bible_data[verse_index]
    results.append({
        "reference": result_verse['reference'],
        "text": result_verse['text'],
        "distance": float(distances[0][i]) # L2 distance
    })

In [43]:
for result in results:
    print(f"Verse: {result['reference']}")
    print(f"Text: {result['text']}")
    print(f"Distance: {result['distance']}")
    print()

Verse: 1 John 4:7
Text: Beloved, let us love one another: for love is of God; and every one that loveth is born of God, and knoweth God.
Distance: 17.25106430053711

Verse: 1 John 4:16
Text: And we have known and believed the love that God hath to us. God is love; and he that dwelleth in love dwelleth in God, and God in him.
Distance: 17.91141700744629

Verse: 1 John 4:12
Text: No man hath seen God at any time. If we love one another, God dwelleth in us, and his love is perfected in us.
Distance: 18.72039031982422

Verse: 1 John 4:11
Text: Beloved, if God so loved us, we ought also to love one another.
Distance: 19.435041427612305

Verse: 1 John 5:2
Text: By this we know that we love the children of God, when we love God, and keep his commandments.
Distance: 19.47077178955078

Verse: Ephesians 2:4
Text: But God, who is rich in mercy, for his great love wherewith he loved us,
Distance: 19.562076568603516

Verse: Titus 3:4
Text: But after that the kindness and love of God our Saviour tow