In [47]:
from dataclasses import dataclass
import pandas as pd

@dataclass
class Verse:
    text: str
    citation: str
    index: int

class ScriptureStorage:
    def __init__(self, file):
        self.df = pd.read_csv(file)

    def get_verse(self, idx: int) -> Verse:
        row = self.df.iloc[idx]

        verse = Verse(text=row['scripture_text'],
                      citation=row['verse_title'],
                      index=idx)
        
        return verse
    
    def get_verses(self, indices: list[int]) -> list[Verse]:
        verses = []
        for idx in indices:
            verse = self.get_verse(idx)
            verses.append(verse)

        return verses
    
    def get_all_texts(self) -> list[str]:
        verses = self.df['scripture_text'].to_list()

        return verses

In [None]:
from sentence_transformers import SentenceTransformer

class ScriptureEmbedder:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        print('loading model....')
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name
        self.embeddings = None
        print(f"Model: {self.model_name}\nhas been loaded successfully!")

    def embed_single_text(self, text: str):
        embedding = self.model.encode([text], convert_to_numpy=True)

        return embedding
    
    def embed_texts(self, texts: list[str]):
        print("embedding texts....")
        self.embeddings = self.model.encode(texts, show_progress_bar=True, convert_to_numpy=True)
        print("embeddings complete!")
        print(f"Embedding Vectors: {self.embeddings.shape[0]}")
        print(f"Dimensions: {self.embeddings.shape[1]}")

        return self.embeddings


In [94]:
import numpy as np
from faiss import IndexFlatL2

class ScriptureIndex:
    def __init__(self, embeddings: np.ndarray[np.ndarray]):
        self.embeddings = embeddings
        self.embedding_dimensions = embeddings.shape[1]
        print("creating index....")
        self.index = IndexFlatL2(self.embedding_dimensions)
        self.index.add(embeddings)
        print("index created!")
        print(f"Vectors: {self.index.ntotal}\nDimensions: {self.index.d}")

    def search(self, vector, k=2):
        if vector.ndim == 1:
            vector = vector.reshape(1, -1)
        distances, indices = self.index.search(vector, k)

        return distances, indices

In [46]:
FILE_PATH = "data/lds-scriptures.csv"

In [48]:
loader = ScriptureStorage(FILE_PATH)
texts = loader.get_all_texts()

In [58]:
embedder = ScriptureEmbedder()
embeddings = embedder.embed_texts(['wow thats crazy', 'i think youre great', 'wow man calm down'])

loading model....
Model: all-MiniLM-L6-v2
has been loaded successfully!
embedding texts....


Batches: 100%|██████████| 1/1 [00:00<00:00, 40.86it/s]

embeddings complete!
Embedding Vectors: 3
Dimensions: 384





In [100]:
index = ScriptureIndex(embeddings)
distances, indices = index.search(embeddings[0])

creating index....
index created!
Vectors: 3
Dimensions: 384
