# Logging Results

In [18]:
def log_top_matches(query, model_name, top_indices, similarities, full_psalms, filename="query_results.txt"):
    """
    Write the top psalm matches to a results file (and print them).
    Includes the original query and model information as a heading.
    """
    output_lines = []
    output_lines.append("=" * 80)
    output_lines.append(f"Query: {query}")
    output_lines.append(f"Model: {model_name}")
    output_lines.append("Top 5 matching psalms:")

    line_length = 125

    for rank, idx in enumerate(top_indices, start=1):
        text_type = "Bible" if idx <= 151 else "Psalter"
        num = idx if idx <= 151 else idx - 151
        psalm = full_psalms.iloc[idx]['verse']
        formatted_verse = ""
        # split into lines of length `line_length`
        for i in range(0, len(text), line_length):
            formatted_verse += psalm[i:i+line_length] + "\n"

        line = (
            f"{rank}. {text_type} Psalm {num + 1} - Similarity: {similarities[idx]}%\n"
            f"{formatted_verse}\n"
        )
        output_lines.append(line)

    # Combine into one result block
    result_block = "\n".join(output_lines)

    # Print to console
    print(result_block)

    # Append to a file
    with open(filename, "a", encoding="utf-8") as f:
        f.write(result_block + "\n\n")

# TF-IDF

# GLoVe X TFIDF

In [19]:
import pandas as pd
import numpy as np

psalms = pd.read_csv("data/psalms_with_vectors.txt")

In [20]:
psalms


Unnamed: 0.1,Unnamed: 0,tradition,text,psalm_num,verse,cleaned_verse,glove_tfidf_vec,glove
0,0,Orthodox,Bible,1,Blessed is the man Who walks not in the counse...,blessed man walk counsel ungodly stand way sin...,[ 0.04116335 -0.16281123 0.01578085 0.102900...,[-0.01810831 -0.2523851 0.00547219 0.195684...
1,1,Orthodox,Bible,2,Why do the nations rage And the people meditat...,nation rage people meditate vain thing king ea...,[-0.09802769 -0.09030678 0.08969123 0.121389...,[-0.13752478 -0.16025928 0.08164851 0.207276...
2,2,Orthodox,Bible,3,A psalm by David when he fled from the face of...,psalm david fled face son absalom olord afflic...,[-0.02370331 -0.02989303 -0.06902936 0.134278...,[-0.06051026 -0.02464193 -0.06952135 0.176681...
3,3,Orthodox,Bible,4,For the End in psalms an ode by David You hear...,end psalm ode david heard icalled god righteou...,[-0.04905575 -0.02581999 -0.03994774 0.128105...,[-0.08449795 -0.0195749 0.00881046 0.162734...
4,4,Orthodox,Bible,5,For the End concerning the inheritance a psalm...,end concerning inheritance psalm david give ea...,[ 0.00102828 -0.09200042 -0.07703259 0.243362...,[-0.0342767 -0.13337569 -0.09014621 0.313558...
...,...,...,...,...,...,...,...,...
296,296,Orthodox,Psalter,146,The Lord doth build up Jerusalem; He shall gat...,lord doth build jerusalem ; shall gather toget...,[-8.69649883e-02 2.96077896e-02 2.16656047e-...,[-0.00202334 -0.09045612 0.07749773 0.364389...
297,297,Orthodox,Psalter,147,"Praise the Lord, O Jerusalem; praise thy God, ...","praise lord , jerusalem ; praise thy god , zio...",[-0.07291189 -0.04181115 0.07198007 0.143565...,[-0.00506784 -0.17236896 0.10899492 0.362852...
298,298,Orthodox,Psalter,148,Praise ye the Lord from the heavens; praise Hi...,praise ye lord heaven ; praise highest . prais...,[-0.07403488 -0.20329297 0.0287572 0.182462...,[ 0.06032175 -0.2944392 0.16987029 0.447312...
299,299,Orthodox,Psalter,149,"Sing unto the Lord a new song, His praise is i...","sing unto lord new song , praise congregation ...",[-0.01585722 -0.07310608 -0.04004482 0.236639...,[ 0.0506989 -0.22018342 -0.01928143 0.459914...


In [21]:
glove_vectors = {}

with open("word_embeddings/vectors.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split()        # split by spaces
        word = parts[0]                     # first part is the word
        vector = [float(x) for x in parts[1:]]  # rest are floats
        glove_vectors[word] = vector


# glove_vectors

In [22]:
embedding_dim = len(next(iter(glove_vectors.values())))  # get embedding dimension

In [23]:
import pickle 
import os
print(os.getcwd())

# getting the pickle file ready to be used
with open("../../data/models/psalms_tfidf_matrix.pickle", "rb") as file:
    tfidf_matrix = pickle.load(file)

/Users/caden/st_david-s-beacon/website/scripts/fall 2025


In [24]:
# Number of documents
N = tfidf_matrix.shape[0]

# Count how many docs contain each word
df = np.sum(tfidf_matrix > 0, axis=0)  # document frequency

# Compute IDF
idf_weights = np.log((N + 1) / (df + 1)) + 1  # standard smoothed IDF

# Map to dict
idf_weights = dict(zip(tfidf_matrix.columns, idf_weights))

In [25]:
def compute_query_embedding(query):
    tokens = query.lower().split()  # lowercase, remove punctuation, tokenize
    
    numerator = np.zeros(embedding_dim)
    denominator = 0

    for word in tokens:
        if word in glove_vectors and word in idf_weights:
            weight = idf_weights[word]  # or full TF-IDF if TF available
            numerator += glove_vectors[word] * weight
            denominator += weight

    if denominator == 0:
        return None

    return numerator / denominator


In [31]:
import numpy as np

def ensure_array(x):
    if isinstance(x, str):
        # string like "[0.04116335 -0.16281123 ...]"
        x = x.strip("[]")  # remove brackets
        numbers = [float(n) for n in x.split()]
        return np.array(numbers, dtype=float)
    elif isinstance(x, list):
        # list -> array
        return np.array(x, dtype=float)
    elif isinstance(x, np.ndarray):
        # already array, do nothing
        return x
    else:
        # fallback to zero vector if something else
        return np.zeros(300)  # replace 300 with your embedding_dim

psalms["glove_tfidf_vec"] = psalms["glove_tfidf_vec"].apply(ensure_array)

# Check
print(type(psalms["glove_tfidf_vec"].iloc[0]))  # <class 'numpy.ndarray'>
print(psalms["glove_tfidf_vec"].iloc[0].shape)


<class 'numpy.ndarray'>
(100,)


In [32]:

# Check
print(type(psalms["glove_tfidf_vec"].iloc[0]))  # should be <class 'numpy.ndarray'>
print(psalms["glove_tfidf_vec"].iloc[0].shape)  # should show (embedding_dim,)



<class 'numpy.ndarray'>
(100,)


In [28]:
print(type(psalms["glove_tfidf_vec"].iloc[0]))  # <class 'numpy.ndarray'>
print(psalms["glove_tfidf_vec"].iloc[0].shape)  # e.g., (300,)


<class 'numpy.ndarray'>
(100,)


In [34]:
import numpy as np

for word in glove_vectors:
    glove_vectors[word] = np.array(glove_vectors[word], dtype=float)


In [91]:
def query_tfidf_glove(query, top_k=6):
    # Compute query embedding
    q_vec = compute_query_embedding(query)
    if q_vec is None:
        return []  # return empty list if no valid words in query

    # Precompute query norm once
    q_norm = np.linalg.norm(q_vec)
    if q_norm == 0:
        return []

    sims = []

    # Iterate over each row in the DataFrame
    for idx, row in psalms.iterrows():
        doc_vec = row['glove_tfidf_vec']
        doc_norm = np.linalg.norm(doc_vec)
        if doc_norm == 0:
            continue  # skip empty embeddings

        # Cosine similarity
        sim = np.dot(q_vec, doc_vec) / (q_norm * doc_norm)
        sims.append((idx, sim))  # store index and similarity

    # Sort by similarity in descending order
    sims.sort(key=lambda x: x[1], reverse=True)

    # Return the top_k indices
    top_indices = [idx for idx, _ in sims[:top_k]]
    
    print(top_indices)

# Example usage:
query_tfidf_glove("For the Peace of the World")


[91, 271, 120, 130, 95, 236]


## Testing 


In [75]:
query = "For the Peace of the World"

query_tfidf_glove(query)

[91, 271, 120, 130, 95, 236]

# BERT & SBERT

## BERT

In [45]:
print(os.getcwd())
output_dir = "data/bert"
psalm_embeddings = []

# Load all saved embeddings
for filename in sorted(os.listdir(output_dir)):
    if filename.endswith(".npy") and "psalm_" in filename:
        emb = np.load(os.path.join(output_dir, filename))
        psalm_embeddings.append(emb)

psalm_embeddings = np.stack(psalm_embeddings)  # shape: (num_psalms, 768)
print("Loaded psalm embeddings:", psalm_embeddings.shape)


/Users/caden/st_david-s-beacon/website/scripts/fall 2025
Loaded psalm embeddings: (301, 768)


In [40]:
# --- Clean Psalm Encoder using BERT ---
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# 1️⃣ Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2️⃣ Load tokenizer and model (fresh instances)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()  # evaluation mode

Using device: cpu


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [41]:
def encode_text_bert(text: str) -> np.ndarray:
    """
    Encode a single text string into a 1D numpy array (hidden_size,)
    Uses attention-mask weighted mean to ignore padding.
    """
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = bert_model(**inputs)  # last_hidden_state: (1, seq_len, hidden)
        hidden = outputs.last_hidden_state
        mask = inputs.get("attention_mask")
        
        if mask is None:
            pooled = hidden.mean(dim=1)
        else:
            mask = mask.unsqueeze(-1)  # (1, seq_len, 1)
            masked_hidden = hidden * mask
            summed = masked_hidden.sum(dim=1)
            counts = mask.sum(dim=1).clamp(min=1e-9)
            pooled = summed / counts
    
    return pooled.squeeze(0).cpu().numpy()

In [42]:
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two 1D numpy arrays."""
    a_norm = a / np.linalg.norm(a)
    b_norm = b / np.linalg.norm(b)
    return float(np.dot(a_norm, b_norm))

In [None]:
def query_bert(query):

    print("Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    # 1. Encode query with BERT
    query_emb = encode_text_bert(query)

    # 2. Compute similarities
    similarities = []
    for i, psalm_emb in enumerate(psalm_embeddings):
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, round(sim * 100, 2)))  # store index + sim %

    # 3. Sort by similarity descending
    top_results = sorted(similarities, key=lambda x: x[1], reverse=True)[:5]


    print(top_results)

    # 4. Log matches (and write to file)
    ''' log_top_matches(query,
        model_name="BERT",
        top_indices=top_indices,
        similarities=similarities,
        full_psalms=full_psalms,
    )'''

IndentationError: expected an indented block after 'for' statement on line 14 (1268997979.py, line 15)

## Testing

In [74]:
query_bert(query)

Query:  For the Peace of the World
[184  17 199 203 196]


## SBERT

In [53]:
from sentence_transformers import SentenceTransformer

# Use a pretrained SBERT model
sbert_model = SentenceTransformer('all-mpnet-base-v2')  # or any SBERT variant

In [55]:
utput_dir = "data/sbert"
psalm_SBERT_embeddings = []

# Load all saved embeddings
for filename in sorted(os.listdir(output_dir)):
    if filename.endswith(".npy") and "psalm_" in filename:
        emb = np.load(os.path.join(output_dir, filename))
        psalm_SBERT_embeddings.append(emb)

psalm_SBERT_embeddings = np.stack(psalm_SBERT_embeddings)  # shape: (num_psalms, 768)
print("Loaded psalm embeddings:", psalm_SBERT_embeddings.shape)

Loaded psalm embeddings: (301, 768)


In [58]:
def encode_text_SBERT(text):
    return sbert_model.encode(text, convert_to_numpy=True)

In [60]:
def query_sbert(query):
    print("Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    query_emb = encode_text_SBERT(query)

    similarities = []

    for psalm_emb in psalm_SBERT_embeddings:
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append(round(sim*100, 2))

    
    top_indices = np.argsort(similarities)[-5:][::-1]

    # Checking the Output
    print(top_indices)

## Testing

In [73]:
query_sbert(query)

Query:  For the Peace of the World
[114 191  70  17 193]


# Testing All Algorithms

In [76]:
query = "For the Peace of the World"

query_tfidf_glove(query)

[91, 271, 120, 130, 95, 236]

In [93]:
def query_all(q):
    # GLoVe scaled by TF-IDF

    # GLoVe scaled by TF_IDF 
    query_tfidf_glove(q)

    # BERT
    query_bert(q)
    # SBERT
    query_sbert(q)

query_all("For the Abundance of the Fruits of the Earth")

query = "For the Peace of the World"

print("GLove X TFIDF")

query_tfidf_glove(query)

[117, 110, 94, 270, 95, 98]
Query:  For the Abundance of the Fruits of the Earth
[196 193 184 202 166]
Query:  For the Abundance of the Fruits of the Earth
[280 285 248  10 151]
GLove X TFIDF
[91, 271, 120, 130, 95, 236]


In [None]:
def query_all(q):
    return {
        "glove_tfidf": query_tfidf_glove(q),
        "bert": query_bert(q),
        "sbert": query_sbert(q)
    }

results = query_all("For the Peace of the World")
print(results["glove_tfidf"])


[91, 271, 120, 130, 95, 236]
Query:  For the Peace of the World
[184  17 199 203 196]
Query:  For the Peace of the World
[114 191  70  17 193]
None


In [87]:
query = "For the Peace of the World"

print("GLove X TFIDF")

query_tfidf_glove(query)



GLove X TFIDF


[91, 271, 120, 130, 95, 236]