# Logging Results

In [10]:
def log_top_matches(query, model_name, top_indices, similarities, full_psalms, filename="query_results.txt"):
    """
    Write the top psalm matches to a results file (and print them).
    Includes the original query and model information as a heading.
    """
    output_lines = []
    output_lines.append("=" * 80)
    output_lines.append(f"Query: {query}")
    output_lines.append(f"Model: {model_name}")
    output_lines.append("Top 5 matching psalms:")

    line_length = 125

    for rank, idx in enumerate(top_indices, start=1):
        text_type = "Bible" if idx <= 151 else "Psalter"
        num = idx if idx <= 151 else idx - 151
        psalm = full_psalms.iloc[idx]['verse']
        formatted_verse = ""
        # split into lines of length `line_length`
        for i in range(0, len(text), line_length):
            formatted_verse += psalm[i:i+line_length] + "\n"

        line = (
            f"{rank}. {text_type} Psalm {num + 1} - Similarity: {similarities[idx]}%\n"
            f"{formatted_verse}\n"
        )
        output_lines.append(line)

    # Combine into one result block
    result_block = "\n".join(output_lines)

    # Print to console
    print(result_block)

    # Append to a file
    with open(filename, "a", encoding="utf-8") as f:
        f.write(result_block + "\n\n")

# GLoVe X TFIDF

In [11]:
import pandas as pd
import numpy as np

psalms = pd.read_csv("data/psalms_with_vectors.txt")

FileNotFoundError: [Errno 2] No such file or directory: 'data/psalms_with_vectors.txt'

In [None]:
psalms


In [None]:
glove_vectors = {}

with open("word_embeddings/vectors.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split()        # split by spaces
        word = parts[0]                     # first part is the word
        vector = [float(x) for x in parts[1:]]  # rest are floats
        glove_vectors[word] = vector


# glove_vectors

In [None]:
embedding_dim = len(next(iter(glove_vectors.values())))  # get embedding dimension

In [None]:
import pickle 
import os
print(os.getcwd())

# getting the pickle file ready to be used
with open("../../data/models/psalms_tfidf_matrix.pickle", "rb") as file:
    tfidf_matrix = pickle.load(file)

In [None]:
# Number of documents
N = tfidf_matrix.shape[0]

# Count how many docs contain each word
df = np.sum(tfidf_matrix > 0, axis=0)  # document frequency

# Compute IDF
idf_weights = np.log((N + 1) / (df + 1)) + 1  # standard smoothed IDF

# Map to dict
idf_weights = dict(zip(tfidf_matrix.columns, idf_weights))

In [None]:
def compute_query_embedding(query):
    tokens = query.lower().split()  # lowercase, remove punctuation, tokenize
    
    numerator = np.zeros(embedding_dim)
    denominator = 0

    for word in tokens:
        if word in glove_vectors and word in idf_weights:
            weight = idf_weights[word]  # or full TF-IDF if TF available
            numerator += glove_vectors[word] * weight
            denominator += weight

    if denominator == 0:
        return None

    return numerator / denominator


In [None]:
import numpy as np

def ensure_array(x):
    if isinstance(x, str):
        # string like "[0.04116335 -0.16281123 ...]"
        x = x.strip("[]")  # remove brackets
        numbers = [float(n) for n in x.split()]
        return np.array(numbers, dtype=float)
    elif isinstance(x, list):
        # list -> array
        return np.array(x, dtype=float)
    elif isinstance(x, np.ndarray):
        # already array, do nothing
        return x
    else:
        # fallback to zero vector if something else
        return np.zeros(300)  # replace 300 with your embedding_dim

psalms["glove_tfidf_vec"] = psalms["glove_tfidf_vec"].apply(ensure_array)

# Check
print(type(psalms["glove_tfidf_vec"].iloc[0]))  # <class 'numpy.ndarray'>
print(psalms["glove_tfidf_vec"].iloc[0].shape)


In [None]:

# Check
print(type(psalms["glove_tfidf_vec"].iloc[0]))  # should be <class 'numpy.ndarray'>
print(psalms["glove_tfidf_vec"].iloc[0].shape)  # should show (embedding_dim,)



In [None]:
print(type(psalms["glove_tfidf_vec"].iloc[0]))  # <class 'numpy.ndarray'>
print(psalms["glove_tfidf_vec"].iloc[0].shape)  # e.g., (300,)


In [None]:
import numpy as np

for word in glove_vectors:
    glove_vectors[word] = np.array(glove_vectors[word], dtype=float)


In [None]:
def query_tfidf_glove(query, top_k=6):
    # Compute query embedding
    q_vec = compute_query_embedding(query)
    if q_vec is None:
        return []  # return empty list if no valid words in query

    # Precompute query norm once
    q_norm = np.linalg.norm(q_vec)
    if q_norm == 0:
        return []

    sims = []

    # Iterate over each row in the DataFrame
    for idx, row in psalms.iterrows():
        doc_vec = row['glove_tfidf_vec']
        doc_norm = np.linalg.norm(doc_vec)
        if doc_norm == 0:
            continue  # skip empty embeddings

        # Cosine similarity
        sim = np.dot(q_vec, doc_vec) / (q_norm * doc_norm)
        sims.append((idx, round(sim*100, 2)))  # store index and similarity

    # Sort by similarity in descending order
    sims.sort(key=lambda x: x[1], reverse=True)

    # Return the top_k indices
    top_indices = [("TFIDF_GLoVe", idx,sim) for idx, sim in sims[:top_k]]
    
    return (top_indices)


## Testing 


In [None]:
query = "For the Peace of the World"

(query_tfidf_glove(query))

# BERT & SBERT

## BERT

In [None]:
print(os.getcwd())
output_dir = "data/bert"
psalm_embeddings = []

# Load all saved embeddings
for filename in sorted(os.listdir(output_dir)):
    if filename.endswith(".npy") and "psalm_" in filename:
        emb = np.load(os.path.join(output_dir, filename))
        psalm_embeddings.append(emb)

psalm_embeddings = np.stack(psalm_embeddings)  # shape: (num_psalms, 768)
print("Loaded psalm embeddings:", psalm_embeddings.shape)


In [None]:
# --- Clean Psalm Encoder using BERT ---
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# 1️⃣ Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 2️⃣ Load tokenizer and model (fresh instances)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)
bert_model.eval()  # evaluation mode

In [None]:
def encode_text_bert(text: str) -> np.ndarray:
    """
    Encode a single text string into a 1D numpy array (hidden_size,)
    Uses attention-mask weighted mean to ignore padding.
    """
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    # Move inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = bert_model(**inputs)  # last_hidden_state: (1, seq_len, hidden)
        hidden = outputs.last_hidden_state
        mask = inputs.get("attention_mask")
        
        if mask is None:
            pooled = hidden.mean(dim=1)
        else:
            mask = mask.unsqueeze(-1)  # (1, seq_len, 1)
            masked_hidden = hidden * mask
            summed = masked_hidden.sum(dim=1)
            counts = mask.sum(dim=1).clamp(min=1e-9)
            pooled = summed / counts
    
    return pooled.squeeze(0).cpu().numpy()

In [None]:
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Compute cosine similarity between two 1D numpy arrays."""
    a_norm = a / np.linalg.norm(a)
    b_norm = b / np.linalg.norm(b)
    return float(np.dot(a_norm, b_norm))

In [None]:
def query_bert(query, top_k=5):

    print("BERT Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    # 1. Encode query with BERT
    query_emb = encode_text_bert(query)

    # 2. Compute similarities
    similarities = []
    for i, psalm_emb in enumerate(psalm_embeddings):
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, round(sim * 100, 2)))  # store index + sim %

    # 3. Sort by similarity descending
    sims = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]



    # Return the top_k indices
    top_results = [("BERT", idx,sim) for idx, sim in sims[:top_k]]


    return (top_results)

    # 4. Log matches (and write to file)
    ''' log_top_matches(query,
        model_name="BERT",
        top_indices=top_indices,
        similarities=similarities,
        full_psalms=full_psalms,
    )'''

## Testing

In [None]:
(query_bert(query))

## SBERT

In [None]:
from sentence_transformers import SentenceTransformer

# Use a pretrained SBERT model
sbert_model = SentenceTransformer('all-mpnet-base-v2')  # or any SBERT variant

In [None]:
utput_dir = "data/sbert"
psalm_SBERT_embeddings = []

# Load all saved embeddings
for filename in sorted(os.listdir(output_dir)):
    if filename.endswith(".npy") and "psalm_" in filename:
        emb = np.load(os.path.join(output_dir, filename))
        psalm_SBERT_embeddings.append(emb)

psalm_SBERT_embeddings = np.stack(psalm_SBERT_embeddings)  # shape: (num_psalms, 768)
print("Loaded psalm embeddings:", psalm_SBERT_embeddings.shape)

In [None]:
def encode_text_SBERT(text):
    return sbert_model.encode(text, convert_to_numpy=True)

In [None]:
def query_sbert(query, top_k=5):
    print("Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    query_emb = encode_text_SBERT(query)

    similarities = []

    for i, psalm_emb in enumerate(psalm_SBERT_embeddings):
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, round(sim*100, 2)))

    
    # 3. Sort by similarity descending
    sims = sorted(similarities, reverse=True)[:5]
    

    # Return the top_k indices
    top_results = [("SBERT", idx,sim) for idx, sim in sims[:top_k]]

    # Checking the Output
    return (top_results)

## Testing

In [None]:
print(query_sbert(query))

#### Comparing Embedding sizes

I am getting very low similarity percentages for `SBERT` results compared to `BERT` results. There are a few things that may be contributing to this. One of them may have to do with the different dimensions of embeddings between `BERT` & `SBERT`. Let look at that first. 

In [None]:
print(f"BERT Embeddings: {psalm_embeddings.shape}")
print(f"SBERT Embeddings: {psalm_SBERT_embeddings.shape}")

The dimensions for both are the same, therefore the problems seems like i t lies within the `SBERT` algorithm itself. Lets look at the embeddings for `SBERT` closer and see what is happening. We are going to look at the embeddings right before the cosine similarity is calculated and see if there are any negatives within the embeddings. 

In [None]:
def query_sbert(query, top_k=5):
    print("Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    query_emb = encode_text_SBERT(query)

    similarities = []

    for i, psalm_emb in enumerate(psalm_SBERT_embeddings):
        print(f"Query: {query_emb} \n Emedding: {psalm_emb}")
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, round(sim*100, 2)))

    
    # 3. Sort by similarity descending
    sims = sorted(similarities, reverse=True)[:5]
    

    # Return the top_k indices
    top_indices = [("SBERT", idx,sim) for idx, sim in sims[:top_k]]

    # Checking the Output
    print(top_indices)

In [None]:
# query_sbert(query)

It can be seen that there are negative values within the embeddings and because of this it is interfereing with the score and making them seem lower than they actually are. We can write a simple sunction to basically shift all of the embedding to be strictly **positive**, which should fix the off balance of similarity scores. 

In [None]:
def shift_embedding(embedding):
    min_val = embedding.min()
    shifted = embedding - min_val

    return shifted

We can now apply this to the embeddings themselves and then look at the similarities again and see if this affected the similarities. 

In [None]:
psalm_SBERT_embeddings = shift_embedding(psalm_SBERT_embeddings)

In [None]:
def query_sbert(query, top_k=5):
    print("Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    query_emb = encode_text_SBERT(query)

    similarities = []

    # Shifting the Embeddings
    query_emb = shift_embedding(query_emb)
    #psalm_SBERT_embeddings = shift_embedding(psalm_SBERT_embeddings)

    for i, psalm_emb in enumerate(psalm_SBERT_embeddings):
        print(f"Query: {query_emb} \n Emedding: {psalm_emb}")
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, round(sim*100, 2)))

    
    # 3. Sort by similarity descending
    sims = sorted(similarities, reverse=True)[:5]
    

    # Return the top_k indices
    top_indices = [("SBERT", idx,sim) for idx, sim in sims[:top_k]]

    # Checking the Output
    print(top_indices)

Let's run the same query again and see if there are different results given. 

In [None]:
#query_sbert(query)

All of our embeddings are no2w strictly positive. lets run the query now. 

In [None]:
def query_sbert(query, top_k=5):
    print("Query: ", query)

    if not query:
        print("Empty query. Exiting.")
        return
    
    query_emb = encode_text_SBERT(query)

    similarities = []

    # Shifting the Embeddings
    query_emb = shift_embedding(query_emb)
    #psalm_SBERT_embeddings = shift_embedding(psalm_SBERT_embeddings)

    for i, psalm_emb in enumerate(psalm_SBERT_embeddings):
        #print(f"Query: {query_emb} \n Emedding: {psalm_emb}")
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, round(sim*100, 2)))

    
    # 3. Sort by similarity descending
    sims = sorted(similarities, reverse=True)[:5]
    

    # Return the top_k indices
    top_indices = [("SBERT", idx,sim) for idx, sim in sims[:top_k]]

    # Checking the Output
    return (top_indices)

query_sbert(query)

With the embeddings a shfited, we are now getting really high similarities which could be a good thing. Lets now update the `cosine_similarity()` function to handle the shifting as all three algorithms are using this to compare and gather results. We are going to take the shifting out of the SBERT algorithm first. 

In [None]:
def query_sbert(query, top_k=5):
    query_emb = encode_text_SBERT(query)

    similarities = []
    for i, psalm_emb in enumerate(psalm_SBERT_embeddings):
        sim = cosine_similarity(query_emb, psalm_emb)
        similarities.append((i, sim))  # store raw similarity (not scaled)

    # Sort by similarity descending
    sims = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_k]

    # Return (method, index, similarity%) for top_k results
    top_indices = [("SBERT", idx, round(sim * 100, 2)) for idx, sim in sims]

    return top_indices


In [None]:
query_sbert("Have mercy on me, O God, have mercy on me. For my soul trusts in Thee, and in the shadow of Thy wings will I hope, until iniquity pass away.")

Then update the `cosine_similarity()` function. 

In [None]:
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    ''' Ensuring that both embeddings are strictly positive'''
    if a.min() < 0:
        a = shift_embedding(a)
    
    if b.min() < 0:
        b = shift_embedding(b)
        
    """Compute cosine similarity between two 1D numpy arrays."""
    a_norm = a / np.linalg.norm(a)
    b_norm = b / np.linalg.norm(b)
    return float(np.dot(a_norm, b_norm))

Now lets examine of the results of all of the algorithms together. 

# Testing All Algorithms

In [None]:
def query_all(q):
    # GLoVe scaled by TF_IDF 
    (query_tfidf_glove(q))
    # BERT
    query_bert(q)
    # SBERT
    query_sbert(q)

print(query_all("For the Peace of the world"))



I want to be able to store ans keep track of the data generate by each of the three algorithms based on the queries. I want to store it all in a dataFrame for future analaysis

## Storing Algorithm


In [38]:
full_results = pd.DataFrame(columns= ["Query", "Method", "Similarity Score (%)", "Text", "Psalm Num", "Verse"] )

In [30]:
def record_results(query, results):
    for result in results:
        method, index, sim = result
        target_psalm = psalms.iloc[index]
        text = target_psalm["text"]
        psalm_num = target_psalm["psalm_num"]
        verse = target_psalm["verse"]

        # Correct way to append a row
        full_results.loc[len(full_results)] = [query, method, sim, text, psalm_num, verse]



In [31]:
def query_all(q):
    # GLoVe scaled by TF_IDF 
    record_results(q, query_tfidf_glove(q))
    # BERT
    record_results(q, query_bert(q))
    # SBERT
    record_results(q, query_sbert(q))

query_all("For the Peace of the world")

NameError: name 'query_tfidf_glove' is not defined

In [None]:
full_results

# Graphing Results

In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.scatter(full_results['Psalm Num'], full_results["Similarity Score (%)"])

plt.show()



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=full_results,
    x="Psalm Num",
    y="Similarity Score (%)",
    hue="Text",       # color by text
    size="Method",    # scale by method
    palette="viridis",
    sizes=(40, 200),
    alpha=0.8,
    edgecolor="k"
)

plt.title(f"Psalm Similarity by Text and Method \n Based on the Query: {query}")
#plt.suptitle(f"Based on the Query: {query}")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap

In [None]:
# Defining a function for the graping
def graph_results(results):
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(
        data=results,
        x="Psalm Num",
        y="Similarity Score (%)",
        hue="Text",       # color by text
        size="Method",    # scale by method
        palette="viridis",
        sizes=(40, 200),
        alpha=0.8,
        edgecolor="k"
    )

    line

    query = textwrap.fill(results.iloc[0]["Query"], width=60)

    plt.title(f"Psalm Similarity by Text and Method \n Based on the Query: {query}")
    #plt.suptitle(f"Based on the Query: {query}")

    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


In [None]:
graph_results(full_results)

With everything needed to implents some investigations, lets work on runing a bunch of different queries to be able to look for any trends that may emerge. After a little thought and consulting ChatGPT I am going to use the following queries to test on all three of the algorithms. 
### Psalms Search Test Queries

#### 1. Simple Keyword Queries
- **Query 1:** mercy

#### 2. Phrase/Exact Match Queries
- **Query 2:** The Lord is my shepherd

#### 3. Thematic/Semantic Queries
- **Query 3:** protection from enemies
- **Query 4:** praise in times of suffering

#### 4. Long/Complex Queries
- **Query 5:** How does the psalmist express trust in God while surrounded by fear and uncertainty?
- **Query 6:** Verses where the psalmist remembers past deliverance and uses it to find hope in present trials.

#### 5. Orthodox Service Quotes
##### From *Vespers*
- **Query 7:** 
    >"Rejoice, O ye heavens, sound the trumpets, ye foundation of the earth, thunder forth gladness, O ye mountains: for behold, Emanuel to    the Cross our sins, and the Giver of Life hath slain death, rasing up Adam,; for He loveth man kind."

- **Query 8:**
    >"“Have mercy on me, O God, have mercy on me. For my soul trusts in Thee, and in the shadow of Thy wings will I hope, until iniquity pass away.”

We can now store all of these within a dictionary to organize them.



In [None]:
# psalm_queries.py (you can save this to import later)
queries = [
    # 1. Simple Keyword Queries
    {"id": 1, "category": "Simple Keyword", "text": "mercy"},

    # 2. Phrase/Exact Match Queries
    {"id": 2, "category": "Phrase/Exact Match", "text": "The Lord is my shepherd"},

    # 3. Thematic/Semantic Queries
    {"id": 3, "category": "Thematic/Semantic", "text": "protection from enemies"},
    {"id": 4, "category": "Thematic/Semantic", "text": "praise in times of suffering"},

    # 4. Long/Complex Queries
    {"id": 5, "category": "Long/Complex", 
     "text": "How does the psalmist express trust in God while surrounded by fear and uncertainty?"},
    {"id": 6, "category": "Long/Complex", 
     "text": "Verses where the psalmist remembers past deliverance and uses it to find hope in present trials."},

    # 5. Orthodox Service Quotes
    {"id": 7, "category": "Orthodox Service (Vespers)", 
     "text": "Rejoice, O ye heavens, sound the trumpets, ye foundation of the earth, thunder forth gladness, O ye mountains: for behold, Emmanuel to the Cross our sins, and the Giver of Life hath slain death, raising up Adam; for He loveth mankind."},

    {"id": 8, "category": "Orthodox Service: Great Canon of St. Andrew of Crete ", 
     "text": "Have mercy on me, O God, have mercy on me. For my soul trusts in Thee, and in the shadow of Thy wings will I hope, until iniquity pass away."}
]


queries

In [None]:
for q in queries:
    query_all(q['text'])

In [None]:
import pandas as pd

# Configure Pandas display options
pd.set_option('display.max_rows', 50)       # show all rows
pd.set_option('display.max_columns', None)    # show all columns
pd.set_option('display.max_colwidth', 180)   # don't truncate text
pd.set_option('display.expand_frame_repr', False)  # keep wide frames on one line

# Then simply display the DataFrame
full_results


In [None]:
grouped_queries = full_results.groupby("Query")

grouped_queries

In [None]:
for q, group in grouped_queries:
    graph_results(group)

In [None]:
graph_results(full_results)

In [None]:
# full_results.to_csv("full_results.csv", index=False)

# TFIDF
After an intail set of scoring and seeing some results, it was decided it might be useful to just see `TF-IDF` as well. 


In [12]:
import os

BASE_DIR = os.getcwd()

In [26]:
import os
import pickle
import pandas as pd

cwd = os.getcwd()

MODEL_PATH = os.path.abspath(
    os.path.join(
        cwd,
        "..",
        "data"
    )
)


with open(MODEL_PATH +"/psalms_tfidf_matrix.pickle", "rb") as f:
    tfidf_matrix = pickle.load(f)


with open(MODEL_PATH +"/psalms_tfidf_vectorizer.pickle", "rb") as f:
    tfidf_vectorizer = pickle.load(f)
with open(MODEL_PATH +"/grouped_psalm.csv", "rb") as f:
    psalms = pd.read_csv(f)
    
    
    
    
    
    

In [27]:
psalms

Unnamed: 0.1,Unnamed: 0,tradition,text,psalm_num,verse,cleaned_verse
0,0,Orthodox,Bible,1,Blessed is the man Who walks not in the counse...,blessed man walk counsel ungodly stand way sin...
1,1,Orthodox,Bible,2,Why do the nations rage And the people meditat...,nation rage people meditate vain thing king ea...
2,2,Orthodox,Bible,3,A psalm by David when he fled from the face of...,psalm david fled face son absalom olord afflic...
3,3,Orthodox,Bible,4,For the End in psalms an ode by David You hear...,end psalm ode david heard icalled god righteou...
4,4,Orthodox,Bible,5,For the End concerning the inheritance a psalm...,end concerning inheritance psalm david give ea...
...,...,...,...,...,...,...
296,296,Orthodox,Psalter,146,The Lord doth build up Jerusalem; He shall gat...,lord doth build jerusalem ; shall gather toget...
297,297,Orthodox,Psalter,147,"Praise the Lord, O Jerusalem; praise thy God, ...","praise lord , jerusalem ; praise thy god , zio..."
298,298,Orthodox,Psalter,148,Praise ye the Lord from the heavens; praise Hi...,praise ye lord heaven ; praise highest . prais...
299,299,Orthodox,Psalter,149,"Sing unto the Lord a new song, His praise is i...","sing unto lord new song , praise congregation ..."


In [15]:
tfidf_vectorizer

In [16]:
tfidf_matrix

Unnamed: 0,aaron,abandon,abasement,abated,abhor,abhorred,abhors,abide,abides,abideth,...,zacharias,zalmon,zalmunna,zeal,zebah,zebulun,zeeb,zion,ziphites,zoan
"(Bible, 1)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
"(Bible, 2)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073937,0.0,0.0
"(Bible, 3)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
"(Bible, 4)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
"(Bible, 5)",0.0,0.0,0.0,0.0,0.0,0.0,0.158926,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(Psalter, 146)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
"(Psalter, 147)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072535,0.0,0.0
"(Psalter, 148)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
"(Psalter, 149)",0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.077526,0.0,0.0


In [17]:
def record_results(query, results_df):
    """
    Append query results DataFrame to full_results.
    """
    for _, row in results_df.iterrows():
        # Extract values from the row
        method = row['Method']
        sim = row['Similarity Score (%)']
        text = row['Text']
        psalm_num = row['Psalm Num']
        verse = row['Verse']

        # Append to the global full_results DataFrame
        full_results.loc[len(full_results)] = [query, method, sim, text, psalm_num, verse]



In [35]:
def query_tfidf(query, top_k=6):
    query_vec = tfidf_vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()

    top_indices = sims.argsort()[::-1][:top_k]
    top_scores = sims[top_indices] * 100  # percent

    results = [("TF-IDF", idx, score) for idx, score in zip(top_indices, top_scores)]
    return results


In [36]:
query_tfidf("for the peace of the world")

[('TF-IDF', 271, 18.675395800994878),
 ('TF-IDF', 120, 18.612013010937538),
 ('TF-IDF', 247, 15.302091583685066),
 ('TF-IDF', 96, 14.183477308107623),
 ('TF-IDF', 91, 12.570602777670539),
 ('TF-IDF', 245, 11.329991914781507)]

In [39]:
results = query_tfidf("for the peace of the world")
record_results("for the peace of the world", results)

In [40]:
full_results

Unnamed: 0,Query,Method,Similarity Score (%),Text,Psalm Num,Verse
0,for the peace of the world,TF-IDF,18.675396,Psalter,121,"I was glad because of them that said to me, Le..."
1,for the peace of the world,TF-IDF,18.612013,Bible,121,1An ode of ascents Iwas glad when they said to...
2,for the peace of the world,TF-IDF,15.302092,Psalter,97,"O sing unto the Lord a new song, for the Lord ..."
3,for the peace of the world,TF-IDF,14.183477,Bible,97,A psalm by David Sing a new song to the Lord F...
4,for the peace of the world,TF-IDF,12.570603,Bible,92,1For the day before the Sabbath when the earth...
5,for the peace of the world,TF-IDF,11.329992,Psalter,95,O sing unto the Lord a new song; sing unto the...


In [None]:

for q in queries:
    record_results(q['text'], query_tfidf(q['text']))
    
full_results

In [42]:
# Adding the rest of the results to the undscored csv file
import os

output_file = "results.csv"

full_results.to_csv(
    output_file,
    mode="a",
    index=False,
    header=not os.path.exists(output_file)
)
