In [None]:
import sys
import os
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from data_pipeline import TextPreprocessingPipeline

    
def retrieve_psalm(index, psalms):
    # Ensure index is a tuple with (document_name, psalm_number)
    doc, psalm_num = index  

    print(f"    Text: {doc}")
    print(f"    Psalm Number: {psalm_num}\n")

    # Retrieve and format the verse text as a paragraph
    matching_verses = psalms.loc[(psalms['text'] == doc) & (psalms['psalm_num'] == psalm_num), 'verse']

    if matching_verses.empty:
        print("    No matching Psalm found.")
        return

    # Removing trailing spaces
    verse_text = " ".join(matching_verses.tolist()).strip()

    # Ensure the last full word is displayed within the first 200 characters
    if len(verse_text) > 200:
        verse_text = verse_text[:200]  # Slice to the first 200 characters
        last_space = verse_text.rfind(' ')  # Find the last space in the first 200 characters
        verse_text = verse_text[:last_space]  # Trim to the last full word

    # Print the first 200 characters (or last full word if it's too long)
    print("   " + verse_text + "...\n")


def search_psalms(query, pipeline, vectorizer, model, psalms, num_results=6):
    # Displaying the query
    print(f"\033[1mSearching for:\033[0m {query}.\n")
    
    # Reporting the number of results
    print(f"Top {num_results} results.")
    
    # Running the query through the data pipeline
    clean_query = pipeline.pipeline(query)

    # Transform the query using the loaded vectorizer
    clean_vec = vectorizer.transform([clean_query])

    # Calculate the cosine similarity between the query vector and the TF-IDF matrix
    cosine_similarities = cosine_similarity(clean_vec, model).flatten()
    
    # Get the indices of the top_n most similar Psalms
    top_indices = cosine_similarities.argsort()[-num_results:][::-1]

    # For the ranking of the results
    n = 1

    # Looping through the indices to print them out
    for index in top_indices:
        # Ensure you have access to both the document name and psalm number from your model
        doc, psalm_num = model.index[index]  # Or adjust this part based on your data structure
        retrieve_psalm((doc, psalm_num), psalms)  # Pass the tuple (doc, psalm_num) to retrieve_psalm


def main():
    # Get the current working directory and construct the cleaning path
    current_dir = os.getcwd()
    cleaning_dir = os.path.abspath(os.path.join(current_dir, "../cleaning"))

    # Add the directory to sys.path
    sys.path.append(cleaning_dir)

    # Define the directory where pickled files are stored
    load_dir = "../pickles"

    # Load the preprocessed text pipeline
    with open(os.path.join(load_dir, "pipeline.pickle"), "rb") as f:
        pipeline = pickle.load(f)

    # Load the pickled vectorizer
    vectorizer_path = os.path.join(load_dir, "psalms_tfidf_vectorizer.pickle")
    with open(vectorizer_path, "rb") as file:
        psalm_vectorizer = pickle.load(file)

    # Load the pickled TF-IDF matrix (optional, if you need to load the transformed matrix)
    matrix_path = os.path.join(load_dir, "psalms_tfidf_matrix.pickle")
    with open(matrix_path, "rb") as file:
        tf_idf_psalms = pickle.load(file)

    # Load the cleaned Psalms data from CSV
    psalms = pd.read_csv("../Data/clean_psalm_verses.csv")

    # Prompt user for search input
    query = input("Enter text to search the Psalms: ")

    # Perform the search with the given query
    search_psalms(query, pipeline, psalm_vectorizer, tf_idf_psalms, psalms, num_results=6)


# Run the script only if executed directly
if __name__ == "__main__":
    main()
