In [9]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import json

# Initialize embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Function 1: Create and save vector store
def build_vector_store_from_titles(titles: list, store_path: str = "title_index") -> None:
    vector_store = FAISS.from_texts(titles, embeddings)
    vector_store.save_local(store_path)


# Function 2: Search top titles and return essays
def get_top_essays_by_title(query_title: str, top_k: int = 5, store_path: str = "title_index") -> dict:
    # Load vector store
    vector_store = FAISS.load_local(store_path, embeddings, allow_dangerous_deserialization=True)

    # Embed query
    query_embedding = embeddings.embed_query(query_title)

    # Reconstruct stored embeddings
    stored_embeddings = vector_store.index.reconstruct_n(0, len(titles))

    # Compute similarity
    similarities = cosine_similarity([query_embedding], stored_embeddings)[0]
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    top_titles = [titles[i] for i in top_indices]

    # Extract corresponding essays
    essays = df[df['title'].isin(top_titles)]['essay'].tolist()
    return {"top_titles": [{ "title": title, "essay": essay } for title, essay in zip(top_titles, essays)]}

In [None]:
# Load data
df = pd.read_csv("scraped_essays.csv")
titles = df['title'].tolist()
build_vector_store_from_titles(titles)

# Example usage:
result = get_top_essays_by_title("Do schools kill creativity?")
print(json.dumps(result, indent=2))

{
  "top_titles": [
    {
      "title": "\u2018Education is a solution to social inequality.\u2019 Do you agree?",
      "essay": "Since time immemorial, humanity has been plagued by violence, from the barbaric, ruthless slavery imposed by colonial masters, to the unstable geopolitical scene before us today with war and terror never too far away. Violence exists on a spectrum and can be physical, emotional, or mental, taking many forms such as war, abuse, and bullying. In their formative years, as children are trying to make sense of their identity and the world around them, growing up exposed to violence is particularly pernicious. The world will eventually come to pay the price as children become pawns for extremist groups, significantly more predisposed to violence or apathetic to violence. Children who grow up exposed to violence tend to be more easily swayed by extremist ideologies and often become pawns for terrorist organisations. On one hand, children growing up in war-torn ar