In [4]:
import os

In [5]:
# Set the OpenAI API key in the Python environment.
# `os` is already imported in a later cell, so we can use it here without re-importing.
os.environ["OPENAI_API_KEY"] = "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"

# Optional confirmation (does not print the secret)
print("OPENAI_API_KEY set in os.environ")

OPENAI_API_KEY set in os.environ


In [None]:
import os
import glob
from typing import List

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document


def load_character_documents(folder_path: str) -> List[Document]:
    """
    Assume each .txt file is all the lines for ONE character.
    We split by line and store metadata with the character name.
    """
    docs: List[Document] = []

    for path in glob.glob(os.path.join(folder_path, "*.txt")):
        # character name = filename without extension
        char_name = os.path.splitext(os.path.basename(path))[0]

        with open(path, "r", encoding="utf-8") as f:
            for raw_line in f:
                line = raw_line.strip()
                if not line:
                    continue

                docs.append(
                    Document(
                        page_content=line,
                        metadata={"character": char_name, "source_file": path},
                    )
                )

    if not docs:
        raise ValueError(f"No .txt files found in {folder_path!r} or files were empty.")

    return docs


def build_vectorstore(docs: List[Document]) -> FAISS:
    # Small, cheap embedding model
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vectordb = FAISS.from_documents(docs, embeddings)
    return vectordb


def make_rag_answer_fn(vectordb: FAISS):
    """
    Returns a simple function answer(query: str) -> str
    that runs retrieval + LLM to guess the speaker.
    """
    retriever = vectordb.as_retriever(search_kwargs={"k": 5})

    llm = ChatOpenAI(
        model="gpt-4o-mini",
        temperature=0,
    )

    def answer(query: str) -> str:
        # Get the most relevant lines (and their characters)
        retrieved_docs = retriever.get_relevant_documents(query)

        context_lines = []
        for d in retrieved_docs:
            char = d.metadata.get("character", "UNKNOWN")
            context_lines.append(f"Character: {char} | Line: {d.page_content}")

        context_block = "\n".join(context_lines)

        messages = [
            (
                "system",
                (
                    "You are a dialogue attribution assistant for a script. "
                    "You get example lines with their characters and a question "
                    "about who says a given line. "
                    "Use the examples to decide the MOST LIKELY speaker. "
                    "Answer with just the character name plus a very short explanation."
                ),
            ),
            (
                "human",
                f"""Here are example lines from the script (each line has a character label):

{context_block}

Question: Based on these examples, who most likely says this line or quote?

{query}

Respond in the format:
Character: <name>
Reason: <one short sentence>""",
            ),
        ]

        resp = llm.invoke(messages)
        return resp.content

    return answer


if __name__ == "__main__":
    # 1) Point this to your folder of character .txt files
    
    FOLDER = "/home/ewu/Desktop/Fall 2025/ENGS108/project/data_new/speaker_texts/train"  # e.g. "./characters"

    print(f"Loading character documents from: {FOLDER}")
    docs = load_character_documents(FOLDER)

    print(f"Loaded {len(docs)} lines. Building vector store...")
    vectordb = build_vectorstore(docs)

    answer = make_rag_answer_fn(vectordb)

    print("\nReady! Ask things like:")
    print('  Who says "You know nothing, Jon Snow"?')
    print("Type 'quit' to exit.\n")

    while True:
        q = input("Query> ").strip()
        if q.lower() in {"quit", "exit"}:
            break

        result = answer(q)
        print("\n" + result + "\n")


Loading character documents from: /home/ewu/Desktop/Fall 2025/ENGS108/project/data_new/speaker_texts/train
Loaded 1849 lines. Building vector store...

Ready! Ask things like:
  Who says "You know nothing, Jon Snow"?
Type 'quit' to exit.



  retrieved_docs = retriever.get_relevant_documents(query)
