In [4]:
import os

In [34]:
# Set the OpenAI API key in the Python environment.
# `os` is already imported in a later cell, so we can use it here without re-importing.
os.environ["OPENAI_API_KEY"] = "your_openai_api_key_here"

# Optional confirmation (does not print the secret)
print("OPENAI_API_KEY set in os.environ")

OPENAI_API_KEY set in os.environ


In [15]:
from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("../data_new/speaker_texts/train", glob="**/*.txt")

In [13]:
loader

<langchain_community.document_loaders.directory.DirectoryLoader at 0x70fc91245700>

In [17]:
!pip install unstructured

Collecting unstructured
  Downloading unstructured-0.18.18-py3-none-any.whl.metadata (25 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.11.11-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m12.9 MB/s[0m  [33m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting unstructured-client (from unstructured)
  Downloading unstructured_client-0.42.4-py3-none-any.whl.metadata (23 kB)
Collecting python-oxmsg (from unstructured)
  Downloading python_oxmsg-0.0.2-

In [18]:
docs = loader.load()

In [19]:
len(docs)

33

In [20]:
from pathlib import Path

for d in docs:
    # e.g. source = ".../Odysseus.txt" → "Odysseus"
    fname = Path(d.metadata["source"]).stem
    d.metadata["character"] = fname

In [21]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()  # uses OPENAI_API_KEY

vectorstore = FAISS.from_documents(docs, embeddings)


In [29]:
from typing import Tuple, List
from langchain_core.documents import Document

def predict_speaker(
    line: str,
    k: int = 33
) -> Tuple[str, List[Document]]:
    """
    Returns:
      - predicted_character: character name from best-matching document
      - results: top-k matching documents for inspection
    """
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})
    results: List[Document] = retriever.get_relevant_documents(line)

    if not results:
        return "UNKNOWN", []

    top = results[0]
    predicted_character = top.metadata.get("character", "UNKNOWN")
    return predicted_character, results


In [32]:
query_line = "Ruthlessness is mercy upon ourselves 'Cause you fight to save lives"

predicted_character, matches = predict_speaker(query_line)

print("Predicted speaker:", predicted_character)
print("--- top hit meta ---")
print(matches[0].metadata)
print("--- snippet of that doc ---")
print(matches[0].page_content[:500])  # just to sanity check


Predicted speaker: Laestrygonians
--- top hit meta ---
{'source': '../data_new/speaker_texts/train/Laestrygonians.txt', 'character': 'Laestrygonians'}
--- snippet of that doc ---
You are far too nice

Unless, of course, you apologize

Ruthlessness is (Captain) mercy upon our- Captain

Poseidon, Poseidon, Poseidon, Poseidon, Poseidon, Poseidon, Poseidon

Ruthlessness is (Captain) mercy upon our- Captain

Had you just killed my son

Ruthl?ssness is mercy upon ourselv?s

I've gotta make you bleed

You reveal your name

Unlike you I've got no mercy left to give 'cause

You are the worst kind of good

That's what I hate

Ruthlessness is mercy-

But before you go, I need to ma


In [33]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-4.1-mini")

prompt = ChatPromptTemplate.from_template("""
You are given some example lines spoken by different characters from a script.
Each block is labeled with the character name.

Use the writing STYLE and content to decide who most likely says the QUERY line.
Respond with ONLY the character name.

CONTEXT:
{context}

QUERY LINE:
"{query}"
""")

def predict_speaker_llm(line: str, k: int = 3) -> str:
    results = vectorstore.similarity_search(line, k=k)

    context_blocks = []
    for doc in results:
        char = doc.metadata.get("character", "UNKNOWN")
        # We can just show the first few hundred chars of each file as style examples
        context_blocks.append(f"[{char}]\n{doc.page_content[:800]}\n")
    context = "\n".join(context_blocks)

    chain = prompt | llm
    resp = chain.invoke({"context": context, "query": line})
    return resp.content.strip()

speaker = predict_speaker_llm(query_line)
print("LLM-predicted speaker:", speaker)


LLM-predicted speaker: Poseidon
