## Prereq

In [2]:
from dotenv import load_dotenv
from pathlib import Path
import os

env_path = Path('.') / '.env'
load_dotenv(env_path)

True

In [3]:
import os
# Read OpenAI API key from environment for safety.
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise EnvironmentError('Please set the OPENAI_API_KEY environment variable before running this notebook')
print('OPENAI_API_KEY found in environment, proceeding...')

OPENAI_API_KEY found in environment, proceeding...


## Load

In [18]:
from langchain_community.document_loaders import TextLoader

# Use the CSV in the project data/clean directory (relative to analysis/)
loader = TextLoader("../data/process/character_lines.csv")
documents = loader.load()

## Embed

In [20]:
import pandas as pd
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os

csv_path = "../data/clean/train.csv"
df = pd.read_csv(csv_path)

# Detect likely columns
possible_char_cols = ["character"]
possible_text_cols = ["lines"]

char_col = next((c for c in possible_char_cols if c in df.columns), None)
text_col = next((c for c in possible_text_cols if c in df.columns), None)

# fallback: if only two columns, assume first is character, second is text
if char_col is None or text_col is None:
    cols = list(df.columns)
    if len(cols) >= 2:
        char_col = char_col or cols[0]
        text_col = text_col or cols[1]
    else:
        # assume single-column CSV, each row may be "CHARACTER: line"
        char_col = None
        text_col = cols[0]

docs = []
for idx, row in df.iterrows():
    raw_text = str(row[text_col]) if text_col in row else str(row[0])
    if not raw_text or raw_text.strip() == "":
        continue
    metadata = {"source": os.path.basename(csv_path), "row": int(idx)}
    if char_col and char_col in row:
        metadata["character"] = str(row[char_col])
    else:
        # try to parse "CHARACTER: text" pattern
        if ":" in raw_text:
            maybe_char, maybe_line = raw_text.split(":", 1)
            metadata["character"] = maybe_char.strip()
            raw_text = maybe_line.strip()
    docs.append(Document(page_content=raw_text, metadata=metadata))

# Create embeddings and vector store
embeddings = OpenAIEmbeddings()  # needs OPENAI_API_KEY env var (not printed)
vectordb = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_lines")
vectordb.persist()

# Example query
results = vectordb.similarity_search("Find lines where Odysseus expresses longing", k=5)
for r in results:
    print(r.page_content, r.metadata)

Odysseus {'row': 24, 'character': 'Suitors', 'source': 'train.csv'}
Odysseus {'character': 'Odysseus', 'row': 25, 'source': 'train.csv'}
Odysseus {'source': 'train.csv', 'character': 'Suitors', 'row': 24}
Odysseus {'source': 'train.csv', 'character': 'Odysseus', 'row': 25}
Just keep your eyes open

Just keep your eyes open

Just keep your eyes open

Just keep your eyes open

Wake up!

Wake up, Odysseus, they're opening the bag

Wake up! {'character': 'Penelope', 'row': 31, 'source': 'train.csv'}


In [11]:
retriever = vectordb.as_retriever()

In [15]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name="gpt-4.1")
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

query = "which character in epic would say: Wait  Stop this, please  NO  Aren't you tired, Poseidon?  It's been 8 years, how long will this go?  We're both hurting from losses  So why not leave this here and just go home?"
response = qa_chain.invoke({"query": query})
print(response["result"])

The character in the epic most likely to say these words is **Odysseus**.

**Reasoning:**  
- The lines express frustration, exhaustion, and a plea to **Poseidon**, the god who relentlessly persecutes Odysseus after he blinds the Cyclops Polyphemus (Poseidon's son).
- Odysseus is famously kept from returning home for many years (ten years after the Trojan War), largely due to Poseidon’s wrath.
- The tone ("It's been 8 years, how long will this go?") fits Odysseus' situation, as he is desperate to return to Ithaca and wants Poseidon’s punishment to end.
- The sense of mutual suffering and loss also fits Odysseus’ perspective, as he has lost crew and time, and Poseidon has lost his son’s eye.

**Conclusion:**  
**Odysseus** would be the character in the epic most likely to plead with Poseidon in this manner.
