# Evaluate with LangChain + OpenAI
This notebook uses LangChain + OpenAI (Chat API) to predict characters from single lines in `data/clean/test.csv`.

It loads your API key from `project/analysis/.env`, runs the model on lines from the test set, parses the model's predicted character, computes accuracy and per-class metrics, and saves the results to `project/analysis/results_langchain.csv`.

Notes:
- Edit `MODEL_NAME` and `N_EXAMPLES` in the config cell before running.
- Be mindful of API usage and rate limits.

In [2]:
# Install required packages
# (Run this cell once in the notebook environment. If you prefer, install in your environment outside the notebook.)
!pip install -q langchain openai python-dotenv pandas scikit-learn tenacity tqdm

# Note: your project may already include pinned versions in `project/data/requirements.txt`. If so, prefer installing from that file.

In [3]:
# Imports, environment and API setup
from dotenv import load_dotenv
import os
import logging
import time
import re
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from tqdm import tqdm
import pandas as pd

# Load .env located in project/analysis
DOTENV_PATH = '/home/ewu/Desktop/Fall 2025/ENGS108/project/analysis/.env'
load_dotenv(DOTENV_PATH)

# Ensure OPENAI_API_KEY is present
if not os.getenv('OPENAI_API_KEY'):
    raise RuntimeError(f"OPENAI_API_KEY not found in {DOTENV_PATH}. Please add it (OPENAI_API_KEY=sk-...) or point DOTENV_PATH to the correct .env file.")

# LangChain/OpenAI imports (import after API key check so errors are clear)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configurable variables
MODEL_NAME = 'gpt-3.5-turbo'  # change to gpt-4 or other model if available
TEMPERATURE = 0.0
MAX_TOKENS = 200

# Initialize model wrapper
llm = ChatOpenAI(model_name=MODEL_NAME, temperature=TEMPERATURE, openai_api_key=os.getenv('OPENAI_API_KEY'))

logger.info(f"Initialized LangChain ChatOpenAI with model={MODEL_NAME}")

RuntimeError: OPENAI_API_KEY not found in /home/ewu/Desktop/Fall 2025/ENGS108/project/analysis/.env. Please add it (OPENAI_API_KEY=sk-...) or point DOTENV_PATH to the correct .env file.

In [10]:
# Load test dataset
CSV_PATH = '/home/ewu/Desktop/Fall 2025/ENGS108/project/data/clean/test.csv'

df = pd.read_csv(CSV_PATH)
print('Loaded:', CSV_PATH)
print('Rows:', len(df))

df_head = df.head(5)
df_head

Loaded: /home/ewu/Desktop/Fall 2025/ENGS108/project/data/clean/test.csv
Rows: 19


Unnamed: 0,saga,song,character,lines
0,The Circe Saga,Done For,Circe,"Who, me?\n\nAll I did was reveal their true fo..."
1,The Cyclops Saga,My Goodbye,Athena,"You were reckless, sentimental at best\n\nThat..."
2,The Cyclops Saga,My Goodbye,Odysseus,"You were reckless, sentimental at best\n\nThat..."
3,The Cyclops Saga,Polyphemus,Polyphemus,Who are you?\n\nYou killed my sheep\n\nMy favo...
4,The Ithaca Saga,Hold Them Down,Suitors,Hold us down while the throne gets colder\n\nH...


In [11]:
# Auto-detect text and label columns (common conventions)
possible_text_keys = ['line','text','utterance','segment','lyric','dialog']
possible_label_keys = ['character','speaker','label','author','char']

text_col = None
label_col = None

for c in df.columns:
    low = c.lower()
    if any(k in low for k in possible_text_keys) and text_col is None:
        text_col = c
    if any(k in low for k in possible_label_keys) and label_col is None:
        label_col = c

# Fallback heuristics
if text_col is None:
    # choose the first string-like column
    for c in df.columns:
        if pd.api.types.is_string_dtype(df[c]):
            text_col = c
            break

if label_col is None:
    # look for column with small cardinality
    for c in df.columns:
        if pd.api.types.is_string_dtype(df[c]) and df[c].nunique() < (len(df) / 2):
            label_col = c
            break

if text_col is None or label_col is None:
    raise RuntimeError(f"Couldn't auto-detect text/label columns. Found text_col={text_col}, label_col={label_col}. Check {CSV_PATH}")

print('Detected text column:', text_col)
print('Detected label column:', label_col)

# Minimal preprocessing
df = df[[text_col, label_col]].dropna().rename(columns={text_col: 'line', label_col: 'gold_character'})
# strip whitespace
df['line'] = df['line'].astype(str).str.strip()
df['gold_character'] = df['gold_character'].astype(str).str.strip()

# quick preview
df.head(5)

Detected text column: lines
Detected label column: character


Unnamed: 0,line,gold_character
0,"Who, me?\n\nAll I did was reveal their true fo...",Circe
1,"You were reckless, sentimental at best\n\nThat...",Athena
2,"You were reckless, sentimental at best\n\nThat...",Odysseus
3,Who are you?\n\nYou killed my sheep\n\nMy favo...,Polyphemus
4,Hold us down while the throne gets colder\n\nH...,Suitors


In [12]:
# Prompt template and parsing utilities
# We'll use a strict output format: a single line starting with `Character: <NAME>`
FEW_SHOT_EXAMPLES = [
    {"line": "I will not leave you to drown.", "character": "Odysseus"},
    {"line": "They will not expect us at dawn.", "character": "Polyphemus"}
]

PROMPT_TEMPLATE = '''You are given a single line spoken by a character from Epic: The Musical. Your job is to guess the character name.

Return your answer in exactly this format (single line, no extra commentary):
Character: <NAME>

Examples:
{examples}

Now guess the character for the following line.
Line: "{line}"
'''

# Helper to format examples
example_text = '\n'.join([f'Line: "{e["line"]}"\nCharacter: {e["character"]}' for e in FEW_SHOT_EXAMPLES])

prompt_template = PROMPT_TEMPLATE.format(examples=example_text, line='{line}')

# Parser to extract predicted name
def parse_prediction(text):
    if not isinstance(text, str):
        return None
    # Try strict 'Character: NAME' pattern
    m = re.search(r'Character\s*:\s*(.+)', text, flags=re.IGNORECASE)
    if m:
        return m.group(1).strip().split('\n')[0]
    # fallback: first non-empty line
    for ln in text.splitlines():
        ln = ln.strip()
        if ln:
            # if contains colon, take right side
            if ':' in ln:
                return ln.split(':', 1)[1].strip()
            return ln
    return None

# Normalization for comparison
import string

def normalize_name(s):
    if s is None:
        return ''
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# test parser quick
assert parse_prediction('Character: Odysseus') == 'Odysseus'
assert normalize_name('Odysseus!') == 'odysseus'


In [13]:
# Initialize LLMChain using the prompt template
prompt = PromptTemplate(input_variables=['line'], template=prompt_template)
chain = LLMChain(llm=llm, prompt=prompt)

# Tenacity retry wrapper for transient OpenAI errors
@retry(retry=retry_if_exception_type(Exception), stop=stop_after_attempt(4), wait=wait_exponential(multiplier=1, min=2, max=20))
def call_model(line):
    """Call the chain for a single line and return raw string."""
    # chain.run returns a string; we pass line as input
    return chain.run({'line': line})

# Quick smoke test (disabled by default)
# print(call_model('I will not leave you to drown.'))


In [14]:
# Evaluation loop (batched)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Configure evaluation
N_EXAMPLES = 200  # set to None to run entire dataset
SLEEP_BETWEEN = 0.3  # seconds between calls to be gentle on rate limits

rows = list(df.itertuples(index=False, name=None))
if N_EXAMPLES is None:
    N = len(rows)
else:
    N = min(N_EXAMPLES, len(rows))

results = []

for i in tqdm(range(N), desc='Evaluating'):
    line, gold = rows[i]
    try:
        raw = call_model(line)
    except Exception as e:
        logger.exception('Model call failed at index %d', i)
        raw = f'ERROR: {e}'
    pred = parse_prediction(raw)
    results.append({'line': line, 'gold_character': gold, 'raw_response': raw, 'predicted_character': pred})
    time.sleep(SLEEP_BETWEEN)

res_df = pd.DataFrame(results)

# Post-process predictions
res_df['pred_norm'] = res_df['predicted_character'].apply(normalize_name)
res_df['gold_norm'] = res_df['gold_character'].apply(normalize_name)

# Mark unknowns
res_df['pred_norm'] = res_df['pred_norm'].fillna('').replace('', 'unknown')

# Simple correctness check
def is_correct(pred_norm, gold_norm):
    if pred_norm == 'unknown':
        return False
    if pred_norm == gold_norm:
        return True
    # partial match
    if gold_norm in pred_norm or pred_norm in gold_norm:
        return True
    return False

res_df['correct'] = res_df.apply(lambda r: is_correct(r['pred_norm'], r['gold_norm']), axis=1)

accuracy = res_df['correct'].mean()
print(f'Accuracy on {len(res_df)} examples: {accuracy:.4f}')

# Show some mismatches
mismatches = res_df[~res_df['correct']]
print('\nSample mismatches (up to 20):')
if len(mismatches) > 0:
    display(mismatches.head(20))
else:
    print('No mismatches!')

# Classification report for non-unknown classes (may be noisy with few examples)
try:
    y_true = res_df['gold_norm']
    y_pred = res_df['pred_norm']
    print('\nClassification report (normalized names):')
    print(classification_report(y_true, y_pred, zero_division=0))
except Exception as e:
    logger.warning('Could not compute classification report: %s', e)


Evaluating:   0%|          | 0/19 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:   5%|▌         | 1/19 [00:00<00:17,  1.01it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:  11%|█         | 2/19 [00:02<00:19,  1.12s/it]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:  16%|█▌        | 3/19 [00:02<00:15,  1.05it/s]INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Evaluating:  21%|██        | 4/19 [00:03<00:13, 

Accuracy on 19 examples: 0.4737

Sample mismatches (up to 20):





Unnamed: 0,line,gold_character,raw_response,predicted_character,pred_norm,gold_norm,correct
1,"You were reckless, sentimental at best\n\nThat...",Athena,Character: M.K.,M.K.,mk,athena,False
2,"You were reckless, sentimental at best\n\nThat...",Odysseus,Character: Hera,Hera,hera,odysseus,False
3,Who are you?\n\nYou killed my sheep\n\nMy favo...,Polyphemus,Character: Cyclops,Cyclops,cyclops,polyphemus,False
4,Hold us down while the throne gets colder\n\nH...,Suitors,Character: Queen Tara,Queen Tara,queen tara,suitors,False
6,Odysseus of Ithaca\n\nDo you know who I am?,Poseidon,Character: Circe,Circe,circe,poseidon,False
8,"Captain, we will capsize with thes? waves, our...",Eurylochus,Character: Captain Nemo,Captain Nemo,captain nemo,eurylochus,False
10,"More than you know\n\nPenelope, I’ve told you ...",Odysseus,Character: Telemachus,Telemachus,telemachus,odysseus,False
12,"Six hundred men, six hundred men under my comm...",Polites,Character: Odysseus,Odysseus,odysseus,polites,False
13,"Alright, my brothers, listen closely\n\nTonigh...",Soldiers,Character: Achilles,Achilles,achilles,soldiers,False
14,This is the son of none other than Troy's very...,Ensemble,Character: Odysseus,Odysseus,odysseus,ensemble,False



Classification report (normalized names):
              precision    recall  f1-score   support

    achilles       0.00      0.00      0.00         0
      athena       0.00      0.00      0.00         1
captain nemo       0.00      0.00      0.00         0
       circe       0.50      1.00      0.67         1
     cyclops       0.00      0.00      0.00         0
    ensemble       0.00      0.00      0.00         1
  eurylochus       0.00      0.00      0.00         1
        hera       0.00      0.00      0.00         0
          mk       0.00      0.00      0.00         0
    odysseus       0.78      0.78      0.78         9
     polites       0.00      0.00      0.00         1
  polyphemus       0.00      0.00      0.00         1
    poseidon       1.00      0.50      0.67         2
  queen tara       0.00      0.00      0.00         0
    soldiers       0.00      0.00      0.00         1
     suitors       0.00      0.00      0.00         1
  telemachus       0.00      0.00     

In [15]:
# Save results and logs
OUT_PATH = '/home/ewu/Desktop/Fall 2025/ENGS108/project/analysis/results_langchain.csv'
res_df.to_csv(OUT_PATH, index=False)
print('Saved results to', OUT_PATH)

# Also save a small summary
summary = {
    'model': MODEL_NAME,
    'n_examples': len(res_df),
    'accuracy': float(accuracy),
    'timestamp': pd.Timestamp.now().isoformat()
}
summary_df = pd.DataFrame([summary])
summary_df.to_csv('/home/ewu/Desktop/Fall 2025/ENGS108/project/analysis/results_summary.csv', index=False)
print('Saved summary to project/analysis/results_summary.csv')

Saved results to /home/ewu/Desktop/Fall 2025/ENGS108/project/analysis/results_langchain.csv
Saved summary to project/analysis/results_summary.csv


In [17]:
# Filter matches
matches = res_df[res_df['correct']]

# Print matches
print(matches)

                                                 line gold_character  \
0   Who, me?\n\nAll I did was reveal their true fo...          Circe   
5   Penelope\n\n(opens the door of the bedroom)\n\...       Odysseus   
7   These waves and tides\n\nHave grown in strengt...       Odysseus   
9   Let's cut this charade, you are no wife of min...       Odysseus   
11  Six hundred men, six hundred men under my comm...       Odysseus   
15  How has everything been turned against us?\n\n...       Odysseus   
16  Hermes?\n\nSo you're the one who talked to Cal...       Odysseus   
17  There you are, coward\n\nI've been waiting for...       Poseidon   
18  Wait\n\nStop this, please\n\nNO\n\nAren't you ...       Odysseus   

           raw_response predicted_character pred_norm gold_norm  correct  
0      Character: Circe               Circe     circe     circe     True  
5   Character: Odysseus            Odysseus  odysseus  odysseus     True  
7   Character: Odysseus            Odysseus  odysseus 