# DSPy optimisation pipeline for UNFCCC obstruction detector
*Generated 2025-05-20*

This notebook loads your gold‐standard labels, splits them into train/dev/test, uses **DSPy** to choose the best subset of few‑shot demonstrations and prompt wording, evaluates accuracy/F1, and saves the resulting prompt as `best_prompt.json`.

> **Tip**  Run the cells top‑to‑bottom after installing the requirements.  
> Adjust file paths if your CSVs live elsewhere.

In [None]:
# ✱ 1  Install packages (skip if already installed)
!pip install --quiet dspy-ai pandas scikit-learn openai tiktoken

In [None]:
# ✱ 2  Imports & basic config
import os, json, random, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_recall_fscore_support
import dspy
from dspy import Argument, Variable, Predict
from dspy.optimizers import ExhaustiveFewShot

# Set your OpenAI key
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'paste-your-key-here')

# Reproducibility
random.seed(42)
np.random.seed(42)

In [None]:
# ✱ 3  Load labelled data and paragraph texts
LABEL_PATH   = 'manual_annotations.csv'  # path to file you downloaded earlier
PARAGRAPH_CSV = 'ENB_UNFCCC 1995-2024 - final_data_UNFCCC.csv'  # full corpus

df_labels = pd.read_csv(LABEL_PATH)
df_texts  = pd.read_csv(PARAGRAPH_CSV, usecols=['index', 'paragraph'])

df = df_labels.merge(df_texts, on='index', how='left')
assert df.paragraph.notna().all(), "Some indices not found in paragraph CSV"

print(f"{len(df)} total labelled rows  |  "
      f"{(df.obstruction=='y').sum()} positives, "
      f"{(df.obstruction=='n').sum()} negatives")

In [None]:
# ✱ 4  Train / dev / test split (80 / 10 / 10 stratified on obstruction)
train_val, test = train_test_split(df, test_size=0.1, stratify=df.obstruction, random_state=42)
train, dev      = train_test_split(train_val, test_size=0.1111, stratify=train_val.obstruction, random_state=42)

for name, part in [('train', train), ('dev', dev), ('test', test)]:
    pos = (part.obstruction=='y').sum()
    print(f"{name:>5}: {len(part):4} rows  |  {pos} positives")

# Helper: record tuples (paragraph text, gold label) for DSPy
train_set = [(row.paragraph, row.obstruction) for _, row in train.iterrows()]
dev_set   = [(row.paragraph, row.obstruction) for _, row in dev.iterrows()]
test_set  = [(row.paragraph, row.obstruction) for _, row in test.iterrows()]

In [None]:
# ✱ 5  Define DSPy program (prompt with variables for rules + demos)
SYSTEM_RULES_OPTIONS = [
    """You are an expert analyst of negotiation tactics in UN climate talks. 
Return JSON with keys 'obstruction' ('yes'/'no') only.""",

    """Classify whether the excerpt contains obstruction to UNFCCC goals. 
Answer strictly with 'yes' or 'no'.""",

    """Determine if this paragraph shows any obstruction tactic as per UNFCCC definitions 
and reply with 'yes' or 'no'."""
]

def make_demos(df_subset):
    demos = []
    for _, row in df_subset.iterrows():
        demos.append({
            "paragraph": row.paragraph,
            "label": "yes" if row.obstruction=='y' else "no"
        })
    return demos

# Precompute 40 candidate demo subsets of size 20 (balanced roughly)
positive_rows = train[train.obstruction=='y']
negative_rows = train[train.obstruction=='n']
demo_subsets = []
for i in range(40):
    demos = pd.concat([
        positive_rows.sample(min(len(positive_rows),10), replace=False, random_state=100+i),
        negative_rows.sample(10, replace=False, random_state=200+i)
    ])
    demo_subsets.append(make_demos(demos))

class ObstructionClassifier(Predict):
    paragraph = Argument()
    rules     = Variable()
    demos     = Variable()

    def forward(self, paragraph):
        return dspy.chat_completion(
            model='gpt-4o-preview',
            system=self.rules,
            user=f"Paragraph:\n\"\"\"\n{paragraph}\n\"\"\"",
            few_shot=self.demos,
            temperature=0
        )


In [None]:
# ✱ 6  Optimise with ExhaustiveFewShot
prog = ObstructionClassifier()
opt  = ExhaustiveFewShot(max_demos=20)   # searches subsets up to 20

best = opt(
    program  = prog,
    argspace = {
        'rules': SYSTEM_RULES_OPTIONS,
        'demos': demo_subsets
    },
    trainset = train_set,
    devset   = dev_set,
    metric   = lambda y_hat, y: f1_score(y, y_hat, pos_label='y')
)

print("✓ optimisation finished")


In [None]:
# ✱ 7  Evaluate on held‑out test set
best_predictions = [best(paragraph=p).prediction.strip().lower() for p, _ in test_set]
gold = [label for _, label in test_set]

print(classification_report(gold, best_predictions, target_names=['no','yes']))

In [None]:
# ✱ 8  Save best prompt & demo list
prompt_bundle = {
    'system_rules': best['rules'],
    'demos': best['demos']
}
with open('best_prompt.json', 'w') as fp:
    json.dump(prompt_bundle, fp, indent=2, ensure_ascii=False)

print("✓ saved best_prompt.json")

In [None]:
# ✱ 9  How to use the optimised prompt
print(json.dumps(prompt_bundle, indent=2)[:1000] + ' ...')