In [1]:
# @title [1] Setup: Install Libraries

!pip install openai pandas scikit-learn -q

In [None]:
# @title [2] Configuration: API Key, Model, and File Paths
# In this cell, we configure all the essential parameters for our experiment.
# You must replace "<YOUR_OPENROUTER_API_KEY>" with your actual key.

import openai
import pandas as pd
import random
import json
import time
from sklearn.metrics import accuracy_score
from google.colab import files

# --- REQUIRED: SET YOUR API KEY HERE ---
# You can get a key from https://openrouter.ai/keys
OPENROUTER_API_KEY = "API-KEY-HERE"

# --- MODEL CONFIGURATION ---
# You can change this to any model available on OpenRouter.
# Find model names here: https://openrouter.ai/models
# Example: "anthropic/claude-3.5-sonnet", "google/gemini-pro-1.5", "mistralai/mistral-large"
MODEL_TO_TEST = "qwen/qwen3-235b-a22b-07-25:free"

# --- DATASET CONFIGURATION ---
# Assumes 'PREMOVE.csv' is uploaded to the Colab session's root directory.
DATASET_PATH = "PREMOVE.csv"

# --- EXPERIMENT CONFIGURATION ---
# Define the prompting strategies you want to test. The number indicates the number of few-shot examples.
SHOT_STRATEGIES = {
    "zero_shot": 0,
    "one_shot": 1,
    "two_shot": 2,
    "five_shot": 5,
    "ten_shot": 10
}
# To run a quick test, you can limit the number of rows processed from the dataset.
# Set to None to process all rows.
MAX_ROWS_TO_PROCESS = 10 # For demonstration. Set to None for the full run.


# --- Initialize the OpenAI Client for OpenRouter ---
# We point the base_url to OpenRouter's API endpoint.
client = openai.OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPENROUTER_API_KEY,
)

print("Configuration loaded successfully.")

Configuration loaded successfully.


In [None]:
# @title [3] Data Loading and Preparation
# This cell loads the PREMOVE.csv file into a pandas DataFrame,
# selects the necessary columns, and cleans it up for use.

try:
    # Load the dataset from the specified path
    df = pd.read_csv(DATASET_PATH)

    # Define the columns we need for this task
    required_columns = ['VERB TOKEN', 'SENTENCE', 'PREVERB', 'PREVERB SEMANTICS']

    # Keep only the required columns
    df = df[required_columns]

    # Rename columns to be more Python-friendly (no spaces)
    df.rename(columns={
        'VERB TOKEN': 'verb_token',
        'SENTENCE': 'sentence',
        'PREVERB': 'preverb',
        'PREVERB SEMANTICS': 'ground_truth_semantics'
    }, inplace=True)

    # Drop any rows where essential data might be missing
    df.dropna(inplace=True)

    # Reset the index after dropping rows
    df.reset_index(drop=True, inplace=True)

    # Limit the number of rows if specified for testing
    if MAX_ROWS_TO_PROCESS is not None:
        df = df.head(MAX_ROWS_TO_PROCESS)
        print(f"--- Using a limited dataset of {MAX_ROWS_TO_PROCESS} rows for this run. ---")


    print("Dataset loaded and prepared successfully.")
    print(f"Total rows to process: {len(df)}")
    print("\nFirst 5 rows of the prepared data:")
    display(df.head())

except FileNotFoundError:
    print(f"ERROR: The file '{DATASET_PATH}' was not found.")
    print("Please make sure you have uploaded the CSV file to your Colab environment.")
except Exception as e:
    print(f"An error occurred while loading the data: {e}")

In [None]:
# @title [4] Core Functions: Prompting and API Interaction
# This cell contains the functions that form the backbone of our experiment.

def create_prompt(current_row, all_data, num_shots):
    '''
    Creates a prompt by placing examples in the system message.

    Args:
        current_row (pd.Series): The row containing the question to ask.
        all_data (pd.DataFrame): The entire dataset to sample from for few-shot examples.
        num_shots (int): The number of examples to include (0 for zero-shot).

    Returns:
        list: A list of message dictionaries for the API call.
    '''
    # --- System Prompt Construction ---
    system_prompt_content = (
        "You are an expert in Classical Philology. Your task is to identify the semantic meaning of a given preverb within a sentence from Latin or Ancient Greek. Analyze the verb, sentence, and preverb provided. Respond with ONLY the English meaning and nothing else, using an English adverb or preposition or a multi-word expression."
    )

    if num_shots > 0:
        # Exclude the current row from the sampling pool
        examples_pool = all_data.drop(current_row.name)
        examples = examples_pool.sample(n=num_shots)

        example_texts = []
        for _, ex in examples.iterrows():
            example_texts.append(
                f"Verb: {ex['verb_token']}\n"
                f"Sentence: \"{ex['sentence']}\"\n"
                f"Preverb: {ex['preverb']}\n"
                f"Meaning: {ex['ground_truth_semantics']}"
            )

        system_prompt_content += "\n\nHere are some examples of the task:\n\n---\n"
        system_prompt_content += "\n---\n".join(example_texts)
        system_prompt_content += "\n---\n\nNow, perform the same analysis for the following case."

    # --- User Prompt Construction ---
    # The user message is now clean, containing only the data to be processed.
    user_prompt_content = (
        f"Verb: {current_row['verb_token']}\n"
        f"Sentence: \"{current_row['sentence']}\"\n"
        f"Preverb: {current_row['preverb']}\n"
        f"Meaning:"
    )

    # --- Final Message Assembly ---
    messages = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": user_prompt_content}
    ]
    return messages


def get_llm_response(messages, model):
    """
    Sends a request to the OpenRouter API and returns the model's response.

    Args:
        messages (list): The list of message dictionaries for the prompt.
        model (str): The name of the model to query.

    Returns:
        str: The content of the model's response, or an error message.
    """
    try:
        completion = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.3, # Set to 0 for deterministic, factual outputs
            max_tokens=50,   # The answer should be short
        )
        response_text = completion.choices[0].message.content
        # Clean the output: remove extra whitespace and quotes
        return response_text.strip().replace('"', '').replace("'", "")
    except Exception as e:
        print(f"  !! API Error: {e}")
        return "API_ERROR"

print("Core functions updated and defined.")

In [None]:
# @title [5] Evaluation Engine: Run Experiments
# This is the main part of the notebook. It iterates through each prompting strategy,
# queries the model for every row, and evaluates the results.
# The results now include the 'verb_token' for more detailed analysis.

all_results_dfs = {}

for strategy_name, num_shots in SHOT_STRATEGIES.items():
    print(f"\n{'='*50}")
    print(f"ðŸš€ Starting evaluation for: {strategy_name.upper()} ({num_shots}-shot)")
    print(f"{'='*50}")

    results_data = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        print(f"  -> Processing row {index + 1}/{len(df)}...")

        # 1. Create the prompt using our unified function
        if num_shots > 0 and len(df) <= num_shots:
            print(f"  !! Warning: Not enough data ({len(df)} rows) to create a {num_shots}-shot prompt. Skipping.")
            continue

        prompt_messages = create_prompt(row, df, num_shots)

        # 2. Get the LLM's prediction
        prediction = get_llm_response(prompt_messages, MODEL_TO_TEST)

        # Add a small delay to respect rate limits, if any
        time.sleep(1)

        # 3. Normalize ground truth and prediction for accurate comparison
        ground_truth = str(row['ground_truth_semantics']).lower().strip()
        prediction_normalized = prediction.lower().strip()

        # 4. Check if the prediction is correct
        is_correct = (ground_truth == prediction_normalized)

        # 5. Store the results (now including the verb_token)
        results_data.append({
            'verb_token': row['verb_token'],
            'sentence': row['sentence'],
            'preverb': row['preverb'],
            'ground_truth': row['ground_truth_semantics'],
            'llm_prediction': prediction,
            'is_correct': is_correct
        })

    # --- Analysis and Reporting for the current strategy ---
    if not results_data:
        print("\nNo results were generated for this strategy.")
        continue

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results_data)
    all_results_dfs[strategy_name] = results_df

    # Calculate accuracy
    accuracy = results_df['is_correct'].mean()

    print(f"\n--- ðŸ“Š Results for {strategy_name.upper()} ---")
    print(f"Model Tested: {MODEL_TO_TEST}")
    print(f"Total Items Evaluated: {len(results_df)}")
    print(f"Correct Predictions: {results_df['is_correct'].sum()}")
    print(f"Accuracy: {accuracy:.2%}")
    print("------------------------------------------\n")

    # Save results to a CSV file
    output_filename = f"results_{MODEL_TO_TEST.replace('/', '_')}_{strategy_name}.csv"
    results_df.to_csv(output_filename, index=False)
    print(f"âœ… Detailed results saved to '{output_filename}'")

print("\nðŸŽ‰ All evaluations complete!")

In [None]:
# @title [6] Download Result Files
# This cell provides the code to download the generated CSV files to your local machine.

if all_results_dfs:
    print("Preparing result files for download...")
    for strategy_name in all_results_dfs.keys():
        filename = f"results_{MODEL_TO_TEST.replace('/', '_')}_{strategy_name}.csv"
        print(f"  - Downloading {filename}...")
        files.download(filename)
else:
    print("No result files were generated to download.")