# **FIQA Data Preprocessing**

**Part 1. Mount Google Drive**

In [None]:
from google.colab import drive

# Mount Google Drive
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Google Drive mount failed: {e}")

**Part 2. Load FIQA Dataset and Extract Targets**

In [None]:
import time
import re
import pandas as pd
import yfinance as yf

# Base path for all files
base_path = "/content/drive/My Drive/P2/"

# Path to FIQA dataset
fiqa_path = base_path + "fiqa_1.csv"

# Load FIQA data
df_fiqa = pd.read_csv(fiqa_path)

# Extract unique targets (tickers or entity names)
tickers = df_fiqa["target"].astype(str).str.strip().unique()

print(f"Total number of unique targets found: {len(tickers)}")
print(tickers.tolist())

**Part 3. Automatically Retrieve Sector Information Using Yahoo Finance**

In [None]:
# Store ticker-sector mapping
sector_data = []

print("\nFetching sector information automatically...\n")

for ticker in tickers:
    ticker_clean = ticker.upper()

    try:
        stock = yf.Ticker(ticker_clean)
        sector = stock.info.get("sector", None)

        if sector is None:
            print(f"{ticker_clean}: sector not found, set to General")
            sector = "General"
        else:
            print(f"{ticker_clean}: {sector}")

        sector_data.append({"ticker": ticker_clean, "sector": sector})

    except Exception:
        print(f"{ticker_clean}: fetch failed, set to General")
        sector_data.append({"ticker": ticker_clean, "sector": "General"})

    time.sleep(0.5)

**Part 4. Save Initial Ticker–Sector Mapping**

In [None]:
# Save initial mapping result
output_path = base_path + "ticker_sector.csv"
df_sector = pd.DataFrame(sector_data)
df_sector.to_csv(output_path, index=False)

print("\nInitial ticker_sector.csv generated:")
print(output_path)

**Part 5. Load Mapping File and Define Manual Sector Corrections**

In [None]:
import numpy as np

# Load previously generated ticker-sector file
df_old = pd.read_csv(base_path + "ticker_sector.csv")

print("\nPreview of original mapping data:")
print(df_old.head())

# Manual sector mapping dictionary
manual_map = {
    #original
    "BBRY":"Technology","HK":"Energy","YHOO":"Technology","P":"Communication Services",
    "BERKSHIRE HATHAWAY INC.":"Financial Services","SAMARCO":"Basic Materials","SPY":"Financial Services",
    "PCLN":"Consumer Cyclical","ASOS PLC":"Consumer Cyclical","USO":"Energy","GLD":"Basic Materials",
    "SANOFI":"Healthcare","WYN":"Consumer Cyclical","INTEGRATED SILICON SOLUTION":"Technology",
    "SAB MILLER":"Consumer Defensive","KINGFISHER":"Consumer Cyclical","QQQ":"Financial Services", "STANCHART":"Financial Services",
    "ASTRAZENECA":"Healthcare","FB":"Communication Services","RBS":"Financial Services","GPS":"Consumer Cyclical",
    "ATVI":"Communication Services","INTERCONTINENTAL":"Consumer Cyclical","BIOC":"Healthcare","BUNZL":"Industrials",
    "ZSL":"Financial Services","EXXONMOBIL":"Energy","BHP BILLITON":"Basic Materials","CHINA MERCHANTS GROUP":"Basic Materials",
    "GERMANWINGS":"Industrials","ROYAL DUTCH SHELL":"Energy","SAUDI ARAMCO":"Energy","VMW":"Technology",
    "SAINSBURY":"Consumer Defensive","TESCO":"Consumer Defensive","YNDX":"Communication Services",
    "CNDO":"Healthcare","VVUS":"Healthcare","SABMILLER":"Consumer Defensive","ROYAL MAIL":"Industrials",
    "EASYJET":"Consumer Cyclical","M&S":"Consumer Cyclical","L&G":"Financial Services","ENDP":"Healthcare",
    "ASHTEAD":"Industrials","XLE":"Energy","STANDARD CHARTERED":"Financial Services",
    "QCOR":"Healthcare","SHIRE":"Healthcare","QIHU":"Technology","HIKMA":"Healthcare",
    "SENSEX":"Financial Services","NOVARTIS":"Healthcare","LLOYDS":"Financial Services","LUFTHANSA":"Industrials",
    "BONE":"Healthcare","MORRISSONS":"Consumer Defensive","JNUG":"Financial Services","GLAXOSMITHKLINE":"Healthcare",
    "TSPT":"Healthcare","STARWOOD":"Consumer Cyclical","TZA":"Financial Services","RIO TINTO":"Basic Materials",
    "HSC":"Industrials","SLW":"Basic Materials","SHELL":"Energy","LONDON STOCK EXCHANGE":"Financial Services",
    "FIO":"Technology","GLENCORE":"Basic Materials","ZAGG":"Technology","BARCLAYS":"Financial Services",
    "SPORTS DIRECT":"Consumer Cyclical","HCP":"Healthcare","ALTR":"Technology","INSIGHT":"Technology",
    "CAFN":"Technology","JNPR":"Technology","AER LINGUS":"Industrials","ABERDEEN AM":"Financial Services",
    "TALKTALK":"Communication Services","CNPC":"Energy","FRIENDS LIFE":"Financial Services","PRUDENTIAL":"Financial Services",
    "BWLD":"Consumer Cyclical","LONMIN":"Basic Materials","EFUT":"Technology","TWTR":"Communication Services",
    "XLF":"Financial Services","RDC":"Energy","SLV":"Basic Materials","LEHMAN":"Financial Services","PEARSON":"Communication Services",
    "HOME RETAIL GROUP":"Consumer Cyclical","STATOIL":"Energy","CHRM":"Healthcare","PERSSIMON":"Consumer Cyclical",
    "CAIXABANK":"Financial Services","RANGOLD":"Basic Materials","DIXONS CARPHONE":"Consumer Cyclical",
    "DEBENHAMS":"Consumer Cyclical","SODA":"Consumer Defensive","SKS":"Consumer Cyclical",
    "SOX":"Financial Services","SKH":"Healthcare","BURBERRY":"Consumer Cyclical","ZNGA":"Technology","CENTRICA":"Utilities",
    "LAZADA":"Consumer Cyclical","X":"Basic Materials","HOME RETAIL":"Consumer Cyclical","MEGGITT":"Industrials",
    "BOBE":"Consumer Cyclical","THETRAINLINE.COM":"Communication Services","BERKSHIRE":"Financial Services",
    "JOHNSON MATTHEY":"Basic Materials","GLAXO":"Healthcare","ENTERTAINMENT ONE":"Communication Services",
    "ZURICH INSURANCE":"Financial Services","SDS":"Financial Services","TOWERGATE":"Financial Services","GMCR":"Consumer Defensive",
    "AGU":"Basic Materials","ARIA":"Healthcare","TESCO PLC":"Consumer Defensive","DWA":"Financial Services","AVIVA":"Financial Services",
    "DIAGEO":"Consumer Defensive","ZS PHARMA":"Healthcare","WEIR":"Industrials","VALEANT":"Healthcare",
    "KRAFT":"Consumer Defensive","GOL":"Consumer Cyclical","STANDARD BANK":"Financial Services","AXDX":"Healthcare",
    "KINDER MORGAN":"Energy","DAIICHI SANKYO":"Healthcare","LNKD":"Technology","UBNT":"Technology",
    "BALFOUR BEATTY PLC":"Industrials","G4S":"Industrials","CTRP":"Consumer Cyclical","ARM HOLDINGS":"Technology",
    "AEGN":"Industrials","IMRS":"Healthcare","UUP":"Financial Services","MEDIACITYUK":"Communication Services","MILLERCOORS":"Consumer Defensive",
    "DIALOG":"Technology","PWC":"Financial Services","DET NORSKE":"Energy","SEVERN TRENT":"Utilities",
    "PERSIMMON":"Consumer Cyclical","SPX":"Industrials","ACTELION":"Healthcare","EXXON":"Energy","NQ":"Technology",
    "REED ELSEVIER":"Communication Services","OLD MUTUAL":"Financial Services","ASDA":"Consumer Defensive",
    "OCN":"Financial Services","RENN":"Technology","PETROFAC":"Energy","ITV":"Communication Services",
    "SOPHOS":"Technology","UNILEVER":"Consumer Defensive","CITI":"Financial Services","BXS":"Financial Services",
    "BLINKBOX":"Communication Services","IWM":"Financial Services","PERRIGO":"Healthcare","BBBY":"Consumer Defensive",
    "SHOR":"Consumer Cyclical","AMCN":"Communication Services","MFLX":"Technology","HZNP":"Healthcare",
    "STANDARD LIFE":"Financial Services","BG GROUP":"Energy","CELG":"Healthcare","OMNIS PHARMACEUTICALS":"Healthcare",
    "PRGN":"Healthcare","SKX":"Consumer Cyclical","INTERTEK":"Industrials","IMPERIAL TOBACCO":"Consumer Defensive",
    "CENTRICA PLC":"Utilities","RYANAIR":"Industrials","SINA":"Communication Services","RXII":"Healthcare",
    "MCP":"Basic Materials","TAKEDA":"Healthcare","ACOM":"Technology","ONEMAIN":"Financial Services",
    "SMH":"Financial Services","LAND SECURITIES":"Real Estate","FAZ":"Financial Services","MR BRICOLAGE":"Consumer Cyclical",
    "SCHRODERS":"Financial Services","FXE":"Financial Services","GTAT":"Technology","INOVIO":"Healthcare","SONC":"Consumer Cyclical",
    "TYC":"Industrials","HARGREAVES LANSDOWN":"Financial Services","DEERE":"Industrials",
    "BRITISH AMERICAN TOBACCO":"Consumer Defensive","LEMANN":"Financial Services","UVXY":"Financial Services",
    "MWW":"Industrials","VERIZON":"Communication Services","SWY":"Consumer Defensive","SAB":"Consumer Defensive",
    "CERN":"Healthcare","SAVE":"Consumer Cyclical","CTXS":"Technology","CREDIT SUISSE":"Financial Services",
    "WOLSELEY":"Industrials","KIOR":"Basic Materials","WX":"Healthcare","COUTTS":"Financial Services",
    "GREENE KING":"Consumer Defensive","BAT":"Consumer Defensive","JNK":"Financial Services","DEUTSCHE BÖRSE":"Financial Services",
    "CREE":"Technology","ATHN":"Technology","M&G":"Financial Services","INTERTEK GROUP":"Industrials",
    "YANG":"Financial Services","GKN":"Industrials","COH":"Consumer Cyclical","NSM":"Financial Services","SPPI":"Healthcare",
    "HDSI":"Technology","WILSHIRE BANCORP":"Financial Services","NIKKEI":"Financial Services","RSOL":"Energy",
    "WELLS FARGO":"Financial Services","PKT":"Technology","RNN":"Healthcare","DISH":"Communication Services",
    "RAD":"Consumer Defensive","MXWL":"Technology","UGAZ":"Financial Services","NIHD":"Communication Services","WAC":"Financial Services",
    "HORIZONTE":"Basic Materials","GAZPROM":"Energy","NUGT":"Financial Services","SSRI":"Basic Materials",
    "JV PARTNER":"Industrials","XLB":"Energy","YOKU":"Communication Services","KIPA":"Consumer Defensive",
    "BRCM":"Technology","AUTO TRADER":"Consumer Cyclical","ASTX":"Healthcare","FMCN":"Communication Services",
    "WHITBREAD":"Consumer Cyclical","AXA":"Financial Services","KITD":"Communication Services",
    "LEGAL & GENERAL":"Financial Services","BERKSHIRE HATHAWAY":"Financial Services",
    "DBO":"Financial Services","ELN":"Healthcare","DARA":"Healthcare","OCWEN":"Financial Services",

    #update
    "MORRISONS": "Consumer Defensive",
    "SYMC": "Technology",
    "DEUTSCHE BÃ\x83Â¶RSE": "Financial Services",
    "IBB": "Healthcare",
    "TULLOW OIL": "Energy"
}

**Part 6. Apply Smart Sector Filling Logic**

In [None]:
# Smart sector filling logic
def smart_fill(row):
    current_sec = row['sector']
    ticker = str(row['ticker']).strip()

    # Keep existing valid sector
    if pd.notna(current_sec) and str(current_sec).lower() != "general":
        return current_sec

    # Try exact match
    if ticker in manual_map:
        return manual_map[ticker]

    # Try uppercase match
    elif ticker.upper() in manual_map:
        return manual_map[ticker.upper()]

    # Fallback
    return "General"

print("Applying manual sector corrections...")
df_old["sector"] = df_old.apply(smart_fill, axis=1)

**Part 7. Final Fallback Strategy for Unmatched Sectors**

In [None]:
# Identify rows still labeled as General
unknown_mask = df_old["sector"] == "General"

# Extract unique unmatched tickers
unknown_tickers = df_old.loc[unknown_mask, "ticker"].unique()
unknown_count = unknown_tickers.shape[0]

if unknown_count > 0:
    print(f"Warning: {unknown_count} tickers still unmatched. Assigning default sector.")

    print("Unmatched tickers:")
    for t in unknown_tickers:
        print(" -", t)

    # Assign default sector
    df_old.loc[unknown_mask, "sector"] = "Financial Services"

else:
    print("All tickers have been assigned a sector.")

**Part 8. Save Final Sector Mapping**

In [None]:
# Save final sector mapping
save_path = "/content/drive/My Drive/P2/final_ticker_sector3.csv"
df_old.to_csv(save_path, index=False)

print(f"Final sector mapping saved to: {save_path}")

**Part 9. Merge Sector Information Back to FIQA Dataset**

In [None]:
# Define paths
base_path = "/content/drive/My Drive/P2/"
fiqa_path = base_path + "fiqa_1.csv"
sector_path = base_path + "final_ticker_sector3.csv"
output_path = base_path + "fiqa_standardized.csv"

# Load datasets
print("Loading datasets...")
df_fiqa = pd.read_csv(fiqa_path)
df_sector = pd.read_csv(sector_path)

print(f"FIQA dataset shape: {df_fiqa.shape}")
print(f"Sector mapping shape: {df_sector.shape}")

**Part 10. Standardize Keys and Perform Merge**

In [None]:
# Standardize keys for merge
df_fiqa["target"] = df_fiqa["target"].astype(str).str.strip().str.upper()
df_sector["ticker"] = df_sector["ticker"].astype(str).str.strip().str.upper()

# Left join sector information to FIQA data
df_merged = df_fiqa.merge(
    df_sector[['ticker', 'sector']],
    left_on="target",
    right_on="ticker",
    how="left"
)

# Remove redundant ticker column
if 'ticker' in df_merged.columns:
    df_merged = df_merged.drop(columns=['ticker'])

**Part 11. Final Validation and Save Standardized Dataset**

In [None]:
# Check for missing sector values after merge
missing_rows = df_merged[df_merged['sector'].isna()]
missing_count = len(missing_rows)

if missing_count > 0:
    print(f"Warning: {missing_count} rows still missing sector information.")

    missing_tickers = missing_rows['target'].unique().tolist()
    print("Tickers with missing sector:")
    for t in missing_tickers:
        print(" -", t)
else:
    print("All records successfully matched with sector information.")

# Add ID column if not present
if 'id' not in df_merged.columns:
    df_merged.insert(0, 'id', range(len(df_merged)))

# Rename sentence column to headline for prompt usage
if 'headline' not in df_merged.columns and 'sentence' in df_merged.columns:
    df_merged.rename(columns={'sentence': 'headline'}, inplace=True)

# Save final standardized FIQA dataset
df_merged.to_csv(output_path, index=False)

print(f"\nFinal standardized dataset saved to:")
print(output_path)

print("\nPreview of first 5 rows:")
print(df_merged[['id', 'target', 'sector', 'label']].head())

# **FIQA Experiment**

**Part 1. Environment Setup and Dependencies**

In [None]:
!pip install -qU google-genai tqdm pandas

# Colab-ready pipeline: sample run followed by batch run for FIQA with 9 prompts
# Notes:
# 1) The API key will be entered manually in Colab and is not stored in the code
# 2) A small sample is executed first before running the full dataset
# 3) Results are written incrementally to CSV to avoid data loss from interruptions
# 4) Concurrency, batch size, and retry behavior are configurable


**Part 2. Imports and Global Configuration**

In [None]:
import os
import time
import math
import json
import csv
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from tqdm import tqdm
import pandas as pd

# Model configuration
MODEL_NAME = "models/gemini-2.5-flash"

# Base directory for input and output files
BASE_PATH = "/content/drive/My Drive/P2/"

# Input dataset (FIQA after standardization and sector injection)
FIQA_PATH = BASE_PATH + "fiqa_standardized.csv"

# Output files
OUTPUT_CSV = BASE_PATH + "fiqa_results.csv"
SAMPLE_OUTPUT_CSV = BASE_PATH + "fiqa_results_sample.csv"

**Part 3. Runtime Control Parameters**

In [None]:
# Runtime control parameters
SAMPLE_SIZE = 50           # Number of samples for initial test run
BATCH_SIZE = 100           # Batch size for full dataset execution
MAX_WORKERS = 6            # Number of concurrent workers
MAX_RETRIES = 4            # Maximum retries per API call
INITIAL_BACKOFF = 1.0      # Initial backoff time for retries (seconds)

# Reference cost for paid tier (used for estimation only)
PAID_INPUT_PRICE_PER_M = 0.50  # USD per 1M input tokens

**Part 4. Prompt Definitions (9 Prompt Strategies)**

In [None]:
# Prompt templates used in the experiment
PROMPTS = {
    # Zero-Shot prompts
    "ZS-1": "Evaluate the sentiment conveyed by the headline with respect to {target} from an investment perspective. Employ a three-tier scale: (-1) for Negative, (0) for Neutral, and (+1) for Positive. Assign Neutral if the headline is vague regarding {target}.\nConstraint: Answer with exactly one word: Positive, Negative, or Neutral.",
    "ZS-2": "Given the news related to the {sector} industry, classify the sentiment as Positive, Negative, or Neutral, based on the headline: \"{headline}\".\nConstraint: Answer with exactly one word: Positive, Negative, or Neutral.",
    "ZS-3": "Act as a sentiment analysis model trained on financial news headlines. Classify the sentiment of the headline: \"{headline}\".\nConstraint: Answer with exactly one word: Positive, Negative, or Neutral.",

    # Role-Play prompts
    "RP-1": "Act as an expert at stock trading holding {target}. Based only on the headline \"{headline}\", will you buy, sell or hold {target} in the short term?\nConstraint: Answer with exactly one word. Return 'Positive' for buy, 'Negative' for sell, or 'Neutral' for hold.",
    "RP-2": "Consider like an institutional investor, focusing on long-term, fundamental effects. Classify the sentiment of the following message regarding {target}: \"{headline}\".\nConstraint: Answer with exactly one word: Positive, Negative, or Neutral.",
    "RP-3": "Act as a financial expert. Classify the sentiment for {target} based only on the headline \"{headline}\".\nConstraint: Answer with exactly one word: Positive, Negative, or Neutral.",

    # Chain-of-Thought prompts
    "CoT-1": "I will offer you a news headline regarding {target}: \"{headline}\".\nPlease think step by step:\n1. Analyze the rationale and potential impact on the stock price.\n2. Identify the sentiment.\n\nConstraint: Return your response in this specific format:\nRationale: [Your reasoning]\nSentiment: [Positive, Negative, or Neutral]",
    "CoT-2": "Consider this message: \"{headline}\".\nStep 1: Identify any irrealis mood (uncertainty) or rhetorics (sarcasm, negative assertion) used in the text.\nStep 2: Based on Step 1, determine the sentiment regarding {target}.\n\nConstraint: Return your response in this specific format:\nAnalysis: [Your analysis]\nSentiment: [Positive, Negative, or Neutral]",
    "CoT-3": "Classify the sentiment of this news headline regarding {target}: \"{headline}\".\nLet's think step by step to determine if it is Positive, Negative, or Neutral.\n\nConstraint: At the end of your response, output the label in this format:\nSentiment: [Label]"
}

**Part 5. API Key Input and Client Initialization**

In [None]:
from google import genai

# Request API key securely via input
api_key = input("Enter your Google GENAI API Key: ").strip()
client = genai.Client(api_key=api_key)

**Part 6. Data Loading and Input Standardization**

In [None]:
df = pd.read_csv(FIQA_PATH)

# Standardize target / ticker format
df["target"] = df["target"].astype(str).str.strip().str.upper()

# Ensure a unified headline column
if "headline" not in df.columns and "sentence" in df.columns:
    df.rename(columns={"sentence": "headline"}, inplace=True)

# Clean headline text
df["headline"] = df["headline"].astype(str).str.strip()

print(f"Data loading completed: {len(df)} rows")

# Sample Run

**Part 1. Response Parsing Function**

In [None]:
# Helper function: parse the model's raw response into pos / neg / neu
def clean_response_text(raw_text):
    if raw_text is None:
        return "unknown"
    txt = str(raw_text).strip().lower()
    if "positive" in txt:
        return "pos"
    if "negative" in txt:
        return "neg"
    if "neutral" in txt:
        return "neu"
    if "bull" in txt or "buy" in txt or "+1" in txt:
        return "pos"
    if "bear" in txt or "sell" in txt or "-1" in txt:
        return "neg"
    return "unknown"

**Part 2. Model Call with Retry (No Token Tracking)**

In [None]:
# Call the model with retry logic (no token tracking)
def call_model(prompt_text, model_name=MODEL_NAME, max_retries=MAX_RETRIES):
    result = {"raw_text": None, "success": False}

    backoff = INITIAL_BACKOFF
    for attempt in range(1, max_retries + 1):
        try:
            response = client.models.generate_content(
                model=model_name,
                contents=prompt_text
            )

            # Extract raw text (compatible with different SDK response formats)
            raw = None
            if hasattr(response, "output") and response.output:
                try:
                    raw = ""
                    for item in response.output:
                        if isinstance(item, dict) and "content" in item:
                            for c in item["content"]:
                                if isinstance(c, dict) and "text" in c:
                                    raw += c["text"]
                                elif isinstance(c, str):
                                    raw += c
                        elif isinstance(item, str):
                            raw += item
                except Exception:
                    raw = str(response)

            elif hasattr(response, "output_text"):
                raw = response.output_text

            elif hasattr(response, "text"):
                raw = response.text

            else:
                raw = str(response)

            result["raw_text"] = raw
            result["success"] = True
            return result

        except Exception as e:
            print(f"[Attempt {attempt}] Call failed: {str(e)}. Retrying after {backoff}s...")
            traceback.print_exc()
            time.sleep(backoff)
            backoff *= 2

    return result

**Part 3. Safe Prompt Formatting**

In [None]:
# Safely format prompt templates to prevent runtime failures
def safe_format_prompt(template, headline, target, sector):
    """
    Safely format the prompt:
    1) If the template does not contain a {headline} placeholder, append it
    2) Perform formatting in a single final step
    """
    # Check for missing placeholder
    if "{headline}" not in template:
        template = template.rstrip() + '\nHeadline: "{headline}"'

    # Perform final formatting
    try:
        prompt_text = template.format(
            headline=headline,
            target=target,
            sector=sector
        )
    except Exception:
        # Minimal fallback: concatenate headline without raising errors or printing logs
        prompt_text = f"{template}\nHeadline: {headline}"

    return prompt_text

**Part 4. Single-Task Processing (One Result Row)**

In [None]:
# Process one inference task and return one result row for CSV writing
def process_one(item):
    """
    item: dict with keys: id, headline, target, sector, prompt_id, prompt_template
    Returns one result row as a dict for CSV output.
    """
    # Safely format the prompt
    prompt_text = safe_format_prompt(
        item["prompt_template"],
        headline=item["headline"],
        target=item["target"],
        sector=item.get("sector", "Finance")
    )

    # Call the model
    call_res = call_model(prompt_text)
    raw = call_res["raw_text"]

    # Parse the response
    label = clean_response_text(raw)

    return {
        "id": item.get("id"),
        "prompt_id": item.get("prompt_id"),
        "headline": item["headline"],
        "target": item["target"],
        "sector": item.get("sector", ""),
        "prompt_text": prompt_text,
        "raw_response": raw,
        "parsed_label": label,
        "success": call_res["success"]
    }

**Part 5. Parallel Execution and Incremental CSV Writing**

In [None]:
# Run tasks in parallel and append results to CSV (no token tracking)
def run_tasks(df_tasks, out_csv, max_workers=MAX_WORKERS, show_progress=True):
    header = [
        "id","prompt_id","headline","target","sector",
        "prompt_text","raw_response","parsed_label","success"
    ]

    if not os.path.exists(out_csv):
        with open(out_csv, "w", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow(header)

    total = len(df_tasks)

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = []
        for _, row in df_tasks.iterrows():
            item = {
                "id": int(row["id"]),
                "headline": row["headline"],
                "target": row["target"],
                "sector": row.get("sector", ""),
                "prompt_id": row["prompt_id"],
                "prompt_template": row["prompt_template"]
            }
            futures.append(ex.submit(process_one, item))

        pbar = tqdm(total=len(futures))
        for fut in as_completed(futures):
            res = fut.result()
            with open(out_csv, "a", newline="", encoding="utf-8") as f:
                csv.writer(f).writerow([res.get(c,"") for c in header])
            pbar.update(1)
        pbar.close()

**Part 6. Task Construction (News × Prompts)**

In [None]:
# Build the task list by expanding each news item across all prompt templates
def build_task_df(sub_df):
    rows = []
    for _, r in sub_df.iterrows():
        for pid, template in PROMPTS.items():
            rows.append({
                "id": int(r.get("id", "")),
                "headline": r["headline"],
                "target": r["target"],
                "sector": r.get("sector",""),
                "prompt_id": pid,
                "prompt_template": template
            })
    return pd.DataFrame(rows)


**Part 7. Sample Run Execution**

In [None]:
sample_df = df.sample(SAMPLE_SIZE, random_state=42)
sample_tasks = build_task_df(sample_df)

print("Starting sample run...")
run_tasks(sample_tasks, SAMPLE_OUTPUT_CSV)
print("Sample run completed. Please review:", SAMPLE_OUTPUT_CSV)

# **Full Run**

**Full-Scale Inference with Batch Execution**

In [None]:
#Full run (batch execution)
def full_run(confirm=False):
    if not confirm:
        print(
            "Please confirm the sample results first. "
            "Call full_run(confirm=True) to start the full-scale run."
        )
        return

    # Ensure an id column exists in the full dataset
    full_df = df.copy().reset_index(drop=True)
    if "id" not in full_df.columns:
        full_df.insert(0, "id", range(len(full_df)))

    # Build the full task dataframe
    tasks_df = build_task_df(full_df)
    print(
        f"Total tasks: {len(tasks_df)} "
        f"(news {len(full_df)} × prompts {len(PROMPTS)})"
    )

    # Split execution into batches based on news count (not task count)
    news_indices = list(range(0, len(full_df), BATCH_SIZE))
    batch_count = len(news_indices)
    print(
        f"Execution will be split into {batch_count} batches, "
        f"each containing up to {BATCH_SIZE} news items "
        f"(tasks per batch = news_in_batch × {len(PROMPTS)})"
    )

    for i, start in enumerate(news_indices):
        end = min(start + BATCH_SIZE, len(full_df))
        sub_news = full_df.iloc[start:end]
        sub_tasks = build_task_df(sub_news)

        # Append all batch results to the same output CSV
        out_csv_batch = OUTPUT_CSV

        print(
            f"\nStarting batch {i + 1}/{batch_count}: "
            f"processing news {start} to {end - 1} "
            f"(tasks {len(sub_tasks)})"
        )

        run_tasks(sub_tasks, out_csv_batch, max_workers=MAX_WORKERS)

        print(
            f"Batch {i + 1} completed. "
            f"Results appended to {out_csv_batch}"
        )

        # Short pause between batches to reduce rate-limit risk
        time.sleep(4)

    print("Full-scale run completed. Results saved to:", OUTPUT_CSV)

***Full Run Trigger***

In [None]:
full_run(confirm=True)

# FIQA Evaluation

**Part 1. Imports**

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os


**Part 2. File Path Configuration**

In [None]:
base_path = "/content/drive/My Drive/P2/"
result_file = base_path + "fiqa_results.csv"              # Model inference outputs
ground_truth_file = base_path + "fiqa_standardized.csv"   # Ground truth labels
output_file = base_path + "fiqa_metrics_summary.csv"      # Metrics summary output

**Part 3. Load Data and Merge with Ground Truth Labels**

In [None]:
if not os.path.exists(result_file) or not os.path.exists(ground_truth_file):
    print("Error: File not found. Please check your paths.")
else:
    df_res = pd.read_csv(result_file)
    df_gold = pd.read_csv(ground_truth_file)

    print(f"Result file rows: {len(df_res)}")
    print(f"Ground truth rows: {len(df_gold)}")

    # Ensure consistent id data type for merging
    df_res['id'] = df_res['id'].astype(int)
    df_gold['id'] = df_gold['id'].astype(int)

    # Merge ground truth label (column name fixed as 'label' in the source file)
    df_final = pd.merge(df_res, df_gold[['id', 'label']], on='id', how='inner')
    df_final.rename(columns={'label': 'gold_label'}, inplace=True)

**Part 4. Label Cleaning and Alignment**

In [None]:
    # Standard label mapping (to support multiple formats)
    label_map = {
        'pos': 'Positive', 'positive': 'Positive',
        'neg': 'Negative', 'negative': 'Negative',
        'neu': 'Neutral',  'neutral': 'Neutral'
    }

    # Normalize predictions and ground truth labels
    df_final['pred_clean'] = df_final['parsed_label'].astype(str).str.lower().map(label_map)
    df_final['gold_clean'] = df_final['gold_label'].astype(str).str.lower().map(label_map)

    # Remove rows that cannot be evaluated
    df_clean = df_final.dropna(subset=['pred_clean', 'gold_clean'])

    print(
        f"Valid evaluation samples after cleaning: {len(df_clean)} "
        f"(dropped {len(df_final) - len(df_clean)} invalid rows)"
    )

**Part 5. Compute Metrics per Prompt (Accuracy, Precision, Recall, F1)**

In [None]:
    metrics_data = []

    print("\n" + "="*80)
    print(f"{'Prompt ID':<12} | {'Strategy':<10} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10}")
    print("="*80)

    # Compute metrics grouped by prompt_id
    for pid, group in df_clean.groupby('prompt_id'):
        y_true = group['gold_clean']
        y_pred = group['pred_clean']

        # Strategy type (ZS = Zero-Shot, RP = Role-Play, CoT = Chain-of-Thought)
        strategy = pid.split('-')[0]

        # Metrics
        acc = accuracy_score(y_true, y_pred)

        # Weighted average is used because class distribution may be imbalanced
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='weighted', zero_division=0
        )

        metrics_data.append({
            "Prompt ID": pid,
            "Strategy": strategy,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1-Score": f1
        })

        print(f"{pid:<12} | {strategy:<10} | {acc:.4f}     | {prec:.4f}     | {rec:.4f}     | {f1:.4f}")

    print("="*80)

**Part 6. Save Results and Select the Best Prompt**

In [None]:
    df_metrics = pd.DataFrame(metrics_data)

    # Rank prompts by F1-score
    df_metrics = df_metrics.sort_values(by="F1-Score", ascending=False)
    df_metrics.to_csv(output_file, index=False)

    print(f"\nMetrics summary saved to: {output_file}")

    # Select the best prompt
    best_prompt = df_metrics.iloc[0]
    print("\nBest prompt selected")
    print(f"ID: {best_prompt['Prompt ID']} ({best_prompt['Strategy']})")
    print(f"F1-Score: {best_prompt['F1-Score']:.4f}")
    print(f"Accuracy: {best_prompt['Accuracy']:.4f}")