In [1]:

import os
import json
import numpy as np
import random
import time
import re
import tiktoken
import google.generativeai as genai
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from collections import defaultdict
from openai import OpenAI

# --- Configuration ---
BASE_FEATURE_DIM = 384  # Dimension for question embeddings
ANSWER_EMBED_DIM = 384  # Dimension for answer embeddings
CASCADE_LENGTH = 5      # Number of attempts in the cascade (K)
UPDATE_FREQUENCY = 1    # Update JSON records after every question
USE_EMBEDDINGS = True   # Use embeddings instead of TF-IDF
ALPHA = 0.675           # Exploration parameter for LinUCB
LAMBDA_REG = 0.45       # Regularization parameter for LinUCB matrix initialization
TRAIN_RATIO = 0.2
BATCH_SIZE = 20

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [None]:
OPENROUTER_API_KEY =  ""
OPENROUTER_BASE_URL =  "https://openrouter.ai/api/v1"

In [3]:
MODELS_CONFIG = {
    "arcee-ai/trinity-large-preview:free": {"input_cost": 0, "output_cost": 0},
    "mistralai/mistral-small-3.1-24b-instruct": {"input_cost": 0.05 / 1e6, "output_cost": 0.15 / 1e6},
    "microsoft/phi-4": {"input_cost": 0.07 / 1e6, "output_cost": 0.14 / 1e6},
    "meta-llama/llama-4-maverick": {"input_cost": 0.17 / 1e6, "output_cost": 0.16 / 1e6},
    "google/gemini-2.0-flash-001": {"input_cost": 0.1 / 1e6, "output_cost": 0.4 / 1e6},
    "deepseek/deepseek-chat": {"input_cost": 0.38 / 1e6, "output_cost": 0.89 / 1e6},
}



# Original models
# MODELS_CONFIG = {
#     "microsoft/phi-3.5-mini-128k-instruct": {"input_cost": 0.03 / 1e6, "output_cost": 0.09 / 1e6},
#     "mistralai/mistral-small-3.1-24b-instruct": {"input_cost": 0.05 / 1e6, "output_cost": 0.15 / 1e6},
#     "microsoft/phi-4": {"input_cost": 0.07 / 1e6, "output_cost": 0.14 / 1e6},
#     "meta-llama/llama-4-maverick": {"input_cost": 0.17 / 1e6, "output_cost": 0.16 / 1e6},
#     "google/gemini-2.0-flash-001": {"input_cost": 0.1 / 1e6, "output_cost": 0.4 / 1e6},
#     "openai/gpt-4.1-nano": {"input_cost": 0.1 / 1e6, "output_cost": 0.4 / 1e6},
#     "deepseek/deepseek-chat": {"input_cost": 0.38 / 1e6, "output_cost": 0.89 / 1e6},
# }

GRADER_MODEL_NAME = "google/gemini-2.0-flash-lite-001"

AVAILABLE_LLMS = list(MODELS_CONFIG.keys())
LLM_ID_DIM = len(AVAILABLE_LLMS)

# Feature dimensions are recalculated based on the number of available LLMs
CONTEXT_FEATURE_DIM = ANSWER_EMBED_DIM + LLM_ID_DIM + ANSWER_EMBED_DIM
TOTAL_FEATURE_DIM = BASE_FEATURE_DIM + 1 + CONTEXT_FEATURE_DIM

# Initialize OpenRouter client
openrouter_client = OpenAI(
    base_url=OPENROUTER_BASE_URL,
    api_key=OPENROUTER_API_KEY,
    # You might need to add default headers if required by your OpenRouter setup/account
    # default_headers={"HTTP-Referer": "YOUR_SITE_URL", "X-Title": "YOUR_APP_NAME"}
)

# File paths
INPUT_JSON = os.path.join("..", "Data", "HotpotQA.json")
RECORDS_PATH = "HP.json"
LINUCB_MODEL_PATH = "HP.npz"
SUMMARY_STATS_PATH = "HP.txt"
BACKUP_SUFFIX = ".bak"


class FeatureExtractor:
    def __init__(self, feature_dim=BASE_FEATURE_DIM, use_embeddings=USE_EMBEDDINGS):
        self.feature_dim = feature_dim
        self.use_embeddings = use_embeddings
        if use_embeddings:
            try:
                self.embedding_model = SentenceTransformer('BAAI/bge-small-en-v1.5')
                print("Initialized sentence transformer embedding model.")
            except Exception as e:
                print(f"Error initializing sentence transformer: {e}\nFalling back to TF-IDF.")
                self.use_embeddings = False
        if not self.use_embeddings:
            self.vectorizer = TfidfVectorizer()
            self.svd = None
        self.initialized = False

    def initialize(self, questions):
        """Initialize the vectorizer with the corpus of questions."""
        if not self.use_embeddings:
            from sklearn.decomposition import TruncatedSVD
            all_text = [q["question"] for q in questions]
            dtm = self.vectorizer.fit_transform(all_text)
            n_components = min(self.feature_dim, dtm.shape[1])
            self.svd = TruncatedSVD(n_components=n_components)
            self.svd.fit(dtm)
            print(f"Using TF-IDF with SVD dimensionality reduction to {n_components} features.")
        self.initialized = True

    def extract_features(self, question):
        """Extract features from a question."""
        if not self.initialized:
            raise ValueError("Feature extractor not initialized. Call initialize() first.")
        text = question["question"]
        if self.use_embeddings:
            return self.embedding_model.encode([text])[0]
        else:
            tfidf_vector = self.vectorizer.transform([text])
            features = self.svd.transform(tfidf_vector)[0]
            if len(features) < self.feature_dim:
                padding = np.zeros(self.feature_dim - len(features))
                features = np.concatenate([features, padding])
            return features

    def extract_answer_features(self, answer_text):
        """Extract features from an answer string."""
        if not answer_text:
            return np.zeros(ANSWER_EMBED_DIM)
        if self.use_embeddings:
            try:
                features = self.embedding_model.encode([answer_text])[0]
                # Handle cases where embedding dimension might not match
                if len(features) != ANSWER_EMBED_DIM:
                    if len(features) > ANSWER_EMBED_DIM:
                        features = features[:ANSWER_EMBED_DIM]
                    else:
                        padding = np.zeros(ANSWER_EMBED_DIM - len(features))
                        features = np.concatenate([features, padding])
                return features
            except Exception as e:
                print(f"Error embedding answer: {e}")
                return np.zeros(ANSWER_EMBED_DIM)
        else:
            # For simplicity, use zero vector if not using embeddings for answers
            return np.zeros(ANSWER_EMBED_DIM)

    def construct_feature_vector(self, base_features, step_i, failed_answers, failed_llm_ids, model_name_to_index):
        """
        Construct the augmented feature vector for LinUCB.
        Features include: base question features, normalized step, last answer embedding,
        one-hot encoding of the last failed LLM, and an average of all previous failed answer embeddings.
        """
        normalized_step = np.array([step_i / CASCADE_LENGTH])

        last_answer_features = np.zeros(ANSWER_EMBED_DIM)
        if step_i > 1 and failed_answers:
            last_answer_features = self.extract_answer_features(failed_answers[-1])

        last_llm_onehot = np.zeros(LLM_ID_DIM)
        if step_i > 1 and failed_llm_ids:
            last_llm_name = failed_llm_ids[-1]
            if last_llm_name in model_name_to_index:
                last_llm_onehot[model_name_to_index[last_llm_name]] = 1.0

        avg_answer_features = np.zeros(ANSWER_EMBED_DIM)
        if step_i > 1 and failed_answers:
            all_answer_features = [self.extract_answer_features(ans) for ans in failed_answers]
            if all_answer_features:
                avg_answer_features = np.mean(all_answer_features, axis=0)
        if avg_answer_features.shape == (): # Handle scalar result from mean
            avg_answer_features = np.zeros(ANSWER_EMBED_DIM)

        context_features = np.concatenate([last_answer_features, last_llm_onehot, avg_answer_features])
        augmented_features = np.concatenate([base_features, normalized_step, context_features])

        if augmented_features.shape[0] != TOTAL_FEATURE_DIM:
            raise ValueError(f"Constructed feature vector dimension {augmented_features.shape[0]} != expected {TOTAL_FEATURE_DIM}")
        return augmented_features


class LinUCBModel:
    def __init__(self, model_names, feature_dim=TOTAL_FEATURE_DIM, alpha=ALPHA, lambda_reg=LAMBDA_REG):
        self.model_names = model_names
        self.feature_dim = feature_dim
        self.alpha = alpha
        self.lambda_reg = lambda_reg
        self.model_name_to_index = {name: i for i, name in enumerate(model_names)}
        self.models = {
            model_name: {
                'A': np.identity(feature_dim) * lambda_reg,
                'b': np.zeros(feature_dim),
                'last_call_time': 0,
            } for model_name in model_names
        }

    def update_reward_only(self, model_name, feature_vector, reward):
        """Update the LinUCB model parameters based on observed reward."""
        model = self.models[model_name]
        model['A'] += np.outer(feature_vector, feature_vector)
        model['b'] += feature_vector * reward

    def calculate_ucb_scores(self, feature_vector):
        """Calculate UCB scores for model selection."""
        scores = {}
        for model_name in self.model_names:
            model = self.models[model_name]
            try:
                # Use Cholesky decomposition for faster and more stable inversion
                L = np.linalg.cholesky(model['A'])
                theta = np.linalg.solve(model['A'], model['b'])
                z = np.linalg.solve(L, feature_vector)
                ucb_term = self.alpha * np.sqrt(np.sum(z**2))
                expected_reward = feature_vector.dot(theta)
                scores[model_name] = {
                    "p_ia": float(expected_reward),
                    "e_ia": float(ucb_term),
                    "ucb_score": float(expected_reward + ucb_term)
                }
            except np.linalg.LinAlgError:
                # Fallback to standard matrix inversion if Cholesky fails
                try:
                    A_inv = np.linalg.inv(model['A'])
                    theta = A_inv.dot(model['b'])
                    ucb_term = self.alpha * np.sqrt(feature_vector.dot(A_inv).dot(feature_vector))
                    expected_reward = feature_vector.dot(theta)
                    scores[model_name] = {
                        "p_ia": float(expected_reward),
                        "e_ia": float(ucb_term),
                        "ucb_score": float(expected_reward + ucb_term)
                    }
                except:
                    scores[model_name] = {"p_ia": 0.0, "e_ia": 0.0, "ucb_score": 0.0}
        return scores

    def select_model_ucb(self, feature_vector):
        """Select a model using standard LinUCB algorithm."""
        scores = self.calculate_ucb_scores(feature_vector)
        if not scores:
            return None, {}
        chosen_model = max(scores.items(), key=lambda x: x[1]["ucb_score"])[0]
        return chosen_model, scores

    def register_model_call(self, model_name):
        self.models[model_name]['last_call_time'] = time.time()

    def respect_rate_limit(self, model_name):
        """Wait if necessary to respect a model's RPM if defined in config."""
        # For the specified OpenRouter models, 'rpm' is not defined, so this block will be skipped.
        model_cfg = MODELS_CONFIG.get(model_name)
        if model_cfg and "rpm" in model_cfg:
            model_state = self.models[model_name]
            min_seconds_between_calls = 60.0 / model_cfg["rpm"]
            time_since_last_call = time.time() - model_state['last_call_time']
            if time_since_last_call < min_seconds_between_calls:
                time.sleep(min_seconds_between_calls - time_since_last_call)

    def save_model_state(self, file_path):
        """Save the model state to a compressed numpy file."""
        save_dict = {f'A_{model_name}': model['A'] for model_name, model in self.models.items()}
        for model_name, model in self.models.items():
            save_dict[f'b_{model_name}'] = model['b']
        np.savez_compressed(file_path, **save_dict)

    def load_model_state(self, file_path):
        """Load the model state from a file."""
        try:
            loaded = np.load(file_path)
            for model_name in self.models.keys():
                if f'A_{model_name}' in loaded and f'b_{model_name}' in loaded:
                    self.models[model_name]['A'] = loaded[f'A_{model_name}']
                    self.models[model_name]['b'] = loaded[f'b_{model_name}']
            print(f"Loaded LinUCB model state from {file_path}")
            return True
        except Exception as e:
            print(f"Error loading model state: {e}")
            return False


class BatchBudgetCascade:
    def __init__(self, feature_extractor, linucb_model, cascade_length=CASCADE_LENGTH):
        self.feature_extractor = feature_extractor
        self.linucb_model = linucb_model
        self.cascade_length = cascade_length

    def format_prompt(self, question, failed_answers=None, failed_llm_ids=None):
        """Generate a prompt for HotpotQA-style short-answer QA."""
        prompt = "You are a question answering assistant.\n\n"
        prompt += "Answer the question with ONLY the final short answer.\n"
        prompt += "Do NOT provide reasoning, steps, or extra text.\n"
        prompt += "If the answer is yes/no, respond with exactly 'yes' or 'no'.\n"
        prompt += f"Question: {question['question']} \n"
        prompt += "Answer:"
        
        if failed_answers and failed_llm_ids:
            prompt += "\n\nNote: The following previous attempts were incorrect. Please provide a different solution:\n"
            for i, answer_info in enumerate(failed_answers):
                prompt += f"- Attempt {i+1} (by {failed_llm_ids[i]}) led to: {answer_info}\n"
        return prompt

    def parse_llm_answer(self, answer_text):
        """
        Extract a short answer string from the model output.
        Returns a plain string (NOT a tuple) so downstream embeddings/history work.
        """
        if answer_text is None:
            return ""
        s = str(answer_text).strip()
        if not s:
            return ""

        # Prefer content after an 'Answer:' marker if present
        m = re.search(r"(?i)\banswer\s*:\s*(.*)", s)
        if m:
            s = m.group(1).strip()

        # Some models respond with JSON-ish snippets
        m_json = re.search(r'(?i)"answer"\s*:\s*"([^"]+)"', s)
        if m_json:
            s = m_json.group(1).strip()

        # Keep first non-empty line
        lines = [ln.strip() for ln in s.splitlines() if ln.strip()]
        s = lines[0] if lines else ""

        # Strip common wrappers
        s = s.strip(" \t\r\n\"'`“”‘’")
        if s.startswith('- '):
            s = s[2:].strip()
        return s

    def _coerce_to_text(self, ans):
        """Coerce different answer representations into a plain string."""
        if ans is None:
            return ""
        # Backward compatibility: older versions returned (parsed, raw)
        if isinstance(ans, (tuple, list)):
            for part in ans:
                if part is None:
                    continue
                if isinstance(part, str) and part.strip():
                    return part
            return str(ans[0]) if len(ans) > 0 else ""
        if isinstance(ans, str):
            return ans
        return str(ans)

    def normalize_hotpotqa_answer(self, s):
        """Normalize answers for HotpotQA-style EM matching (SQuAD-like)."""
        if s is None:
            return ""
        s = self._coerce_to_text(s)
        s = s.strip()
        if not s:
            return ""
        # Drop common prefixes
        s = re.sub(r"^(answer|final answer)\s*:\s*", "", s, flags=re.IGNORECASE).strip()
        # Keep first line to reduce extra chatter
        s = s.splitlines()[0].strip()
        # Strip surrounding quotes/backticks
        s = s.strip(" \t\r\n\"'`“”‘’")
        s = s.lower()
        # Normalize yes/no variants early
        if s in {"yes.", "yes!", "yes,"}:
            s = "yes"
        if s in {"no.", "no!", "no,"}:
            s = "no"
        # Remove punctuation (unicode-aware) -> space
        s = re.sub(r"[^\w\s]", " ", s, flags=re.UNICODE)
        # Remove articles
        s = re.sub(r"\b(a|an|the)\b", " ", s)
        # Collapse whitespace
        s = " ".join(s.split())
        return s

    def hotpotqa_f1(self, pred, gold):
        """Token-level F1 (SQuAD-style) over normalized answers."""
        pred_norm = self.normalize_hotpotqa_answer(pred)
        gold_norm = self.normalize_hotpotqa_answer(gold)
        pred_tokens = pred_norm.split() if pred_norm else []
        gold_tokens = gold_norm.split() if gold_norm else []
        if not pred_tokens or not gold_tokens:
            return 0.0

        pred_counts = {}
        for t in pred_tokens:
            pred_counts[t] = pred_counts.get(t, 0) + 1
        gold_counts = {}
        for t in gold_tokens:
            gold_counts[t] = gold_counts.get(t, 0) + 1

        num_same = 0
        for t, pc in pred_counts.items():
            gc = gold_counts.get(t, 0)
            if gc:
                num_same += min(pc, gc)

        if num_same == 0:
            return 0.0
        precision = num_same / len(pred_tokens)
        recall = num_same / len(gold_tokens)
        return (2 * precision * recall) / (precision + recall)

    def grade_with_gemma12b(self, llm_answer, ground_truth_answer):
        """
        HotpotQA grading: normalized Exact Match (EM) OR token-level F1 >= threshold.
        NOTE: Kept the old method name for minimal downstream changes.
        """
        pred = self.normalize_hotpotqa_answer(llm_answer)
        gold = self.normalize_hotpotqa_answer(ground_truth_answer)
        if not pred or not gold:
            return False
        if pred == gold:
            return True
        return self.hotpotqa_f1(pred, gold) >= 0.8

    def calculate_token_cost(self, model_name, prompt, response_text, usage_info=None):
        """Calculate the actual cost. For OpenRouter, relies on usage_info from API response."""
        total_cost = 0.0
        error_message = None
        if model_name in MODELS_CONFIG:
            model_cfg = MODELS_CONFIG[model_name]
            if usage_info:
                input_tokens = usage_info.get("prompt_tokens", 0)
                output_tokens = usage_info.get("completion_tokens", 0)
                total_cost = (input_tokens * model_cfg["input_cost"]) + (output_tokens * model_cfg["output_cost"])
            else:
                error_message = f"Usage info not available for {model_name}. Cost is a rough estimate."
                input_tokens = len(prompt) // 4
                output_tokens = len(response_text) // 4 if response_text else 0
                total_cost = (input_tokens * model_cfg["input_cost"]) + (output_tokens * model_cfg["output_cost"])
        else:
            error_message = f"Model {model_name} not found in config for cost calculation."
        result = {"total_cost": total_cost}
        if error_message:
            result["error"] = error_message
            print(f"Cost calculation warning for {model_name}: {error_message}")
        return result

    def query_llm(self, model_name, prompt):
        """Query the specified LLM and return its response and cost."""
        answer_text, parsed_answer, cost_data = "", None, {}
        try:
            self.linucb_model.respect_rate_limit(model_name)
            if model_name in MODELS_CONFIG:
                api_response = openrouter_client.chat.completions.create(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}],
                )
                answer_text = api_response.choices[0].message.content.strip()
                usage_info = api_response.usage.model_dump() if api_response.usage else None
                cost_data = self.calculate_token_cost(model_name, prompt, answer_text, usage_info=usage_info)
            else:
                raise ValueError(f"Model {model_name} is not configured in MODELS_CONFIG.")
            self.linucb_model.register_model_call(model_name)
            parsed_answer = self.parse_llm_answer(answer_text)
            return answer_text, parsed_answer, cost_data
        except Exception as e:
            print(f"Error querying LLM {model_name}: {e}")
            self.linucb_model.register_model_call(model_name)
            cost_data = self.calculate_token_cost(model_name, prompt, "", usage_info=None)
            return "", None, cost_data

    def run_cascade_single_question(self, question):
        """Run the standard LinUCB cascade for a single question."""
        base_features = self.feature_extractor.extract_features(question)
        failed_answers, failed_llm_ids, current_attempts_log = [], [], []
        question_total_cost = 0.0
        final_status = "Failure"

        for i in range(1, self.cascade_length + 1):
            print(f"Step {i}")
            # 1. Construct feature vector with history
            x_i = self.feature_extractor.construct_feature_vector(base_features, i, failed_answers, failed_llm_ids, self.linucb_model.model_name_to_index)
            # 2. Select model based on UCB score
            chosen_model, scores = self.linucb_model.select_model_ucb(x_i)
            if not chosen_model:
                print("Error: LinUCB failed to select a model.")
                break
            print(f"Selected: {chosen_model}, UCB: {scores[chosen_model]['ucb_score']:.4f}")
            # 3. Query the chosen model
            prompt = self.format_prompt(question, failed_answers, failed_llm_ids)
            raw_response, model_answer, cost_data = self.query_llm(chosen_model, prompt)
            actual_cost = cost_data.get("total_cost", 0.0)
            question_total_cost += actual_cost
            # 4. Grade the answer
            is_correct = self.grade_with_gemma12b(model_answer, question['ground_truth_answer'])
            reward = 1 if is_correct else 0
            print(f"Answer: {model_answer}, Correct: {is_correct}, Cost: ${actual_cost:.8f}")
            # 5. Update LinUCB model
            self.linucb_model.update_reward_only(chosen_model, x_i, reward)
            # 6. Log the attempt
            current_attempts_log.append({
                "step": i, "chosen_model": chosen_model, "chosen_model_cost": cost_data,
                "llm_answer": model_answer, "is_correct": is_correct, "reward_ri": reward,
                "raw_response": raw_response, "scores_per_arm": scores
            })
            # 7. Check for success
            if is_correct:
                final_status = "Success"
                print(f"Success in step {i}!")
                break
            else:
                failed_answers.append(model_answer if model_answer else "ParsingFailed/NoAnswer")
                failed_llm_ids.append(chosen_model)

        return {
            "question": question["question"],
            "ground_truth_answer": question["ground_truth_answer"],
            "unique_id": question.get("unique_id"),
            "subject": question.get("subject"),
            "level": question.get("level"),
            "final_status": final_status,
            "total_cost": question_total_cost,
            "steps_taken": len(current_attempts_log),
            "attempts": current_attempts_log
        }

       

In [4]:

def load_math500_dataset(json_path, split_key="test"):
    """
    Load and process the HotpotQA dataset (kept function name for compatibility).

    Expected JSON format:
      {"test": [{"problem": "Question: ...", "answer": "...", ...}, ...]}

    Returns a list of dicts with keys:
      - question
      - ground_truth_answer
      - subject / level / unique_id (if present)
    """
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        if isinstance(raw_data, dict):
            items = raw_data.get(split_key)
            if items is None:
                # Fallback: if split_key doesn't exist, try common alternatives
                for candidate in ("test", "dev", "validation", "train"):
                    if candidate in raw_data:
                        items = raw_data[candidate]
                        break
            if items is None:
                raise KeyError(f"No split found in dataset JSON. Tried '{split_key}' and common keys.")
        elif isinstance(raw_data, list):
            # Rare case: dataset is directly a list
            items = raw_data
        else:
            raise TypeError(f"Unexpected dataset JSON type: {type(raw_data)}")

        processed_dataset = []
        for item in items:
            problem = (item.get("problem") or "").strip()
            # HotpotQA-style export often prefixes the question with "Question:"
            if problem.lower().startswith("question:"):
                problem = problem.split(":", 1)[1].strip()

            processed_dataset.append({
                "question": problem,
                "options": [],
                "ground_truth_answer": (item.get("answer") or "").strip(),
                "subject": item.get("subject", "Unknown"),
                "level": item.get("level", "Unknown"),
                "unique_id": item.get("unique_id", "Unknown"),
            })

        print(f"Loaded {len(processed_dataset)} questions from {json_path} (split={split_key})")
        return processed_dataset

    except Exception as e:
        print(f"Error loading HotpotQA dataset: {e}")
        return []

def save_records_with_backup(records, json_path):
    """Save records to a JSON file with backup of previous file"""
    # Create backup of existing file if it exists
    if os.path.exists(json_path):
        backup_path = json_path + BACKUP_SUFFIX
        try:
            os.replace(json_path, backup_path)
        except Exception as e:
            print(f"Failed to create backup: {e}")

    # Save new data
    try:
        with open(json_path, 'w') as f:
            json.dump(records, f, indent=4)
        return True
    except Exception as e:
        print(f"Error saving records: {e}")

        # Try to restore from backup if save failed
        if os.path.exists(json_path + BACKUP_SUFFIX):
            try:
                os.replace(json_path + BACKUP_SUFFIX, json_path)
            except:
                pass

        return False

def initialize_json_files():
    """Initialize the JSON files for records with validation"""
    if not os.path.exists(RECORDS_PATH):
        with open(RECORDS_PATH, 'w') as f:
            json.dump([], f)  # Empty array
    else:
        # Validate existing file
        try:
            with open(RECORDS_PATH, 'r') as f:
                data = json.load(f)
            if not isinstance(data, list):
                os.rename(RECORDS_PATH, RECORDS_PATH + BACKUP_SUFFIX)
                with open(RECORDS_PATH, 'w') as f:
                    json.dump([], f)
        except json.JSONDecodeError:
            os.rename(RECORDS_PATH, RECORDS_PATH + BACKUP_SUFFIX)
            with open(RECORDS_PATH, 'w') as f:
                json.dump([], f)

def calculate_metrics(records):
    """Helper function to calculate metrics for a subset of records"""
    if not records:
        return {
            "total_questions": 0,
            "successful_questions": 0,
            "success_rate": 0,
            "total_steps": 0,
            "avg_steps": 0,
            "total_cost": 0,
            "avg_cost": 0,
            "avg_cost_success": 0,
            "successes_by_position": [0] * CASCADE_LENGTH,
            "per_position_success": [0] * CASCADE_LENGTH,
            "model_metrics": {}
        }

    total_questions = len(records)
    successful_questions = sum(1 for r in records if r["final_status"] == "Success")
    success_rate = successful_questions / total_questions if total_questions > 0 else 0

    # Calculate success by position
    successes_by_position = [0] * CASCADE_LENGTH
    for record in records:
        if record["final_status"] == "Success":
            position = len(record["attempts"]) - 1  # 0-indexed
            if position < CASCADE_LENGTH:
                successes_by_position[position] += 1

    per_position_success = [count/total_questions for count in successes_by_position]

    # Calculate average steps and costs
    total_steps = sum(r["steps_taken"] for r in records)
    total_cost = sum(r["total_cost"] for r in records)

    avg_steps = total_steps / total_questions if total_questions > 0 else 0
    avg_cost = total_cost / total_questions if total_questions > 0 else 0

    # Calculate average cost for successful questions only
    if successful_questions > 0:
        success_cost = sum(r["total_cost"] for r in records if r["final_status"] == "Success")
        avg_cost_success = success_cost / successful_questions
    else:
        avg_cost_success = 0

    # Analyze model performance
    model_metrics = {model: {"calls": 0, "successes": 0, "total_cost": 0.0, "by_position": defaultdict(lambda: {"calls": 0, "successes": 0, "total_cost": 0.0})}
                    for model in AVAILABLE_LLMS}

    for record in records:
        for attempt in record["attempts"]:
            position = attempt["step"]
            model = attempt["chosen_model"]
            is_correct = attempt["is_correct"]
            cost = attempt["chosen_model_cost"]["total_cost"]

            # Update overall stats
            model_metrics[model]["calls"] += 1
            model_metrics[model]["total_cost"] += cost
            if is_correct:
                model_metrics[model]["successes"] += 1

            # Update position-specific stats
            model_metrics[model]["by_position"][position]["calls"] += 1
            model_metrics[model]["by_position"][position]["total_cost"] += cost
            if is_correct:
                model_metrics[model]["by_position"][position]["successes"] += 1

    # Calculate success rates and average costs for models
    for model, data in model_metrics.items():
        if data["calls"] > 0:
            data["success_rate"] = data["successes"] / data["calls"]
            data["avg_cost"] = data["total_cost"] / data["calls"]
        else:
            data["success_rate"] = 0
            data["avg_cost"] = 0

        for position, pos_data in data["by_position"].items():
            if pos_data["calls"] > 0:
                pos_data["success_rate"] = pos_data["successes"] / pos_data["calls"]
                pos_data["avg_cost"] = pos_data["total_cost"] / pos_data["calls"]
            else:
                pos_data["success_rate"] = 0
                pos_data["avg_cost"] = 0

    return {
        "total_questions": total_questions,
        "successful_questions": successful_questions,
        "success_rate": success_rate,
        "total_steps": total_steps,
        "avg_steps": avg_steps,
        "total_cost": total_cost,
        "avg_cost": avg_cost,
        "avg_cost_success": avg_cost_success,
        "successes_by_position": successes_by_position,
        "per_position_success": per_position_success,
        "model_metrics": model_metrics
    }

def analyze_results(records):
    """Analyze and print results from the records, separating train and test sets"""
    if not records:
        print("No records to analyze")
        return

    # Separate records into train and test sets
    train_records = [r for r in records if r.get("phase") == "train"]
    test_records = [r for r in records if r.get("phase") == "test"]
    overall_records = records

    print(f"Train Records: {len(train_records)}")
    print(f"Test Records: {len(test_records)}")
    print(f"Total Records: {len(overall_records)}")

    # Calculate metrics for each set
    train_metrics = calculate_metrics(train_records)
    test_metrics = calculate_metrics(test_records)
    overall_metrics = calculate_metrics(overall_records)

    # Analyze batch performance
    batch_metrics = []
    for i in range(0, len(records), BATCH_SIZE):
        batch = records[i:i+BATCH_SIZE]
        batch_success = sum(1 for r in batch if r["final_status"] == "Success")
        batch_success_rate = batch_success / len(batch) if batch else 0
        batch_cost = sum(r["total_cost"] for r in batch)
        batch_avg_cost = batch_cost / len(batch) if batch else 0
        batch_train = sum(1 for r in batch if r.get("phase") == "train")
        batch_test = sum(1 for r in batch if r.get("phase") == "test")

        batch_metrics.append({
            "batch_idx": i // BATCH_SIZE,
            "batch_size": len(batch),
            "train_count": batch_train,
            "test_count": batch_test,
            "success_count": batch_success,
            "success_rate": batch_success_rate,
            "total_cost": batch_cost,
            "avg_cost": batch_avg_cost
        })

    # Generate summary text
    summary = "=== LinUCB CASCADE WITH HOTPOTQA DATASET ===\n\n"
    summary += f"Train Ratio: {TRAIN_RATIO*100}%\n"
    summary += f"LinUCB updated on BOTH Train and Test sets.\n"
    summary += f"Batch Size: {BATCH_SIZE}\n\n"

    # TRAIN SET RESULTS
    summary += "=== TRAIN SET RESULTS ===\n"
    summary += f"Total Train Questions: {train_metrics['total_questions']}\n"
    summary += f"Success Rate: {train_metrics['success_rate']:.4f}\n"
    summary += f"Average Steps: {train_metrics['avg_steps']:.4f}\n"
    summary += f"Average Cost per Question: ${train_metrics['avg_cost']:.8f}\n"
    summary += f"Average Cost per Successful Question: ${train_metrics['avg_cost_success']:.8f}\n"
    summary += "Success Rate by Position:\n"
    for i, rate in enumerate(train_metrics['per_position_success']):
        summary += f"  Position {i+1}: {rate:.4f}\n"

    summary += "\nTrain Set Model Performance:\n"
    for model, metrics in train_metrics['model_metrics'].items():
        if metrics["calls"] > 0:
            summary += f"\n{model}:\n"
            summary += f"  Overall: {metrics['successes']}/{metrics['calls']} = {metrics['success_rate']:.4f}\n"
            summary += f"  Average Cost: ${metrics['avg_cost']:.8f}\n"
            summary += "  By Position:\n"
            for position, pos_data in sorted(metrics["by_position"].items()):
                if pos_data["calls"] > 0:
                    summary += f"    Pos {position}: {pos_data['successes']}/{pos_data['calls']} = {pos_data['success_rate']:.4f}, Avg Cost: ${pos_data['avg_cost']:.8f}\n"

    # TEST SET RESULTS
    summary += "\n\n=== TEST SET RESULTS ===\n"
    summary += f"Total Test Questions: {test_metrics['total_questions']}\n"
    summary += f"Success Rate: {test_metrics['success_rate']:.4f}\n"
    summary += f"Average Steps: {test_metrics['avg_steps']:.4f}\n"
    summary += f"Average Cost per Question: ${test_metrics['avg_cost']:.8f}\n"
    summary += f"Average Cost per Successful Question: ${test_metrics['avg_cost_success']:.8f}\n"
    summary += "Success Rate by Position:\n"
    for i, rate in enumerate(test_metrics['per_position_success']):
        summary += f"  Position {i+1}: {rate:.4f}\n"

    summary += "\nTest Set Model Performance:\n"
    for model, metrics in test_metrics['model_metrics'].items():
        if metrics["calls"] > 0:
            summary += f"\n{model}:\n"
            summary += f"  Overall: {metrics['successes']}/{metrics['calls']} = {metrics['success_rate']:.4f}\n"
            summary += f"  Average Cost: ${metrics['avg_cost']:.8f}\n"
            summary += "  By Position:\n"
            for position, pos_data in sorted(metrics["by_position"].items()):
                if pos_data["calls"] > 0:
                    summary += f"    Pos {position}: {pos_data['successes']}/{pos_data['calls']} = {pos_data['success_rate']:.4f}, Avg Cost: ${pos_data['avg_cost']:.8f}\n"

    # OVERALL RESULTS
    summary += "\n\n=== OVERALL RESULTS (TRAIN + TEST) ===\n"
    summary += f"Total Overall Questions: {overall_metrics['total_questions']}\n"
    summary += f"Success Rate: {overall_metrics['success_rate']:.4f}\n"
    summary += f"Average Steps: {overall_metrics['avg_steps']:.4f}\n"
    summary += f"Average Cost per Question: ${overall_metrics['avg_cost']:.8f}\n"
    summary += f"Average Cost per Successful Question: ${overall_metrics['avg_cost_success']:.8f}\n"
    summary += "Success Rate by Position:\n"
    for i, rate in enumerate(overall_metrics['per_position_success']):
        summary += f"  Position {i+1}: {rate:.4f}\n"

    summary += "\nOverall Model Performance:\n"
    for model, metrics in overall_metrics['model_metrics'].items():
        if metrics["calls"] > 0:
            summary += f"\n{model}:\n"
            summary += f"  Overall: {metrics['successes']}/{metrics['calls']} = {metrics['success_rate']:.4f}\n"
            summary += f"  Average Cost: ${metrics['avg_cost']:.8f}\n"
            summary += "  By Position:\n"
            for position, pos_data in sorted(metrics["by_position"].items()):
                if pos_data["calls"] > 0:
                    summary += f"    Pos {position}: {pos_data['successes']}/{pos_data['calls']} = {pos_data['success_rate']:.4f}, Avg Cost: ${pos_data['avg_cost']:.8f}\n"

    summary += "\n=== BATCH PERFORMANCE ===\n"
    for batch in batch_metrics:
        summary += (
            f"Batch {batch['batch_idx']}: {batch['success_count']}/{batch['batch_size']} = {batch['success_rate']:.4f}, "
            f"Train/Test: {batch['train_count']}/{batch['test_count']}, "
            f"Total Cost: ${batch['total_cost']:.8f}, Avg Cost: ${batch['avg_cost']:.8f}\n"
        )

    summary += f"\nTotal Overall Cost (All Questions): ${overall_metrics['total_cost']:.8f}\n"
    summary += f"Total Train Cost: ${train_metrics['total_cost']:.8f}\n"
    summary += f"Total Test Cost: ${test_metrics['total_cost']:.8f}\n"

    # Add interpretation note about test set updates
    summary += "\n=== IMPORTANT NOTES ON INTERPRETATION ===\n"
    summary += "Since the LinUCB model is updated using data from both the train and test sets,\n"
    summary += "the test set performance metrics do not represent the performance of a fixed,\n"
    summary += "pre-trained model on unseen data. Instead, they reflect the model's performance\n"
    summary += "while it is still learning and adapting to the test data (online evaluation).\n"
    summary += "This setup allows us to observe how the bandit algorithm performs and adapts\n"
    summary += "over time across the entire dataset, comparing performance between an initial\n"
    summary += "phase (train) and a later phase (test).\n"

    # Print and save summary
    print(summary)

    with open(SUMMARY_STATS_PATH, 'w') as f:
        f.write(summary)


In [5]:

def main():
    print("Starting LinUCB Cascade with HotpotQA Dataset")
    initialize_json_files()
    dataset_full = load_math500_dataset(INPUT_JSON)
    if not dataset_full:
        print("Dataset is empty or could not be loaded. Exiting.")
        return

    try:
        num_to_process = int(input(f"Enter the number of questions to process (max {len(dataset_full)}): "))
        num_to_process = min(num_to_process, len(dataset_full))
    except ValueError:
        num_to_process = len(dataset_full)
        print(f"Invalid input. Using all {num_to_process} questions.")

    random.seed(42) # for reproducible shuffle
    random.shuffle(dataset_full)
    dataset = dataset_full[:num_to_process]
    train_size = int(TRAIN_RATIO * len(dataset))
    print(f"Using {len(dataset)} questions: {train_size} train, {len(dataset) - train_size} test.")

    feature_extractor = FeatureExtractor()
    feature_extractor.initialize(dataset)
    linucb_model = LinUCBModel(model_names=AVAILABLE_LLMS)
    cascade = BatchBudgetCascade(feature_extractor, linucb_model)

    all_records = []
    start_idx = 0
    if os.path.exists(RECORDS_PATH):
        try:
            with open(RECORDS_PATH, 'r') as f: all_records = json.load(f)
            processed_keys = {(r.get("unique_id") or r.get("question")) for r in all_records}
            dataset = [q for q in dataset if (q.get("unique_id") or q.get("question")) not in processed_keys]
            print(f"Loaded {len(all_records)} existing records. {len(dataset)} new questions to process.")
        except Exception as e:
            print(f"Could not load existing records: {e}")
            all_records = []

    if os.path.exists(LINUCB_MODEL_PATH):
        linucb_model.load_model_state(LINUCB_MODEL_PATH)

    try:
        for i, question in enumerate(dataset):
            phase = "test" if i >= train_size else "train"
            print(f"\n--- Q{i+1}/{len(dataset)} ({phase}) ---")
            print(f"Question: {question['question'][:100]}...")
            question_record = cascade.run_cascade_single_question(question)
            question_record["phase"] = phase
            all_records.append(question_record)

            if (i + 1) % UPDATE_FREQUENCY == 0:
                save_records_with_backup(all_records, RECORDS_PATH)
                linucb_model.save_model_state(LINUCB_MODEL_PATH)

    except KeyboardInterrupt:
        print("\nProcessing interrupted. Saving progress...")
    finally:
        save_records_with_backup(all_records, RECORDS_PATH)
        linucb_model.save_model_state(LINUCB_MODEL_PATH)
        analyze_results(all_records)
        print("\nLinUCB Cascade with HotpotQA Dataset completed!")

if __name__ == "__main__":
    main()

Starting LinUCB Cascade with HotpotQA Dataset
Loaded 500 questions from ..\Data\HotpotQA.json (split=test)
Using 100 questions: 20 train, 80 test.
Initialized sentence transformer embedding model.
Loaded 0 existing records. 100 new questions to process.

--- Q1/100 (train) ---
Question: Who did the Star and Dagger bass player marry?...
Step 1
Selected: arcee-ai/trinity-large-preview:free, UCB: 1.0262
Answer: Rachel Barton Pine, Correct: False, Cost: $0.00000000
Step 2
Selected: mistralai/mistral-small-3.1-24b-instruct, UCB: 2.0523
Answer: Suzanne Ciani, Correct: False, Cost: $0.00000615
Step 3
Selected: microsoft/phi-4, UCB: 2.0455
Answer: Heather Koniges., Correct: False, Cost: $0.00001029
Step 4
Selected: meta-llama/llama-4-maverick, UCB: 2.0938
Answer: Heather Koniges, Correct: False, Cost: $0.00002766
Step 5
Selected: google/gemini-2.0-flash-001, UCB: 2.1816
Answer: Heather Koniges, Correct: False, Cost: $0.00002020

--- Q2/100 (train) ---
Question: What national historic district 