In [1]:
pip install transformers accelerate bitsandbytes sentencepiece pandas datasets huggingface_hub tqdm

Note: you may need to restart the kernel to use updated packages.


In [3]:
  import ipywidgets
  print(f"ipywidgets version: {ipywidgets.__version__}")
  print(f"ipywidgets location: {ipywidgets.__file__}")

  import tqdm
  print(f"tqdm version: {tqdm.__version__}")
  print(f"tqdm location: {tqdm.__file__}")

ipywidgets version: 8.1.5
ipywidgets location: /raid/infolab/gaurav/Llama_Spider_A100_Project/miniconda3/envs/llama_spider_env/lib/python3.10/site-packages/ipywidgets/__init__.py
tqdm version: 4.67.1
tqdm location: /raid/infolab/gaurav/Llama_Spider_A100_Project/miniconda3/envs/llama_spider_env/lib/python3.10/site-packages/tqdm/__init__.py


In [5]:
from tqdm.auto import tqdm
import time

print("tqdm imported successfully from .auto")
my_list = list(range(3))
for i in tqdm(my_list, desc="Minimal Auto Test"):
    time.sleep(0.2)
print("Simple tqdm .auto loop completed")

tqdm imported successfully from .auto


Minimal Auto Test:   0%|          | 0/3 [00:00<?, ?it/s]

Simple tqdm .auto loop completed


In [7]:
# --- Standard Library Imports ---
# --- Third-party Library Imports ---
# --- Third-party Library Imports ---
import torch
from tqdm.auto import tqdm
import time
from huggingface_hub import login
import transformers # <--- ADD THIS LINE
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# --- Third-party Library Imports ---
import torch
from tqdm.auto import tqdm # For progress bars
from huggingface_hub import login # For Hugging Face Hub authentication
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

print("--- Cell 1: Imports and Initial Configuration Complete ---")
print(f"PyTorch Version: {torch.__version__}")
print(f"Transformers Version: {transformers.__version__}")

--- Cell 1: Imports and Initial Configuration Complete ---
PyTorch Version: 2.2.0
Transformers Version: 4.52.4


In [8]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version PyTorch compiled with: {torch.version.cuda}")
    print(f"Number of GPUs available to PyTorch: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("ERROR: PyTorch cannot see the GPUs! Check installation and CUDA compatibility.")

PyTorch version: 2.2.0
CUDA available: True
CUDA version PyTorch compiled with: 11.8
Number of GPUs available to PyTorch: 8
  GPU 0: NVIDIA A100-SXM4-80GB
  GPU 1: NVIDIA A100-SXM4-80GB
  GPU 2: NVIDIA A100-SXM4-80GB
  GPU 3: NVIDIA A100-SXM4-80GB
  GPU 4: NVIDIA A100-SXM4-80GB
  GPU 5: NVIDIA A100-SXM4-80GB
  GPU 6: NVIDIA A100-SXM4-80GB
  GPU 7: NVIDIA A100-SXM4-80GB


In [9]:
# --- Standard Library Imports ---
# --- Third-party Library Imports ---
# --- Third-party Library Imports ---
import torch
from tqdm.auto import tqdm
import time
from huggingface_hub import login
import transformers # <--- ADD THIS LINE
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# --- Third-party Library Imports ---
import torch
from tqdm.auto import tqdm # For progress bars
from huggingface_hub import login # For Hugging Face Hub authentication
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

print("--- Cell 1: Imports and Initial Configuration Complete ---")
print(f"PyTorch Version: {torch.__version__}")
print(f"Transformers Version: {transformers.__version__}")

--- Cell 1: Imports and Initial Configuration Complete ---
PyTorch Version: 2.2.0
Transformers Version: 4.52.4


In [10]:
# --- Hugging Face Hub Authentication ---
# You MUST have requested access to Llama 2 models via Meta's form on Hugging Face
# AND have your request approved.

# Option 1: If you've stored your token as an environment variable on the server
# HF_TOKEN = os.environ.get("HF_TOKEN")
# if HF_TOKEN:
#     print("Logging into Hugging Face Hub using token from environment variable...")
#     login(token=HF_TOKEN)
# else:
#     print("HF_TOKEN environment variable not set. Attempting widget login if in interactive environment, or manual CLI login might be needed.")
#     login() # Will prompt if in an environment that supports it

# Option 2: Paste token directly (less secure, use with caution)
# HF_TOKEN = "YOUR_HF_READ_TOKEN_HERE"
# login(token=HF_TOKEN)

# Option 3: Use huggingface-cli login in a server terminal beforehand (Recommended)
# If already logged in via CLI, this cell might not be strictly necessary,
# but running login() can confirm status or refresh credentials.
try:
    login() # Will use cached token or prompt if needed
    print("Hugging Face login successful or already authenticated.")
except Exception as e:
    print(f"Hugging Face login failed: {e}. Ensure you are authenticated to download Llama 2.")

print("\n--- Cell 2: Hugging Face Login Attempt Complete ---")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Hugging Face login successful or already authenticated.

--- Cell 2: Hugging Face Login Attempt Complete ---


In [11]:
# --- Model and Tokenizer Configuration ---
import os

# 3.1. Specify the Llama 2 70B Chat Model
MODEL_NAME = "meta-llama/Llama-2-70b-chat-hf"
print(f"Target Model: {MODEL_NAME}")

# 3.2. Configure 4-bit Quantization (essential for 70B, even on A100s for single/few GPU use)
# A100s support bfloat16, which is excellent for mixed-precision.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",        # nf4 is a good default
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation on A100s
    bnb_4bit_use_double_quant=True,   # Can save a bit more memory
)
print(f"BitsAndBytesConfig: load_in_4bit={bnb_config.load_in_4bit}, compute_dtype={bnb_config.bnb_4bit_compute_dtype}")

# 3.3. Define Prompt Templates
SYSTEM_PROMPT = (
    "You are an expert data analyst. Your task is to determine if a given natural language query "
    "can be answered *solely* based on the provided database schema. "
    "Do not attempt to answer the query itself. Your entire response must be only the word 'Yes' or the word 'No'."
)

USER_PROMPT_TEMPLATE = """Database Schema:
---
{schema_string}
---
Natural Language Query: "{nl_query}"
---
Can the query be answered using *only* the provided schema and its potential contents? Answer with either "Yes" or "No".
"""
print("System and User prompt templates defined.")

# 3.4. Define Cache Directory for Hugging Face downloads (optional, but good for managing large models)
# Create it within your project directory on the A100 server.
HF_MODEL_CACHE_DIR = os.path.join(os.getcwd(), ".hf_model_cache_70b") # Assumes current dir is project root
os.makedirs(HF_MODEL_CACHE_DIR, exist_ok=True)
print(f"Hugging Face model cache directory set to: {HF_MODEL_CACHE_DIR}")

print("\n--- Cell 3: Model and Prompt Configuration Complete ---")

Target Model: meta-llama/Llama-2-70b-chat-hf
BitsAndBytesConfig: load_in_4bit=True, compute_dtype=torch.bfloat16
System and User prompt templates defined.
Hugging Face model cache directory set to: /raid/infolab/gaurav/Llama_Spider_A100_Project/experiments_70b_llama/.hf_model_cache_70b

--- Cell 3: Model and Prompt Configuration Complete ---


In [12]:
# --- Load Tokenizer and Define Yes/No Token Logic ---

# 4.1. Load Tokenizer
print(f"Loading tokenizer for {MODEL_NAME}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=HF_MODEL_CACHE_DIR)
    # Set pad token if not already set (Llama tokenizers often don't have one)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"Set tokenizer.pad_token to tokenizer.eos_token ('{tokenizer.eos_token}')")
    print("Tokenizer loaded successfully.")
except Exception as e:
    raise RuntimeError(f"Failed to load tokenizer for {MODEL_NAME}: {e}")


# 4.2. Define Helper Function to get Yes/No Token IDs
def get_yes_no_token_ids(tokenizer_arg):
    """Determines token IDs for 'Yes'/'No', preferring those with a leading space."""
    # Try with leading space first for chat models
    yes_variants = [" Yes", "Yes"]
    no_variants = [" No", "No"]
    
    final_yes_id = None
    final_no_id = None

    for variant in yes_variants:
        token_ids = tokenizer_arg.encode(variant, add_special_tokens=False)
        if len(token_ids) == 1:
            final_yes_id = token_ids[0]
            print(f"Found single token for '{variant}': ID {final_yes_id}")
            break
            
    for variant in no_variants:
        token_ids = tokenizer_arg.encode(variant, add_special_tokens=False)
        if len(token_ids) == 1:
            final_no_id = token_ids[0]
            print(f"Found single token for '{variant}': ID {final_no_id}")
            break

    if final_yes_id is None or final_no_id is None:
        print(f"ERROR: Could not determine reliable single token IDs for 'Yes'/'No' or variants.")
        # You might want to print detailed tokenization attempts here if this error occurs
        raise ValueError("Unstable tokenization for 'Yes'/'No'. Cannot proceed.")
    
    return final_yes_id, final_no_id

# 4.3. Define Global YES_TOKEN_ID and NO_TOKEN_ID
try:
    YES_TOKEN_ID, NO_TOKEN_ID = get_yes_no_token_ids(tokenizer)
    print(f"GLOBAL YES_TOKEN_ID: {YES_TOKEN_ID} ('{tokenizer.decode([YES_TOKEN_ID]).strip()}')")
    print(f"GLOBAL NO_TOKEN_ID: {NO_TOKEN_ID} ('{tokenizer.decode([NO_TOKEN_ID]).strip()}')")
except ValueError as e:
    raise RuntimeError(f"Failed to set YES/NO token IDs: {e}")

print("\n--- Cell 4: Tokenizer Loading and Yes/No Token ID Setup Complete ---")

Loading tokenizer for meta-llama/Llama-2-70b-chat-hf...
Set tokenizer.pad_token to tokenizer.eos_token ('</s>')
Tokenizer loaded successfully.
Found single token for 'Yes': ID 3869
Found single token for 'No': ID 1939
GLOBAL YES_TOKEN_ID: 3869 ('Yes')
GLOBAL NO_TOKEN_ID: 1939 ('No')

--- Cell 4: Tokenizer Loading and Yes/No Token ID Setup Complete ---


In [13]:
# --- Load the Llama 2 70B Model ---
# This is a memory-intensive step. `device_map="auto"` will attempt to distribute
# the model across available GPUs if one is insufficient.
# Ensure CUDA_VISIBLE_DEVICES is set in your shell if you want to restrict which GPUs are used.
import gc
print(f"Loading model: {MODEL_NAME} with 4-bit quantization. This will take significant time and memory...")
model_load_start_time = time.time()
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,    # Apply 4-bit quantization
        torch_dtype=torch.bfloat16,        # Use bfloat16 on A100s
        device_map="auto",                 # Distribute model across available GPUs automatically
        trust_remote_code=True,            # Often needed for newer models
        cache_dir=HF_MODEL_CACHE_DIR
    )
    model_load_end_time = time.time()
    print("\nModel loaded successfully!")
    print(f"Time taken to load model: {model_load_end_time - model_load_start_time:.2f} seconds.")
    print(f"Model device map: {model.hf_device_map}") # Shows how layers are distributed
    # For a 70B model, this should show parts on different GPUs if more than one is used.
    
    # Perform a quick memory cleanup after loading large model
    torch.cuda.empty_cache()
    gc.collect()
    print("Performed memory cleanup (torch.cuda.empty_cache(), gc.collect())")

except Exception as e:
    import traceback
    traceback.print_exc()
    raise RuntimeError(f"Failed to load model {MODEL_NAME}: {e}. Check VRAM, CUDA setup, and Hugging Face authentication.")

print("\n--- Cell 5: Llama 2 70B Model Loading Complete ---")

Loading model: meta-llama/Llama-2-70b-chat-hf with 4-bit quantization. This will take significant time and memory...


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]


Model loaded successfully!
Time taken to load model: 44.49 seconds.
Model device map: {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 1, 'model.layers.11': 1, 'model.layers.12': 1, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 2, 'model.layers.19': 2, 'model.layers.20': 2, 'model.layers.21': 2, 'model.layers.22': 2, 'model.layers.23': 2, 'model.layers.24': 2, 'model.layers.25': 2, 'model.layers.26': 2, 'model.layers.27': 2, 'model.layers.28': 3, 'model.layers.29': 3, 'model.layers.30': 3, 'model.layers.31': 3, 'model.layers.32': 3, 'model.layers.33': 3, 'model.layers.34': 3, 'model.layers.35': 3, 'model.layers.36': 3, 'model.layers.37': 3, 'model.layers.38': 4, 'model.layers.39': 4, 'model.layers.40'

In [17]:
# --- MODIFIED PART ---
# Increase max_new_tokens to allow for longer output
# You can adjust this value. The model will also stop if it generates an EOS token.
new_max_tokens = 256  # Let's try allowing up to 256 new tokens

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=new_max_tokens, # MODIFIED
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id # Important for some models when batching or using attention_mask
    )

response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

assistant_response_start = -1
# Try to find the common end-of-instruction marker for chat models
# For Llama-2 style:
if " [/INST] " in response_text:
    assistant_response_start = response_text.rfind(" [/INST] ") + len(" [/INST] ")
# For other potential chatML-like structures:
elif "<|assistant|>\n" in response_text: # Anthropic/Claude style or similar
    assistant_response_start = response_text.rfind("<|assistant|>\n") + len("<|assistant|>\n")
elif "<|im_start|>assistant\n" in response_text: # Newer ChatML
    assistant_response_start = response_text.rfind("<|im_start|>assistant\n") + len("<|im_start|>assistant\n")


if assistant_response_start != -1:
    clean_response = response_text[assistant_response_start:].strip()
else:
    # Fallback: if the prompt was simple and no clear marker,
    # try to remove the original prompt from the start of the response text.
    # This is less robust.
    formatted_prompt_without_generation_cue = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    if response_text.startswith(formatted_prompt_without_generation_cue):
        clean_response = response_text[len(formatted_prompt_without_generation_cue):].strip()
    else:
        # Last resort: very basic split if we can't find a good marker.
        # This assumes the model *only* added new text.
        prompt_tokens_count = inputs.input_ids.shape[1]
        generated_tokens = outputs[0][prompt_tokens_count:]
        clean_response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        if not clean_response: # If the above still fails, show the raw output
             clean_response = "Could not reliably clean the prompt. Raw output (minus special tokens):\n" + response_text


print(f"\nModel Response (cleaned, potentially longer): {clean_response}")

NameError: name 'inputs' is not defined

In [18]:
import zipfile
import os

SERVER_ZIP_FILE_PATH = '/raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data.zip'
EXTRACTION_DESTINATION_DIR_ON_SERVER = '/raid/infolab/gaurav/Llama_Spider_A100_Project/'

DEV_JSON_PATH = None
TABLES_JSON_PATH = None

def unzip_data(zip_filepath, dest_dir):
    """
    Unzips a zip file to a specified destination directory.
    """
    print(f"Attempting to unzip {zip_filepath} to {dest_dir}...")
    try:
        
        with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
            zip_ref.extractall(dest_dir)
        print(f"Successfully unzipped files to {dest_dir}")

        print(f"Contents of {dest_dir}:")
        for item in os.listdir(dest_dir):
            print(f"  - {item}")
        return True
    except zipfile.BadZipFile:
        print(f"Error: {zip_filepath} is not a valid zip file or is corrupted.")
        return False
    except FileNotFoundError:
        print(f"Error: Zip file not found at {zip_filepath}. Please ensure the path is correct.")
        return False
    except PermissionError:
        print(f"Error: Permission denied to write to {dest_dir} or read {zip_filepath}.")
        return False
    except Exception as e:
        print(f"An unexpected error occurred during unzipping: {e}")
        return False

print(f"Script started. Looking for zip file at: {SERVER_ZIP_FILE_PATH}")

if os.path.exists(SERVER_ZIP_FILE_PATH):
    print(f"Zip file found at {SERVER_ZIP_FILE_PATH}.")
    if unzip_data(SERVER_ZIP_FILE_PATH, EXTRACTION_DESTINATION_DIR_ON_SERVER):
        
        EXPECTED_EXTRACTED_FOLDER_NAME = 'spider_subset_data' # This is the folder INSIDE the zip

        DEV_JSON_PATH = os.path.join(EXTRACTION_DESTINATION_DIR_ON_SERVER, EXPECTED_EXTRACTED_FOLDER_NAME, 'dev.json')
        TABLES_JSON_PATH = os.path.join(EXTRACTION_DESTINATION_DIR_ON_SERVER, EXPECTED_EXTRACTED_FOLDER_NAME, 'tables.json')

        print("\nVerifying extracted file paths...")
        if os.path.exists(DEV_JSON_PATH):
            print(f"SUCCESS: dev.json path is valid: {DEV_JSON_PATH}")
        else:
            print(f"ERROR: dev.json NOT FOUND at expected path: {DEV_JSON_PATH}")
            print(f"Please check the contents of {os.path.join(EXTRACTION_DESTINATION_DIR_ON_SERVER, EXPECTED_EXTRACTED_FOLDER_NAME)}")


        if os.path.exists(TABLES_JSON_PATH):
            print(f"SUCCESS: tables.json path is valid: {TABLES_JSON_PATH}")
        else:
            print(f"ERROR: tables.json NOT FOUND at expected path: {TABLES_JSON_PATH}")
            print(f"Please check the contents of {os.path.join(EXTRACTION_DESTINATION_DIR_ON_SERVER, EXPECTED_EXTRACTED_FOLDER_NAME)}")

    else:
        print("Unzipping failed on the server. Cannot define data paths.")
else:
    print(f"ERROR: Zip file NOT FOUND at {SERVER_ZIP_FILE_PATH} on the server.")
    print("Please ensure the 'scp' command was successful and the path is correct.")


if DEV_JSON_PATH and TABLES_JSON_PATH and os.path.exists(DEV_JSON_PATH) and os.path.exists(TABLES_JSON_PATH):
    print("\n--- Ready to load data ---")
    print(f"Path to dev.json: {DEV_JSON_PATH}")
    print(f"Path to tables.json: {TABLES_JSON_PATH}")
    
else:
    print("\n--- Data paths are not correctly set up. Cannot proceed with data loading. ---")

Script started. Looking for zip file at: /raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data.zip
Zip file found at /raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data.zip.
Attempting to unzip /raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data.zip to /raid/infolab/gaurav/Llama_Spider_A100_Project/...
Successfully unzipped files to /raid/infolab/gaurav/Llama_Spider_A100_Project/
Contents of /raid/infolab/gaurav/Llama_Spider_A100_Project/:
  - experiments_70b_llama
  - .gitignore
  - backup_to_github.sh
  - Miniconda3-latest-Linux-x86_64.sh
  - spider_subset_data.zip
  - randomQ_allDBs_run1
  - .ipynb_checkpoints
  - .git
  - miniconda3
  - spider_subset_data
  - __MACOSX

Verifying extracted file paths...
SUCCESS: dev.json path is valid: /raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data/dev.json
SUCCESS: tables.json path is valid: /raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data/tables.json

--- Ready to load dat

In [19]:
import json

def load_json_data(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    else:
        print(f"ERROR: File not found at {file_path}")
        return None

dev_data = load_json_data(DEV_JSON_PATH)
tables_data = load_json_data(TABLES_JSON_PATH)

if dev_data and tables_data:
    print(f"Loaded {len(dev_data)} queries from dev.json")
    print(f"Loaded {len(tables_data)} database schemas from tables.json")
else:
    print("Failed to load Spider data. Please check paths and upload.")

Loaded 1034 queries from dev.json
Loaded 166 database schemas from tables.json


In [39]:
import json
# import os # Not strictly needed for this dictionary creation unless used in paths
# import traceback # Only needed if you keep the full traceback print in except

# --- Helper Functions (These are the same as you provided) ---
def load_schemas(tables_json_path):
    """Loads schemas from tables.json into a dictionary keyed by db_id."""
    with open(tables_json_path, 'r') as f:
        schemas_list = json.load(f)
    schemas_dict = {db_info['db_id']: db_info for db_info in schemas_list}
    return schemas_dict

def map_spider_type_to_sql_type(spider_type, is_pk_or_fk=False):
    """Maps Spider's generic types to SQLite data types."""
    spider_type = spider_type.lower()
    if spider_type == "text":
        return "TEXT"
    elif spider_type == "number":
        return "INTEGER" if is_pk_or_fk else "REAL"
    elif spider_type == "time":
        return "DATETIME"
    elif spider_type == "boolean":
        return "BOOLEAN"
    elif spider_type == "others":
        return "BLOB"
    else:
        return "TEXT"

def escape_sql_identifier(name):
    """Escapes SQL identifiers (table/column names) if they contain spaces or are keywords."""
    if " " in name or name.lower() in {"select", "from", "where", "table", "primary", "key", "foreign", "index", "order", "group"}:
        return f'"{name}"'
    return name

def generate_create_table_sql_for_db(db_id, all_schemas_data): # Parameter name changed for consistency
    """
    Generates SQL CREATE TABLE statements for a given db_id from the Spider schema.
    'all_schemas_data' is the dictionary produced by load_schemas.
    """
    if db_id not in all_schemas_data:
        return f"-- Database ID '{db_id}' not found in schemas."

    db_schema = all_schemas_data[db_id] # Get the specific schema info for this db_id
    sql_statements = []
    column_info_by_index = {}
    for i, (table_idx, col_name_original) in enumerate(db_schema['column_names_original']):
        if col_name_original == "*":
            continue
        column_info_by_index[i] = {
            "original_name": col_name_original,
            "table_index": table_idx,
            "original_table_name": db_schema['table_names_original'][table_idx],
            "type": db_schema['column_types'][i]
        }
    for table_idx, table_name_original in enumerate(db_schema['table_names_original']):
        escaped_table_name = escape_sql_identifier(table_name_original)
        column_definitions = []
        table_constraints = []
        current_table_columns = []
        for col_global_idx, (tbl_idx_for_col, col_name_orig) in enumerate(db_schema['column_names_original']):
            if col_name_orig == "*":
                continue
            if tbl_idx_for_col == table_idx:
                current_table_columns.append({
                    "global_idx": col_global_idx,
                    "name": col_name_orig,
                    "type": db_schema['column_types'][col_global_idx]
                })
        pk_column_indices_for_table = [
            pk_idx for pk_idx in db_schema['primary_keys']
            if column_info_by_index.get(pk_idx) and column_info_by_index[pk_idx]['table_index'] == table_idx
        ]
        pk_column_names_for_table = [column_info_by_index[idx]['original_name'] for idx in pk_column_indices_for_table]
        for col_data in current_table_columns:
            col_name_original = col_data['name']
            spider_type = col_data['type']
            col_global_idx = col_data['global_idx']
            is_pk_col = col_global_idx in pk_column_indices_for_table
            is_fk_col = any(fk_pair[0] == col_global_idx for fk_pair in db_schema['foreign_keys'])
            sql_type = map_spider_type_to_sql_type(spider_type, is_pk_or_fk=(is_pk_col or is_fk_col))
            escaped_col_name = escape_sql_identifier(col_name_original)
            col_def_str = f"{escaped_col_name} {sql_type}"
            if is_pk_col and len(pk_column_names_for_table) == 1:
                col_def_str += " PRIMARY KEY"
            column_definitions.append(col_def_str)
        if len(pk_column_names_for_table) > 1:
            escaped_pk_cols = [escape_sql_identifier(name) for name in pk_column_names_for_table]
            table_constraints.append(f"PRIMARY KEY ({', '.join(escaped_pk_cols)})")
        for fk_col_idx, referenced_col_idx in db_schema['foreign_keys']:
            if column_info_by_index.get(fk_col_idx) and \
               column_info_by_index.get(referenced_col_idx) and \
               column_info_by_index[fk_col_idx]['table_index'] == table_idx:
                fk_column_name = column_info_by_index[fk_col_idx]['original_name']
                referenced_table_name = column_info_by_index[referenced_col_idx]['original_table_name']
                referenced_column_name = column_info_by_index[referenced_col_idx]['original_name']
                escaped_fk_col = escape_sql_identifier(fk_column_name)
                escaped_ref_table = escape_sql_identifier(referenced_table_name)
                escaped_ref_col = escape_sql_identifier(referenced_column_name)
                table_constraints.append(
                    f"FOREIGN KEY ({escaped_fk_col}) REFERENCES {escaped_ref_table} ({escaped_ref_col})"
                )
        all_parts = column_definitions + table_constraints
        create_table_statement = f"CREATE TABLE {escaped_table_name} (\n  "
        create_table_statement += ",\n  ".join(all_parts)
        create_table_statement += "\n);"
        sql_statements.append(create_table_statement)
    return "\n\n".join(sql_statements)
# --- End of Helper Functions ---


# --- MODIFIED "Main Execution" for "Cell 1" to produce the dictionary ---
# This code will be run when you execute the Jupyter cell.
# The output variable needed by your experiment is `all_db_schemas_sql_strings`.

all_db_schemas_sql_strings = {} # This is the dictionary your experiment needs

# Define the path to your tables.json
spider_tables_json_path = '/raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data/tables.json'

print("--- Cell 1: Preparing Database Schema SQL Strings (Dictionary Output) ---")
try:
    # 1. Load all schema structures from tables.json
    # `all_db_schemas_data_loaded` will be a dictionary: {db_id: schema_info_dict, ...}
    all_db_schemas_data_loaded = load_schemas(spider_tables_json_path) # Renamed to avoid confusion with function parameter
    print(f"Loaded schema data for {len(all_db_schemas_data_loaded)} databases from '{spider_tables_json_path}'.")

    # 2. Iterate through each loaded schema and generate its SQL string, storing it in the dictionary
    if all_db_schemas_data_loaded:
        for db_id in all_db_schemas_data_loaded: # Iterate through keys (db_ids)
            # Call generate_create_table_sql_for_db, passing the full loaded data
            # and the current db_id.
            sql_string_for_db = generate_create_table_sql_for_db(db_id, all_db_schemas_data_loaded)

            # Store the raw SQL string in the dictionary.
            # We only store it if it's a successful generation (doesn't start with the error message)
            if sql_string_for_db and not sql_string_for_db.startswith("-- Database ID"):
                all_db_schemas_sql_strings[db_id] = sql_string_for_db
            elif sql_string_for_db.startswith("-- Database ID"):
                print(f"Warning: Schema for {db_id} reported as not found by generate_create_table_sql_for_db.")
            else:
                print(f"Warning: SQL generation returned empty or unexpected for {db_id} (Result: '{sql_string_for_db[:50]}...')")

        print(f"Successfully populated `all_db_schemas_sql_strings` dictionary with {len(all_db_schemas_sql_strings)} entries.")
    else:
        print("No schema data loaded from tables.json, so `all_db_schemas_sql_strings` will be empty.")

except FileNotFoundError:
    print(f"FATAL ERROR: The file '{spider_tables_json_path}' was not found.")
    all_db_schemas_sql_strings = {} # Ensure it's defined as empty on error
except json.JSONDecodeError:
    print(f"FATAL ERROR: Could not decode JSON from '{spider_tables_json_path}'. Check if it's a valid JSON file.")
    all_db_schemas_sql_strings = {}
except Exception as e:
    print(f"FATAL ERROR during schema preparation: {e}")
    # import traceback # Uncomment if you need the full traceback here
    # traceback.print_exc()
    all_db_schemas_sql_strings = {}

# --- Verification (you can add this to your cell to check after it runs) ---
print(f"\n--- Verification of all_db_schemas_sql_strings ---")
print(f"Type: {type(all_db_schemas_sql_strings)}")
print(f"Number of schemas processed: {len(all_db_schemas_sql_strings)}")
if all_db_schemas_sql_strings:
    # Print a sample to verify content
    sample_db_id = list(all_db_schemas_sql_strings.keys())[0]
    print(f"Sample - DB ID: {sample_db_id}")
    # print(f"Sample - SQL String (first 300 chars):\n{all_db_schemas_sql_strings[sample_db_id][:300]}...")
else:
    print("`all_db_schemas_sql_strings` is empty. Review errors above.")
# --- End of Cell 1 Logic ---

--- Cell 1: Preparing Database Schema SQL Strings (Dictionary Output) ---
Loaded schema data for 166 databases from '/raid/infolab/gaurav/Llama_Spider_A100_Project/spider_subset_data/tables.json'.
Successfully populated `all_db_schemas_sql_strings` dictionary with 166 entries.

--- Verification of all_db_schemas_sql_strings ---
Type: <class 'dict'>
Number of schemas processed: 166
Sample - DB ID: perpetrator


In [49]:
# This cell defines parameters for running the experiment.
# It will now randomly select queries and always use ALL database schemas as candidates.

import random # Ensure random is imported at the top of your notebook or this cell
# import os # Ensure os is imported (likely already done for path joining)
# import json # Ensure json is imported (likely already done for loading)

# --- 2.1. Experiment Parameters ---
# Number of NL queries to RANDOMLY select from dev.json to process.
# For initial testing in Colab, use a small subset. For a more thorough run, increase this.
NUM_RANDOM_QUERIES_TO_TEST = 100 # For example, test 5 random queries

# This will now effectively always be True based on your requirement.
# The logic will be set up to use all schemas from all_db_schemas_sql_strings.
# We can keep the variable for clarity or remove it if it's always all DBs.
# For this implementation, let's explicitly aim for all DBs.
print("INFO: This experiment configuration will test each randomly selected query against ALL available Spider database schemas.")


# --- 2.2. Randomly Select NL Queries for the Experiment ---
# We will randomly sample NUM_RANDOM_QUERIES_TO_TEST queries from the loaded dev_data.
if not dev_data: # dev_data should have been loaded in Cell 1
    raise ValueError("dev_data is not loaded (from dev.json). Cannot select queries. Please run Cell 1 first.")

if len(dev_data) == 0:
    raise ValueError("dev_data is empty. No queries to select.")

actual_num_queries_to_select = min(NUM_RANDOM_QUERIES_TO_TEST, len(dev_data))
# Using min ensures we don't try to sample more queries than available.

if actual_num_queries_to_select < NUM_RANDOM_QUERIES_TO_TEST:
    print(f"Warning: Requested {NUM_RANDOM_QUERIES_TO_TEST} random queries, but only {len(dev_data)} are available. Using all {len(dev_data)} queries.")

# Randomly sample without replacement
selected_nl_queries = random.sample(dev_data, actual_num_queries_to_select)

print(f"\nRandomly selected {len(selected_nl_queries)} NL queries for the experiment:")
for i, q_info in enumerate(selected_nl_queries):
    print(f"  Test Query {i+1}: '{q_info['question']}' (True DB: {q_info['db_id']})")


# --- 2.3. Determine Candidate Database Schemas for Each Query ---
# For this experiment design, we ALWAYS use ALL available database schemas.
# all_db_schemas_sql_strings should have been populated in Cell 1.
if not all_sql_output: # Populated in Cell 1
    raise ValueError("all_sql_output is empty. Schemas were not converted in Cell 1. Cannot proceed.")

candidate_schemas_for_evaluation = all_db_schemas_sql_strings # Use all converted schemas
print(f"\nEach of the {len(selected_nl_queries)} selected queries will be evaluated against all {len(candidate_schemas_for_evaluation)} available Spider database schemas.")

if not candidate_schemas_for_evaluation: # Should not happen if all_db_schemas_sql_strings was populated
    raise ValueError("No candidate schemas available for evaluation. This indicates an issue with schema loading or conversion in Cell 1.")

INFO: This experiment configuration will test each randomly selected query against ALL available Spider database schemas.

Randomly selected 100 NL queries for the experiment:
  Test Query 1: 'What are the names and release years for all the songs of the youngest singer?' (True DB: concert_singer)
  Test Query 2: 'What are names of countries with the top 3 largest population?' (True DB: world_1)
  Test Query 3: 'What are the names and birth dates of people, ordered by their names in alphabetical order?' (True DB: poker_player)
  Test Query 4: 'How many different store locations are there?' (True DB: employee_hire_evaluation)
  Test Query 5: 'How many different nationalities do conductors have?' (True DB: orchestra)
  Test Query 6: 'How many states are there?' (True DB: voter_1)
  Test Query 7: 'What are the codes of template types that have fewer than 3 templates?' (True DB: cre_Doc_Template_Mgt)
  Test Query 8: 'How many dogs have not gone through any treatment?' (True DB: dog_kennels

In [41]:
import os
import json 
LOCAL_EXPERIMENT_BASE_DIR = "/raid/infolab/gaurav/Llama_Spider_A100_Project/"


EXPERIMENT_RUN_NAME = "randomQ_allDBs_run1" 
EXPERIMENT_PROJECT_DIR = os.path.join(LOCAL_EXPERIMENT_BASE_DIR, EXPERIMENT_RUN_NAME)

try:
    os.makedirs(EXPERIMENT_PROJECT_DIR, exist_ok=True)
    print(f"Ensured experiment project directory exists: '{EXPERIMENT_PROJECT_DIR}'")
except OSError as e:
    print(f"Error creating directory {EXPERIMENT_PROJECT_DIR}: {e}")
    EXPERIMENT_PROJECT_DIR = "." 


RESULTS_FILENAME = "spider_random_query_all_db_scores.json"
EXPERIMENT_RESULTS_FILE = os.path.join(EXPERIMENT_PROJECT_DIR, RESULTS_FILENAME)

print(f"Experiment results will be saved to: {EXPERIMENT_RESULTS_FILE}")

Ensured experiment project directory exists: '/raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1'
Experiment results will be saved to: /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json


In [42]:
# Cell defining get_yes_no_token_ids (CORRECTED)
def get_yes_no_token_ids(tokenizer_arg):
    """
    Determines the token IDs for 'Yes' and 'No', accounting for potential leading spaces.
    Llama-2-chat tends to produce " Yes" or " No" as single tokens after the prompt.
    """
    # Try with leading space first, as it's common for chat models
    yes_token_id_with_space = tokenizer_arg.encode(" Yes", add_special_tokens=False)
    no_token_id_with_space = tokenizer_arg.encode(" No", add_special_tokens=False)

    if len(yes_token_id_with_space) == 1 and len(no_token_id_with_space) == 1:
        print("Using ' Yes' and ' No' (with leading space) for Yes/No token IDs.")
        return yes_token_id_with_space[0], no_token_id_with_space[0] # Explicit return
    else:
        # Fallback to "Yes" and "No" without leading space
        yes_token_id_no_space = tokenizer_arg.encode("Yes", add_special_tokens=False)
        no_token_id_no_space = tokenizer_arg.encode("No", add_special_tokens=False)
        if len(yes_token_id_no_space) == 1 and len(no_token_id_no_space) == 1:
            print("Warning: Using 'Yes' and 'No' (no leading space) for Yes/No token IDs. This might be suboptimal for chat models.")
            return yes_token_id_no_space[0], no_token_id_no_space[0] # Explicit return
        else:
            # This case is problematic.
            print(f"ERROR: Could not determine reliable single token IDs for 'Yes'/'No' or ' Yes'/' No'.")
            print(f"Tokenization of ' Yes': {yes_token_id_with_space} (decoded: {[tokenizer_arg.decode(t) for t in yes_token_id_with_space]})")
            print(f"Tokenization of ' No': {no_token_id_with_space} (decoded: {[tokenizer_arg.decode(t) for t in no_token_id_with_space]})")
            print(f"Tokenization of 'Yes': {yes_token_id_no_space} (decoded: {[tokenizer_arg.decode(t) for t in yes_token_id_no_space]})")
            print(f"Tokenization of 'No': {no_token_id_no_space} (decoded: {[tokenizer_arg.decode(t) for t in no_token_id_no_space]})")
            # It's better to raise an error here so the problem is immediately obvious
            # rather than returning None and causing a TypeError later.
            raise ValueError("Unstable tokenization for 'Yes'/'No'. Review tokenization outputs above. Cannot proceed without reliable Yes/No token IDs.")

print("Helper function 'get_yes_no_token_ids' defined (with actual logic).")

Helper function 'get_yes_no_token_ids' defined (with actual logic).


In [43]:
if 'tokenizer' in globals() and tokenizer is not None:
    try:
        YES_TOKEN_ID, NO_TOKEN_ID = get_yes_no_token_ids(tokenizer)
        print(f"YES_TOKEN_ID: {YES_TOKEN_ID} ('{tokenizer.decode([YES_TOKEN_ID])}')")
        print(f"NO_TOKEN_ID: {NO_TOKEN_ID} ('{tokenizer.decode([NO_TOKEN_ID])}')")
    except ValueError as e:
        print(f"Error defining YES/NO token IDs: {e}")
else:
    print("ERROR: 'tokenizer' is not defined. Cannot define YES_TOKEN_ID and NO_TOKEN_ID.")

YES_TOKEN_ID: 3869 ('Yes')
NO_TOKEN_ID: 1939 ('No')


In [44]:
import torch # Ensure torch is imported
# After loading your tokenizer:
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", ...) # Or your specific model

# Llama 2 Chat Template (common structure)
# Make sure this matches the exact format expected by YOUR specific Llama 2 variant.
# Check the model card on Hugging Face for the precise template.
chat_template_llama2 = (
    "{% if messages[0]['role'] == 'system' %}"
    "<s>[INST] <<SYS>>\n{{ messages[0]['content'] }}\n<</SYS>>\n\n"
    "{% else %}"
    "<s>[INST] "
    "{% endif %}"
    "{% for message in messages %}"
    "{% if message['role'] == 'user' %}"
    "{{ message['content'] }} [/INST]"
    "{% elif message['role'] == 'assistant' %}"
    " {{ message['content'] }} </s><s>[INST]"
    "{% elif message['role'] == 'system' and loop.index0 > 0 %}" # Handle system message if not first
    " <<SYS>>\n{{ message['content'] }}\n<</SYS>>\n\n"
    "{% endif %}"
    "{% endfor %}"
    "{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}" # Add generation prompt if last message is not assistant
    " " # This space is important before the assistant starts generating
    "{% endif %}"
)
# A simpler version if you always have system then user:
# chat_template_llama2_simple = "<s>[INST] <<SYS>>\n{{ messages[0]['content'] }}\n<</SYS>>\n\n{{ messages[1]['content'] }} [/INST] "


# --- CHOOSE THE CORRECT TEMPLATE FOR YOUR MODEL ---
# For many Llama-2-chat models, the tokenizer might already have a default template
# if loaded correctly, but if not, you can set it.
# A common one if your `messages` list is always [system_message, user_message]:
if tokenizer.chat_template is None:
    if "llama-2" in tokenizer.name_or_path.lower() and "chat" in tokenizer.name_or_path.lower() : # Be more specific if needed
        # This is a common structure for Llama-2-chat for a system prompt followed by a user prompt.
        # The assistant's response will follow " [/INST] "
        tokenizer.chat_template = (
            "{% if messages[0]['role'] == 'system' %}"
            "<s>[INST] <<SYS>>\n{{ messages[0]['content'] }}\n<</SYS>>\n\n"
            "{% endif %}"
            "{{ messages[1]['content'] }} [/INST]" # Assumes second message is user
            # Add a space for the model to start generation if add_generation_prompt=True
            "{% if add_generation_prompt %} {% endif %}"
        )
        print("Manually set Llama 2 chat template on tokenizer.")
    else:
        print("Warning: tokenizer.chat_template is None and no specific template was set for the model type.")
        # You might need to define a different template or handle formatting manually.

# --- Core function to get P(Yes) ---
def get_yes_probability(model_arg, tokenizer_arg, system_prompt_arg, user_prompt_content_arg, yes_token_id_arg, no_token_id_arg, max_length=2048):
    """
    Gets the probability of the model answering "Yes" to the given query and schema.
    """
    messages = [
        {"role": "system", "content": system_prompt_arg},
        {"role": "user", "content": user_prompt_content_arg}
    ]

    prompt_for_model = tokenizer_arg.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer_arg(
        prompt_for_model,
        return_tensors="pt",
        truncation=True,
        max_length=max_length - 10
    )
    inputs = {k: v.to(model_arg.device) for k, v in inputs.items()}

    if inputs['input_ids'].shape[1] >= max_length - 10:
         print(f"Warning: Prompt for query was truncated. Length: {inputs['input_ids'].shape[1]}")

    with torch.no_grad():
        outputs = model_arg(**inputs)
        logits = outputs.logits
        next_token_logits = logits[:, -1, :]
        logit_yes = next_token_logits[:, yes_token_id_arg].item()
        logit_no = next_token_logits[:, no_token_id_arg].item()

    max_logit = max(logit_yes, logit_no)
    exp_yes = torch.exp(torch.tensor(logit_yes - max_logit, device=model_arg.device))
    exp_no = torch.exp(torch.tensor(logit_no - max_logit, device=model_arg.device))

    prob_yes = exp_yes / (exp_yes + exp_no)
    return prob_yes.item()

print("Core function 'get_yes_probability' defined.") # Add a print statement to confirm execution

Core function 'get_yes_probability' defined.


In [45]:
# --- Prompt Configuration ---
# This SYSTEM_PROMPT will be passed to your get_yes_probability function,
# which then should use it when constructing the messages for the chat model.
SYSTEM_PROMPT = (
    "You are an expert data analyst. Your task is to determine if a given natural language query "
    "can be answered *solely* based on the provided database schema. "
    "Do not attempt to answer the query itself. Your entire response must be only the word 'Yes' or the word 'No'."
)

# This USER_PROMPT_TEMPLATE is used directly in your experiment loop
# to format the schema and query for the user message.
USER_PROMPT_TEMPLATE = """Database Schema:
---
{schema_string}
---
Natural Language Query: "{nl_query}"
---
Can the query be answered using *only* the provided schema and its potential contents? Answer with either "Yes" or "No".
"""

print("SYSTEM_PROMPT and USER_PROMPT_TEMPLATE defined.")

SYSTEM_PROMPT and USER_PROMPT_TEMPLATE defined.


In [46]:
print("Testing get_yes_probability directly...")
try:
    # Construct a very simple schema and query for testing
    test_schema = "CREATE TABLE TestTable (id INT, name TEXT);"
    test_nl_query = "What is the name for id 1?"
    sample_user_prompt_content = USER_PROMPT_TEMPLATE.format(
        schema_string=test_schema,
        nl_query=test_nl_query
    )
    print(f"Test User Prompt: {sample_user_prompt_content}")

    # Make sure all these variables are defined and loaded:
    # model, tokenizer, SYSTEM_PROMPT, YES_TOKEN_ID, NO_TOKEN_ID
    prob = get_yes_probability(
        model,
        tokenizer,
        SYSTEM_PROMPT,
        sample_user_prompt_content,
        YES_TOKEN_ID,
        NO_TOKEN_ID
    )
    print(f"get_yes_probability returned: {prob}")
except Exception as e:
    import traceback
    print("Error during direct call to get_yes_probability:")
    traceback.print_exc()

Testing get_yes_probability directly...
Test User Prompt: Database Schema:
---
CREATE TABLE TestTable (id INT, name TEXT);
---
Natural Language Query: "What is the name for id 1?"
---
Can the query be answered using *only* the provided schema and its potential contents? Answer with either "Yes" or "No".

get_yes_probability returned: 0.8991213440895081


In [50]:
# --- Ensure these imports are at the top of your script/notebook ---
import json
import os
import traceback
from tqdm.auto import tqdm # Use .auto or .notebook for Jupyter

# --- Prerequisites (must be defined and populated from Cell 1 and Cell 2): ---
# model, tokenizer, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE,
# YES_TOKEN_ID, NO_TOKEN_ID, get_yes_probability,
# selected_nl_queries, candidate_schemas_for_evaluation, EXPERIMENT_RESULTS_FILE
# --- (Assume these are correctly defined above this cell) ---

# --- 3.1. Initialize Results Storage ---
experiment_all_query_results = []

# --- 3.2. Start the Loop ---
# This initial print is fine as it's before any tqdm loops start for this cell's main logic
print(f"\n--- Starting Experiment: {len(selected_nl_queries)} Random Queries vs. {len(candidate_schemas_for_evaluation)} Total DB Schemas ---")

# Outer loop: Iterate through each randomly selected NL query
for query_idx, nl_query_info in enumerate(tqdm(selected_nl_queries, desc="Processing NL Queries")):
    current_nl_query_text = nl_query_info['question']
    true_db_id_for_query = nl_query_info['db_id']
    experiment_query_id = f"spider_dev_q{query_idx}_{nl_query_info.get('query_id', 'idx'+str(query_idx))}"

    # Use tqdm.write for status updates related to the outer loop's progress
    # The '\n' at the beginning helps separate entries for each query visually.
    tqdm.write(f"\nProcessing Query {query_idx + 1}/{len(selected_nl_queries)} (ID: {experiment_query_id}): '{current_nl_query_text}' (True DB: {true_db_id_for_query})")

    scores_for_current_query = []

    # --- Optional: For debugging, print scores for the VERY FIRST query only ---
    # print_debug_scores_for_first_query_only = True
    # if print_debug_scores_for_first_query_only and query_idx == 0:
    #     tqdm.write(f"  --- Incremental Scores for First Query: '{current_nl_query_text}' ---")
    # --- End Optional Debug Print Setup ---

    # Inner loop: Iterate through each candidate database schema
    for candidate_db_id, candidate_schema_sql in tqdm(
        candidate_schemas_for_evaluation.items(),
        desc=f"  DBs for Q:{experiment_query_id[:20]}", # Description for the inner bar
        leave=False  # Inner bar will be removed upon completion of its loop
    ):
        user_prompt_content = USER_PROMPT_TEMPLATE.format(
            schema_string=candidate_schema_sql,
            nl_query=current_nl_query_text
        )
        p_yes_score = -1.0

        try:
            p_yes_score = get_yes_probability(
                model, tokenizer, SYSTEM_PROMPT, user_prompt_content, YES_TOKEN_ID, NO_TOKEN_ID
            )
        except Exception as e:
            # Use tqdm.write for error messages occurring inside the inner loop
            tqdm.write(f"    ERROR: Exception in get_yes_probability for Query ID '{experiment_query_id}' with DB '{candidate_db_id}'.")
            tqdm.write(f"    Exception type: {type(e).__name__}, Message: {e}")
            # if you need full traceback for debugging, tqdm.write(traceback.format_exc()) might work,
            # but it can be very verbose. Printing to a log file is better for extensive tracebacks.
            # traceback.print_exc() # This will print to stderr and might still mess with tqdm display

        scores_for_current_query.append({
            'candidate_db_id': candidate_db_id,
            'p_yes_score': p_yes_score
        })

        # --- Optional: For debugging, print scores for the VERY FIRST query only ---
        # if print_debug_scores_for_first_query_only and query_idx == 0:
        #     tqdm.write(f"    DB: {candidate_db_id}, Score: {p_yes_score:.4f}") # Incremental print with tqdm.write
        # --- End Optional Debug Print ---

    ranked_databases_for_query = sorted(scores_for_current_query, key=lambda x: x['p_yes_score'], reverse=True)

    # --- Optional: For debugging, print sorted scores for the VERY FIRST query only ---
    # if print_debug_scores_for_first_query_only and query_idx == 0:
    #     tqdm.write(f"  --- Sorted Ranked Databases for First Query: '{current_nl_query_text}' (Top 10) ---")
    #     for rank_info in ranked_databases_for_query[:10]:
    #         tqdm.write(f"    Ranked DB: {rank_info['candidate_db_id']}, Score: {rank_info['p_yes_score']:.4f}")
    # --- End Optional Debug Print ---

    experiment_all_query_results.append({
        'experiment_query_id': experiment_query_id,
        'nl_query_text': current_nl_query_text,
        'true_db_id': true_db_id_for_query,
        'ranked_databases_with_scores': ranked_databases_for_query
    })

    # --- 3.3. Periodic Saving of Results ---
    if (query_idx + 1) % 1 == 0 or (query_idx + 1) == len(selected_nl_queries):
        try:
            with open(EXPERIMENT_RESULTS_FILE, 'w') as f_out:
                json.dump(experiment_all_query_results, f_out, indent=2)
            # Use tqdm.write for save messages that occur between outer loop iterations
            tqdm.write(f"  Successfully saved intermediate results for {len(experiment_all_query_results)} queries to {EXPERIMENT_RESULTS_FILE}")
        except Exception as e:
            tqdm.write(f"  ERROR: Could not save intermediate results: {e}")

# --- 3.4. Experiment Loop Completion ---
# These final prints are after all tqdm loops are done, so standard print is fine.
print("\n--- Experiment Loop Finished ---")
if experiment_all_query_results:
    print(f"Processed {len(experiment_all_query_results)} queries in total.")
    try:
        with open(EXPERIMENT_RESULTS_FILE, 'w') as f_out:
            json.dump(experiment_all_query_results, f_out, indent=2)
        print(f"Final results comprehensively saved to {EXPERIMENT_RESULTS_FILE}")
    except Exception as e:
        print(f"ERROR: Could not save final results: {e}")
else:
    print("No results were generated from the experiment. Check logs for errors.")


--- Starting Experiment: 100 Random Queries vs. 166 Total DB Schemas ---


Processing NL Queries:   0%|          | 0/100 [00:00<?, ?it/s]


Processing Query 1/100 (ID: spider_dev_q0_idx0): 'What are the names and release years for all the songs of the youngest singer?' (True DB: concert_singer)


  DBs for Q:spider_dev_q0_idx0:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 1 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 2/100 (ID: spider_dev_q1_idx1): 'What are names of countries with the top 3 largest population?' (True DB: world_1)


  DBs for Q:spider_dev_q1_idx1:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 2 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 3/100 (ID: spider_dev_q2_idx2): 'What are the names and birth dates of people, ordered by their names in alphabetical order?' (True DB: poker_player)


  DBs for Q:spider_dev_q2_idx2:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 3 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 4/100 (ID: spider_dev_q3_idx3): 'How many different store locations are there?' (True DB: employee_hire_evaluation)


  DBs for Q:spider_dev_q3_idx3:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 4 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 5/100 (ID: spider_dev_q4_idx4): 'How many different nationalities do conductors have?' (True DB: orchestra)


  DBs for Q:spider_dev_q4_idx4:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 5 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 6/100 (ID: spider_dev_q5_idx5): 'How many states are there?' (True DB: voter_1)


  DBs for Q:spider_dev_q5_idx5:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 6 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 7/100 (ID: spider_dev_q6_idx6): 'What are the codes of template types that have fewer than 3 templates?' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q6_idx6:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 7 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 8/100 (ID: spider_dev_q7_idx7): 'How many dogs have not gone through any treatment?' (True DB: dog_kennels)


  DBs for Q:spider_dev_q7_idx7:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 8 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 9/100 (ID: spider_dev_q8_idx8): 'What are the template ids of any templates used in more than a single document?' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q8_idx8:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 9 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 10/100 (ID: spider_dev_q9_idx9): 'Show name, country, age for all singers ordered by age from the oldest to the youngest.' (True DB: concert_singer)


  DBs for Q:spider_dev_q9_idx9:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 10 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 11/100 (ID: spider_dev_q10_idx10): 'Show the student IDs and numbers of friends corresponding to each.' (True DB: network_1)


  DBs for Q:spider_dev_q10_idx10:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 11 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 12/100 (ID: spider_dev_q11_idx11): 'What are flight numbers of flights arriving at City "Aberdeen"?' (True DB: flight_2)


  DBs for Q:spider_dev_q11_idx11:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 12 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 13/100 (ID: spider_dev_q12_idx12): 'How many countries speak both English and Dutch?' (True DB: world_1)


  DBs for Q:spider_dev_q12_idx12:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 13 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 14/100 (ID: spider_dev_q13_idx13): 'What are the notes of the death events which has substring 'East'?' (True DB: battle_death)


  DBs for Q:spider_dev_q13_idx13:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 14 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 15/100 (ID: spider_dev_q14_idx14): 'What are the names of conductors as well as the corresonding orchestras that they have conducted?' (True DB: orchestra)


  DBs for Q:spider_dev_q14_idx14:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 15 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 16/100 (ID: spider_dev_q15_idx15): 'List the earnings of poker players in descending order.' (True DB: poker_player)


  DBs for Q:spider_dev_q15_idx15:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 16 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 17/100 (ID: spider_dev_q16_idx16): 'Which owner has paid the largest amount of money in total for their dogs? Show the owner id and zip code.' (True DB: dog_kennels)


  DBs for Q:spider_dev_q16_idx16:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 17 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 18/100 (ID: spider_dev_q17_idx17): 'Find the number of flights landing in the city of Aberdeen or Abilene.' (True DB: flight_2)


  DBs for Q:spider_dev_q17_idx17:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 18 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 19/100 (ID: spider_dev_q18_idx18): 'How many pets have a greater weight than 10?' (True DB: pets_1)


  DBs for Q:spider_dev_q18_idx18:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 19 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 20/100 (ID: spider_dev_q19_idx19): 'Show different citizenships and the maximum net worth of singers of each citizenship.' (True DB: singer)


  DBs for Q:spider_dev_q19_idx19:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 20 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 21/100 (ID: spider_dev_q20_idx20): 'What are the population, name and leader of the country with the largest area?' (True DB: world_1)


  DBs for Q:spider_dev_q20_idx20:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 21 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 22/100 (ID: spider_dev_q21_idx21): 'For each semester, what is the name and id of the one with the most students registered?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q21_idx21:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 22 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 23/100 (ID: spider_dev_q22_idx22): 'Show paragraph details for paragraph with text 'Korea ' .' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q22_idx22:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 23 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 24/100 (ID: spider_dev_q23_idx23): 'Return the number of  airports.' (True DB: flight_2)


  DBs for Q:spider_dev_q23_idx23:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 24 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 25/100 (ID: spider_dev_q24_idx24): 'Find the average age of losers and winners of all matches.' (True DB: wta_1)


  DBs for Q:spider_dev_q24_idx24:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 25 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 26/100 (ID: spider_dev_q25_idx25): 'What is the most commmon hometowns for teachers?' (True DB: course_teach)


  DBs for Q:spider_dev_q25_idx25:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 26 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 27/100 (ID: spider_dev_q26_idx26): 'List the title of all cartoons in alphabetical order.' (True DB: tvshow)


  DBs for Q:spider_dev_q26_idx26:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 27 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 28/100 (ID: spider_dev_q27_idx27): 'What is the maximum miles per gallon of the car with 8 cylinders or produced before 1980 ?' (True DB: car_1)


  DBs for Q:spider_dev_q27_idx27:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 28 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 29/100 (ID: spider_dev_q28_idx28): 'What is the average transcript date?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q28_idx28:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 29 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 30/100 (ID: spider_dev_q29_idx29): 'What is the total population of Gelderland district?' (True DB: world_1)


  DBs for Q:spider_dev_q29_idx29:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 30 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 31/100 (ID: spider_dev_q30_idx30): 'Return the money rank of the player with the greatest earnings.' (True DB: poker_player)


  DBs for Q:spider_dev_q30_idx30:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 31 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 32/100 (ID: spider_dev_q31_idx31): 'What are the names of the sections in reverse alphabetical order?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q31_idx31:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 32 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 33/100 (ID: spider_dev_q32_idx32): 'What languages are only used by a single country with a republic government?' (True DB: world_1)


  DBs for Q:spider_dev_q32_idx32:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 33 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 34/100 (ID: spider_dev_q33_idx33): 'Return the id of the document with the fewest paragraphs.' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q33_idx33:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 34 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 35/100 (ID: spider_dev_q34_idx34): 'What are the descriptions for all the math courses?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q34_idx34:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 35 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 36/100 (ID: spider_dev_q35_idx35): 'Find the first name of students who have both cat and dog pets .' (True DB: pets_1)


  DBs for Q:spider_dev_q35_idx35:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 36 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 37/100 (ID: spider_dev_q36_idx36): 'Find the type and weight of the youngest pet.' (True DB: pets_1)


  DBs for Q:spider_dev_q36_idx36:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 37 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 38/100 (ID: spider_dev_q37_idx37): 'Find the role, street, city and state of the professionals living in a city that contains the substring 'West'.' (True DB: dog_kennels)


  DBs for Q:spider_dev_q37_idx37:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 38 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 39/100 (ID: spider_dev_q38_idx38): 'What is the code of airport that has the highest number of flights?' (True DB: flight_2)


  DBs for Q:spider_dev_q38_idx38:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 39 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 40/100 (ID: spider_dev_q39_idx39): 'What is the Package Option of TV Channel with serial name "Sky Radio"?' (True DB: tvshow)


  DBs for Q:spider_dev_q39_idx39:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 40 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 41/100 (ID: spider_dev_q40_idx40): 'Tell me the age of the oldest dog.' (True DB: dog_kennels)


  DBs for Q:spider_dev_q40_idx40:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 41 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 42/100 (ID: spider_dev_q41_idx41): 'What are the countries that have greater surface area than any country in Europe?' (True DB: world_1)


  DBs for Q:spider_dev_q41_idx41:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 42 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 43/100 (ID: spider_dev_q42_idx42): 'Find the name and location of the stadiums which some concerts happened in the years of both 2014 and 2015.' (True DB: concert_singer)


  DBs for Q:spider_dev_q42_idx42:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 43 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 44/100 (ID: spider_dev_q43_idx43): 'List the names and birth dates of people in ascending alphabetical order of name.' (True DB: poker_player)


  DBs for Q:spider_dev_q43_idx43:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 44 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 45/100 (ID: spider_dev_q44_idx44): 'How many cities in each district have a population that is above the average population across all cities?' (True DB: world_1)


  DBs for Q:spider_dev_q44_idx44:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 45 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 46/100 (ID: spider_dev_q45_idx45): 'Return the nationalities for which there are two or more people.' (True DB: poker_player)


  DBs for Q:spider_dev_q45_idx45:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 46 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 47/100 (ID: spider_dev_q46_idx46): 'How many documents do we have?' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q46_idx46:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 47 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 48/100 (ID: spider_dev_q47_idx47): 'Tell me the owner id and last name of the owner who spent the most on treatments of his or her dogs.' (True DB: dog_kennels)


  DBs for Q:spider_dev_q47_idx47:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 48 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 49/100 (ID: spider_dev_q48_idx48): 'What are the names and descriptions for all the sections?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q48_idx48:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 49 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 50/100 (ID: spider_dev_q49_idx49): 'How many available features are there in total?' (True DB: real_estate_properties)


  DBs for Q:spider_dev_q49_idx49:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 50 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 51/100 (ID: spider_dev_q50_idx50): 'What are the birth year and citizenship of singers?' (True DB: singer)


  DBs for Q:spider_dev_q50_idx50:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 51 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 52/100 (ID: spider_dev_q51_idx51): 'How many matches were played in each year?' (True DB: wta_1)


  DBs for Q:spider_dev_q51_idx51:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 52 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 53/100 (ID: spider_dev_q52_idx52): 'Which airlines have at least 10 flights?' (True DB: flight_2)


  DBs for Q:spider_dev_q52_idx52:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 53 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 54/100 (ID: spider_dev_q53_idx53): 'What is the name and id of the department with the most number of degrees ?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q53_idx53:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 54 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 55/100 (ID: spider_dev_q54_idx54): 'What are the ids for templates that are not used in any documents?' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q54_idx54:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 55 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 56/100 (ID: spider_dev_q55_idx55): 'What is the last transcript release date?' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q55_idx55:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 56 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 57/100 (ID: spider_dev_q56_idx56): 'What are the names of the teachers ordered by ascending age?' (True DB: course_teach)


  DBs for Q:spider_dev_q56_idx56:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 57 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 58/100 (ID: spider_dev_q57_idx57): 'Show the date and id of the transcript with at least 2 course results.' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q57_idx57:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 58 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 59/100 (ID: spider_dev_q58_idx58): 'What are the names of conductors whose nationalities are not "USA"?' (True DB: orchestra)


  DBs for Q:spider_dev_q58_idx58:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 59 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 60/100 (ID: spider_dev_q59_idx59): 'What are the names of students who have no friends?' (True DB: network_1)


  DBs for Q:spider_dev_q59_idx59:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 60 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 61/100 (ID: spider_dev_q60_idx60): 'What is the count of the car models produced in the United States?' (True DB: car_1)


  DBs for Q:spider_dev_q60_idx60:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 61 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 62/100 (ID: spider_dev_q61_idx61): 'What are the names of countries that speak more than 2 languages, as well as how many languages they speak?' (True DB: world_1)


  DBs for Q:spider_dev_q61_idx61:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 62 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 63/100 (ID: spider_dev_q62_idx62): 'Give the name, population, and head of state for the country that has the largest area.' (True DB: world_1)


  DBs for Q:spider_dev_q62_idx62:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 63 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 64/100 (ID: spider_dev_q63_idx63): 'How many different results are there for the battles?' (True DB: battle_death)


  DBs for Q:spider_dev_q63_idx63:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 64 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 65/100 (ID: spider_dev_q64_idx64): 'What are the names of the countries with no car makers?' (True DB: car_1)


  DBs for Q:spider_dev_q64_idx64:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 65 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 66/100 (ID: spider_dev_q65_idx65): 'What is the language that is used by the largest number of Asian nations?' (True DB: world_1)


  DBs for Q:spider_dev_q65_idx65:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 66 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 67/100 (ID: spider_dev_q66_idx66): 'Return the record companies of orchestras, sorted descending by the years in which they were founded.' (True DB: orchestra)


  DBs for Q:spider_dev_q66_idx66:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 67 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 68/100 (ID: spider_dev_q67_idx67): 'What are the names of properties that are either houses or apartments with more than 1 room?' (True DB: real_estate_properties)


  DBs for Q:spider_dev_q67_idx67:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 68 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 69/100 (ID: spider_dev_q68_idx68): 'find the package option of the tv channel that do not have any cartoon directed by Ben Jones.' (True DB: tvshow)


  DBs for Q:spider_dev_q68_idx68:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 69 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 70/100 (ID: spider_dev_q69_idx69): 'List the last name of the owner owning the youngest dog.' (True DB: dog_kennels)


  DBs for Q:spider_dev_q69_idx69:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 70 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 71/100 (ID: spider_dev_q70_idx70): 'What is the name and capacity of the stadium with the most concerts after 2013 ?' (True DB: concert_singer)


  DBs for Q:spider_dev_q70_idx70:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 71 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 72/100 (ID: spider_dev_q71_idx71): 'What are the names of conductors who have conducted orchestras founded after the year 2008?' (True DB: orchestra)


  DBs for Q:spider_dev_q71_idx71:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 72 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 73/100 (ID: spider_dev_q72_idx72): 'How many cars has over 6 cylinders?' (True DB: car_1)


  DBs for Q:spider_dev_q72_idx72:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 73 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 74/100 (ID: spider_dev_q73_idx73): 'What is the number of car models that are produced by each maker and what is the id and full name of each maker?' (True DB: car_1)


  DBs for Q:spider_dev_q73_idx73:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 74 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 75/100 (ID: spider_dev_q74_idx74): 'What are the names of conductors, sorted descending by the number of years they have worked?' (True DB: orchestra)


  DBs for Q:spider_dev_q74_idx74:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 75 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 76/100 (ID: spider_dev_q75_idx75): 'What are the distinct template type descriptions for the templates ever used by any document?' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q75_idx75:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 76 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 77/100 (ID: spider_dev_q76_idx76): 'What are the names of poker players, ordered ascending by the number of final tables they have made?' (True DB: poker_player)


  DBs for Q:spider_dev_q76_idx76:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 77 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 78/100 (ID: spider_dev_q77_idx77): 'What is the maximum mpg of the cars that had 8 cylinders or that were produced before 1980 ?' (True DB: car_1)


  DBs for Q:spider_dev_q77_idx77:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 78 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 79/100 (ID: spider_dev_q78_idx78): 'What is the TV Channel of TV series with Episode "A Love of a Lifetime"? List the TV Channel's series name.' (True DB: tvshow)


  DBs for Q:spider_dev_q78_idx78:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 79 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 80/100 (ID: spider_dev_q79_idx79): 'What is the lowest grade of students who do not have any friends?' (True DB: network_1)


  DBs for Q:spider_dev_q79_idx79:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 80 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 81/100 (ID: spider_dev_q80_idx80): 'Show the name and theme for all concerts and the number of singers in each concert.' (True DB: concert_singer)


  DBs for Q:spider_dev_q80_idx80:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 81 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 82/100 (ID: spider_dev_q81_idx81): 'What is the total ticket expense of the visitors whose membership level is 1?' (True DB: museum_visit)


  DBs for Q:spider_dev_q81_idx81:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 82 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 83/100 (ID: spider_dev_q82_idx82): 'Which professionals have done at least two types of treatments? List the professional id and cell phone.' (True DB: dog_kennels)


  DBs for Q:spider_dev_q82_idx82:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 83 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 84/100 (ID: spider_dev_q83_idx83): 'How many countries does each continent have? List the continent id, continent name and the number of countries.' (True DB: car_1)


  DBs for Q:spider_dev_q83_idx83:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 84 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 85/100 (ID: spider_dev_q84_idx84): 'What is the average, minimum, and maximum age for all French singers?' (True DB: concert_singer)


  DBs for Q:spider_dev_q84_idx84:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 85 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 86/100 (ID: spider_dev_q85_idx85): 'What is the car model with the highest mpg ?' (True DB: car_1)


  DBs for Q:spider_dev_q85_idx85:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 86 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 87/100 (ID: spider_dev_q86_idx86): 'How many flights do we have?' (True DB: flight_2)


  DBs for Q:spider_dev_q86_idx86:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 87 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 88/100 (ID: spider_dev_q87_idx87): 'What is the series name of the TV Channel that shows the cartoon "The Rise of the Blue Beetle"?' (True DB: tvshow)


  DBs for Q:spider_dev_q87_idx87:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 88 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 89/100 (ID: spider_dev_q88_idx88): 'What is the name of the conductor who has conducted the most orchestras?' (True DB: orchestra)


  DBs for Q:spider_dev_q88_idx88:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 89 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 90/100 (ID: spider_dev_q89_idx89): 'How many battles did not lose any ship with tonnage '225'?' (True DB: battle_death)


  DBs for Q:spider_dev_q89_idx89:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 90 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 91/100 (ID: spider_dev_q90_idx90): 'Of all the contestants who got voted, what is the contestant number and name of the contestant who got least votes?' (True DB: voter_1)


  DBs for Q:spider_dev_q90_idx90:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 91 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 92/100 (ID: spider_dev_q91_idx91): 'What are the regions that use English or Dutch?' (True DB: world_1)


  DBs for Q:spider_dev_q91_idx91:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 92 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 93/100 (ID: spider_dev_q92_idx92): 'Find the average number of staff working for the museums that were open before 2009.' (True DB: museum_visit)


  DBs for Q:spider_dev_q92_idx92:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 93 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 94/100 (ID: spider_dev_q93_idx93): 'Find the total number of players.' (True DB: wta_1)


  DBs for Q:spider_dev_q93_idx93:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 94 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 95/100 (ID: spider_dev_q94_idx94): 'Sort all the shops by number products in descending order, and return the name, location and district of each shop.' (True DB: employee_hire_evaluation)


  DBs for Q:spider_dev_q94_idx94:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 95 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 96/100 (ID: spider_dev_q95_idx95): 'Describe the section h.' (True DB: student_transcripts_tracking)


  DBs for Q:spider_dev_q95_idx95:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 96 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 97/100 (ID: spider_dev_q96_idx96): 'Count the number of paragraphs.' (True DB: cre_Doc_Template_Mgt)


  DBs for Q:spider_dev_q96_idx96:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 97 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 98/100 (ID: spider_dev_q97_idx97): 'Find the number of pets whose weight is heavier than 10.' (True DB: pets_1)


  DBs for Q:spider_dev_q97_idx97:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 98 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 99/100 (ID: spider_dev_q98_idx98): 'What are the record companies that are used by both orchestras founded before 2003 and those founded after 2003?' (True DB: orchestra)


  DBs for Q:spider_dev_q98_idx98:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 99 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

Processing Query 100/100 (ID: spider_dev_q99_idx99): 'What are the ids and makers of all car makers that produce at least 2 models and make more than 3 cars?' (True DB: car_1)


  DBs for Q:spider_dev_q99_idx99:   0%|          | 0/166 [00:00<?, ?it/s]

  Successfully saved intermediate results for 100 queries to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json

--- Experiment Loop Finished ---
Processed 100 queries in total.
Final results comprehensively saved to /raid/infolab/gaurav/Llama_Spider_A100_Project/randomQ_allDBs_run1/spider_random_query_all_db_scores.json


In [None]:
import os
import json

# Path where the evaluation summary (Recall@K results) will be saved
EVAL_RESULTS_SAVE_PATH = "recall_k_results.json"

# --- 4.1. Define Recall@K Calculation Function ---
def calculate_recall_at_k_metric(all_query_results_list, k_values_list):
    """
    Calculates Recall@K for a list of K values.
    Each item in all_query_results_list should be a dictionary with:
        'true_db_id': The ground truth database ID for the query.
        'ranked_databases_with_scores': A list of {'candidate_db_id': id, 'p_yes_score': score},
                                         sorted by score in descending order.
    """
    recall_counts = {k: 0 for k in k_values_list}  # Stores how many times true_db was in top K
    total_valid_queries = 0  # Queries for which we have a true_db_id

    if not all_query_results_list:
        return {k: 0.0 for k in k_values_list}, 0

    for query_result in all_query_results_list:
        true_db = query_result.get('true_db_id')
        ranked_dbs_info = query_result.get('ranked_databases_with_scores')

        if true_db is None or ranked_dbs_info is None:
            print(f"Warning: Skipping query result due to missing 'true_db_id' or 'ranked_databases_with_scores': "
                  f"{query_result.get('experiment_query_id', 'Unknown Query')}")
            continue  # Skip if essential information is missing

        total_valid_queries += 1
        # Extract just the DB IDs from the ranked list
        ranked_db_ids_only = [item['candidate_db_id'] for item in ranked_dbs_info]

        for k in k_values_list:
            # Get the top K predicted database IDs
            top_k_predicted_dbs = ranked_db_ids_only[:k]
            if true_db in top_k_predicted_dbs:
                recall_counts[k] += 1

    # Calculate final recall percentages
    recall_percentages = {}
    if total_valid_queries > 0:
        for k in k_values_list:
            recall_percentages[k] = (recall_counts[k] / total_valid_queries) * 100.0  # As percentage
    else:
        recall_percentages = {k: 0.0 for k in k_values_list}

    return recall_percentages, total_valid_queries


# --- 4.2. Perform Evaluation ---
# Load results if this cell is run in a new session and experiment_all_query_results isn't in memory
# (assuming results were saved to EXPERIMENT_RESULTS_FILE)
loaded_results_for_eval = None
if 'experiment_all_query_results' in globals() and experiment_all_query_results:
    print("Using in-memory experiment_all_query_results for evaluation.")
    loaded_results_for_eval = experiment_all_query_results
elif os.path.exists(EXPERIMENT_RESULTS_FILE):
    print(f"Loading results from {EXPERIMENT_RESULTS_FILE} for evaluation...")
    try:
        with open(EXPERIMENT_RESULTS_FILE, 'r') as f_in:
            loaded_results_for_eval = json.load(f_in)
        print(f"Successfully loaded {len(loaded_results_for_eval)} results from file.")
    except Exception as e:
        print(f"Error loading results from file for evaluation: {e}")
else:
    print("No results available in memory or in the specified results file for evaluation.")

if loaded_results_for_eval:
    K_VALUES_TO_EVALUATE = [1, 3, 5, 10]  # Define the K values you care about
    recall_scores_map, num_queries_evaluated = calculate_recall_at_k_metric(
        loaded_results_for_eval, K_VALUES_TO_EVALUATE
    )

    print("\n--- Evaluation: Recall@K ---")
    print(f"Evaluated on {num_queries_evaluated} queries.")
    for k_val, recall_val in recall_scores_map.items():
        print(f"Recall@{k_val}: {recall_val:.2f}%")

    # --- 4.2.1. Save evaluation results to a JSON file ---
    try:
        eval_summary = {
            "num_queries_evaluated": num_queries_evaluated,
            "recall_scores": recall_scores_map
        }
        with open(EVAL_RESULTS_SAVE_PATH, 'w') as fout:
            json.dump(eval_summary, fout, indent=2)
        print(f"Saved evaluation results to '{EVAL_RESULTS_SAVE_PATH}'")
    except Exception as save_err:
        print(f"Error saving evaluation results: {save_err}")

    # --- 4.3. Optional: Print Detailed Results for a Few Queries ---
    print("\n--- Sample Detailed Query Results (Top 5 Queries) ---")
    for i, res in enumerate(loaded_results_for_eval[:5]):  # Show for first 5 queries
        print(f"\nQuery {i+1}: '{res.get('nl_query_text', '<no text>')}' (True DB: {res.get('true_db_id')})")
        print("  Top Ranked Databases (with P(Yes) scores):")
        for rank, db_info in enumerate(res.get('ranked_databases_with_scores', [])[:5]):  # Show top 5 ranked DBs
            is_true_db_char = "*" if db_info['candidate_db_id'] == res['true_db_id'] else " "
            print(f"    {rank+1}. {db_info['candidate_db_id']}{is_true_db_char} "
                  f"(Score: {db_info['p_yes_score']:.4f})")
else:
    print("Cannot perform evaluation as no results were loaded or generated.")
