# Experiment Runner: HLE Analysis with DeepSeek

このノートブックは設定ファイルに基づいて実験プロセス全体をオーケストレーションします

## 1. Setup and Configuration

In [15]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pandas as pd
from tqdm.auto import tqdm
from dotenv import load_dotenv

# .envファイルから環境変数を読み込む
load_dotenv('../.env')

hf_token_check = os.getenv('HUGGINGFACE_API_KEY')
print(f"Hugging Face Token Loaded: {'Yes' if hf_token_check else 'No'}")
if hf_token_check:
    # トークン全体は表示せず、最初の数文字だけ表示して確認
    print(f"Token starts with: {hf_token_check[:5]}...") 

# srcディレクトリをパスに追加してモジュールをインポート
sys.path.append('../src')

from utils import load_config, load_prompt_template, ensure_dir
from data_loader import load_hle_dataset
from model_handler import ModelHandler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Hugging Face Token Loaded: Yes
Token starts with: hf_ag...


In [21]:
CONFIG_FILE = '../configs/deepseek_base.yml'

print(f"Loading configuration from: {CONFIG_FILE}")
config = load_config(CONFIG_FILE)

# Load prompts
mc_prompt_template = load_prompt_template('mc_system_prompt')
em_prompt_template = load_prompt_template('em_system_prompt')

print("Configuration and prompts loaded successfully.")

Loading configuration from: ../configs/deepseek_base.yml
Configuration and prompts loaded successfully.


## 2. Load Dataset

In [17]:
dataset = load_hle_dataset(config['DATASET_NAME'])

# For development, select a subset of the data
num_samples = config.get('NUM_SAMPLES_TO_RUN', len(dataset))
subset_dataset = dataset.select(range(num_samples))

print(f"Dataset loaded. Running on {len(subset_dataset)} samples.")
display(subset_dataset)

Loading dataset: cais/hle (split: test)
Original dataset size: 2500
Filtered to text-only dataset size: 2158
Dataset loaded. Running on 5 samples.


Dataset({
    features: ['id', 'question', 'image', 'image_preview', 'answer', 'answer_type', 'author_name', 'rationale', 'rationale_image', 'raw_subject', 'category', 'canary'],
    num_rows: 5
})

## 3. Load Model and Tokenizer

In [18]:
model_handler = ModelHandler(
    model_name=config['MODEL_NAME'],
    hf_token=os.getenv('HUGGINGFACE_API_KEY')
)

Initializing ModelHandler on device: cpu
Loading tokenizer: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B


Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


Loading model: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


Model and tokenizer loaded successfully.


## 4. Run Inference Loop

In [20]:
results = []

for example in tqdm(subset_dataset):
    if example['answer_type'] == 'multiple_choice': # Multiple Choice
        prompt_template = mc_prompt_template
    else: # Exact Match
        prompt_template = em_prompt_template
    final_prompt = prompt_template.format(question=example['question'])
    
    # Generate response from the model
    raw_output = model_handler.generate(
        prompt=final_prompt, 
        max_length=config['MAX_LENGTH'], 
        temperature=config['TEMPERATURE']
    )
    
    # Parse the output to extract structured data
    parsed_data = model_handler.parse_output(raw_output)
    
    # Store everything for analysis
    record = {
        "id": example['id'],
        "question": example['question'],
        "ground_truth": example['answer'],
        "model_answer": parsed_data.get('answer'),
        "confidence": parsed_data.get('confidence'),
        "explanation": parsed_data.get('explanation'),
        "think_process": parsed_data.get('think_process'),
        "parse_error": parsed_data.get('error'),
        "raw_output": raw_output
    }
    results.append(record)


  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 5. Save Results

In [None]:
results_df = pd.DataFrame(results)

# Construct a dynamic output path based on the config file name
output_dir = f"../{config['output_path']}/results"
config_name = CONFIG_FILE.split('/')[-1].replace('.yml', '')
output_csv_path = f"{output_dir}/{config_name}_results.csv"

# Ensure the directory exists before saving
ensure_dir(output_csv_path)

# Save to CSV
results_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

print(f"Results successfully saved to: {output_csv_path}")
display(results_df.head())