# Setup libraries

In [None]:
# Clone the latest github repo version
import os
def configure_environment_paths():
    """Detect environment and configure paths"""
    try:
        if "google.colab" in str(get_ipython()):
            print("‚úÖ Environment: Google Colab")
            base_data_path = "/content/"
            base_output_path = "/content/output/"
            environment_name = "colab"
        elif os.environ.get("KAGGLE_KERNEL_RUN_TYPE"):
            print("‚úÖ Environment: Kaggle")
            base_data_path = "/kaggle/input/"
            base_output_path = "/kaggle/working/"
            environment_name = "kaggle"
        else:
            print("‚ö†Ô∏è Environment: Local/Unknown")
            base_data_path = "./data/"
            base_output_path = "./output/"
            environment_name = "local"
    except NameError:
        print("‚ö†Ô∏è Non-interactive session. Using local paths.")
        base_data_path = "./data/"
        base_output_path = "./output/"
        environment_name = "local"

    os.makedirs(base_output_path, exist_ok=True)
    print(f"üìÇ Data Path: {base_data_path}")
    print(f"üì¶ Output Path: {base_output_path}")

    return base_data_path, base_output_path, environment_name

INPUT_PATH, OUTPUT_PATH, ENV_NAME = configure_environment_paths()

!rm -r -f OuroTrace
!git clone --branch claude https://github.com/dzungphieuluuky/OuroTrace.git
%cd OuroTrace

In [None]:
!pip install --upgrade pip
!pip uninstall -y transformers tokenizers accelerate -q
!pip install "transformers==4.56.0" "protobuf==5.29.3" -q
!pip install torch datasets -q
!pip install pandas matplotlib seaborn tqdm wandb pyyaml
!pip install bitsandbytes accelerate
# !pip install -r requirements.txt
!pip install --force-reinstall --no-cache-dir "numpy<2.0"

# Config input/output path and clone latest repo

In [None]:
# Suppress warnings for clean output
import warnings
import os
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
print("‚úÖ Packages installed successfully!")

In [None]:
"Built-in libraries"
import re
import sys
import gc
import time
import json
import hashlib
import glob
import zipfile
from io import StringIO
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
import yaml
import logging
import random

"Deep learning and NLP libraries"
import torch
import torch.nn.functional as F
from transformers import (
    AutoConfig, 
    AutoTokenizer, 
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig,
    logging as hf_logging
)

"Data processing libraries"
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
from tqdm.auto import tqdm
from IPython import get_ipython

# Configure logging
logging.getLogger("ContinuousBatchingLogger").setLevel(logging.ERROR)
hf_logging.set_verbosity_error()


print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
!nvidia-smi

In [None]:
# WANDB CONFIG 
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key=secret_value_0)

In [None]:
import pandas as pd
from src.config_loader import load_config_from_json, post_process_config

# this is the fused version when single and batch use the same predict function
from src.new_runner import run_batch_experiment

# this is the original version when single and batch use different functions
# from src.runner import run_batch_experiment

from src.evaluation import analyze_experiment_results


# 1. Load Configuration from JSON
config = load_config_from_json('configs/batch_ouro_1.4b_thinking.json')

# 2. Post-process (Convert 'torch.float16' string to object, generate timestamps)
config = post_process_config(config)

config["INFERENCE_STEPS"] = [4]
# config['MODEL']['dtype'] = torch.bfloat16
config["reasoning_primitives"]["num_samples"] = 5

# 4. Execute
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"üïí Timestamp: {timestamp}")
print("üöÄ Starting Experiment...")
acc_results, ppl_results, hol_results = run_batch_experiment(config)

In [None]:
# 3. Save Results
df_acc = pd.DataFrame(acc_results)
df_ppl = pd.DataFrame(ppl_results)
df_hol = pd.DataFrame(hol_results)
RUN_RESULTS_NAME = f"run_{timestamp}"
os.makedirs(os.path.join(OUTPUT_PATH, RUN_RESULTS_NAME), exist_ok=True)
acc_path = os.path.join(OUTPUT_PATH, RUN_RESULTS_NAME, f"result_stats_{timestamp}.csv")
ppl_path = os.path.join(OUTPUT_PATH, RUN_RESULTS_NAME, f"result_ppl_{timestamp}.csv")
hol_path = os.path.join(OUTPUT_PATH, RUN_RESULTS_NAME, f"result_hol_{timestamp}.csv")
cfg_path = os.path.join(OUTPUT_PATH, RUN_RESULTS_NAME, f"result_config_{timestamp}.yaml")

df_acc.to_csv(acc_path, index=False)
if not df_ppl.empty:
    df_ppl.to_csv(ppl_path, index=False)
if not df_hol.empty:
    df_hol.to_csv(hol_path, index=False)
# Save Config
def sanitize_config(cfg):
    """Convert config to YAML-safe format"""
    clean = {}
    for k, v in cfg.items():
        if isinstance(v, dict):
            clean[k] = sanitize_config(v)
        elif str(type(v)).find('torch.') != -1:
            clean[k] = str(v)
        else:
            clean[k] = v
    return clean

with open(cfg_path, 'w') as f:
    yaml.dump(sanitize_config(config), f)

print(f"\nüíæ Results saved to {OUTPUT_PATH}")

# 4. Visualization & Reporting
if not df_acc.empty:
    print("\n" + "="*50 + "\nüìä VISUALIZATION\n" + "="*50)
    
    # Summary Tables
    # NOTE: The variable 'results_acc' is used here, assuming it holds the raw data 
    # (list of dicts) required by 'analyze_experiment_results'.
    summary = analyze_experiment_results(acc_results)
    print("\n--- Summary Statistics ---")
    print(summary)
    
    # Plotting
    try:
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))
        
        # Plot 1: Accuracy
        acc_summary = df_acc.groupby(['task_type', 'ut_steps'])['is_correct'].mean().reset_index()
        sns.barplot(data=acc_summary, x='ut_steps', y='is_correct', hue='task_type', ax=axes[0])
        axes[0].set_title('Accuracy by UT Steps')
        axes[0].set_ylabel('Accuracy')
        axes[0].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))
        
        # Plot 2: Time
        time_summary = df_acc.groupby(['task_type', 'ut_steps'])['generation_time'].mean().reset_index()
        sns.barplot(data=time_summary, x='ut_steps', y='generation_time', hue='task_type', ax=axes[1])
        axes[1].set_title('Inference Time (s) by UT Steps')
        
        # Plot 3: Token Count
        sns.boxplot(data=df_acc, x='ut_steps', y='generated_tokens', hue='task_type', ax=axes[2])
        axes[2].set_title('Generated Tokens Distribution')
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"‚ö†Ô∏è Visualization error: {e}")
else:
    print("‚ö†Ô∏è No results to visualize.")

print("\nüèÅ Experiment Complete.\n")

In [None]:
import os
import glob
import zipfile

def zip_all_run_folders(output_base_path: str):
    os.makedirs(output_base_path, exist_ok=True)
    
    search_pattern = os.path.join(output_base_path, "run_*")
    run_folders = glob.glob(search_pattern)
    run_directories = [d for d in run_folders if os.path.isdir(d)]

    if not run_directories:
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y th∆∞ m·ª•c n√†o b·∫Øt ƒë·∫ßu b·∫±ng 'run_' trong '{output_base_path}'.")
        return

    print(f"üîç T√¨m th·∫•y {len(run_directories)} th∆∞ m·ª•c k·∫øt qu·∫£ ƒë·ªÉ n√©n.")
    
    successful_zips = 0

    for folder_path in run_directories:
        folder_name = os.path.basename(folder_path)
        zip_filename = os.path.join(output_base_path, f"{folder_name}.zip")
        
        try:
            print(f"\n   -> ƒêang n√©n th∆∞ m·ª•c: {folder_name}...")
            
            with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for root, _, files in os.walk(folder_path):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                        zipf.write(file_path, arcname)
                        
            print(f"   ‚úÖ ƒê√£ t·∫°o file ZIP: {os.path.basename(zip_filename)}")
            successful_zips += 1
            
        except Exception as e:
            print(f"   ‚ùå L·ªói khi n√©n th∆∞ m·ª•c {folder_name}: {e}")

    print(f"\n‚úÖ HO√ÄN T·∫§T! ƒê√£ n√©n th√†nh c√¥ng {successful_zips} tr√™n {len(run_directories)} th∆∞ m·ª•c k·∫øt qu·∫£.")


try:
    if 'OUTPUT_PATH' in globals():
        zip_all_run_folders(OUTPUT_PATH)
    else:
        print("OUTPUT_PATH not defined.")
        
except NameError:
    print("OUTPUT_PATH not defined.")
except Exception as e:
    print(f"ƒê√£ x·∫£y ra l·ªói trong qu√° tr√¨nh n√©n: {e}")

In [None]:
print("Final Inspection:\n")
print("Top 20 Accuracy Report:\n")
print(df_acc.head(20))
print(f"Full Response:\n")
print(df_acc['full_response'])
print("Perplexity Report:\n")
print(df_ppl.head(20))

In [None]:
print(df_acc[['full_response', 'generated_tokens']])