# Setup libraries

In [1]:
!uv pip install --upgrade pip
!uv pip uninstall transformers tokenizers accelerate -q

!uv pip install "transformers==4.56.0" "protobuf==5.29.3" -q
!uv pip install torch datasets -q
!uv pip install pandas matplotlib seaborn tqdm wandb pyyaml
!uv pip install bitsandbytes accelerate optimum lm_eval[hf]
# !uv pip install -r requirements.txt
!uv pip install --force-reinstall --no-cache-dir "numpy<2.0"

[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m1 package[0m [2min 282ms[0m[0m                                          [0m
[2K[2mPrepared [1m1 package[0m [2min 138ms[0m[0m                                              
[2mUninstalled [1m1 package[0m [2min 365ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 19ms[0m[0m                                 [0m
 [31m-[39m [1mpip[0m[2m==24.1.2[0m
 [32m+[39m [1mpip[0m[2m==25.3[0m
[2mUsing Python 3.11.13 environment at: /usr[0m
[2mAudited [1m6 packages[0m [2min 131ms[0m[0m
[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[2mResolved [1m110 packages[0m [2min 4.66s[0m[0m                                       [0m
[2K[2mPrepared [1m18 packages[0m [2min 2.55s[0m[0m                                            
[2K[2mInstalled [1m18 packages[0m [2min 302ms[0m[0m                              [0m
 [32m+[39m [1maccelerate[0m[2m==1.12.0[0m
 [32m+[39m [1mbits

# Suppress warnings

In [2]:
# Suppress warnings for clean output
import warnings
import os

warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
print("✅ Packages installed successfully!")

✅ Packages installed successfully!


# Install Libraries

In [3]:
"Built-in libraries"
import re
import sys
import gc
import time
import json
import hashlib
import glob
import zipfile
from io import StringIO
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
import yaml
import logging
import random

"Deep learning and NLP libraries"
import torch
import torch.nn.functional as F
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig,
    logging as hf_logging,
)

"Data processing libraries"
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
from tqdm.auto import tqdm
from IPython import get_ipython

# Configure logging
logging.getLogger("ContinuousBatchingLogger").setLevel(logging.ERROR)
hf_logging.set_verbosity_error()


print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
!nvidia-smi

Python Version: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
PyTorch Version: 2.6.0+cu124
CUDA Available: True
CUDA Version: 12.4
Wed Dec 24 10:00:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             11W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+---

In [4]:
import os


def configure_environment_paths():
    """Detect environment and configure paths"""
    try:
        if "google.colab" in str(get_ipython()):
            print("✅ Environment: Google Colab")
            base_data_path = "/content/"
            base_output_path = "/content/"
            environment_name = "colab"
        elif os.environ.get("KAGGLE_KERNEL_RUN_TYPE"):
            print("✅ Environment: Kaggle")
            base_data_path = "/kaggle/input/"
            base_output_path = "/kaggle/working/"
            environment_name = "kaggle"
        else:
            print("⚠️ Environment: Local/Unknown")
            base_data_path = "./data/"
            base_output_path = "./output/"
            environment_name = "local"
    except NameError:
        print("⚠️ Non-interactive session. Using local paths.")
        base_data_path = "./data/"
        base_output_path = "./output/"
        environment_name = "local"

    os.makedirs(base_output_path, exist_ok=True)
    print(f"📂 Data Path: {base_data_path}")
    print(f"📦 Output Path: {base_output_path}")

    return base_data_path, base_output_path, environment_name


INPUT_PATH, OUTPUT_PATH, ENV_NAME = configure_environment_paths()

✅ Environment: Kaggle
📂 Data Path: /kaggle/input/
📦 Output Path: /kaggle/working/


# Setup WANDB

In [5]:
import os
import wandb

if "colab" in ENV_NAME:
    from google.colab import userdata

    try:
        # Ensure 'WANDB_API_KEY' is the exact name in your Colab Secrets (the key icon)
        wandb_key = userdata.get("WANDB_API_KEY")
        wandb.login(key=wandb_key)
    except Exception as e:
        print(f"Could not retrieve W&B API key from Colab Secrets: {e}")

# 2. Check if running in Kaggle
elif "kaggle" in ENV_NAME:
    try:
        from kaggle_secrets import UserSecretsClient

        user_secrets = UserSecretsClient()
        wandb_key = user_secrets.get_secret("WANDB_API_KEY")
        wandb.login(key=wandb_key)
    except Exception as e:
        print(f"Could not retrieve W&B API key from Kaggle Secrets: {e}")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdungngocpham171[0m ([33mdungngocpham171-university-of-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Config input/output path and clone latest repo

In [6]:
# Clone the latest github repo version
%cd {OUTPUT_PATH}
torch.cuda.empty_cache()
!rm -rf OuroTrace

/kaggle/working


In [7]:
!git clone --branch claude https://github.com/dzungphieuluuky/OuroTrace.git
%cd OuroTrace

Cloning into 'OuroTrace'...
remote: Enumerating objects: 2067, done.[K
remote: Counting objects: 100% (284/284), done.[K
remote: Compressing objects: 100% (203/203), done.[K
remote: Total 2067 (delta 144), reused 205 (delta 81), pack-reused 1783 (from 1)[K
Receiving objects: 100% (2067/2067), 3.62 MiB | 16.26 MiB/s, done.
Resolving deltas: 100% (1318/1318), done.
/kaggle/working/OuroTrace


# Run Benchmark

In [12]:
import pandas as pd
from src.config_loader import load_config_from_json, post_process_config

# this is the fused version when single and batch use the same predict function
from src.new_runner import run_batch_experiment

# this is the original version when single and batch use different functions
# from src.runner import run_batch_experiment

from src.evaluation_metrics import analyze_experiment_results


# 1. Load Configuration from JSON
config = load_config_from_json("configs/ouro_1.4b_thinking.json")

# 2. Post-process (Convert 'torch.float16' string to object, generate timestamps)
config = post_process_config(config)

config["INFERENCE_STEPS"] = [12]
config["OPTIMIZATION"]["enable_batch"] = False
config["EVAL_SETTINGS"]["calculate_perplexity"] = False
config["DATA"]["n_ary"]["num_samples_per_level"] = 0
config["DATA"]["p_hop"]["hop_levels"] = [24, 32]
config["DATA"]["p_hop"]["num_samples_per_level"] = 100
config["DATA"]["igsm"]["num_samples"] = 0
config["DATA"]["reasoning_primitives"]["num_samples"] = 0
# 4. Execute
print("🚀 Starting Experiment...")
try:
    del model, tokenizer
    torch.cuda.empty_cache()
    gc.collect()
except:
    pass

try:
    acc_results, ppl_results, hol_results = run_batch_experiment(config)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

🚀 Starting Experiment...
🔗 Initializing W&B (timeout: 30s)...


✅ W&B initialized

🔧 EXPERIMENT CONFIGURATION
Model Path: ByteDance/Ouro-1.4B-Thinking
UT Steps to Test: [12]
Data Type: torch.bfloat16
4-bit Quantization: False
Torch Compile: False
Max Batch Size: 8
Max New Tokens: 16
Batching: True
Calculate Perplexity: False
Early Exit: 1.0

[+] Quality monitor initialized:
    → Garbage threshold: 30%
    → Example similarity threshold: 85%
    → Min samples before check: 10
🎲 Random seed set to 42

📦 LOADING TEST DATASETS
⚙️ Generating new test datasets...
✅ Generated test datasets

Dataset Summary:
   n_ary       :    0 samples
   p_hop       :  200 samples
   igsm        :    0 samples


📋 PAPER COMPLIANCE CHECK
Task Alignment: {'has_n_ary': True, 'has_p_hop': True, 'has_igsm': True, 'all_paper_tasks': True}
UT Steps Coverage: {'min_ut': 12, 'max_ut': 12, 'covers_baseline': False, 'covers_paper_range': False, 'recommended_range': [1, 2, 4, 8]}

✅ Configuration saved to ../results_20251224_112545_UT_12/config.json
✅ Task templates saved to ../re

   p_hop:   0%|          | 0/50 [00:00<?, ?it/s]

                                          test_input     full_response  \
0  Sequence: A D D B D B A D D B D C A C C B D A ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: A C C B B A D B C C D B C C B D C A ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: A A C C D A C D B B D C A B A A C B ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: B A A D A C B A D D A B B D A D A C ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
❌ Aborting due to repeated outputs...

⚠️ Batch 2 failed: Experiment failed: 5 repeated outputs
❌ Aborting due to repeated outputs...
⚠️ Item failed: Experiment failed: 5 repeated outputs
                                        

✅ W&B session closed
🧹 Cleaning up GPU memory...
✅ GPU memory freed

✅ Periodic save: simple reasoning results to ../results_20251224_112545_UT_12/simple_reasoning.csv
✅ Configuration saved to ../results_20251224_112545_UT_12/config.json
✅ Task templates saved to ../results_20251224_112545_UT_12/task_templates.json


In [13]:
import os
import glob
import zipfile
from typing import List


def find_result_folders(base_path: str) -> List[str]:
    """
    Return a list of absolute paths to all directories under `base_path`
    whose names start with 'results_'.
    """
    pattern = os.path.join(base_path, "results_*")
    # glob returns both files and directories; filter to directories only
    return [p for p in glob.glob(pattern) if os.path.isdir(p)]


def zip_folder(folder_path: str, output_base_path: str) -> bool:
    """
    Zip the contents of `folder_path` into a file named
    <folder_name>.zip` inside `output_base_path`.

    Returns True on success, False otherwise.
    """
    folder_name = os.path.basename(folder_path)
    zip_path = os.path.join(output_base_path, f"{folder_name}.zip")

    try:
        print(f"   -> Zipping folder: {folder_name}...")
        with zipfile.ZipFile(
            zip_path, mode="w", compression=zipfile.ZIP_DEFLATED
        ) as zipf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    full_path = os.path.join(root, file)
                    # Preserve relative path inside the zip
                    arcname = os.path.relpath(full_path, os.path.dirname(folder_path))
                    zipf.write(full_path, arcname)
        print(f"   ✅ Created ZIP: {os.path.basename(zip_path)}")
        return True
    except Exception as exc:
        print(f"   ❌ Failed to zip {folder_name}: {exc}")
        return False


def zip_stats_results_folders(output_base_path: str) -> None:
    """
    Main driver: locate all result folders and zip each one.
    """
    # Ensure the output directory exists
    os.makedirs(output_base_path, exist_ok=True)

    result_folders = find_result_folders(output_base_path)

    if not result_folders:
        print(f"⚠️ No folders starting with 'results_' found in '{output_base_path}'.")
        return

    print(f"🔍 Found {len(result_folders)} result folder(s) to zip.")
    successful = 0

    for folder in result_folders:
        if zip_folder(folder, output_base_path):
            successful += 1

    print(
        f"\n✅ DONE! Successfully zipped {successful} out of {len(result_folders)} folder(s)."
    )


if __name__ == "__main__":
    try:
        # Prefer an environment variable; fall back to a global if defined
        output_root = os.getenv("OUTPUT_PATH") or globals().get("OUTPUT_PATH")
        if not output_root:
            raise ValueError("OUTPUT_PATH not defined")

        # The script expects a sub‑folder named 'OuroTrace' under OUTPUT_PATH
        target_path = os.path.join(output_root, "")
        zip_stats_results_folders(target_path)

    except Exception as e:
        print(f"❌ An error occurred: {e}")

🔍 Found 2 result folder(s) to zip.
   -> Zipping folder: results_20251224_100041_UT_12...
   ✅ Created ZIP: results_20251224_100041_UT_12.zip
   -> Zipping folder: results_20251224_112545_UT_12...
   ✅ Created ZIP: results_20251224_112545_UT_12.zip

✅ DONE! Successfully zipped 2 out of 2 folder(s).


In [14]:
# 3. Save Results
df_acc = pd.DataFrame(acc_results)
df_ppl = pd.DataFrame(ppl_results)
df_hol = pd.DataFrame(hol_results)
# 4. Visualization & Reporting
if not df_acc.empty:
    print("\n" + "=" * 50 + "\n📊 VISUALIZATION\n" + "=" * 50)

    # Summary Tables
    # NOTE: The variable 'results_acc' is used here, assuming it holds the raw data
    # (list of dicts) required by 'analyze_experiment_results'.
    summary = analyze_experiment_results(acc_results)
    print("\n--- Summary Statistics ---")
    print(summary)

    # Plotting
    try:
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        # Plot 1: Accuracy
        acc_summary = (
            df_acc.groupby(["task_type", "ut_steps"])["is_correct"].mean().reset_index()
        )
        sns.barplot(
            data=acc_summary, x="ut_steps", y="is_correct", hue="task_type", ax=axes[0]
        )
        axes[0].set_title("Accuracy by UT Steps")
        axes[0].set_ylabel("Accuracy")
        axes[0].yaxis.set_major_formatter(
            plt.FuncFormatter(lambda y, _: "{:.0%}".format(y))
        )

        # Plot 2: Time
        time_summary = (
            df_acc.groupby(["task_type", "ut_steps"])["generation_time"]
            .mean()
            .reset_index()
        )
        sns.barplot(
            data=time_summary,
            x="ut_steps",
            y="generation_time",
            hue="task_type",
            ax=axes[1],
        )
        axes[1].set_title("Inference Time (s) by UT Steps")

        # Plot 3: Token Count
        sns.boxplot(
            data=df_acc, x="ut_steps", y="generated_tokens", hue="task_type", ax=axes[2]
        )
        axes[2].set_title("Generated Tokens Distribution")

        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"⚠️ Visualization error: {e}")
else:
    print("⚠️ No results to visualize.")

print("\n🏁 Experiment Complete.\n")


📊 VISUALIZATION

📊 COMPREHENSIVE METRICS ANALYSIS



TypeError: expected str, bytes or os.PathLike object, not list

In [None]:
print("Final Inspection:\n")
print("Top 20 Accuracy Report:\n")
print(df_acc.head(20))
print(f"Full Response:\n")
print(df_acc["full_response"])
print("Perplexity Report:\n")
print(df_ppl.head(20))

In [None]:
print(df_acc[["full_response", "generated_tokens"]])