# Setup libraries

In [None]:
!pip install --upgrade pip
!pip uninstall -y transformers tokenizers accelerate -q
!pip install "transformers==4.56.0" "protobuf==5.29.3" -q
!pip install torch datasets -q
!pip install pandas matplotlib seaborn tqdm wandb pyyaml
!pip install bitsandbytes accelerate
# !pip install -r requirements.txt
!pip install --force-reinstall --no-cache-dir "numpy<2.0"

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
peft 0.18.0 requires accelerate>=0.21.0, which is not installed.[0m[31m
Collecting accelerate
  Using cached accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.12.0-py3-none-any.whl (380 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.12.0
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m18.0/18.0 MB[0m [31m129.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installati

In [None]:
# Suppress warnings for clean output
import warnings
import os
warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
print("‚úÖ Packages installed successfully!")

‚úÖ Packages installed successfully!


In [None]:
"Built-in libraries"
import re
import sys
import gc
import time
import json
import hashlib
import glob
import zipfile
from io import StringIO
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
import yaml
import logging
import random

"Deep learning and NLP libraries"
import torch
import torch.nn.functional as F
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig,
    logging as hf_logging
)

"Data processing libraries"
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
from tqdm.auto import tqdm
from IPython import get_ipython

# Configure logging
logging.getLogger("ContinuousBatchingLogger").setLevel(logging.ERROR)
hf_logging.set_verbosity_error()


print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
!nvidia-smi

Python Version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch Version: 2.9.0+cu126
CUDA Available: True
CUDA Version: 12.6
Fri Dec 19 16:21:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   59C    P8             10W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+---

In [None]:
import os
import wandb

# 1. Check if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import userdata
    try:
        # Ensure 'WANDB_API_KEY' is the exact name in your Colab Secrets (the key icon)
        wandb_key = userdata.get('WANDB_API_KEY')
        wandb.login(key=wandb_key)
    except Exception as e:
        print(f"Could not retrieve W&B API key from Colab Secrets: {e}")

# 2. Check if running in Kaggle
elif os.path.exists('/kaggle/input'):
    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        wandb_key = user_secrets.get_secret("WANDB_API_KEY")
        wandb.login(key=wandb_key)
    except Exception as e:
        print(f"Could not retrieve W&B API key from Kaggle Secrets: {e}")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdungngocpham171[0m ([33mdungngocpham171-university-of-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Config input/output path and clone latest repo

In [None]:
# Clone the latest github repo version
import os
def configure_environment_paths():
    """Detect environment and configure paths"""
    try:
        if "google.colab" in str(get_ipython()):
            print("‚úÖ Environment: Google Colab")
            base_data_path = "/content/"
            base_output_path = "/content/"
            environment_name = "colab"
        elif os.environ.get("KAGGLE_KERNEL_RUN_TYPE"):
            print("‚úÖ Environment: Kaggle")
            base_data_path = "/kaggle/input/"
            base_output_path = "/kaggle/working/"
            environment_name = "kaggle"
        else:
            print("‚ö†Ô∏è Environment: Local/Unknown")
            base_data_path = "./data/"
            base_output_path = "./output/"
            environment_name = "local"
    except NameError:
        print("‚ö†Ô∏è Non-interactive session. Using local paths.")
        base_data_path = "./data/"
        base_output_path = "./output/"
        environment_name = "local"

    os.makedirs(base_output_path, exist_ok=True)
    print(f"üìÇ Data Path: {base_data_path}")
    print(f"üì¶ Output Path: {base_output_path}")

    return base_data_path, base_output_path, environment_name

INPUT_PATH, OUTPUT_PATH, ENV_NAME = configure_environment_paths()
%cd /content/
torch.cuda.empty_cache()
!rm -r -f OuroTrace
!git clone --branch claude https://github.com/dzungphieuluuky/OuroTrace.git
%cd OuroTrace

‚úÖ Environment: Google Colab
üìÇ Data Path: /content/
üì¶ Output Path: /content/
/content
Cloning into 'OuroTrace'...
remote: Enumerating objects: 1028, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 1028 (delta 21), reused 21 (delta 9), pack-reused 990 (from 2)[K
Receiving objects: 100% (1028/1028), 1.51 MiB | 22.14 MiB/s, done.
Resolving deltas: 100% (686/686), done.
/content/OuroTrace


In [None]:
import pandas as pd
from src.config_loader import load_config_from_json, post_process_config

# this is the fused version when single and batch use the same predict function
from src.new_runner import run_batch_experiment

# this is the original version when single and batch use different functions
# from src.runner import run_batch_experiment

from src.evaluation import analyze_experiment_results


# 1. Load Configuration from JSON
config = load_config_from_json('configs/batch_ouro_1.4b_thinking.json')

# 2. Post-process (Convert 'torch.float16' string to object, generate timestamps)
config = post_process_config(config)

config["INFERENCE_STEPS"] = [2]
config["DATA"]["n_ary"]["num_samples_per_level"] = 0
# 4. Execute
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"üïí Timestamp: {timestamp}")
print("üöÄ Starting Experiment...")
try:
  acc_results, ppl_results, hol_results = run_batch_experiment(config)
except Exception as e:
  print(f"An unexpected error occurred: {e}")

üïí Timestamp: 20251219_162147
üöÄ Starting Experiment...
üîó Initializing W&B (timeout: 30s)...
‚ö†Ô∏è W&B initialization failed: 1 validation error for Settings
start_timeout
  Extra inputs are not permitted [type=extra_forbidden, input_value=30, input_type=int]
    For further information visit https://errors.pydantic.dev/2.12/v/extra_forbidden. Continuing offline.

üîß EXPERIMENT CONFIGURATION
Model Path: ByteDance/Ouro-1.4B-Thinking
UT Steps to Test: [2]
Data Type: torch.bfloat16
4-bit Quantization: False
Torch Compile: False
Max Batch Size: 12
Max New Tokens: 512
Batching: True
Calculate Perplexity: True
Early Exit: 1.0

[+] Quality monitor initialized:
    ‚Üí Garbage threshold: 30%
    ‚Üí Example similarity threshold: 85%
    ‚Üí Min samples before check: 10
üé≤ Random seed set to 42

üì¶ LOADING TEST DATASETS
‚öôÔ∏è Generating new test datasets...
‚úÖ Generated test datasets

Dataset Summary:
   n_ary       :  500 samples
   p_hop       :  300 samples
   igsm        :  

Calculating PPL (UT=2):   0%|          | 0/8 [00:00<?, ?it/s]


‚úÖ Perplexity Results:
   Perplexity: 0.6314
   Avg Loss:   1.8802

üéØ ACCURACY EVALUATION


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìù Task: N_ARY
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Total Samples: 500
Batch Size: 8
Strategy: Batched Processing

Running 63 batches...


   n_ary:   0%|          | 0/63 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
You are a calculator. Given an addition problem with several numbers (e.g., '{number_1} + {number_2} + {number_3} + ... ='), show your work step by step. For each number, add it to the running total and show the calculation. After all steps, output only the final sum on a new line as [FINAL] [sum].
Example:
Input: {number_i} + {number_i+1} + {number_i+2} + ... =
Output:
Step {i}: 0 + {number_i} = {sum_i}
Step {i+1}: {sum_i} + {number_i+1} = {sum_i+1}
Step {i+2}: {sum_i+1} + {number_i+2} = {sum_i+2}
...[FINAL] {final_sum}<|im_end|>
<|im_start|>user
131 + 235 + 019 + 524 + 578 + 427 + 107 + 015 =<|im_end|>
<|im_start|>assistant
Step 1:

   ‚úì Chat format verified for input.
DEBUG: Full prompt for 'n_ary':
<|im_start|>system
You are a calculator. Given an addition problem with several numbers (e.g., '{number_1} + {number_2} + {number_3} + ... ='), show your work step by step. For each number, add it to the running total and

   p_hop:   0%|          | 0/75 [00:00<?, ?it/s]

DEBUG: Full prompt for 'p_hop':
<|im_start|>system
You are a sequence tracer.trace the sequence step by step. At each hop, follow strictly and exactly the format below. Output each line as 'Hop {X}: At {token} ‚Üí Next is {token}'. After all hops, output the result as [FINAL] {token}.
Example:
Input: Sequence: {token_1} {token_2} {token_3} .... Start: {token_1}. Hop {N} times.
Output:
Hop {i}: At {token_i} ‚Üí Next is {token_{i+1}}
Hop {i+1}: At {token_{i+1}} ‚Üí Next is {token_{i+2}}
...Hop {N}: At {token_N} ‚Üí Next is {token_final}
[FINAL] {token_final}<|im_end|>
<|im_start|>user
Sequence: A B C B B D C C A C D C D B A A C B D C D A A D B C C C C C B D C B D A A C A B B B D D D C B C B B A D B A D D B B A C B C A A A A A B B B A C A D B B D B A D A D C C B B D B D C A A A B B A D B C B D B B D C D A B A B B A D B D D A D A A B C C B D D A D C A C B C A B D C B D A D D C D C C D A A C D C C D D A C D C A A B B C D B A B C D B C C D A A B C B D A A A A D A B C D D C B C A C A B B A A 

KeyboardInterrupt: 

In [None]:
import os
import glob
import zipfile

def zip_all_results_folders(output_base_path: str):
    os.makedirs(output_base_path, exist_ok=True)

    search_pattern = os.path.join(output_base_path, "results_*")
    results_folders = glob.glob(search_pattern)
    results_directories = [d for d in results_folders if os.path.isdir(d)]

    if not results_directories:
        print(f"‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y th∆∞ m·ª•c n√†o b·∫Øt ƒë·∫ßu b·∫±ng 'results_' trong '{output_base_path}'.")
        return

    print(f"üîç T√¨m th·∫•y {len(results_directories)} th∆∞ m·ª•c k·∫øt qu·∫£ ƒë·ªÉ n√©n.")

    successful_zips = 0

    for folder_path in results_directories:
        folder_name = os.path.basename(folder_path)
        zip_filename = os.path.join(output_base_path, f"{folder_name}.zip")

        try:
            print(f"\n   -> ƒêang n√©n th∆∞ m·ª•c: {folder_name}...")

            with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for root, _, files in os.walk(folder_path):
                    for file in files:
                        file_path = os.path.join(root, file)
                        arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                        zipf.write(file_path, arcname)

            print(f"   ‚úÖ ƒê√£ t·∫°o file ZIP: {os.path.basename(zip_filename)}")
            successful_zips += 1

        except Exception as e:
            print(f"   ‚ùå L·ªói khi n√©n th∆∞ m·ª•c {folder_name}: {e}")

    print(f"\n‚úÖ HO√ÄN T·∫§T! ƒê√£ n√©n th√†nh c√¥ng {successful_zips} tr√™n {len(results_directories)} th∆∞ m·ª•c k·∫øt qu·∫£.")


try:
    if 'OUTPUT_PATH' in globals():
        zip_all_results_folders(os.path.join(OUTPUT_PATH, "OuroTrace"))
    else:
        print("OUTPUT_PATH not defined.")

except NameError:
    print("OUTPUT_PATH not defined.")
except Exception as e:
    print(f"ƒê√£ x·∫£y ra l·ªói trong qu√° tr√¨nh n√©n: {e}")

üîç T√¨m th·∫•y 1 th∆∞ m·ª•c k·∫øt qu·∫£ ƒë·ªÉ n√©n.

   -> ƒêang n√©n th∆∞ m·ª•c: results_20251219_162147...
   ‚úÖ ƒê√£ t·∫°o file ZIP: results_20251219_162147.zip

‚úÖ HO√ÄN T·∫§T! ƒê√£ n√©n th√†nh c√¥ng 1 tr√™n 1 th∆∞ m·ª•c k·∫øt qu·∫£.


In [None]:
# 3. Save Results
df_acc = pd.DataFrame(acc_results)
df_ppl = pd.DataFrame(ppl_results)
df_hol = pd.DataFrame(hol_results)
# 4. Visualization & Reporting
if not df_acc.empty:
    print("\n" + "="*50 + "\nüìä VISUALIZATION\n" + "="*50)

    # Summary Tables
    # NOTE: The variable 'results_acc' is used here, assuming it holds the raw data
    # (list of dicts) required by 'analyze_experiment_results'.
    summary = analyze_experiment_results(acc_results)
    print("\n--- Summary Statistics ---")
    print(summary)

    # Plotting
    try:
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        # Plot 1: Accuracy
        acc_summary = df_acc.groupby(['task_type', 'ut_steps'])['is_correct'].mean().reset_index()
        sns.barplot(data=acc_summary, x='ut_steps', y='is_correct', hue='task_type', ax=axes[0])
        axes[0].set_title('Accuracy by UT Steps')
        axes[0].set_ylabel('Accuracy')
        axes[0].yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

        # Plot 2: Time
        time_summary = df_acc.groupby(['task_type', 'ut_steps'])['generation_time'].mean().reset_index()
        sns.barplot(data=time_summary, x='ut_steps', y='generation_time', hue='task_type', ax=axes[1])
        axes[1].set_title('Inference Time (s) by UT Steps')

        # Plot 3: Token Count
        sns.boxplot(data=df_acc, x='ut_steps', y='generated_tokens', hue='task_type', ax=axes[2])
        axes[2].set_title('Generated Tokens Distribution')

        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"‚ö†Ô∏è Visualization error: {e}")
else:
    print("‚ö†Ô∏è No results to visualize.")

print("\nüèÅ Experiment Complete.\n")

NameError: name 'acc_results' is not defined

In [None]:
print("Final Inspection:\n")
print("Top 20 Accuracy Report:\n")
print(df_acc.head(20))
print(f"Full Response:\n")
print(df_acc['full_response'])
print("Perplexity Report:\n")
print(df_ppl.head(20))

In [None]:
print(df_acc[['full_response', 'generated_tokens']])