# Setup libraries

In [1]:
!uv pip install --upgrade pip
!uv pip uninstall transformers tokenizers accelerate -q

!uv pip install "transformers==4.56.0" "protobuf==5.29.3" -q
!uv pip install torch datasets -q
!uv pip install pandas matplotlib seaborn tqdm wandb pyyaml
!uv pip install bitsandbytes accelerate optimum lm_eval
# !uv pip install -r requirements.txt
!uv pip install --force-reinstall --no-cache-dir "numpy<2.0"

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m1 package[0m [2min 116ms[0m[0m
[2K[2mPrepared [1m1 package[0m [2min 272ms[0m[0m
[2mUninstalled [1m1 package[0m [2min 224ms[0m[0m
[2K[2mInstalled [1m1 package[0m [2min 32ms[0m[0m
 [31m-[39m [1mpip[0m[2m==24.1.2[0m
 [32m+[39m [1mpip[0m[2m==25.3[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m6 packages[0m [2min 325ms[0m[0m
[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m95 packages[0m [2min 2.71s[0m[0m
[2K[2mPrepared [1m21 packages[0m [2min 5.60s[0m[0m
[2K[2mInstalled [1m21 packages[0m [2min 385ms[0m[0m
 [32m+[39m [1maccelerate[0m[2m==1.12.0[0m
 [32m+[39m [1mbitsandbytes[0m[2m==0.49.0[0m
 [32m+[39m [1mcolorama[0m[2m==0.4.6[0m
 [32m+[39m [1mdataproperty[0m[2m==1.1.0[0m
 [32m+[39m [1mevaluate[0m[2m==0.4.6[0m
 [32m+[39m [1mjsonlines[0m[2m==4.0.0[0m
 [32m+[39m [1mlm-eval[0m[2m==0.4

# Suppress warnings

In [1]:
# Suppress warnings for clean output
import warnings
import os

warnings.filterwarnings("ignore", category=UserWarning)
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
print("‚úÖ Packages installed successfully!")

‚úÖ Packages installed successfully!


# Install Libraries

In [2]:
"Built-in libraries"
import re
import sys
import gc
import time
import json
import hashlib
import glob
import zipfile
from io import StringIO
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
import yaml
import logging
import random

"Deep learning and NLP libraries"
import torch
import torch.nn.functional as F
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GenerationConfig,
    logging as hf_logging,
)

"Data processing libraries"
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
from tqdm.auto import tqdm
from IPython import get_ipython

# Configure logging
logging.getLogger("ContinuousBatchingLogger").setLevel(logging.ERROR)
hf_logging.set_verbosity_error()


print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Version: {torch.version.cuda}")
!nvidia-smi

Python Version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch Version: 2.9.0+cu126
CUDA Available: True
CUDA Version: 12.6
Thu Dec 25 13:26:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+---

In [3]:
import os


def configure_environment_paths():
    """Detect environment and configure paths"""
    try:
        if "google.colab" in str(get_ipython()):
            print("‚úÖ Environment: Google Colab")
            base_data_path = "/content/"
            base_output_path = "/content/"
            environment_name = "colab"
        elif os.environ.get("KAGGLE_KERNEL_RUN_TYPE"):
            print("‚úÖ Environment: Kaggle")
            base_data_path = "/kaggle/input/"
            base_output_path = "/kaggle/working/"
            environment_name = "kaggle"
        else:
            print("‚ö†Ô∏è Environment: Local/Unknown")
            base_data_path = "./data/"
            base_output_path = "./output/"
            environment_name = "local"
    except NameError:
        print("‚ö†Ô∏è Non-interactive session. Using local paths.")
        base_data_path = "./data/"
        base_output_path = "./output/"
        environment_name = "local"

    os.makedirs(base_output_path, exist_ok=True)
    print(f"üìÇ Data Path: {base_data_path}")
    print(f"üì¶ Output Path: {base_output_path}")

    return base_data_path, base_output_path, environment_name


INPUT_PATH, OUTPUT_PATH, ENV_NAME = configure_environment_paths()

‚úÖ Environment: Google Colab
üìÇ Data Path: /content/
üì¶ Output Path: /content/


# Setup WANDB

In [4]:
import os
import wandb

if "colab" in ENV_NAME:
    from google.colab import userdata

    try:
        # Ensure 'WANDB_API_KEY' is the exact name in your Colab Secrets (the key icon)
        wandb_key = userdata.get("WANDB_API_KEY")
        wandb.login(key=wandb_key)
    except Exception as e:
        print(f"Could not retrieve W&B API key from Colab Secrets: {e}")

# 2. Check if running in Kaggle
elif "kaggle" in ENV_NAME:
    try:
        from kaggle_secrets import UserSecretsClient

        user_secrets = UserSecretsClient()
        wandb_key = user_secrets.get_secret("WANDB_API_KEY")
        wandb.login(key=wandb_key)
    except Exception as e:
        print(f"Could not retrieve W&B API key from Kaggle Secrets: {e}")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdungngocpham171[0m ([33mdungngocpham171-university-of-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Config input/output path and clone latest repo

In [5]:
# Clone the latest github repo version
%cd {OUTPUT_PATH}
torch.cuda.empty_cache()
!rm -rf OuroTrace

/content


In [6]:
!git clone --branch claude https://github.com/dzungphieuluuky/OuroTrace.git
%cd OuroTrace

Cloning into 'OuroTrace'...
remote: Enumerating objects: 2131, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 2131 (delta 6), reused 9 (delta 3), pack-reused 2118 (from 2)[K
Receiving objects: 100% (2131/2131), 3.66 MiB | 8.63 MiB/s, done.
Resolving deltas: 100% (1363/1363), done.
/content/OuroTrace


# Run Benchmark

In [7]:
from src.config_loader import load_config_from_json, post_process_config
from src.new_runner import run_batch_experiment
from src.evaluation_metrics import analyze_experiment_results

def set_all_seeds(seed):
    random.seed(seed)                          # Python random
    os.environ['PYTHONHASHSEED'] = str(seed)  # Python hash seed
    np.random.seed(seed)                      # NumPy
    torch.manual_seed(seed)                   # PyTorch CPU & GPU

    # Additional GPU-specific settings
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)      # For multi-GPU

set_all_seeds(1415)
# 1. Load Configuration from JSON
config = load_config_from_json("configs/ouro_1.4b_thinking.json")

# 2. Post-process (Convert 'torch.float16' string to object, generate timestamps)
config = post_process_config(config)

config["INFERENCE_STEPS"] = [3]
config["OPTIMIZATION"]["enable_batch"] = False
# config["EVAL_SETTINGS"]["calculate_perplexity"] = False
# config["DATA"]["n_ary"]["num_samples_per_level"] = 0
# config["DATA"]["p_hop"]["num_samples_per_level"] = 0
# config["DATA"]["igsm"]["num_samples"] = 0
# config["DATA"]["reasoning_primitives"]["num_samples"] = 0
# config["ENABLE_HEAVY_BENCHMARKS"] = True
# 4. Execute
print("üöÄ Starting Experiment...")
try:
    del model, tokenizer
    torch.cuda.empty_cache()
    gc.collect()
except:
    pass

try:
    simple_reasoning_results, ppl_results, primitives_results, benchmark_results = run_batch_experiment(config)
except Exception as e:
    print(f"An unexpected error occurred: {e}")

üöÄ Starting Experiment...
üîó Initializing W&B (timeout: 30s)...


‚úÖ W&B initialized

üîß EXPERIMENT CONFIGURATION
Model Path: ByteDance/Ouro-1.4B-Thinking
UT Steps to Test: [3]
Data Type: torch.bfloat16
4-bit Quantization: False
Torch Compile: False
Max Batch Size: 8
Max New Tokens: 16
Batching: False
Calculate Perplexity: True
Early Exit: 1.0

[+] Quality monitor initialized:
    ‚Üí Garbage threshold: 30%
    ‚Üí Example similarity threshold: 85%
    ‚Üí Min samples before check: 10
üé≤ Random seed set to 42

üì¶ LOADING TEST DATASETS
‚öôÔ∏è Generating new test datasets...
‚úÖ Generated test datasets

Dataset Summary:
   n_ary       :  500 samples
   p_hop       :  300 samples
   igsm        :  100 samples


üìã PAPER COMPLIANCE CHECK
Task Alignment: {'has_n_ary': True, 'has_p_hop': True, 'has_igsm': True, 'all_paper_tasks': True}
UT Steps Coverage: {'min_ut': 3, 'max_ut': 3, 'covers_baseline': False, 'covers_paper_range': False, 'recommended_range': [1, 2, 4, 8]}

üìö Preparing perplexity evaluation data...
‚úÖ Prepared 50 samples for PPL



config.json: 0.00B [00:00, ?B/s]

configuration_ouro.py: 0.00B [00:00, ?B/s]


‚Üí Base config loaded
   Original UT steps: 4
   Original early exit: 1.0

‚Üí Modified config:
   New UT steps: 3
   Early exit threshold: 1.0 (from default)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/965 [00:00<?, ?B/s]


‚Üí Tokenizer loaded
   Vocab size: 49152
   PAD token: <|im_end|>
   EOS token: <|im_end|>

‚Üí Loading model weights...


modeling_ouro.py: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.87G [00:00<?, ?B/s]


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üöÄ APPLYING SAFE OPTIMIZATIONS
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
   ‚úì Flash Attention / SDPA enabled
   ‚úì TF32 enabled for matmul
   ‚úì cuDNN auto-tuning enabled
   ‚úì Memory pool optimized
   ‚Üí Running 3 warmup passes...
   ‚úì Warmup complete
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

‚úÖ MODEL LOADED SUCCESSFULLY
Device: cuda:0
Model dtype: torch.bfloat16
VERIFIED UT steps: 3
VERIFIED early exit: 1.0

üîß Building task templates...
[+] Task templates with pre-tokenized components computed.
    System prompt N_ary

Calculating PPL (UT=3):   0%|          | 0/8 [00:00<?, ?it/s]


‚úÖ Perplexity Results:
   Perplexity: 0.2411
   Avg Loss:   1.2726

üéØ ACCURACY EVALUATION


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìù Task: N_ARY
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Total Samples: 500
Batch Size: 1 (Sequential)
Strategy: Sequential Processing

Batch size < 1 or not enough items, processing sequentially.
Processing 500 items sequentially...


   n_ary:   0%|          | 0/500 [00:00<?, ?it/s]

    test_input        full_response  generated_tokens
0  625 + 449 =  [FINAL]  1074 [END]                 8
    test_input        full_response  generated_tokens
0  976 + 756 =  [FINAL]  1732 [END]                 8
    test_input        full_response  generated_tokens
0  722 + 324 =  [FINAL]  1046 [END]                 8
    test_input        full_response  generated_tokens
0  490 + 643 =  [FINAL]  1133 [END]                 8
    test_input       full_response  generated_tokens
0  118 + 710 =  [FINAL]  828 [END]                 7
    test_input       full_response  generated_tokens
0  075 + 884 =  [FINAL]  959 [END]                 7
    test_input       full_response  generated_tokens
0  133 + 213 =  [FINAL]  346 [END]                 7
    test_input       full_response  generated_tokens
0  092 + 549 =  [FINAL]  641 [END]                 7
    test_input        full_response  generated_tokens
0  825 + 702 =  [FINAL]  1527 [END]                 8
    test_input        full_response 

   p_hop:   0%|          | 0/300 [00:00<?, ?it/s]

                                          test_input     full_response  \
0  Sequence: B D C A B D B C D A C A B A B A C D ...  [FINAL]  C [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: A C A B A D B C A D A B D B A D A A ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: C C D D A D C C D A A B C A A A B C ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: C A B A C C A A D C C B D D B D A D ...  [FINAL]  B [END]   

   generated_tokens  
0                 4  
                                          test_input     full_response  \
0  Sequence: B A B D C D D D D C C D D C B A C D ...  [FINAL]  A [END]   

   generated_tokens  
0                 4  
                                   

   igsm:   0%|          | 0/100 [00:00<?, ?it/s]

                                          test_input     full_response  \
0  Question. J#F := I#L. F#D := E#K - E#K. H#K :=...  [FINAL]  2 [END]   

   generated_tokens  
0                 5  
                                          test_input     full_response  \
0  Question. G#G := E#C. C#F := J#K. J#K := O#L. ...  [FINAL]  2 [END]   

   generated_tokens  
0                 5  
                                          test_input     full_response  \
0  Question. P#N := A#O. N#H := H#C * L#A. O#O :=...  [FINAL]  2 [END]   

   generated_tokens  
0                 5  
                                          test_input     full_response  \
0  Question. G#F := J#O. I#P := G#A. E#L := H#P. ...  [FINAL]  2 [END]   

   generated_tokens  
0                 5  
‚ùå Aborting due to repeated outputs...
‚ö†Ô∏è Item failed: Experiment failed: 5 repeated outputs
                                          test_input  \
0  Question. C#M := 1. K#P := 2. K#N := D#F * P#H...   

                 

  var_assign_depth_0_code:   0%|          | 0/100 [00:00<?, ?it/s]

    ‚úÖ Accuracy: 97.00% (97/100)

üìã Task: var_assign_depth_0_math (100 samples)


  var_assign_depth_0_math:   0%|          | 0/100 [00:00<?, ?it/s]

    ‚úÖ Accuracy: 100.00% (100/100)

üìã Task: var_assign_depth_0_equation (100 samples)


  var_assign_depth_0_equation:   0%|          | 0/100 [00:00<?, ?it/s]

    ‚úÖ Accuracy: 100.00% (100/100)

üìã Task: var_assign_depth_1_code (100 samples)


  var_assign_depth_1_code:   0%|          | 0/100 [00:00<?, ?it/s]

    ‚úÖ Accuracy: 89.00% (89/100)

üìã Task: var_assign_depth_1_math (100 samples)


  var_assign_depth_1_math:   0%|          | 0/100 [00:00<?, ?it/s]

    ‚úÖ Accuracy: 93.00% (93/100)

üìã Task: var_assign_depth_1_equation (100 samples)


  var_assign_depth_1_equation:   0%|          | 0/100 [00:00<?, ?it/s]

    ‚úÖ Accuracy: 88.00% (88/100)
‚úÖ Reasoning primitives evaluation completed

‚úÖ Periodic save: simple reasoning results to ../results_20251225_132657_UT_3/simple_reasoning.csv
‚úÖ Periodic save: perplexity results to ../results_20251225_132657_UT_3/perplexity.csv
‚úÖ Periodic save: reasoning primitives results to ../results_20251225_132657_UT_3/reasoning_primitives.csv
üßπ Cleaning up GPU memory...
‚úÖ GPU memory freed


üìä FINAL EXPERIMENT SUMMARY

üìà Overall Accuracy by Task Type:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
          Accuracy    N
task_type              
igsm        12.00%  100
n_ary       29.40%  500
p_hop       28.00%  300

üìà Accuracy by UT Steps:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î

0,1
igsm/accuracy,‚ñÅ
igsm/avg_generation_time,‚ñÅ
igsm/avg_tokens,‚ñÅ
igsm/num_degenerate,‚ñÅ
igsm/num_samples,‚ñÅ
igsm/throughput,‚ñÅ
n_ary/accuracy,‚ñÅ
n_ary/avg_generation_time,‚ñÅ
n_ary/avg_tokens,‚ñÅ
n_ary/num_degenerate,‚ñÅ

0,1
igsm/accuracy,0.12
igsm/avg_generation_time,6.75733
igsm/avg_tokens,4.65
igsm/num_degenerate,0
igsm/num_samples,100
igsm/throughput,0.13746
n_ary/accuracy,0.294
n_ary/avg_generation_time,5.92126
n_ary/avg_tokens,8.12
n_ary/num_degenerate,0


‚úÖ W&B session closed

‚úÖ Periodic save: simple reasoning results to ../results_20251225_132657_UT_3/simple_reasoning.csv
‚úÖ Periodic save: perplexity results to ../results_20251225_132657_UT_3/perplexity.csv
‚úÖ Periodic save: reasoning primitives results to ../results_20251225_132657_UT_3/reasoning_primitives.csv
‚úÖ Configuration saved to ../results_20251225_132657_UT_3/config.json
‚úÖ Task templates saved to ../results_20251225_132657_UT_3/task_templates.json
An unexpected error occurred: too many values to unpack (expected 3)


In [8]:
import os
import glob
import zipfile
from typing import List
def find_result_folders(base_path: str) -> List[str]:
    """
    Return a list of absolute paths to all directories under `base_path`
    whose names start with 'results_'.
    """
    pattern = os.path.join(base_path, "results_*")
    # glob returns both files and directories; filter to directories only
    return [p for p in glob.glob(pattern) if os.path.isdir(p)]
def zip_folder(folder_path: str, output_base_path: str) -> bool:
    """
    Zip the contents of `folder_path` into a file named
    <folder_name>.zip` inside `output_base_path`.

    Returns True on success, False otherwise.
    """
    folder_name = os.path.basename(folder_path)
    zip_path = os.path.join(output_base_path, f"{folder_name}.zip")
    try:
        print(f"   -> Zipping folder: {folder_name}...")
        with zipfile.ZipFile(
            zip_path, mode="w", compression=zipfile.ZIP_DEFLATED
        ) as zipf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    full_path = os.path.join(root, file)
                    # Preserve relative path inside the zip
                    arcname = os.path.relpath(full_path, os.path.dirname(folder_path))
                    zipf.write(full_path, arcname)
        print(f"   ‚úÖ Created ZIP: {os.path.basename(zip_path)}")
        return True
    except Exception as exc:
        print(f"   ‚ùå Failed to zip {folder_name}: {exc}")
        return False
def zip_stats_results_folders(output_base_path: str) -> None:
    """
    Main driver: locate all result folders and zip each one.
    """
    # Ensure the output directory exists
    os.makedirs(output_base_path, exist_ok=True)
    result_folders = find_result_folders(output_base_path)
    if not result_folders:
        print(f"‚ö†Ô∏è No folders starting with 'results_' found in '{output_base_path}'.")
        return
    print(f"üîç Found {len(result_folders)} result folder(s) to zip.")
    successful = 0
    for folder in result_folders:
        if zip_folder(folder, output_base_path):
            successful += 1
    print(
        f"\n‚úÖ DONE! Successfully zipped {successful} out of {len(result_folders)} folder(s)."
    )
if __name__ == "__main__":
    try:
        # Prefer an environment variable; fall back to a global if defined
        output_root = os.getenv("OUTPUT_PATH") or globals().get("OUTPUT_PATH")
        if not output_root:
            raise ValueError("OUTPUT_PATH not defined")
        # The script expects a sub‚Äëfolder named 'OuroTrace' under OUTPUT_PATH
        target_path = os.path.join(output_root, "")
        zip_stats_results_folders(target_path)
    except Exception as e:
        print(f"‚ùå An error occurred: {e}")

üîç Found 1 result folder(s) to zip.
   -> Zipping folder: results_20251225_132657_UT_3...
   ‚úÖ Created ZIP: results_20251225_132657_UT_3.zip

‚úÖ DONE! Successfully zipped 1 out of 1 folder(s).


In [9]:
# 3. Save Results
df_simple = pd.DataFrame(simple_reasoning_results)
df_ppl = pd.DataFrame(ppl_results)
df_primitives = pd.DataFrame(primitives_results)
df_benchmark = pd.DataFrame(benchmark_results)
# 4. Visualization & Reporting
if not df_simple.empty:
    print("\n" + "=" * 50 + "\nüìä VISUALIZATION\n" + "=" * 50)

    # Summary Tables
    # NOTE: The variable 'results_acc' is used here, assuming it holds the raw data
    # (list of dicts) required by 'analyze_experiment_results'.
    summary = analyze_experiment_results(simple_reasoning_results)
    print("\n--- Summary Statistics ---")
    print(summary)

    # Plotting
    try:
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))

        # Plot 1: Accuracy
        acc_summary = (
            df_simple.groupby(["task_type", "ut_steps"])["is_correct"].mean().reset_index()
        )
        sns.barplot(
            data=acc_summary, x="ut_steps", y="is_correct", hue="task_type", ax=axes[0]
        )
        axes[0].set_title("Accuracy by UT Steps")
        axes[0].set_ylabel("Accuracy")
        axes[0].yaxis.set_major_formatter(
            plt.FuncFormatter(lambda y, _: "{:.0%}".format(y))
        )

        # Plot 2: Time
        time_summary = (
            df_simple.groupby(["task_type", "ut_steps"])["generation_time"]
            .mean()
            .reset_index()
        )
        sns.barplot(
            data=time_summary,
            x="ut_steps",
            y="generation_time",
            hue="task_type",
            ax=axes[1],
        )
        axes[1].set_title("Inference Time (s) by UT Steps")

        # Plot 3: Token Count
        sns.boxplot(
            data=df_simple, x="ut_steps", y="generated_tokens", hue="task_type", ax=axes[2]
        )
        axes[2].set_title("Generated Tokens Distribution")

        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"‚ö†Ô∏è Visualization error: {e}")
else:
    print("‚ö†Ô∏è No results to visualize.")

print("\nüèÅ Experiment Complete.\n")

NameError: name 'acc_results' is not defined

In [None]:
print("Final Inspection:\n")
print("Top 20 Accuracy Report:\n")
print(df_simple.head(20))
print(f"Full Response:\n")
print(df_simple["full_response"])
print("Perplexity Report:\n")
print(df_ppl.head(20))

In [None]:
print(df_simple[["full_response", "generated_tokens"]])