In [1]:
# Ensure project root is on sys.path for imports like `data_util`
import sys
sys.path.insert(0, "/root/competitive-coding-ai")


In [2]:
# Shared data utilities: map legacy modules to data_util and expose helpers
import sys
import data_util.codeforces as cf
import data_util.piston_eval as pe
import data_util.programming_pretty as pp

# Redirect legacy module names if referenced
sys.modules['pretty_print_r1_codeforces'] = cf
sys.modules['pretty_print_piston_eval'] = pe

# Convenience imports used in this notebook
from data_util import (
    pretty_print_programming_record,
    pretty_print_programming_record_veri,
    pretty_print_codeforces_problem,
    pretty_print_codeforces_problem_dark,
    get_record_by_problem_id,
)


In [3]:
from datasets import load_dataset

dataset = load_dataset("open-r1/Mixture-of-Thoughts", "all", split="train")

# Load a specific domain
ds_math = load_dataset("open-r1/Mixture-of-Thoughts", "math", split="train")
ds_code = load_dataset("open-r1/Mixture-of-Thoughts", "code", split="train")
ds_code_py = load_dataset("open-r1/codeforces-cots", "solutions_py", split="train")
ds_code_ed_py = load_dataset("open-r1/codeforces-cots", "solutions_w_editorials_py", split="train")
ds_science = load_dataset("open-r1/Mixture-of-Thoughts", "science", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#import importlib
#
# Print dataset info
print("=== TRAIN DATASET ===")
print(dataset) #349317
#print(ds_math) #93733
#print(ds_science) #172514
print(ds_code)
print(ds_code_ed_py)
print(ds_code_py)

# Print features with types
print("\n=== FEATURES ===")
for feature_name, feature_type in ds_code_py.features.items():
    print(f"{feature_name}: {feature_type}")

# Check a sample row
print("\n=== SAMPLE ROW ===")
sample = ds_code_py[688]
for key, value in sample.items():
    if False and isinstance(value, str) and len(value) > 600:
        print(f"{key}: {value[:600]}...")
    else:
        print(f"{key}: {value}")


=== TRAIN DATASET ===
Dataset({
    features: ['messages', 'num_tokens', 'source'],
    num_rows: 349317
})
Dataset({
    features: ['messages', 'num_tokens', 'source'],
    num_rows: 83070
})
Dataset({
    features: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'prompt', 'generation', 'finish_reason', 'api_metadata', 'messages'],
    num_rows: 11672
})
Dataset({
    features: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'prompt', 'generation', 'finish_reason', 'api_metadata', 'messages'],
    num_rows: 9556
})

=== FEATURES ===
id: Value('string')
aliases: List(Value('string'

In [5]:
from datasets import get_dataset_infos, load_dataset

def analyze_codeforces_cots_dataset():
    """
    Analyzes the 'open-r1/codeforces-cots' dataset by printing the total
    number of samples, column names, and one sample record for each subset.
    """
    dataset_name = "open-r1/codeforces-cots"

    try:
        # Step 1: Efficiently get metadata for all subsets, including sample counts
        print(f"Fetching dataset information for '{dataset_name}'...")
        infos = get_dataset_infos(dataset_name)
        print("Done.\n")

        # Step 2: Iterate through each subset using the fetched information
        for subset_name, info in infos.items():
            print(f"================== SUBSET: {subset_name} ==================")
            try:
                # Print the number of samples from the metadata
                num_samples = info.splits['train'].num_examples
                print(f"Total Samples: {num_samples:,}")

                # Step 3: Load only the first sample of the subset using streaming
                # This avoids downloading the entire subset
                dataset_stream = load_dataset(dataset_name, subset_name, split='train', streaming=True)
                sample = next(iter(dataset_stream))

                # Print the column names
                print("Columns:", list(sample.keys()))
                print("\n--- Sample Record ---")

                # Print each field of the sample, truncating long text for readability
                for key, value in sample.items():
                    print(f"\n>>> {key}:")
                    value_str = str(value)
                    # Truncate long strings to keep the output manageable
                    print(value_str[:700] + '...' if len(value_str) > 700 else value_str)

                print("\n" + "="*80 + "\n")

            except Exception as e:
                print(f"Could not process subset '{subset_name}'. Error: {e}\n")

    except Exception as e:
        print(f"Failed to get dataset info. Please check your internet connection and that the 'datasets' library is installed correctly. Error: {e}")

# Execute the analysis function
analyze_codeforces_cots_dataset()


Fetching dataset information for 'open-r1/codeforces-cots'...
Done.

Total Samples: 35,718
Columns: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'prompt', 'generation', 'finish_reason', 'api_metadata', 'messages']

--- Sample Record ---

>>> id:
176/E

>>> aliases:
None

>>> contest_id:
176

>>> contest_name:
Croc Champ 2012 - Round 2

>>> contest_type:
CF

>>> contest_start:
1334934300

>>> contest_start_year:
2012

>>> index:
E

>>> time_limit:
1.0

>>> memory_limit:
256.0

>>> title:
Archaeology

>>> description:
This time you should help a team of researchers on an island in the Pacific Ocean. They research the culture of the ancient tribes that used to inhabit the island many years ago.

Overall they've dug out n villages. Some pairs of villages were connected by roads. Peopl

In [6]:
import html
from datetime import datetime
from IPython.display import display, HTML

try:
    from pygments import highlight
    from pygments.lexers import get_lexer_by_name
    from pygments.formatters import HtmlFormatter
    PYGMENTS_AVAILABLE = True
except Exception:
    PYGMENTS_AVAILABLE = False


def pretty_print_programming_record(record: dict, record_type: str = "competitive_programming"):
    """Pretty print with Markdown + LaTeX (MathJax) + syntax highlight, no f-string backslash issues."""
    if not isinstance(record, dict):
        print("❌ Error: The provided record is not a valid dictionary.")
        return

    styles = (
        "<style>"
        ".programming-record-container{font-family:-apple-system,BlinkMacSystemFont,\"Segoe UI\",\"Noto Sans\",sans-serif;border:1px solid #d0d7de;"
        "border-radius:12px;margin-bottom:24px;box-shadow:0 8px 24px rgba(0,0,0,0.12);overflow:hidden;background:linear-gradient(135deg,#fafbfc 0%,#f6f8fa 100%);}"
        ".header{display:flex;justify-content:space-between;align-items:center;padding:16px 24px;background:linear-gradient(135deg,#0366d6 0%,#0256c7 100%);color:white;border-bottom:1px solid rgba(255,255,255,0.2);}"
        ".problem-title{font-size:22px;font-weight:700;text-shadow:0 1px 2px rgba(0,0,0,0.2);}"
        ".record-type{font-family:'SF Mono','Monaco','Inconsolata',monospace;font-size:11px;padding:6px 12px;background-color:rgba(255,255,255,0.2);border:1px solid rgba(255,255,255,0.3);border-radius:20px;backdrop-filter:blur(10px);text-transform:uppercase;font-weight:600;letter-spacing:.5px;}"
        ".metadata-section{padding:16px 24px;background-color:#f8f9fa;border-bottom:1px solid #e1e8ed;display:grid;grid-template-columns:repeat(auto-fit,minmax(200px,1fr));gap:16px;font-size:14px;}"
        ".metadata-item{display:flex;align-items:center;gap:8px;}"
        ".metadata-label{font-weight:600;color:#24292e;min-width:80px;}"
        ".metadata-value{color:#57606a;font-family:'SF Mono',monospace;background:#e1e8ed;padding:2px 6px;border-radius:4px;font-size:12px;}"
        ".messages-section{max-height:600px;overflow-y:auto;}"
        ".message{padding:20px 24px;border-bottom:1px solid #e1e8ed;}"
        ".message:last-child{border-bottom:none;}"
        ".message.user{background-color:#f0f8ff;border-left:4px solid #0366d6;}"
        ".message.assistant{background-color:#f8fff8;border-left:4px solid #28a745;}"
        ".message-header{display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;}"
        ".role{font-weight:600;font-size:14px;text-transform:uppercase;letter-spacing:.5px;}"
        ".role.user{color:#0366d6;} .role.assistant{color:#28a745;}"
        ".message-content{line-height:1.6;color:#24292e;}"
        ".code-block{background-color:#272822;border-radius:8px;margin:16px 0;overflow:hidden;}"
        ".code-header{background-color:#3c3c3c;padding:8px 16px;font-size:12px;color:#f8f8f2;font-family:'SF Mono',monospace;border-bottom:1px solid #4a4a4a;}"
        ".code-block pre{margin:0;padding:20px;white-space:pre-wrap;word-wrap:break-word;background-color:#272822;color:#f8f8f2;font-family:'SF Mono','Monaco','Inconsolata',monospace;font-size:13px;line-height:1.5;}"
        ".highlight{background-color:#272822 !important;}"
        ".stats-footer{padding:12px 24px;font-size:12px;color:#8c959d;background:linear-gradient(135deg,#f6f8fa 0%,#f1f3f4 100%);border-top:1px solid #e1e8ed;display:flex;justify-content:space-between;align-items:center;}"
        "</style>"
    )

    record_id = record.get("id", "Unknown")
    timestamp = record.get("timestamp")
    messages = record.get("messages", [])

    readable_time = ""
    if timestamp:
        try:
            readable_time = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S UTC")
        except Exception:
            readable_time = str(timestamp)

    total_messages = len(messages)
    user_messages = sum(1 for m in messages if m.get("role") == "user")
    assistant_messages = sum(1 for m in messages if m.get("role") == "assistant")

    # Build messages
    message_blocks = []
    for message in messages:
        role = message.get("role", "unknown")
        content = message.get("content", "")
        content_html = format_message_content(content)
        message_class = "user" if role == "user" else "assistant"
        block = (
            "<div class=\"message {}\">".format(message_class)
            + "<div class=\"message-header\"><span class=\"role {}\">{}</span></div>".format(role, role)
            + "<div class=\"message-content\">{}</div>".format(content_html)
            + "</div>"
        )
        message_blocks.append(block)
    messages_html = "".join(message_blocks)

    timestamp_html = ""
    if readable_time:
        timestamp_html = (
            '<div class="metadata-item"><span class="metadata-label">Timestamp:</span><span class="metadata-value">{}</span></div>'
        ).format(html.escape(readable_time))

    # Fixed the string formatting issue here
    metadata_html = (
        '<div class="metadata-section">'
        '<div class="metadata-item"><span class="metadata-label">Messages:</span><span class="metadata-value">{} total</span></div>'
        '<div class="metadata-item"><span class="metadata-label">User:</span><span class="metadata-value">{} messages</span></div>'
        '<div class="metadata-item"><span class="metadata-label">Assistant:</span><span class="metadata-value">{} messages</span></div>'
        '{}'
        '</div>'
    ).format(total_messages, user_messages, assistant_messages, timestamp_html)

    full_html = (
        styles
        + '<div class="programming-record-container">'
        + '<div class="header">'
        + '<span class="problem-title">{}</span>'.format(html.escape(str(record_id)))
        + '<span class="record-type">{}</span>'.format(record_type.replace('_', ' '))
        + '</div>'
        + metadata_html
        + '<div class="messages-section">{}</div>'.format(messages_html)
        + '<div class="stats-footer"><span>Programming conversation record</span><span>{} messages • {} characters</span></div>'.format(total_messages, len(str(record)))
        + '</div>'
        + '<script>if(window.MathJax){try{window.MathJax.typeset&&window.MathJax.typeset();}catch(e){}try{window.MathJax.typesetPromise&&window.MathJax.typesetPromise();}catch(e){}}</script>'
    )

    display(HTML(full_html))


def format_message_content(content: str) -> str:
    """Convert Markdown (with fenced code) to HTML and let MathJax render LaTeX."""
    if not content:
        return ""
    try:
        import markdown
        from markdown.extensions.fenced_code import FencedCodeExtension
        from markdown.extensions.codehilite import CodeHiliteExtension
        from markdown.extensions.tables import TableExtension
        html_body = markdown.markdown(
            content,
            extensions=[
                FencedCodeExtension(),
                CodeHiliteExtension(linenums=False, guess_lang=True, noclasses=True, pygments_style='monokai'),
                TableExtension(),
            ],
            output_format='html5',
        )
        return html_body
    except Exception:
        import re
        parts = []
        last = 0
        pattern = re.compile(r"```(\w+)?\n([\s\S]*?)```", re.MULTILINE)
        for m in pattern.finditer(content):
            before = content[last:m.start()]
            if before:
                before_html = html.escape(before).replace('\n', '<br>')
                parts.append("<div>{}</div>".format(before_html))
            lang = (m.group(1) or 'text')
            code = m.group(2)
            highlighted = None
            if PYGMENTS_AVAILABLE:
                try:
                    lexer = get_lexer_by_name(lang)
                    formatter = HtmlFormatter(style='monokai', noclasses=True)
                    highlighted = highlight(code, lexer, formatter)
                except Exception:
                    highlighted = None
            if highlighted is None:
                highlighted = "<pre><code>{}</code></pre>".format(html.escape(code))
            parts.append('<div class="code-block"><div class="code-header">{}</div>{}</div>'.format(lang.upper(), highlighted))
            last = m.end()
        tail = content[last:]
        if tail:
            tail_html = html.escape(tail).replace('\n', '<br>')
            parts.append("<div>{}</div>".format(tail_html))
        return "".join(parts)

In [6]:
ds_code_ed_py_decont = load_dataset("open-r1/codeforces-cots", "solutions_w_editorials_py_decontaminated", split='train', streaming=False)
pretty_print_programming_record(ds_code_ed_py_decont[1288], "competitive_programming")

In [8]:
from datasets import load_from_disk

# --- Load the Datasets ---

# Path where the parallel script saved the outputs
successful_dataset_path = "/mnt/data2/filtered_datasets_flexible_match/successful_solutions"
failed_dataset_path = "/mnt/data2/filtered_datasets_flexible_match/failed_solutions"

print(f"Loading successful solutions from: {successful_dataset_path}")
successful_ds = load_from_disk(successful_dataset_path)

print(f"Loading failed solutions from: {failed_dataset_path}")
failed_ds = load_from_disk(failed_dataset_path)


# --- Inspect the Loaded Datasets ---

print("\n--- Inspection ---")
print("\n✅ Successfully Verified Dataset:")
print(successful_ds)

print("\n❌ Failed Verification Dataset:")
print(failed_ds)

# You can also inspect the first row to see the structure, including the new 'verification_result' column
print("\nExample of a successful row:")
print(successful_ds[0])

print("\nExample of a failed row:")
print(failed_ds[1:5]) #4: independent set has no order

pretty_print_programming_record(failed_ds[4], "Failed")

Loading successful solutions from: /mnt/data2/filtered_datasets_flexible_match/successful_solutions
Loading failed solutions from: /mnt/data2/filtered_datasets_flexible_match/failed_solutions

--- Inspection ---

✅ Successfully Verified Dataset:
Dataset({
    features: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'prompt', 'generation', 'finish_reason', 'api_metadata', 'messages', 'verification_result'],
    num_rows: 5760
})

❌ Failed Verification Dataset:
Dataset({
    features: ['id', 'aliases', 'contest_id', 'contest_name', 'contest_type', 'contest_start', 'contest_start_year', 'index', 'time_limit', 'memory_limit', 'title', 'description', 'input_format', 'output_format', 'interaction_format', 'note', 'examples', 'editorial', 'prompt', 'generation', 'finish_reason', 'api_metad

In [9]:
from datasets import load_from_disk
from collections import Counter
import pandas as pd

# --- 1. Load the Dataset ---
# Ensure this path points to where your failed dataset is saved.
try:
    print(f"Loading failed solutions from: {failed_dataset_path}")
    failed_ds = load_from_disk(failed_dataset_path)
    print(f"Loaded {len(failed_ds)} failed examples successfully.")
except FileNotFoundError:
    print(f"Error: The directory '{failed_dataset_path}' was not found.")
    print("Please make sure you have run the parallel filtering script first.")
    # Exit gracefully if the dataset doesn't exist
    exit()

# --- 2. Generalize and Count the Reasons ---

def generalize_reason(reason_string):
    """
    Groups detailed failure reasons into general categories for better summarization.
    """
    if not isinstance(reason_string, str):
        return "Unknown Error"
    
    if "Wrong Answer" in reason_string:
        return "Wrong Answer"
    if "Time Limit Exceeded" in reason_string:
        return "Time Limit Exceeded"
    if "Runtime Error" in reason_string:
        return "Runtime Error"
    if "Could not extract Python code" in reason_string:
        return "Code Extraction Failed"
    if "No test cases found" in reason_string:
        return "No Test Cases Found"
    # Add any other specific categories you might encounter
    return "Other Execution Error"

# Use a Counter to efficiently tally the reasons
reason_counter = Counter()

# Iterate through the dataset and update the counter
for example in failed_ds:
    detailed_reason = example['verification_result']['reason']
    general_category = generalize_reason(detailed_reason)
    reason_counter[general_category] += 1

# --- 3. Display the Summary ---

print("\n" + "="*40)
print("   Summary of Failed Execution Reasons")
print("="*40)

if not reason_counter:
    print("No failed examples to summarize.")
else:
    # Create a pandas DataFrame for prettier printing
    total_failed = len(failed_ds)
    summary_data = []
    for reason, count in reason_counter.most_common():
        percentage = (count / total_failed) * 100
        summary_data.append({
            "Failure Reason": reason,
            "Count": count,
            "Percentage": f"{percentage:.2f}%"
        })
    
    df = pd.DataFrame(summary_data)
    
    # Add a total row
    total_row = pd.DataFrame({
        "Failure Reason": ["--- TOTAL ---"], 
        "Count": [total_failed], 
        "Percentage": ["100.00%"]
    })
    
    df = pd.concat([df, total_row], ignore_index=True)
    
    # Print the DataFrame without the index
    print(df.to_string(index=False))

print("="*40)

Loading failed solutions from: /mnt/data2/filtered_datasets_flexible_match/failed_solutions
Loaded 3796 failed examples successfully.

   Summary of Failed Execution Reasons
        Failure Reason  Count Percentage
          Wrong Answer   2866     75.50%
Code Extraction Failed    398     10.48%
         Runtime Error    327      8.61%
   No Test Cases Found    130      3.42%
   Time Limit Exceeded     75      1.98%
         --- TOTAL ---   3796    100.00%


In [10]:
wrong_answer_examples = failed_ds.filter(lambda x: "Wrong Answer" in x['verification_result']['reason'])
#runtime_error_example = failed_ds.filter(lambda x: "Runtime Error" in x['verification_result']['reason'])[8]
#pretty_print_programming_record_veri(runtime_error_example)

for ex in wrong_answer_examples.select(range(10)):
    # Pretty print it to see the detailed failure analysis
    pretty_print_programming_record_veri(ex)

Filter: 100%|██████████| 3796/3796 [00:00<00:00, 8309.81 examples/s]
