In [1]:
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [67]:
CHECKPOINT_FOLDER = "/home/morg/students/gottesman3/knowledge-analysis-suite/performance_by_step/"
OUTPUT_FOLDER = "/home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/subject_chunks"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [74]:
SHARED_CHECKPOINT_FOLDER = "/home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/outputs"
OUTPUT_FOLDER = "/home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks"

In [59]:
def load_checkpoints(folder):
    checkpoints = {}
    for filename in os.listdir(folder):
        if filename.endswith(".json"):
            with open(os.path.join(folder, filename), "r") as f:
                data = json.load(f)
                checkpoint_name = os.path.splitext(filename)[0]
                checkpoints[checkpoint_name] = data
    return checkpoints

In [60]:
def extract_entity_data(checkpoint_data):
    records = []
    for entity_id, stats in checkpoint_data.items():
        questions = stats.get("questions", 0)
        correct = stats.get("correct", 0)
        occurences = stats.get("occurences", None)
        last_occurence = stats.get("last_occurence", None)

        if questions > 0:
            accuracy = correct / questions
            records.append({
                "entity_id": entity_id,
                "accuracy": accuracy,
                "questions": questions,      # ‚Üê include
                "correct": correct,          # ‚Üê include
                "occurences": occurences,
                "last_occurence": last_occurence
            })
    return pd.DataFrame(records)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

def plot_accuracy_by_field(df, field, checkpoint_name):
    filtered_df = df[df[field].notnull()].copy()

    # Ensure accuracy is in [0, 1]
    filtered_df = filtered_df[
        (filtered_df["accuracy"] >= 0) & (filtered_df["accuracy"] <= 1)
    ]

    if filtered_df.empty:
        print(f"‚ö†Ô∏è No valid data for '{field}' in {checkpoint_name}, skipping plot.")
        return

    # Binning strategy
    try:
        max_val = filtered_df[field].max()
        if max_val == 0:
            print(f"‚ö†Ô∏è Max value of '{field}' is 0 in {checkpoint_name}, skipping.")
            return
        bin_edges = np.linspace(0, max_val, num=11)  # 10 bins from 0 to max
        filtered_df["bin"] = pd.cut(
            filtered_df[field], bins=bin_edges, include_lowest=True
        )
    except ValueError:
        print(f"‚ö†Ô∏è Could not bin '{field}' in {checkpoint_name}. Skipping.")
        return

    # Group by bin - keep empty bins
    grouped = (
        filtered_df
        .groupby("bin", observed=False)  # Keep empty categories
        .agg(
            total_questions=("questions", "sum"),
            total_correct=("correct", "sum"),
            entity_count=("entity_id", "count")
        )
        .reset_index()
    )
    grouped["accuracy"] = grouped.apply(
        lambda row: row["total_correct"] / row["total_questions"] if row["total_questions"] > 0 else 0, 
        axis=1
    )    
    grouped["bin_label"] = grouped["bin"].astype(str)

    # Plot
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x="bin_label", y="accuracy", data=grouped)
    plt.xticks(rotation=45, ha='right')
    plt.title(f"{checkpoint_name}: Accuracy by {field} (n={len(filtered_df)})")
    plt.ylabel("Accuracy (Total Correct / Total Questions)")
    plt.xlabel(f"{field.replace('_', ' ').title()}")
    plt.ylim(0, 1.11)
    plt.grid(True, linestyle="--", alpha=0.5)

    # Annotate entity counts
    for i, row in grouped.iterrows():
        ax.text(
            i,
            row["accuracy"] + 0.03,
            f'n={row["entity_count"]}',
            ha='center',
            va='bottom',
            fontsize=9
        )

    plt.tight_layout()
    plot_path = os.path.join(OUTPUT_FOLDER, f"{checkpoint_name}_binned_accuracy_by_{field}.png")
    plt.savefig(plot_path)
    plt.close()
    print(f"‚úÖ Saved plot: {plot_path}")


In [90]:
def plot_occurences_of_last_seen_entities(df, checkpoint_name):
    # Filter entities with valid last_occurence and occurences
    filtered_df = df[df["last_occurence"].notnull() & df["occurences"].notnull()].copy()

    if filtered_df.empty:
        print(f"‚ö†Ô∏è No data for occurences + last_occurence in {checkpoint_name}, skipping cross-bin plot.")
        return

    # Binning strategy for last_occurence
    try:
        max_val = filtered_df["last_occurence"].max()
        if max_val == 0:
            print(f"‚ö†Ô∏è Max value of 'last_occurence' is 0 in {checkpoint_name}, skipping.")
            return
        bin_edges = np.linspace(0, max_val, num=11)  # 10 bins from 0 to max
        filtered_df["last_bin"] = pd.cut(
            filtered_df["last_occurence"], bins=bin_edges, include_lowest=True
        )
    except ValueError:
        print(f"‚ö†Ô∏è Could not bin last_occurence in {checkpoint_name}, skipping.")
        return

    # Identify the last bin
    last_bin = filtered_df["last_bin"].cat.categories[-1]
    print(last_bin)
    last_seen_df = filtered_df[filtered_df["last_bin"] == last_bin]

    if last_seen_df.empty:
        print(f"‚ö†Ô∏è No entities in last bin of last_occurence in {checkpoint_name}, skipping.")
        return

    # Fixed bins for occurences: 0-10, 10-100, 100-1000, 1000+
    max_val = last_seen_df["occurences"].max()
    if max_val == 0:
        print(f"‚ö†Ô∏è All occurences are 0 for last-seen entities in {checkpoint_name}. Skipping.")
        return

    # Define fixed bin edges
    bin_edges = [0, 10, 100, 1000, float('inf')]
    bin_labels = ['0-10', '10-100', '100-1000', '1000+']
    
    last_seen_df["occur_bin"] = pd.cut(
        last_seen_df["occurences"], 
        bins=bin_edges, 
        labels=bin_labels,
        include_lowest=True,
        right=False  # Use [0, 10), [10, 100), [100, 1000), [1000, inf)
    )

    # Count how many entities fall into each occurence bin
    grouped = (
        last_seen_df
        .groupby("occur_bin", observed=True)
        .agg(entity_count=("entity_id", "count"))
        .reset_index()
    )
    grouped["bin_label"] = grouped["occur_bin"].astype(str)

    # Plot
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(x="bin_label", y="entity_count", data=grouped)
    plt.xticks(rotation=45, ha='right')
    plt.title(f"{checkpoint_name}: Occurence Bins of Last-Seen Entities")
    plt.ylabel("Entity Count")
    plt.xlabel("Occurences Bin")
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.tight_layout()

    plot_path = os.path.join(OUTPUT_FOLDER, f"{checkpoint_name}_last_seen_entities_by_occurences.png")
    plt.savefig(plot_path)
    plt.close()
    print(f"‚úÖ Saved last-seen occurence bin plot: {plot_path}")

In [91]:
checkpoints = load_checkpoints(SHARED_CHECKPOINT_FOLDER)
for checkpoint_name, checkpoint_data in checkpoints.items():
    print(f"üìä Processing checkpoint: {checkpoint_name}")
    df = extract_entity_data(checkpoint_data)
    plot_accuracy_by_field(df, "occurences", checkpoint_name)
    plot_accuracy_by_field(df, "last_occurence", checkpoint_name)
    plot_occurences_of_last_seen_entities(df, checkpoint_name)

üìä Processing checkpoint: checkpoint_1
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_1_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_1_binned_accuracy_by_last_occurence.png
(8999.1, 9999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_1_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_2_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_2_binned_accuracy_by_last_occurence.png
(17999.1, 19999.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_2_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_3
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_3_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_3_binned_accuracy_by_last_occurence.png
(26999.1, 29999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_3_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_4_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_4_binned_accuracy_by_last_occurence.png
(35999.1, 39999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_4_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_5_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_5_binned_accuracy_by_last_occurence.png
(44999.1, 49999.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_5_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_6
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_6_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_6_binned_accuracy_by_last_occurence.png
(53999.1, 59999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_6_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_7_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_7_binned_accuracy_by_last_occurence.png
(62999.1, 69999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_7_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_8_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_8_binned_accuracy_by_last_occurence.png
(71999.1, 79999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_8_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_9_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_9_binned_accuracy_by_last_occurence.png
(80999.1, 89999.0]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_9_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_10
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_10_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_10_binned_accuracy_by_last_occurence.png
(89999.1, 99999.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_10_last_seen_entities_by_occurences.png
üìä Processing checkpoint: checkpoint_final


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(


‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_final_binned_accuracy_by_occurences.png
‚úÖ Saved plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_final_binned_accuracy_by_last_occurence.png
(98703.9, 109671.0]
‚úÖ Saved last-seen occurence bin plot: /home/joberant/NLP_2425b/shirab6/knowledge-analysis-suite/OLMo-core/performance_by_step_plots/shared_chunks/checkpoint_final_last_seen_entities_by_occurences.png


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  last_seen_df["occur_bin"] = pd.cut(
