In [None]:
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd

# Load the contents
with open("../../logs/train_metrics_20251220_1602.jsonl", "r") as f:
    train_metrics = [json.loads(line) for line in f]

df_metrics = pd.DataFrame(train_metrics)
df_metrics

In [None]:
df_metrics["timestamp"] = pd.to_datetime(df_metrics["timestamp"], format="mixed")

# adjust timestamps after a slight hickup in training.
cutoff = pd.Timestamp("2025-12-24T09:46:05")
mask = df_metrics["timestamp"] < cutoff
df_metrics.loc[mask, "timestamp"] += pd.Timedelta(hours=13, minutes=37)


df_metrics["elapsed_hours"] = (
    df_metrics["timestamp"] - df_metrics["timestamp"].iloc[0]
).dt.total_seconds() / 3600


df_metrics["tokens_seen"] = df_metrics["step"] * 84 * 1024
df_metrics["tokens_seen_b"] = (df_metrics["step"] * 84 * 1024 ) / 1e9  # in billions

df_metrics

In [None]:
# Training Loss Figure

fig1, ax1 = plt.subplots(figsize=(10, 5), facecolor="white")
ax1.plot(
    df_metrics["step"],
    df_metrics["train_loss"],
    color="tab:blue",
    alpha=0.7,
    label="Train Loss",
)

ax1.plot(
    df_metrics["step"][df_metrics["val_loss"].notna()],
    df_metrics["val_loss"][df_metrics["val_loss"].notna()],
    color="tab:orange",
    label="Validation Loss",
    zorder=4,
    linewidth=3,
)

# Add horizontal lines for previous and target val loss
ax1.axhline(2.8, color="tab:purple", linestyle="--", linewidth=2, label="Prev val loss")
ax1.axhline(2.7, color="tab:green", linestyle=":", linewidth=2, label="Target val loss")


ax1.legend()
ax1.yaxis.grid(True, which="both", linestyle="--", linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.AutoLocator())
ax1.xaxis.set_minor_locator(mticker.AutoMinorLocator())
ax1.xaxis.grid(True, which="both", linestyle=":", linewidth=0.5)
# ax1.set_yscale("log")
ax1.set_ylabel("Loss")
ax1.set_title("Cross Entropy Loss by Step (nats)")
ax1.grid(True)
ax1.set_ylim(bottom=1.5, top=4.5)
ax1.set_xlim(left=0, right=350000)
fig1.patch.set_facecolor("white")


plt.tight_layout()
plt.show()

# Learning Rate Figure
fig2, ax2 = plt.subplots(figsize=(10, 5))
# ax2.plot(df_metrics["step"], df_metrics["adamw_lr"], color="tab:green")
ax2.plot(df_metrics["step"], df_metrics["muon_lr"], color="tab:blue")
ax2.set_xlabel("Step")
ax2.set_ylabel("Learning Rate")
ax2.set_title("Learning Rate by Step")
ax2.set_xlim(left=0, right=350000)
ax2.grid(True)
plt.tight_layout()
plt.show()

# Hellaswag Eval Accuracy Figure
fig3, ax3 = plt.subplots(figsize=(10, 5))
ax3.plot(
    df_metrics["tokens_seen"][df_metrics["hellaswag_acc"].notna()],
    df_metrics["hellaswag_acc"][df_metrics["hellaswag_acc"].notna()],
    color="tab:blue",
    marker="o",
)
ax3.set_ylabel("Accuracy")
ax3.set_title("Hellaswag Eval Accuracy")
ax3.grid(True)
ax3.set_ylim(bottom=0.23, top=0.5)
# ax3.set_xlim(left=0, right=350000)
plt.tight_layout()
plt.show()

In [None]:
# Norm Figure
fig_norm, ax_norm = plt.subplots(figsize=(10, 5))
ax_norm.plot(df_metrics["step"], df_metrics["norm"], color="tab:purple")
ax_norm.set_xlabel("Step")
ax_norm.set_ylabel("Norm")
ax_norm.set_title("Gradient Norm by Step")
ax_norm.set_xlim(left=0, right=350000)
ax_norm.set_ylim(bottom=0, top=3)
ax_norm.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Training Loss Figure by hours

fig1, ax1 = plt.subplots(figsize=(10, 5), facecolor="white")
ax1.plot(
    df_metrics["elapsed_hours"],
    df_metrics["train_loss"],
    color="tab:blue",
    alpha=0.7,
    label="Train Loss",
)

ax1.plot(
    df_metrics["elapsed_hours"][df_metrics["val_loss"].notna()],
    df_metrics["val_loss"][df_metrics["val_loss"].notna()],
    color="tab:orange",
    label="Validation Loss",
    zorder=4,
    linewidth=3,
)

# Add horizontal lines for previous and target val loss
ax1.axhline(2.8, color="tab:purple", linestyle="--", linewidth=2, label="Prev val loss")
ax1.axhline(2.7, color="tab:green", linestyle=":", linewidth=2, label="Target val loss")
# ax1.plot([20000, 125000], [3.63, 3.31], color="tab:red", linestyle="--", linewidth=2, label="Prev decline", zorder=99)

ax1.legend()
ax1.yaxis.grid(True, which="both", linestyle="--", linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.AutoLocator())
ax1.xaxis.set_minor_locator(mticker.AutoMinorLocator())
ax1.xaxis.grid(True, which="both", linestyle=":", linewidth=0.5)
# ax1.set_yscale("log")
ax1.set_ylabel("Loss")
ax1.set_title("Cross Entropy Loss by Hours trained (nats)")
ax1.grid(True)
ax1.set_ylim(bottom=1.5, top=4.5)
ax1.set_xlim(left=0, right=70)
fig1.patch.set_facecolor("white")


plt.tight_layout()
plt.show()

In [None]:
from matplotlib.gridspec import GridSpec

# Create a figure with GridSpec to control height ratios
fig = plt.figure(figsize=(10, 7), facecolor="white")
gs = GridSpec(3, 1, height_ratios=[2.5, 1, 1], hspace=0.18)  # loss taller, lr/norm shorter

# --- Loss Plot ---
ax1 = fig.add_subplot(gs[0])
ax1.plot(
    df_metrics["tokens_seen_b"],
    df_metrics["train_loss"],
    color="tab:blue",
    alpha=0.7,
    label="Train Loss",
)
ax1.plot(
    df_metrics["tokens_seen_b"][df_metrics["val_loss"].notna()],
    df_metrics["val_loss"][df_metrics["val_loss"].notna()],
    color="tab:orange",
    label="Validation Loss",
    zorder=4,
    linewidth=3,
)
ax1.legend()
ax1.yaxis.grid(True, which="both", linestyle="--", linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.AutoLocator())
ax1.xaxis.set_minor_locator(mticker.AutoMinorLocator())
ax1.xaxis.grid(True, which="both", linestyle=":", linewidth=0.5)
ax1.set_ylabel("Loss")
ax1.set_ylim(bottom=2, top=4.5)
ax1.grid(True)
ax1.set_xlabel("")
ax1.tick_params(labelbottom=False)

# --- Learning Rate Plot ---
ax2 = fig.add_subplot(gs[1], sharex=ax1)
ax2.plot(df_metrics["tokens_seen_b"], df_metrics["muon_lr"], color="tab:green")
ax2.set_ylabel("Learning Rate")
ax2.grid(True)
ax2.set_xlabel("")
ax2.tick_params(labelbottom=False)

# --- Norm Plot ---
ax3 = fig.add_subplot(gs[2], sharex=ax1)
ax3.plot(df_metrics["tokens_seen_b"], df_metrics["norm"], color="tab:purple")
ax3.set_xlabel("Tokens [Billion]")
ax3.set_ylabel("Norm")
ax3.grid(True)
ax3.set_ylim(bottom=0, top=2.5)
# Plot a line from x=0 to x=22 at y=1
ax3.plot([0, 22], [1, 1], color="red", linestyle="--", linewidth=2, zorder=99)
ax3.plot([22, 22.0000001], [1, 0.5], color="red", linestyle="--", linewidth=2, zorder=99)
ax3.plot([22, 30], [0.5, 0.5], color="red", linestyle="--", label="Norm clipping", linewidth=2, zorder=99)
ax3.legend()


fig.suptitle("Pretraining Progress Overview", fontsize=12,y =0.92)
plt.tight_layout()
plt.show()

In [None]:
# Load the contents
with open("../../logs/instruct_training_metrics.jsonl", "r") as f:
    train_instruct = [json.loads(line) for line in f]

df_instruct = pd.DataFrame(train_instruct)

fig, ax = plt.subplots(figsize=(10, 5), facecolor="white")
ax.plot(df_instruct["step"], df_instruct["train_loss"], label="Train Loss", color="tab:blue", alpha=0.7)
ax.plot(df_instruct["step"], df_instruct["val_loss"], label="Validation Loss", color="tab:orange", linewidth=2)
ax.set_xlabel("Step")
ax.set_ylabel("Loss")
ax.set_title("Train and Validation Loss (Instruct Tuning)")
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()