In [None]:
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd

# Load the contents
with open("../../logs/train_metrics_20251220_1343.jsonl", "r") as f:
    train_metrics = [json.loads(line) for line in f]

df_metrics = pd.DataFrame(train_metrics)
df_metrics

In [None]:
df_metrics["timestamp"] = pd.to_datetime(df_metrics["timestamp"])

# adjust timestamps after a slight hickup in training.
cutoff = pd.Timestamp("2025-12-20T15:50:00")
mask = df_metrics["timestamp"] < cutoff
df_metrics.loc[mask, "timestamp"] += pd.Timedelta(minutes=24)


df_metrics["elapsed_hours"] = (df_metrics["timestamp"] - df_metrics["timestamp"].iloc[0]).dt.total_seconds() / 3600


df_metrics

In [None]:
# Training Loss Figure

fig1, ax1 = plt.subplots(figsize=(10, 5), facecolor="white")
ax1.plot(
    df_metrics["step"],
    df_metrics["train_loss"],
    color="tab:blue",
    alpha=0.7,
    label="Train Loss",
)

ax1.plot(
    df_metrics["step"][df_metrics["val_loss"].notna()],
    df_metrics["val_loss"][df_metrics["val_loss"].notna()],
    color="tab:orange",
    label="Validation Loss",
    zorder=4,
    linewidth=3,
)

# Add horizontal lines for previous and target val loss
ax1.axhline(2.8, color="tab:purple", linestyle="--", linewidth=2, label="Prev val loss")
ax1.axhline(2.6, color="tab:green", linestyle=":", linewidth=2, label="Target val loss")
ax1.plot([20000, 125000], [3.63, 3.31], color="tab:red", linestyle="--", linewidth=2, label="Prev decline", zorder=99)

ax1.legend()
ax1.yaxis.grid(True, which="both", linestyle="--", linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.AutoLocator())
ax1.xaxis.set_minor_locator(mticker.AutoMinorLocator())
ax1.xaxis.grid(True, which="both", linestyle=":", linewidth=0.5)
# ax1.set_yscale("log")
ax1.set_ylabel("Loss")
ax1.set_title("Cross Entropy Loss by Step (nats)")
ax1.grid(True)
ax1.set_ylim(bottom=1.5, top=4.5)
ax1.set_xlim(left=0, right=310000)
fig1.patch.set_facecolor("white")


plt.tight_layout()
plt.show()

# Learning Rate Figure
fig2, ax2 = plt.subplots(figsize=(10, 5))
# ax2.plot(df_metrics["step"], df_metrics["adamw_lr"], color="tab:green")
ax2.plot(df_metrics["step"], df_metrics["muon_lr"], color="tab:blue")
ax2.set_xlabel("Step")
ax2.set_ylabel("Learning Rate")
ax2.set_title("Learning Rate by Step")
ax2.set_xlim(left=0, right=310000)
ax2.grid(True)
plt.tight_layout()
plt.show()

# Hellaswag Eval Accuracy Figure
fig3, ax3 = plt.subplots(figsize=(10, 5))
ax3.plot(
    df_metrics["step"][df_metrics["hellaswag_acc"].notna()],
    df_metrics["hellaswag_acc"][df_metrics["hellaswag_acc"].notna()],
    color="tab:blue",
    marker="o",
)
ax3.set_ylabel("Hellaswag Accuracy")
ax3.set_title("Hellaswag Eval Accuracy by Step (Base Model)")
ax3.grid(True)
ax3.set_ylim(bottom=0.23, top=0.4)
plt.tight_layout()
plt.show()

In [None]:
# Training Loss Figure by hours

fig1, ax1 = plt.subplots(figsize=(10, 5), facecolor="white")
ax1.plot(
    df_metrics["elapsed_hours"],
    df_metrics["train_loss"],
    color="tab:blue",
    alpha=0.7,
    label="Train Loss",
)

ax1.plot(
    df_metrics["elapsed_hours"][df_metrics["val_loss"].notna()],
    df_metrics["val_loss"][df_metrics["val_loss"].notna()],
    color="tab:orange",
    label="Validation Loss",
    zorder=4,
    linewidth=3,
)

# Add horizontal lines for previous and target val loss
ax1.axhline(2.8, color="tab:purple", linestyle="--", linewidth=2, label="Prev val loss")
ax1.axhline(2.6, color="tab:green", linestyle=":", linewidth=2, label="Target val loss")
# ax1.plot([20000, 125000], [3.63, 3.31], color="tab:red", linestyle="--", linewidth=2, label="Prev decline", zorder=99)

ax1.legend()
ax1.yaxis.grid(True, which="both", linestyle="--", linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.AutoLocator())
ax1.xaxis.set_minor_locator(mticker.AutoMinorLocator())
ax1.xaxis.grid(True, which="both", linestyle=":", linewidth=0.5)
# ax1.set_yscale("log")
ax1.set_ylabel("Loss")
ax1.set_title("Cross Entropy Loss by Step (nats)")
ax1.grid(True)
ax1.set_ylim(bottom=1.5, top=4.5)
ax1.set_xlim(left=0, right=10)
fig1.patch.set_facecolor("white")


plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.gridspec import GridSpec

# Create a figure with GridSpec to control height ratios
fig = plt.figure(figsize=(10, 7), facecolor="white")
gs = GridSpec(2, 1, height_ratios=[3, 1], hspace=0.15)  # loss taller, lr shorter

# --- Loss Plot ---
ax1 = fig.add_subplot(gs[0])
ax1.plot(
    df_metrics["step"],
    df_metrics["train_loss"],
    color="tab:blue",
    alpha=0.7,
    label="Train Loss",
)
ax1.plot(
    df_metrics["step"][df_metrics["val_loss"].notna()],
    df_metrics["val_loss"][df_metrics["val_loss"].notna()],
    color="tab:orange",
    label="Validation Loss",
    zorder=4,
    linewidth=3,
)
ax1.legend()
ax1.yaxis.grid(True, which="both", linestyle="--", linewidth=0.5)
ax1.xaxis.set_major_locator(mticker.AutoLocator())
ax1.xaxis.set_minor_locator(mticker.AutoMinorLocator())
ax1.xaxis.grid(True, which="both", linestyle=":", linewidth=0.5)
ax1.set_yscale("log")
ax1.set_ylabel("Loss")
ax1.set_ylim(top=8.0)
ax1.set_title("Cross Entropy Loss by Step (nats)")
ax1.grid(True)

# Remove x-axis labels and ticks for the top plot
ax1.set_xlabel("")
ax1.tick_params(labelbottom=False)

# --- Learning Rate Plot ---
ax2 = fig.add_subplot(gs[1], sharex=ax1)
ax2.plot(df_metrics["step"], df_metrics["muon_lr"], color="tab:green")
ax2.set_xlabel("Step")
ax2.set_ylabel("Learning Rate")
ax2.grid(True)

# Clean layout
plt.tight_layout()
plt.show()


In [None]:
# Load the contents
with open("../../logs/instruct_training_metrics.jsonl", "r") as f:
    train_instruct = [json.loads(line) for line in f]

df_instruct = pd.DataFrame(train_instruct)
df_instruct