In [None]:
# taken and modified from https://github.com/karpathy/build-nanogpt/blob/master/play.ipynb
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import wandb

sns.set_style("darkgrid")

%matplotlib inline

In [None]:
sz = "124M"
log_file = "log/log.txt"

loss_baseline = {
    "124M": 3.2924,
}[sz]

hella2_baseline = { # HellaSwag for GPT-2
    "124M": 0.294463,
    "350M": 0.375224,
    "774M": 0.431986,
    "1558M": 0.488946,
}[sz]
hella3_baseline = { # HellaSwag for GPT-3
    "124M": 0.337,
    "350M": 0.436,
    "774M": 0.510,
    "1558M": 0.547,
}[sz]


In [None]:
# load the log file
with open(log_file, "r") as f:
    lines = f.readlines()

# parse the individual lines, group by stream (train,val,hella)
streams = {}
for line in lines:
    step, stream, val = line.strip().split()
    if stream not in streams:
        streams[stream] = {}
    streams[stream][int(step)] = float(val)

# convert each stream from {step: val} to (steps[], vals[])
streams_xy = {}
for k, v in streams.items():
    # get all (step, val) items, sort them
    xy = sorted(list(v.items()))
    # unpack the list of tuples to tuple of lists
    streams_xy[k] = list(zip(*xy))

# make sure to remove : from the end of the keys
streams_xy = {k.rstrip(':'): v for k, v in streams_xy.items()}
print(streams_xy.keys())


In [None]:
# create figure
plt.figure(figsize=(16, 6))

# Panel 1: losses: both train and val
plt.subplot(121)
xs, ys = streams_xy["train_loss"] # training loss
ys = np.array(ys)
xs = np.array(xs)
plt.plot(xs / 1000, ys, label=f'gpt-2 implement ({sz}) train loss', color='royalblue', linewidth=2)
print("Min Train Loss:", min(ys))

xs, ys = streams_xy["val_loss"] # validation loss
xs = np.array(xs)
plt.plot(xs / 1000, ys, label=f'gpt-2 implement ({sz}) val loss', color='darkorange', linewidth=2)
print("Min Validation Loss:", min(ys))

# horizontal line at GPT-2 baseline
if loss_baseline is not None:
    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) val loss")

plt.xlabel("Steps (K)", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.yscale('log')
plt.ylim(top=4.0)
plt.legend(fontsize=12)
plt.title("Loss", fontsize=16)
plt.grid(True, which="both", ls="--")

# Panel 2: HellaSwag eval
plt.subplot(122)
xs, ys = streams_xy["hella_norm"] # HellaSwag eval
ys = np.array(ys)
xs = np.array(xs)
plt.plot(xs / 1000, ys, label=f"gpt-2 implement ({sz})", color='royalblue', linewidth=2)
print("Max Hellaswag eval:", max(ys))

# horizontal line at GPT-2 baseline
if hella2_baseline:
    plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) baseline")
if hella3_baseline:
    plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f"OpenAI GPT-3 ({sz}) baseline")

plt.xlabel("Steps (K)", fontsize=14)
plt.ylabel("HellaSwag Accuracy", fontsize=14)
plt.legend(fontsize=12)
plt.title("HellaSwag Eval", fontsize=16)
plt.grid(True, which="both", ls="--")

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

# save the plot
# plt.savefig(f"images/gpt-2-implement-baseline.png", dpi=300)


In [None]:
# Initialize API
wandb_entity = "garg-aayush"
wandb_project = "pre-training"
run_names = ["gpt2-baseline", "gpt2-rope"]
cols_to_keep = ['step', 'val/loss', 'train/loss', 'val/hella_norm']


api = wandb.Api()
runs_path = f"{wandb_entity}/{wandb_project}"
print(f"Fetching from: {runs_path}")
    
runs = api.runs(runs_path)
    
# Filter runs if specified
if run_names:
    runs = [run for run in runs if run.name in run_names]
    
# Dictionary to store all data
all_histories = {}
    
    
for run in runs:
    print(f"\nProcessing: {run.name}")
    print(f"  Run ID: {run.id}")
    print(f"  State: {run.state}")
        
    # Fetch complete history
    print("  Fetching history...", end="", flush=True)
    history = run.history(samples=100000)  # Large number to get all samples
    print(f" Got {len(history)} datapoints")
        
    # Add metadata columns
    history['run_name'] = run.name
    history['run_id'] = run.id
        
    # Store in dictionary
    all_histories[run.name] = history
    
    
baseline_df = all_histories['gpt2-baseline'][cols_to_keep]
rope_df = all_histories['gpt2-rope'][cols_to_keep]


In [None]:
# create figure
plt.figure(figsize=(16, 6))

# Panel 1: losses: both train and val
plt.subplot(121)
xs_baseline, ys_baseline = baseline_df["step"], baseline_df["train/loss"]
xs_rope, ys_rope = rope_df["step"], rope_df["train/loss"]

# Apply running average
# ys_baseline = ys_baseline.rolling(window=10, min_periods=1).mean()
# ys_rope = ys_rope.rolling(window=3, min_periods=1).mean()

xs_baseline = np.array(xs_baseline)
xs_rope = np.array(xs_rope)
ys_baseline = np.array(ys_baseline)
ys_rope = np.array(ys_rope)
plt.plot(xs_baseline / 1000, ys_baseline, label=f'gpt2-baseline train loss ({sz})', color='royalblue', linewidth=2)
plt.plot(xs_rope / 1000, ys_rope, label=f'gpt2-rope train loss ({sz})', color='lightblue', linewidth=2)
print("Min Baseline Train Loss:", min(ys_baseline))
print("Min Rope Train Loss:", min(ys_rope))

# Filter out NaN values for baseline validation data
baseline_val_mask = baseline_df["val/loss"].notna()
xs_val_baseline = baseline_df[baseline_val_mask]["step"]
ys_val_baseline = baseline_df[baseline_val_mask]["val/loss"]

# Filter out NaN values for rope validation data
rope_val_mask = rope_df["val/loss"].notna()
xs_val_rope = rope_df[rope_val_mask]["step"]
ys_val_rope = rope_df[rope_val_mask]["val/loss"]

xs_val_baseline = np.array(xs_val_baseline)
xs_val_rope = np.array(xs_val_rope)
ys_val_baseline = np.array(ys_val_baseline)
ys_val_rope = np.array(ys_val_rope)
plt.plot(xs_val_baseline / 1000, ys_val_baseline, label=f'gpt2-baseline val loss ({sz})', color='darkorange', linewidth=2)
plt.plot(xs_val_rope / 1000, ys_val_rope, label=f'gpt2-rope val loss ({sz})', color='red', linewidth=2)
print("Min Validation Loss:", min(ys_val_baseline))
print("Min Rope Validation Loss:", min(ys_val_rope))


# horizontal line at GPT-2 baseline
if loss_baseline is not None:
    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) val loss")

plt.xlabel("Steps (K)", fontsize=14)
plt.ylabel("Loss", fontsize=14)
plt.yscale('log')
plt.ylim(top=4.0)
plt.legend(fontsize=12)
plt.title("Loss", fontsize=16)
plt.grid(True, which="both", ls="--")



# Panel 2: HellaSwag eval
plt.subplot(122)
xs_baseline, ys_baseline = baseline_df["step"], baseline_df["val/hella_norm"]
xs_rope, ys_rope = rope_df["step"], rope_df["val/hella_norm"]
# remove NaN values
xs_baseline = xs_baseline[~np.isnan(ys_baseline)]
ys_baseline = ys_baseline[~np.isnan(ys_baseline)]
xs_rope = xs_rope[~np.isnan(ys_rope)]
ys_rope = ys_rope[~np.isnan(ys_rope)]

xs_baseline = np.array(xs_baseline)
xs_rope = np.array(xs_rope)
ys_baseline = np.array(ys_baseline)
ys_rope = np.array(ys_rope)
plt.plot(xs_baseline / 1000, ys_baseline, label=f"gpt2-baseline ({sz})", color='darkgreen', linewidth=2)
plt.plot(xs_rope / 1000, ys_rope, label=f"gpt2-rope ({sz})", color='darkblue', linewidth=2)
print("Max Hellaswag eval:", max(ys_baseline))
print("Max Hellaswag eval:", max(ys_rope))

# horizontal line at GPT-2 baseline
if hella2_baseline:
    plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) baseline")
if hella3_baseline:
    plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f"OpenAI GPT-3 ({sz}) baseline")

plt.xlabel("Steps (K)", fontsize=14)
plt.ylabel("HellaSwag Accuracy", fontsize=14)
plt.legend(fontsize=12)
plt.title("HellaSwag Eval", fontsize=16)
plt.grid(True, which="both", ls="--")

plt.tight_layout(rect=[0, 0, 1, 0.96])

# save the plot
plt.savefig("images/gpt-2-implement-compare.png", dpi=300)
plt.show()

