In [1]:
import os
import pandas as pd
import wandb

## Setup

### Read Results CSV Files

This assumes that you already have at least 1 result csv file for each step. We would recommend using `run_checkpoints.sh` and `run_checkpoints_cot.sh` to generate the result csv files, where it would automatically save the results in the following format:
```
{run_name}/c{checkpoint_number}_api_{benchmark}_{if cot}.csv
```

Update the `result_folder` variable to point to the folder that contains the csv files. This will import all of the csv files in there.


In [2]:
# Step 1. Specify the folder where the result csv files are stored
result_folder = (
    "results/sqlcoder_8b_fullft_ds_013_llama3_mgn1_b1_0900_b2_0990_steps_1000"
)
csv_files = []
for f in os.listdir(result_folder):
    if f.endswith(".csv"):
        csv_files.append(f)
print(f"Found {len(csv_files)} csv files in {result_folder}")

Found 32 csv files in results/sqlcoder_8b_fullft_ds_013_llama3_mgn1_b1_0900_b2_0990_steps_1000


In [3]:
# Step 2. Specify the wandb run id
# We don't do a lookup via the wandb API because different runs may have the same run name
wandb_run_id = "qcbad5rx"

In [4]:
# Load results from csv file into dataframe
results_dfs = []
for csv_file_name in csv_files:
    file_path = os.path.join(result_folder, csv_file_name)
    df_i = pd.read_csv(file_path, comment="#")
    df_i["model"] = csv_file_name.rsplit(".csv", 1)[0]
    results_dfs.append(df_i)
results_df = pd.concat(results_dfs, ignore_index=True)
print(f"Loaded {results_df.shape[0]} results from {len(csv_files)} csv files")

Loaded 3272 results from 32 csv files


In [5]:
s = results_df.groupby("model")["correct"].mean()
s = pd.DataFrame(s)
s["file_name"] = s.index
s["benchmark"] = s["file_name"].str.extract(r"_(advanced|basic|v1|idk)")
s["checkpoint"] = s["file_name"].str.extract(r"c(\d+)_").astype(int)
s["cot"] = s["file_name"].str.extract(r"_(cot)").fillna("no_cot")
s = s.reset_index(drop=True)

In [6]:
# Get unique checkpoints
checkpoints = s["checkpoint"].unique()
checkpoints.sort()
print(f"Found {len(checkpoints)} checkpoints: {checkpoints}")

Found 4 checkpoints: [ 400  600  800 1000]


In [7]:
# Continue existing run, specifying the project and the run ID
run = wandb.init(project="huggingface", id=wandb_run_id, resume="must")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwongjingping[0m ([33mdefog[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011166706489812996, max=1.0…

In [8]:
# get current step, so that we can log incrementally after it
# this is because wandb doesn't allow logging back to previous steps
current_step = run.step
print(f"Current step: {current_step}")

Current step: 1001


In [9]:
for checkpoint in checkpoints:
    checkpoint_metrics = {}
    for benchmark in ["advanced", "basic", "v1", "idk"]:
        for cot in ["cot", "no_cot"]:
            mask = (
                (s["checkpoint"] == checkpoint)
                & (s["benchmark"] == benchmark)
                & (s["cot"] == cot)
            )
            if mask.sum() == 1:
                row = s[mask]
                metric_name = f"vllm/{benchmark}"
                if cot == "cot":
                    metric_name += "_cot"
                metric_value = row["correct"].values[0]
                checkpoint_metrics[metric_name] = metric_value
    print(f"Logging checkpoint {checkpoint} metrics:")
    for k, v in checkpoint_metrics.items():
        print(f"\t{k}: {v}")
    # we log the metrics at the current step + checkpoint
    wandb.log(checkpoint_metrics, step=current_step + checkpoint)

Logging checkpoint 400 metrics:
	vllm/advanced_cot: 0.75
	vllm/advanced: 0.78125
	vllm/basic_cot: 0.9
	vllm/basic: 0.825
	vllm/v1_cot: 0.875
	vllm/v1: 0.865
	vllm/idk_cot: 0.9238095238095239
	vllm/idk: 0.8476190476190476
Logging checkpoint 600 metrics:
	vllm/advanced_cot: 0.703125
	vllm/advanced: 0.765625
	vllm/basic_cot: 0.9
	vllm/basic: 0.85
	vllm/v1_cot: 0.85
	vllm/v1: 0.84
	vllm/idk_cot: 0.9523809523809523
	vllm/idk: 0.8952380952380953
Logging checkpoint 800 metrics:
	vllm/advanced_cot: 0.765625
	vllm/advanced: 0.765625
	vllm/basic_cot: 0.925
	vllm/basic: 0.9
	vllm/v1_cot: 0.86
	vllm/v1: 0.845
	vllm/idk_cot: 0.9523809523809523
	vllm/idk: 0.8761904761904762
Logging checkpoint 1000 metrics:
	vllm/advanced_cot: 0.78125
	vllm/advanced: 0.78125
	vllm/basic_cot: 0.925
	vllm/basic: 0.9
	vllm/v1_cot: 0.865
	vllm/v1: 0.845
	vllm/idk_cot: 0.9523809523809523
	vllm/idk: 0.8761904761904762


In [10]:
# Finish the run
run.finish()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
vllm/advanced,█▁▁█
vllm/advanced_cot,▅▁▇█
vllm/basic,▁▃██
vllm/basic_cot,▁▁██
vllm/idk,▁█▅▅
vllm/idk_cot,▁███
vllm/v1,█▁▂▂
vllm/v1_cot,█▁▄▅

0,1
advanced,0.45312
basic,0.95
basic_group_order_limit,1.0
basic_join_date_group_order_limit,0.875
basic_join_distinct,1.0
basic_join_group_order_limit,0.875
basic_left_join,1.0
cat_a,0.0
cat_b,0.0
cat_c,0.0
