# Evaluation Notebook

In diesem Notebook ist die Codebase beschrieben.

Das Training (5-Fold Cross-Validation) kann durch folgende Befehle ausgeführt werden:
```bash
python src/train.py --model show_tell --checkpoint_dir show_tell_fold0 --eval_fold 0 --epochs 10
python src/train.py --model show_tell --checkpoint_dir show_tell_fold1 --eval_fold 1 --epochs 10
python src/train.py --model show_tell --checkpoint_dir show_tell_fold2 --eval_fold 2 --epochs 10
python src/train.py --model show_tell --checkpoint_dir show_tell_fold3 --eval_fold 3 --epochs 10
python src/train.py --model show_tell --checkpoint_dir show_tell_fold4 --eval_fold 4 --epochs 10

python src/train.py --model show_attend_tell --checkpoint_dir show_attend_tell_fold0 --eval_fold 0 --epochs 10
python src/train.py --model show_attend_tell --checkpoint_dir show_attend_tell_fold1 --eval_fold 1 --epochs 10
python src/train.py --model show_attend_tell --checkpoint_dir show_attend_tell_fold2 --eval_fold 2 --epochs 10
python src/train.py --model show_attend_tell --checkpoint_dir show_attend_tell_fold3 --eval_fold 3 --epochs 10
python src/train.py --model show_attend_tell --checkpoint_dir show_attend_tell_fold4 --eval_fold 4 --epochs 10
```

In [1]:
import subprocess
import sys
import wandb
import pandas as pd

In [2]:
# model checkpoint artefacts generated during 5-fold cv trained using commands from above 
model_artefacts = {
    "show_tell": {
        0: "model-show_tell_20250107_012408_fold0:v0",
        1: "model-show_tell_20250107_014557_fold1:v0",
        2: "model-show_tell_20250107_020657_fold2:v0",
        3: "model-show_tell_20250107_022823_fold3:v0",
        4: "model-show_tell_20250107_024856_fold4:v0"
    },
    "show_attend_tell": {
        0: "model-show_attend_tell_20250107_030953_fold0:v0",
        1: "model-show_attend_tell_20250107_035920_fold1:v0",
        2: "model-show_attend_tell_20250107_044723_fold2:v0",
        3: "model-show_attend_tell_20250107_053559_fold3:v0",
        4: "model-show_attend_tell_20250107_062503_fold4:v0"
    }
}

In [None]:
# to skip the evaluation and just use results from evals already run, set this to True
SKIP_EVAL = False

results = {}

In [3]:
# evaluate each artefact using the following command
# python eval.py --model [show_tell/show_attend_tell] --eval_fold [fold] --wandb_project image-captioning-comparison --wandb_artifact [artifact_name:version]
if not SKIP_EVAL:
    results = {}
    for model_name, folds in model_artefacts.items():
        for fold, artifact in folds.items():
            if fold in results.get(model_name, {}):
                print(f"Skipping {model_name} fold {fold} as it is already evaluated.")
                continue
            cmd = [sys.executable, "eval.py", "--model", model_name, "--eval_fold", str(fold), "--wandb_project", "image-captioning-comparison", "--wandb_artifact", artifact]
            print(f"\nEvaluating {model_name} fold {fold}")
            result = subprocess.run(cmd, capture_output=True)
            out = result.stdout.decode('utf-8')
            err = result.stderr.decode('utf-8')

            # Extract the run URL from the stdout
            if "https://wandb.ai/" in err:
                url_line = [line for line in err.splitlines() if ("https://wandb.ai/" in line and "/runs/" in line)]
                if url_line:
                    run_url = url_line[0].strip()
                    run_id = run_url.split("/")[-1]  # Get the last part of the URL
                    results.setdefault(model_name, {})[fold] = run_id


Evaluating show_tell fold 0

Evaluating show_tell fold 1

Evaluating show_tell fold 2

Evaluating show_tell fold 3

Evaluating show_tell fold 4

Evaluating show_attend_tell fold 0


: 

: 

In [21]:
results

{'show_tell': {0: '2tg2yil9', 1: 'hngbhvxd'}}

In [22]:
api = wandb.Api()

# Initialize an empty list to store logs
all_logs = []

# Iterate through the results to fetch logs for each model and fold
for model_name, folds in results.items():
    for fold, run_id in folds.items():
        # run is specified by <entity>/<project>/<run_id>
        run = api.run(f"florin-barbisch/image-captioning-comparison/runs/{run_id}")

        # Fetch the logs for beam_bleu1
        logs = run.history()
        # Adding model and fold columns to the DataFrame
        logs['model'] = model_name
        logs['fold'] = fold
        
        # Append the logs to the list
        all_logs.append(logs)

# Concatenate all logs into one big DataFrame
final_logs_df = pd.concat(all_logs, ignore_index=True)

final_logs_df.head()

True
True


Unnamed: 0,beam_bleu4,beam_bleu2,_step,beam_bleu1,_runtime,greedy_bleu4,greedy_bleu1,greedy_bleu2,beam_bleu3,greedy_bleu3,_timestamp,model,fold
0,1.186993e-76,13.156025,0,35.193133,26.954806,4.842084e-153,29.284165,10.537078,4.609435,6.754901e-101,1736245000.0,show_tell,0
1,2.349674,11.070336,0,26.583963,137.737707,2.395733,26.274515,10.253199,4.918598,4.647314,1736243000.0,show_tell,1
