In [1]:
from transformers import ViTImageProcessor, AutoTokenizer
from huggingface_hub import PyTorchModelHubMixin
import torch
from model import BirdCaptioningModel  # Save model.py locally

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model
model = BirdCaptioningModel.from_pretrained("INVERTO/bird-captioning-cub200").to(device)
image_processor = ViTImageProcessor.from_pretrained("INVERTO/bird-captioning-cub200")
tokenizer = AutoTokenizer.from_pretrained("INVERTO/bird-captioning-cub200")
model.eval()

# Load species mapping
species_mapping = {}
with open("species_mapping.txt", "r") as f:
    for line in f:
        idx, name = line.strip().split(",", 1)
        species_mapping[int(idx)] = name


## TODO: in cell 5 there is code to translate the species to a label. return that too.

In [None]:
from PIL import Image

def predict_bird_image(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = image_processor(image, return_tensors="pt").pixel_values.to(device)
    with torch.no_grad():
        output_ids = model.base_model.generate(pixel_values, max_length=75, num_beams=4)
        _, class_logits = model(pixel_values)
        predicted_class_idx = torch.argmax(class_logits, dim=1).item()
        confidence = torch.nn.functional.softmax(class_logits, dim=1)[0, predicted_class_idx].item() * 100
        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        species = species_mapping.get(predicted_class_idx, "Unknown")
    return caption, predicted_class_idx, species, confidence

# Example
caption, predicted_class_idx, species, confidence = predict_bird_image("../datasets/train_images/train_images/39.jpg")
print(f"Caption: {caption}")
print(f"Class: {predicted_class_idx}")
print(f"Species: {species}")
print(f"Confidence: {confidence:.2f}%")


Caption: is an image of Slaty backed Gull, has attributes: has bill shape hooked seabird, has wing color white, has upperparts color white, has underparts color white, has breast pattern solid
Class: 0
Species: Black footed Albatross
Confidence: 100.00%


In [20]:
# Print first part of captions for a range of image IDs
import os

def print_captions_range(start_id=1, end_id=200):
    for file_id in range(start_id, end_id + 1):
        image_path = f"../datasets/train_images/train_images/{file_id}.jpg"
        if not os.path.exists(image_path):
            print(f"{file_id}: [missing]")
            continue
        try:
            caption, predicted_class_idx, species, confidence = predict_bird_image(image_path)
            first_part = caption.split(",", 1)[0].strip()
            print(f"{file_id}: {first_part}")
        except Exception as e:
            print(f"{file_id}: [error] {e}")

# Run for 1..200
print_captions_range(1, 200)

1: is an image of Black footed Albatross
2: is an image of Black footed Albatross
3: is an image of Black footed Albatross
4: is an image of Black footed Albatross
5: is an image of Black footed Albatross
6: is an image of Black footed Albatross
7: is an image of Black footed Albatross
8: is an image of Black footed Albatross
9: is an image of Black footed Albatross
10: is an image of Black footed Albatross
11: is an image of Black footed Albatross
12: is an image of Black footed Albatross
13: is an image of Black footed Albatross
14: is an image of Black footed Albatross
15: is an image of Black footed Albatross
16: is an image of Black footed Albatross
17: is an image of Black footed Albatross
18: is an image of Black footed Albatross
19: is an image of Black footed Albatross
20: is an image of Black footed Albatross
21: is an image of Black footed Albatross
22: is an image of Black footed Albatross
23: is an image of Black footed Albatross
24: is an image of Black footed Albatross
2

In [33]:
import re
import difflib

def _extract_lookup_from_caption(caption: str) -> str:
    s = str(caption)
    s_lower = s.lower()
    phrase = "is an image of "
    i = s_lower.rfind(phrase)
    if i != -1:
        tail = s[i + len(phrase):]
    else:
        # fallback: use text before first comma
        tail = s.split(",", 1)[0]
    # keep only up to first comma after the phrase (species part)
    tail = tail.split(",", 1)[0]
    # strip trailing punctuation/spaces
    tail = tail.strip().rstrip(".!? ")
    return tail

def debug_mapping_snapshot(mapping, sample=20):
    keys = sorted(mapping.keys())[:sample]
    print(f"species_to_label size: {len(mapping)}")
    print("sample normalized keys:")
    for i, k in enumerate(keys, 1):
        print(f"  {i:02d}. '{k}' -> {mapping[k]}")

def print_caption_labels_range_debug(start_id=1, end_id=20, topk=5):
    debug_mapping_snapshot(species_to_label, sample=20)
    print("-" * 80)

    for file_id in range(start_id, end_id + 1):
        image_path = f"../datasets/train_images/train_images/{file_id}.jpg"
        if not os.path.exists(image_path):
            print(f"{file_id}: [missing]")
            continue
        try:
            caption, predicted_class_idx, species, confidence = predict_bird_image(image_path)
            extracted = _extract_lookup_from_caption(caption)
            key = _norm_caption_name(extracted)

            found = key in species_to_label
            label = species_to_label.get(key, None)

            print(f"{file_id}:")
            print(f"  raw caption: {caption}")
            print(f"  extracted (after 'is an image of '): {extracted}")
            print(f"  lookup key (normalized): '{key}'")
            if found:
                print(f"  FOUND label: {label}")
            else:
                print("  NOT FOUND in species_to_label")
                close = difflib.get_close_matches(key, species_to_label.keys(), n=topk, cutoff=0.6)
                if close:
                    print("  close matches:")
                    for c in close:
                        print(f"    - '{c}' -> {species_to_label[c]}")
                else:
                    print("  close matches: []")
            print("-" * 80)
        except Exception as e:
            print(f"{file_id}: [error] {e}")

# Run a short debug window
print_caption_labels_range_debug(38, 72)

species_to_label size: 200
sample normalized keys:
  01. 'acadian flycatcher' -> 37
  02. 'american crow' -> 29
  03. 'american goldfinch' -> 47
  04. 'american pipit' -> 104
  05. 'american redstart' -> 109
  06. 'american three toed woodpecker' -> 187
  07. 'anna hummingbird' -> 67
  08. 'artic tern' -> 141
  09. 'baird sparrow' -> 113
  10. 'baltimore oriole' -> 95
  11. 'bank swallow' -> 135
  12. 'barn swallow' -> 136
  13. 'bay breasted warbler' -> 158
  14. 'belted kingfisher' -> 79
  15. 'bewick wren' -> 193
  16. 'black and white warbler' -> 159
  17. 'black billed cuckoo' -> 31
  18. 'black capped vireo' -> 151
  19. 'black footed albatross' -> 1
  20. 'black tern' -> 142
--------------------------------------------------------------------------------
38:
  raw caption: is an image of Laysan Albatross, has attributes: has bill shape hooked seabird, has wing color black, has upperparts color black, has underparts color white, has breast pattern solid
  extracted (after 'is an 

In [None]:
import os
import re
import pandas as pd
import numpy as np

def _norm_caption_name(s: str) -> str:
    s = str(s).replace('_', ' ')
    s = re.sub(r'\s+', ' ', s).strip().lower()
    return s

def _extract_lookup_from_caption(caption: str) -> str:
    s = str(caption)
    s_lower = s.lower()
    phrase = "is an image of "
    i = s_lower.rfind(phrase)
    if i != -1:
        tail = s[i + len(phrase):]
    else:
        tail = s.split(",", 1)[0]
    tail = tail.split(",", 1)[0]
    return tail.strip().rstrip(".!? ")

def _build_species_to_label(np_path="../datasets/class_names.npy"):
    arr = np.load(np_path, allow_pickle=True)
    if np.ndim(arr) != 0:
        raise ValueError("Expected 0-d pickle with dict")
    obj = arr.tolist()
    if not isinstance(obj, dict):
        raise ValueError("Pickle does not contain a dict")
    inverted = {v: k for k, v in obj.items()}
    cleaned = {
        k: (str(v).split('.', 1)[1] if '.' in str(v) else str(v))
        for k, v in inverted.items()
    }
    mapping = {}
    for k, v in cleaned.items():
        try:
            mapping[_norm_caption_name(v)] = int(str(k))
        except Exception:
            continue
    return mapping

# Ensure mapping exists
try:
    species_to_label
except NameError:
    species_to_label = _build_species_to_label("../datasets/class_names.npy")

def predict_from_csv(csv_path="../datasets/test_images_path.csv",
                     root_dir="../datasets/test_images",   # force double: <datasets>/test_images + "test_images/xxx.jpg"
                     out_csv="../results/test_images_prediction_INVERTO.csv",
                     debug=False):
    df = pd.read_csv(csv_path)
    root_dir = os.path.normpath(root_dir)
    os.makedirs(os.path.dirname(out_csv), exist_ok=True)

    preds = []
    for _, row in df.iterrows():
        raw_csv_path = str(row["image_path"]).strip()                # e.g. "/test_images/999.jpg"
        rel_path = os.path.normpath(raw_csv_path.lstrip("/\\"))      # -> "test_images/999.jpg"
        image_path = os.path.join(root_dir, rel_path)                # -> ..\datasets\test_images\test_images\999.jpg

        if debug:
            print(f"[debug] id={int(row['id'])} csv='{raw_csv_path}' -> rel='{rel_path}' -> full='{image_path}' exists={os.path.exists(image_path)}")

        if not os.path.exists(image_path):
            # fallback: try within root_dir with just filename
            fallback = os.path.join(root_dir, os.path.basename(rel_path))  # -> ..\datasets\test_images\999.jpg
            if os.path.exists(fallback):
                image_path = fallback
                if debug:
                    print(f"[debug] fallback used: {image_path}")
            else:
                preds.append({"id": int(row["id"]), "label": -1})
                if debug:
                    print(f"[debug] -> MISSING")
                continue

        try:
            caption, predicted_class_idx, species, confidence = predict_bird_image(image_path)
            extracted = _extract_lookup_from_caption(caption)
            key = _norm_caption_name(extracted)
            label = species_to_label.get(key, -1)
            if label == -1:
                label = int(predicted_class_idx) + 1

            preds.append({"id": int(row["id"]), "label": int(label)})

            if debug:
                print(f"        caption='{caption}'")
                print(f"        extracted='{extracted}'")
                print(f"        key='{key}' -> label={label}")
        except Exception as e:
            preds.append({"id": int(row["id"]), "label": -1})
            if debug:
                print(f"[debug] -> ERROR: {e}")

    df_out = pd.DataFrame(preds).sort_values("id")
    df_out.to_csv(out_csv, index=False)
    print(f"Saved submission file to: {out_csv}")
    return df_out.head()

# Run
predict_from_csv()
#print(predict_from_csv(debug=True))

#34min run on MU PC

Saved submission file to: ../results/test_images_prediction_INVERTO.csv


Unnamed: 0,id,label
0,1,67
1,2,38
2,3,74
3,4,12
4,5,74


In [51]:
import pandas as pd
import os
import numpy as np  # Add numpy import

# Load train CSV
train_csv = '../datasets/train_images.csv'
df_train = pd.read_csv(train_csv)

# Load class names for mapping
class_names = np.load(r'../datasets/class_names.npy', allow_pickle=True)

# List to store predictions
predictions = []

# Predict for each image in the train CSV
for idx, row in df_train.iterrows():
    image_path = os.path.join("../datasets/train_images", row['image_path'])
    if os.path.exists(image_path):
        try:
            caption, species, confidence = predict_bird_image(image_path)
            predicted_label = class_names[predicted_class_idx]  # Map using class_names.npy
            predictions.append({'id': row['id'], 'predicted_label': predicted_label})
        except Exception as e:
            print(f"Error predicting for {image_path}: {e}")
            predictions.append({'id': row['id'], 'predicted_label': -1})  # Placeholder for errors
    else:
        print(f"Image not found: {image_path}")
        predictions.append({'id': row['id'], 'predicted_label': -1})

# Create DataFrame and save to CSV
df_predictions = pd.DataFrame(predictions)
output_csv = "train_predict_vit.csv"
df_predictions.to_csv(output_csv, index=False)

print(f"Saved predictions to {output_csv}")
print(df_predictions.head())

Image not found: /train_images/1.jpg


KeyError: 'id'

## TODO: Based on previous predict
please modify this code so that the inference is done using the predict_bird_image(image_path) method instead.

In [None]:
# Load class names for mapping (if using HF model)
class_names = np.load("../datasets/class_names.npy")

# List to store predictions
predictions = []

# Predict for each image in the test CSV using predict_bird_image
for idx, row in df_unseen.iterrows():
    raw_path = row['image_path']
    path = os.path.normpath(raw_path.lstrip("/\\"))
    image_path = os.path.join("datasets", path)
    if os.path.exists(image_path):
        try:
            caption, species, confidence = predict_bird_image(image_path)
            predicted_label = class_names[predicted_class_idx]  # Map using class_names.npy
            predictions.append({'id': row['id'], 'predicted_label': predicted_label})
        except Exception as e:
            print(f"Error predicting for {image_path}: {e}")
            predictions.append({'id': row['id'], 'predicted_label': -1})  # Placeholder for errors
    else:
        print(f"Image not found: {image_path}")
        predictions.append({'id': row['id'], 'predicted_label': -1})

# Create DataFrame and save to CSV
df_out = pd.DataFrame(predictions)
output_csv = "results/test_images_prediction.csv"
df_out.to_csv(output_csv, index=False)

print("Saved submission file to:", output_csv)

In [None]:
class BirdsCSVInference(torch.utils.data.Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

        # use the correct column from test CSV
        if "image_path" not in self.df.columns:
            raise ValueError("Expected column 'image_path' in test CSV")

        raw_paths = self.df["image_path"].astype(str).str.strip().tolist()

        # remove any leading / or \ for proper joining with root_dir
        self.paths = [os.path.normpath(p.lstrip("/\\")) for p in raw_paths]

    def __len__(self):
        return len(self.paths)

    def _resolve(self, p):
        if os.path.isabs(p):
            return p
        return os.path.join(self.root_dir, p)

    def __getitem__(self, idx):
        img_path = self._resolve(self.paths[idx])
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Not found: {img_path}")

        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)

        return img, -1  # dummy label for test

train_csv = 'datasets/train_images.csv'
test_csv = 'datasets/test_images_path.csv'

# ---- 1) Point to the paths CSV you uploaded ----
unseen_csv = test_csv
seen_csv = train_csv

df_unseen = pd.read_csv(unseen_csv)
df_seen = pd.read_csv(unseen_csv)

print("Unseen CSV head:")
print(df_unseen.head())

# ---- 2) Build dataset & loader for unseen test ----
# IMPORTANT: root_dir is "datasets" because image_path = "/test_images/xxx.jpg"
# After stripping leading "/", it becomes "test_images/xxx.jpg"
# So datasets + test_images/xxx.jpg = datasets/test_images/xxx.jpg (correct)
unseen_ds = BirdsCSVInference(
    unseen_csv,
    root_dir=os.path.join("datasets", "test_images"),          # <- correct root folder
    transform=transforms_eval
)

seen_ds = BirdsCSVInference(
    seen_csv,
    root_dir=os.path.join("datasets", "train_images"),          # <- correct root folder
    transform=transforms_eval
)

unseen_loader = td.DataLoader(
    unseen_ds,
    batch_size=128,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

seen_loader = td.DataLoader(
    seen_ds,
    batch_size=128,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

# Optional sanity check of paths
print("cwd:", os.getcwd())
print("root_dir:", unseen_ds.root_dir)
for p in unseen_ds.paths[:5]:
    rp = unseen_ds._resolve(p)
    print("raw:", p, "->", rp, "exists:", os.path.exists(rp))

# ---- 3) Load best model snapshot ----
print("Loading best model from:", best_snapshot_path)
model.load_state_dict(torch.load(best_snapshot_path, map_location=DEVICE))
model.to(DEVICE)
model.eval()
""" 
# ---- 4) Run inference ----
pred_indices = []   # model outputs 0..199

with torch.no_grad():
    for x, _ in seen_loader:   # labels are dummy (-1)
        x = x.to(DEVICE)
        y_pred = model(x)
        batch_pred = y_pred.argmax(dim=1).cpu().tolist()
        pred_indices.extend(batch_pred)

print("Number of predictions:", len(pred_indices), "for", len(unseen_ds), "images")

 """
# ---- 4) Run inference ----
pred_indices = []   # model outputs 0..199

with torch.no_grad():
    for x, _ in unseen_loader:   # labels are dummy (-1)
        x = x.to(DEVICE)
        y_pred = model(x)
        batch_pred = y_pred.argmax(dim=1).cpu().tolist()
        pred_indices.extend(batch_pred)

print("Number of predictions:", len(pred_indices), "for", len(unseen_ds), "images")

# 1. Model already outputs correct label range (1..200)
# No +1 needed anymore
pred_labels = [p + 1 for p in pred_indices]

# 2. Create submission DataFrame using IDs from CSV
df_unseen = pd.read_csv(unseen_csv)

df_out = pd.DataFrame({
    "id": df_unseen["id"].tolist(),
    "label": pred_labels
})

# 3. Save to CSV
output_csv = "results/test_images_prediction.csv"
df_out.to_csv(output_csv, index=False)

print("Saved submission file to:", output_csv)
print(df_out.head())

