In [1]:
%%capture
!pip install datasets

## Downloading the Dataset

In [12]:
!wget https://datasets.cms.waikato.ac.nz/taiao/data/waikato_aerial_imagery_2017/classification.tar

--2025-02-15 04:23:52--  https://datasets.cms.waikato.ac.nz/taiao/data/waikato_aerial_imagery_2017/classification.tar
Resolving datasets.cms.waikato.ac.nz (datasets.cms.waikato.ac.nz)... 104.22.23.233, 172.67.20.185, 104.22.22.233, ...
Connecting to datasets.cms.waikato.ac.nz (datasets.cms.waikato.ac.nz)|104.22.23.233|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1105899520 (1.0G) [application/x-tar]
Saving to: ‘classification.tar.1’


2025-02-15 04:24:06 (82.2 MB/s) - ‘classification.tar.1’ saved [1105899520/1105899520]



In [26]:
%%capture
!tar -xvf dataset/classification.tar

classification/
classification/train/
classification/train/harvested_forest/
classification/train/harvested_forest/971.png
classification/train/harvested_forest/635.png
classification/train/harvested_forest/432.png
classification/train/harvested_forest/426.png
classification/train/harvested_forest/629.png
classification/train/harvested_forest/545.png
classification/train/harvested_forest/55.png
classification/train/harvested_forest/93.png
classification/train/harvested_forest/638.png
classification/train/harvested_forest/677.png
classification/train/harvested_forest/529.png
classification/train/harvested_forest/436.png
classification/train/harvested_forest/707.png
classification/train/harvested_forest/712.png
classification/train/harvested_forest/514.png
classification/train/harvested_forest/405.png
classification/train/harvested_forest/613.png
classification/train/harvested_forest/656.png
classification/train/harvested_forest/68.png
classification/train/harvested_forest/602.png
classi

## Uploading the classification dataset to the Hub

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

HFT = user_secrets.get_secret("HFT")

In [61]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", name="waikato_aerial_imagery_2017", data_dir="classification")
dataset

Resolving data files:   0%|          | 0/8658 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/4343 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/8658 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/4342 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 8658
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 4342
    })
})

In [62]:
dataset.push_to_hub("dushj98/waikato_aerial_imagery_2017", token=HFT)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

Map:   0%|          | 0/4329 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/4342 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/44 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dushj98/waikato_aerial_imagery_2017/commit/725151fa99db2033767da9d5381eb85251454d4b', commit_message='Upload dataset', commit_description='', oid='725151fa99db2033767da9d5381eb85251454d4b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dushj98/waikato_aerial_imagery_2017', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dushj98/waikato_aerial_imagery_2017'), pr_revision=None, pr_num=None)

## Creating a sample dataset for SD Finetuning

In [27]:
!mkdir -p dataset/generated/woc && mkdir -p dataset/generated/wc

In [35]:
import os
import json

class_names = os.listdir("classification/train")
class_names = {n: "" for n in class_names}
print(json.dumps(class_names, indent=2))

{
  "urban_parkland": "",
  "low_producing_grassland": "",
  "harvested_forest": "",
  "broadleaved_indigenous_hardwood": "",
  "deciduous_hardwood": "",
  "urban_build_up": "",
  "high_producing_grassland": "",
  "herbaceous_freshwater_vege": "",
  "grose_broom": "",
  "lake_pond": "",
  "manuka_kanuka": "",
  "shortrotation_cropland": "",
  "indigenous_forest": ""
}


In [3]:
classes = {
  "urban_parkland": "UPL",
  "low_producing_grassland": "LPG",
  "harvested_forest": "HFT",
  "broadleaved_indigenous_hardwood": "BIH",
  "deciduous_hardwood": "DHW",
  "urban_build_up": "UBU",
  "high_producing_grassland": "HPG",
  "herbaceous_freshwater_vege": "HFV",
  "grose_broom": "GBM",
  "lake_pond": "LPD",
  "manuka_kanuka": "MKA",
  "shortrotation_cropland": "SRC",
  "indigenous_forest": "IFT"
}

In [4]:
len(classes) == len(set(classes)) == len(set(classes.values())) == 13

True

In [71]:
import random
import shutil
import os


def create_random_dataset_from_data(size: int = 20) -> None:
    source_dir = "classification/train"
    destination_dir = "dataset/generated/woc"

    for cls, abv in classes.items():
        class_dir = os.path.join(source_dir, cls)
        dest_class_dir = os.path.join(destination_dir, cls)
        os.makedirs(dest_class_dir, exist_ok=True)
        
        files = [f for f in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, f)) and f.endswith('.png')]
        selected_files = random.sample(files, min(size, len(files)))
        
        for file in selected_files:
            src_path = os.path.join(class_dir, file)
            dest_path = os.path.join(destination_dir, cls, file)
            shutil.copy(src_path, dest_path)
        
        print(f"Copied: {class_dir} -> {destination_dir}")

In [72]:
!rm -r dataset/generated/woc/*
create_random_dataset_from_data()

rm: cannot remove 'dataset/generated/woc/*': No such file or directory
Copied: classification/train/urban_parkland -> dataset/generated/woc
Copied: classification/train/low_producing_grassland -> dataset/generated/woc
Copied: classification/train/harvested_forest -> dataset/generated/woc
Copied: classification/train/broadleaved_indigenous_hardwood -> dataset/generated/woc
Copied: classification/train/deciduous_hardwood -> dataset/generated/woc
Copied: classification/train/urban_build_up -> dataset/generated/woc
Copied: classification/train/high_producing_grassland -> dataset/generated/woc
Copied: classification/train/herbaceous_freshwater_vege -> dataset/generated/woc
Copied: classification/train/grose_broom -> dataset/generated/woc
Copied: classification/train/lake_pond -> dataset/generated/woc
Copied: classification/train/manuka_kanuka -> dataset/generated/woc
Copied: classification/train/shortrotation_cropland -> dataset/generated/woc
Copied: classification/train/indigenous_forest -

In [101]:
def inspect_items(dir: str = "dataset/generated/woc"):
    col_len = max([len(k) for k in classes.keys()])
    
    for d in os.listdir(dir):
        print(f"{d:{col_len}} ->", len(os.listdir(f"{dir}/{d}")))

In [102]:
inspect_items("dataset/generated/woc")

urban_parkland                  -> 20
low_producing_grassland         -> 20
harvested_forest                -> 20
broadleaved_indigenous_hardwood -> 20
deciduous_hardwood              -> 20
urban_build_up                  -> 20
high_producing_grassland        -> 20
herbaceous_freshwater_vege      -> 20
grose_broom                     -> 20
lake_pond                       -> 20
manuka_kanuka                   -> 20
shortrotation_cropland          -> 20
indigenous_forest               -> 20


In [75]:
from datasets import load_dataset

woc_dataset = load_dataset("imagefolder", name="waikato_aerial_2017_sd_ft", data_dir="dataset/generated/woc")
woc_dataset

Resolving data files:   0%|          | 0/260 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/260 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 260
    })
})

In [76]:
woc_dataset.push_to_hub("dushj98/waikato_aerial_2017_sd_ft", token=HFT)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dushj98/waikato_aerial_2017_sd_ft/commit/b55afd1089ec0c359893b5abed67c590105ada2f', commit_message='Upload dataset', commit_description='', oid='b55afd1089ec0c359893b5abed67c590105ada2f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dushj98/waikato_aerial_2017_sd_ft', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dushj98/waikato_aerial_2017_sd_ft'), pr_revision=None, pr_num=None)

## Captioning Images

In [None]:
from datasets import load_dataset

woc_dataset = load_dataset("dushj98/waikato_aerial_2017_sd_ft")
woc_dataset

In [14]:
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import requests
from PIL import Image

cap_model_name = "Salesforce/blip-image-captioning-large"
cap_model = BlipForConditionalGeneration.from_pretrained(cap_model_name).to("cuda")
cap_processor = BlipProcessor.from_pretrained(cap_model_name)

In [21]:
from PIL import ImageFile


def generate_caption(image: ImageFile, prompt: str) -> str:
    inputs = cap_processor(image, prompt, return_tensors="pt").to("cuda")
    
    out = cap_model.generate(**inputs)
    return prompt + cap_processor.decode(out[0][inputs.input_ids.shape[-1] :], skip_special_tokens=True).strip()

In [24]:
def add_caption(example):
    class_label = woc_dataset["train"].features["label"].int2str(example["label"])
    token = classes[class_label].strip()
    
    caption = generate_caption(image=example["image"], prompt=f"A <{token}> aerial view of ")
    
    example["caption"] = caption
    example["token"] = token
    return example

woc_dataset = woc_dataset.map(add_caption)
print(woc_dataset["train"][0]) 

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=224x224 at 0x7D0303F7B670>, 'label': 0, 'caption': 'A <BIH> aerial view of field with trees and a road', 'token': 'BIH'}


In [25]:
woc_dataset.push_to_hub("dushj98/waikato_aerial_2017_sd_ft", token=HFT)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/260 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dushj98/waikato_aerial_2017_sd_ft/commit/3fb41f9a83cf9733cc4c926ac235c0b405551650', commit_message='Upload dataset', commit_description='', oid='3fb41f9a83cf9733cc4c926ac235c0b405551650', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dushj98/waikato_aerial_2017_sd_ft', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dushj98/waikato_aerial_2017_sd_ft'), pr_revision=None, pr_num=None)

In [26]:
uploaded_dataset = load_dataset("dushj98/waikato_aerial_2017_sd_ft")
uploaded_dataset["train"][0]

README.md:   0%|          | 0.00/860 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/260 [00:00<?, ? examples/s]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=224x224>,
 'label': 0,
 'caption': 'A <BIH> aerial view of field with trees and a road',
 'token': 'BIH'}

___