In [8]:
import boto3
import os

# Initialize S3 clients
s3 = boto3.client('s3')
s3_resource = boto3.resource('s3')

# Define names
dst_bucket_name = 'wildscan-data-bucket'
region = boto3.session.Session().region_name

In [9]:
from botocore.exceptions import ClientError

try:
    if region == 'us-east-1':
        s3.create_bucket(Bucket=dst_bucket_name)
    else:
        s3.create_bucket(
            Bucket=dst_bucket_name,
            CreateBucketConfiguration={'LocationConstraint': region}
        )
    print(f"Bucket '{dst_bucket_name}' created in region '{region}'.")
except s3.exceptions.BucketAlreadyOwnedByYou:
    print(f"Bucket '{dst_bucket_name}' already exists.")
except ClientError as e:
    print(f"Error creating bucket: {e}")

Bucket 'wildscan-data-bucket' created in region 'us-east-1'.


In [14]:
!pip install datasets



In [15]:
from datasets import load_dataset

ds = load_dataset("society-ethics/lila_camera_traps", "Caltech Camera Traps")

Downloading builder script:   0%|          | 0.00/52.3k [00:00<?, ?B/s]

Downloading and preparing dataset lila/Caltech Camera Traps to /home/sagemaker-user/.cache/huggingface/datasets/society-ethics___lila/Caltech Camera Traps/1.0.0/5ef3e1d3c631fd49b65a8ea5c84612c4ed8b1bfa7556009c3e69a6b048c19527...


Downloading data:   0%|          | 0.00/10.2M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset lila downloaded and prepared to /home/sagemaker-user/.cache/huggingface/datasets/society-ethics___lila/Caltech Camera Traps/1.0.0/5ef3e1d3c631fd49b65a8ea5c84612c4ed8b1bfa7556009c3e69a6b048c19527. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['file_name', 'width', 'height', 'seq_num_frames', 'date_captured', 'seq_id', 'location', 'rights_holder', 'frame_num', 'annotations', 'bboxes', 'image'],
        num_rows: 243100
    })
})


In [24]:
record = ds["train"][0]
print(record.keys())
print("📦 bboxes:", record.get("bboxes"))
print("📸 file_name:", record.get("file_name"))

dict_keys(['file_name', 'width', 'height', 'seq_num_frames', 'date_captured', 'seq_id', 'location', 'rights_holder', 'frame_num', 'annotations', 'bboxes', 'image'])
📦 bboxes: None
📸 file_name: 5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg


In [26]:
has_bbox = 0
no_bbox = 0

for record in ds["train"]:
    bboxes = record.get("bboxes")
    if isinstance(bboxes, list) and len(bboxes) > 0:
        has_bbox += 1
    else:
        no_bbox += 1

print(f"✅ Records with bounding boxes: {has_bbox}")
print(f"❌ Records without bounding boxes: {no_bbox}")

✅ Records with bounding boxes: 0
❌ Records without bounding boxes: 243100


In [27]:
sample = ds["train"][0]
print(sample["annotations"])

{'taxonomy': [{'kingdom': None, 'phylum': None, 'subphylum': None, 'superclass': None, 'class': None, 'subclass': None, 'infraclass': None, 'superorder': None, 'order': None, 'suborder': None, 'infraorder': None, 'superfamily': None, 'family': None, 'subfamily': None, 'tribe': None, 'genus': None, 'species': None, 'subspecies': None, 'variety': None}]}


In [37]:
import boto3

s3 = boto3.client("s3")
bucket = "us-west-2.opendata.source.coop"
prefix = "agentmorris/lila-wildlife/caltech-unzipped/cct_images/"

# List first 10 files
resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=10)
for obj in resp.get("Contents", []):
    print(obj["Key"])

agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf1e-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf20-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf21-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf22-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf23-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf24-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf25-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf29-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf2a-23d2-11e8-a6a3-ec086b02610b.jpg
agentmorris/lila-wildlife/caltech-unzipped/cct_images/5858bf2b-23d2-11e8-a6a3-ec086b02610b.jpg


In [57]:
import boto3
import zipfile
import os

s3 = boto3.client("s3")
bucket = "us-west-2.opendata.source.coop"
key = "agentmorris/lila-wildlife/caltechcameratraps/labels/caltech_camera_traps.json.zip"
local_zip_path = "/tmp/caltech_camera_traps.json.zip"
extracted_json_path = "/tmp/caltech_camera_traps.json"

try:
    s3.download_file(bucket, key, local_zip_path)
    print("✅ Downloaded ZIP file.")
except Exception as e:
    print(f"❌ Failed to download: {e}")

try:
    with zipfile.ZipFile(local_zip_path, 'r') as zip_ref:
        zip_ref.extractall("/tmp/")
    print("✅ Extracted JSON file from ZIP.")
except Exception as e:
    print(f"❌ Failed to unzip: {e}")


✅ Downloaded ZIP file.
✅ Extracted JSON file from ZIP.


In [58]:
s3 = boto3.client("s3")
dst_bucket = "wildscan-data-bucket"

local_files = [
    "/tmp/caltech_bboxes_20200316.json",
    "/tmp/caltech_camera_traps.json"
]

for file_path in local_files:
    key = "annotations/" + os.path.basename(file_path)
    try:
        s3.upload_file(file_path, dst_bucket, key)
        print(f"✅ Uploaded {file_path} to s3://{dst_bucket}/{key}")
    except Exception as e:
        print(f"❌ Failed to upload {file_path}: {e}")

✅ Uploaded /tmp/caltech_bboxes_20200316.json to s3://wildscan-data-bucket/annotations/caltech_bboxes_20200316.json
✅ Uploaded /tmp/caltech_camera_traps.json to s3://wildscan-data-bucket/annotations/caltech_camera_traps.json


In [61]:
import zipfile

with zipfile.ZipFile("/tmp/caltech_camera_traps.json.zip", "r") as zip_ref:
    print("📦 Contents of ZIP:")
    print(zip_ref.namelist())

📦 Contents of ZIP:
['caltech_images_20210113.json']


In [63]:
import json

with zipfile.ZipFile("/tmp/caltech_camera_traps.json.zip", "r") as zip_ref:
    zip_ref.extract("caltech_images_20210113.json", "/tmp/")

os.rename("/tmp/caltech_images_20210113.json", "/tmp/caltech_camera_traps.json")

with open("/tmp/caltech_camera_traps.json", "r") as f:
    label_data = json.load(f)

print(f"✅ Successfully loaded label data: {len(label_data)} records")

✅ Successfully loaded label data: 4 records


In [64]:
s3 = boto3.client("s3")
bucket_name = "us-west-2.opendata.source.coop"
prefix = "agentmorris/lila-wildlife/caltechcameratraps/labels/"

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

if "Contents" in response:
    print("📂 Found label files:")
    for obj in response["Contents"]:
        print("-", obj["Key"])
else:
    print("❌ No files found.")

📂 Found label files:
- agentmorris/lila-wildlife/caltechcameratraps/labels/caltech_bboxes_20200316.json
- agentmorris/lila-wildlife/caltechcameratraps/labels/caltech_camera_traps.json.zip


In [67]:
with open("/tmp/caltech_bboxes_20200316.json", "r") as f:
    bbox_data = json.load(f)

print(f"📦 Total records in bbox file: {len(bbox_data)}")
print(f"🔍 First 10 keys: {list(bbox_data.keys())[:10]}")

📦 Total records in bbox file: 4
🔍 First 10 keys: ['info', 'categories', 'annotations', 'images']


In [69]:
bucket_name = "us-west-2.opendata.source.coop"
object_key = "agentmorris/lila-wildlife/caltechcameratraps/labels/caltech_bboxes_20200316.json"
local_path = "/tmp/caltech_bboxes_20200316.json"

s3 = boto3.client("s3")

try:
    s3.download_file(bucket_name, object_key, local_path)
    print(f"✅ Downloaded {object_key} to {local_path}")
except Exception as e:
    print(f"❌ Download failed: {e}")

✅ Downloaded agentmorris/lila-wildlife/caltechcameratraps/labels/caltech_bboxes_20200316.json to /tmp/caltech_bboxes_20200316.json


In [74]:

with open("/tmp/caltech_bboxes_20200316.json", "r") as f:
    data = json.load(f)

print(f"📦 Annotations: {len(data['annotations'])}")
print(f"🖼️ Images: {len(data['images'])}")

📦 Annotations: 65112
🖼️ Images: 63025


In [76]:
import json
import boto3
import os
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

bbox_json_path = "/tmp/caltech_bboxes_20200316.json"
image_base_url = "https://s3.us-west-2.amazonaws.com/agentmorris-lila-wildlife/caltech-unzipped/cct_images/"
target_size = (224, 224)
dst_bucket = "wildscan-data-bucket"
crop_prefix = "cct_cropped/"
resize_prefix = "cct_resized/"

s3 = boto3.client("s3")

with open(bbox_json_path, "r") as f:
    metadata = json.load(f)

annotations = metadata["annotations"]
images = metadata["images"]
image_map = {img["id"]: img["file_name"] for img in images}

print(f"📦 Total annotations: {len(annotations)}")

uploaded = 0
skipped = 0

for ann in tqdm(annotations, desc="📸 Processing"):
    try:
        image_id = ann["image_id"]
        bbox = ann.get("bbox")
        category = ann.get("category", "unknown")

        file_name = image_map.get(image_id)
        if not bbox or not file_name:
            skipped += 1
            continue

        # Try downloading image
        response = requests.get(image_base_url + file_name, timeout=5)
        if response.status_code != 200:
            skipped += 1
            continue

        img = Image.open(BytesIO(response.content)).convert("RGB")

        # Crop
        x, y, w, h = map(int, bbox)
        cropped = img.crop((x, y, x + w, y + h))
        cropped_path = f"/tmp/crop_{file_name}"
        cropped.save(cropped_path)
        s3.upload_file(cropped_path, dst_bucket, f"{crop_prefix}{category}/{file_name}")

        # Resize
        resized = cropped.resize(target_size)
        resized_path = f"/tmp/resize_{file_name}"
        resized.save(resized_path)
        s3.upload_file(resized_path, dst_bucket, f"{resize_prefix}{category}/{file_name}")

        uploaded += 1

    except Exception as e:
        skipped += 1
        print(f"❌ Failed {file_name}: {e}")

print(f"\n✅ Uploaded: {uploaded} images")
print(f"⏭️ Skipped: {skipped} images")

📦 Total annotations: 65112


📸 Processing:   5%|▍         | 2988/65112 [09:48<3:23:51,  5.08it/s]


KeyboardInterrupt: 