
# 01 — Data Intake (Presence/Absence Setup)

**Goal:** Point to two image folders and build a manifest for presence/absence learning.

- `DATA_POS_DIR`: images **with** tortoises
- `DATA_NEG_DIR`: images **without** tortoises
- We will write a CSV/Parquet manifest containing `image_id`, `path`, and `label` (1/0).
- Optional: mount Google Drive for persistence.


In [None]:

# (Optional) Colab: show GPU and mount Drive
# If using Colab, uncomment the following two lines:
# !nvidia-smi
# from google.colab import drive; drive.mount('/content/drive')


In [None]:

# Install deps (safe to re-run). On Colab, this will install into the runtime.
# You can comment out lines if already installed.
%pip -q install --upgrade pip
%pip -q install numpy pandas pyarrow pillow tqdm scikit-learn mlflow==2.14.3


In [None]:

import os, sys, uuid, pandas as pd
from pathlib import Path

# ====== USER CONFIG ======
# Point these to your folders (Colab + Drive example shown commented).
# If using Colab+Drive, something like:
# BASE = Path('/content/drive/MyDrive/tortoise-finder')
BASE = Path('/content')  # change as needed
DATA_POS_DIR = BASE/'data/positives'   # images with tortoises
DATA_NEG_DIR = BASE/'data/negatives'   # images without tortoises
OUT_DIR = BASE/'data/manifests'
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Pos dir:", DATA_POS_DIR)
print("Neg dir:", DATA_NEG_DIR)
print("Out dir:", OUT_DIR)


In [None]:

from pathlib import Path

def scan_images(folder: Path):
    exts = {'.jpg','.jpeg','.png','.webp','.tif','.tiff'}
    files = []
    for p in folder.rglob('*'):
        if p.suffix.lower() in exts and p.is_file():
            files.append(p)
    return files

pos_files = scan_images(DATA_POS_DIR)
neg_files = scan_images(DATA_NEG_DIR)

print(f"Found positives: {len(pos_files)}")
print(f"Found negatives: {len(neg_files)}")

# Build manifest DataFrame
pos_rows = [{'image_id': p.stem, 'path': str(p), 'label': 1} for p in pos_files]
neg_rows = [{'image_id': p.stem, 'path': str(p), 'label': 0} for p in neg_files]
df = pd.DataFrame(pos_rows + neg_rows)

# Shuffle and save
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
df.to_csv(OUT_DIR/'manifest.csv', index=False)
df.to_parquet(OUT_DIR/'manifest.parquet', index=False)
df.head(), df['label'].value_counts()


In [None]:

# Create stratified splits by image ID, preserving label balance
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path

MANIFEST = Path(OUT_DIR/'manifest.parquet')
df = pd.read_parquet(MANIFEST)

train_df, test_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=123)
train_df, val_df  = train_test_split(train_df, test_size=0.1765, stratify=train_df['label'], random_state=123) # 0.1765 ~ 15% of total

for name, d in [('train',train_df), ('val',val_df), ('test',test_df)]:
    d.to_parquet(OUT_DIR/f'manifest_{name}.parquet', index=False)
    d.to_csv(OUT_DIR/f'manifest_{name}.csv', index=False)
    print(name, d.shape, d['label'].value_counts().to_dict())

print("Wrote:", list((OUT_DIR).glob('manifest_*.*')))
