# Get IFCB data from WHOI dashboard

* https://ifcb-data.whoi.edu
* https://ifcb.caloos.org/dashboard
* https://github.com/hsosik/ifcb-analysis/wik

## What is IFCB data?

The **Imaging FlowCytobot (IFCB)** is an instrument that continuously samples seawater and takes images of individual particles as they flow past a camera. Each detected particle is saved as a small image called a **Region of Interest (ROI)**. An ROI usually represents a single phytoplankton cell, zooplankton, detritus particle, or other object in the water.

For each sampling period (called a **bin**), IFCB records thousands of ROIs along with metadata describing when, where, and how much water was analyzed. Automated classifiers are often applied to the ROI images, producing a table of **class scores** that estimate how likely each ROI belongs to different species or functional groups.

---

## How do we compute abundance (objects per mL)?

Abundance is calculated by combining **object counts** with the **volume of water analyzed**:

1. **Count objects**  
   Each row in a `*_class_scores.csv` file corresponds to **one ROI (one detected object)**.  
   To estimate species-level counts, each ROI is assigned to the class with the highest classification score (“winner”), often requiring the score to exceed a confidence threshold.

2. **Get analyzed volume**  
   For each bin, IFCB metadata includes volume analyzed (`ml_analyzed` in our dataset), the total volume of seawater (in milliliters) that passed the detector during that sample.

3. **Compute abundance**  

   $$
   \text{objects per mL} = \frac{\text{number of ROIs (or ROIs of a given species)}}{\text{ml\_analyzed}}
   $$

This converts image-based counts into a physically meaningful concentration. Abundance per mL.

## FCB files per bin

For a given bin `DYYYYMMDDTHHMMSS_IFCBXXX`, we have:

* *_class_scores.csv
    - One row per ROI (detected object)
    - Columns = classifier scores (probabilities)

* *_features.csv
    - One row per ROI (object)
    - Contains morphological / size features, e.g.: area, equivalent spherical diameter (ESD), major/minor axis, perimeter, biovolume

This notebook is just using the class_scores.csv file.

## Things to be aware of

* Look at the size drop-off. Sizes in features files for each ROI.
* Colony forming

## Look at the metadata first

We see that we have the ml analyzed and the number of objects identified.

In [11]:
import requests
import json

base_url = "https://ifcb-data.whoi.edu"
dataset = "mvco"
bin_id = "D20240215T150055_IFCB010"  # pick one bin you know exists

url = f"{base_url}/api/bin/{bin_id}"
params = {"dataset": dataset}

r = requests.get(url, params=params, timeout=30)
r.raise_for_status()

data = r.json()

# Pretty-print the whole JSON
print("\nFull metadata (first ~2000 chars):\n")
print(json.dumps(data, indent=2)[:2000])


Full metadata (first ~2000 chars):

{
  "scale": 0.33,
  "shape": [
    600,
    800
  ],
  "previous_bin_id": "D20240215T143741_IFCB010",
  "next_bin_id": "D20240215T152410_IFCB010",
  "lat": 41.325,
  "lng": -70.5667,
  "lat_rounded": "41.325",
  "lng_rounded": "-70.5667",
  "depth": 4.0,
  "pages": [
    0,
    1
  ],
  "num_pages": 1,
  "tags": [],
  "coordinates": [],
  "has_blobs": false,
  "has_features": false,
  "has_class_scores": false,
  "timestamp_iso": "2024-02-15T15:00:55+00:00",
  "instrument": "IFCB10",
  "num_triggers": 7031,
  "num_images": 6521,
  "trigger_freq": 5.874,
  "ml_analyzed": "2.851 ml",
  "size": 56579281,
  "datasets": [
    "mvco"
  ],
  "primary_dataset": "mvco",
  "comments": [],
  "concentration": 2287.189,
  "skip": false,
  "sample_type": "",
  "cruise": "",
  "cast": "",
  "niskin": null
}


## Functions

These process bin files in parallel, compute abundance (objects per ml) for each bin, and then summarize that into an average abundance per day by combining all the bins in a single day.

In [5]:
import io
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
import pandas as pd
from tqdm.auto import tqdm


def summarize_bins_to_daily_csvs_streaming_parallel(
    start_bin_id,
    end_bin_id,
    base_url,
    dataset,
    instrument,          # if None -> no filtering; if set -> filter traversal
    output_dir,
    thresh=0.7,
    max_workers=8,
    vol_label="ml_analyzed",  # e.g. "ml_analyzed" -> "2.851 ml"
    lat_label="lat",
    lon_label="lng",
):
    os.makedirs(output_dir, exist_ok=True)

    # ---- thread-local Session (thread-safe reuse of connections) ----
    _tls = threading.local()

    def get_session():
        if getattr(_tls, "session", None) is None:
            s = requests.Session()
            adapter = requests.adapters.HTTPAdapter(
                pool_connections=4,
                pool_maxsize=4,
                max_retries=2,
            )
            s.mount("https://", adapter)
            s.mount("http://", adapter)
            _tls.session = s
        return _tls.session

    def parse_ml(d):
        raw = d.get(vol_label)
        if raw is None:
            return None
        try:
            return float(raw.split()[0]) if isinstance(raw, str) else float(raw)
        except (TypeError, ValueError):
            return None

    def fetch_bin_meta(bin_id):
        """Serial: fetch metadata (also provides next_bin_id)."""
        url = f"{base_url}/api/bin/{bin_id}"
        params = {"dataset": dataset}
        if instrument is not None:
            params["instrument"] = instrument   # <-- keep optional filtering
        r = requests.get(url, params=params, timeout=30)
        if r.status_code != 200:
            return None, None
        d = r.json()
        return d, d.get("next_bin_id")

    def fetch_and_summarize_bin(bin_id, vol_ml):
        """Threaded: download class_scores and compute counts above thresh."""
        s = get_session()
        url = f"{base_url}/{dataset}/{bin_id}_class_scores.csv"
        r = s.get(url, timeout=60)
        if r.status_code != 200:
            return None

        df = pd.read_csv(io.StringIO(r.text))
        if df is None or df.empty or "pid" not in df.columns:
            return None

        class_cols = [c for c in df.columns if c != "pid"]
        scores = df[class_cols].apply(pd.to_numeric, errors="coerce").fillna(float("-inf"))
        winner = scores.idxmax(axis=1)
        max_score = scores.max(axis=1)

        keep = max_score >= thresh
        if keep.sum() == 0:
            return (vol_ml, {})

        counts = winner[keep].value_counts().to_dict()
        return (vol_ml, counts)

    def flush_day(date, ml_total, counts, lat, lon, inst):
        """Write one day LONG-format CSV. Returns filepath or None."""
        if date is None:
            return None
        if ml_total <= 0 or not counts:
            return None

        rows = []
        for cls, ct in sorted(counts.items()):
            per_ml = ct / ml_total
            rows.append({
                "date": date,
                "latitude": lat,
                "longitude": lon,
                "instrument": inst,              # <-- recorded instrument (from metadata)
                "class": cls,
                "count": int(ct),
                "ml_analyzed_total": float(ml_total),
                "objects_per_ml": float(per_ml),
                "objects_per_L": float(per_ml * 1000),
                "threshold": thresh,
            })

        out = pd.DataFrame(rows)
        out_file = os.path.join(output_dir, f"{date}_daily_summary_long.csv")
        out.to_csv(out_file, index=False)
        return out_file

    written = []

    current_date = None
    day_lat = None
    day_lon = None
    day_instrument = None
    futures = []

    n_meta = 0
    pbar = tqdm(desc="Streaming metadata", unit="bin")

    current_bin_id = start_bin_id

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        while True:
            d, next_bin_id = fetch_bin_meta(current_bin_id)
            if d is None:
                print(f"Metadata fetch failed for {current_bin_id}. Stopping.")
                break

            bin_id = current_bin_id

            date = bin_id.split("T")[0][1:]  # YYYYMMDD
            vol_ml = parse_ml(d)

            # day change -> collect futures, write previous day, reset
            if current_date is not None and date != current_date:
                ml_total = 0.0
                counts = {}
                for fut in as_completed(futures):
                    res = fut.result()
                    if res is None:
                        continue
                    v, cdict = res
                    if cdict:
                        ml_total += float(v)
                        for cls, ct in cdict.items():
                            counts[cls] = counts.get(cls, 0) + int(ct)

                out_file = flush_day(current_date, ml_total, counts, day_lat, day_lon, day_instrument)
                if out_file:
                    written.append(out_file)
                    tqdm.write(f"Wrote: {out_file}")

                futures = []
                day_lat = None
                day_lon = None
                day_instrument = None

            current_date = date

            # set day lat/lon/instrument from metadata (first bin of day)
            if day_lat is None:
                day_lat = d.get(lat_label)
            if day_lon is None:
                day_lon = d.get(lon_label)
            if day_instrument is None:
                day_instrument = d.get("instrument")   # <-- always record actual instrument

            if vol_ml and vol_ml > 0:
                futures.append(executor.submit(fetch_and_summarize_bin, bin_id, vol_ml))

            n_meta += 1
            pbar.update(1)
            if n_meta % 200 == 0:
                pbar.set_postfix_str(f"day={date} queued={len(futures)}")

            if bin_id == end_bin_id:
                break

            if not next_bin_id:
                break
            current_bin_id = next_bin_id

        # flush last day
        if current_date is not None:
            ml_total = 0.0
            counts = {}
            for fut in as_completed(futures):
                res = fut.result()
                if res is None:
                    continue
                v, cdict = res
                if cdict:
                    ml_total += float(v)
                    for cls, ct in cdict.items():
                        counts[cls] = counts.get(cls, 0) + int(ct)

            out_file = flush_day(current_date, ml_total, counts, day_lat, day_lon, day_instrument)
            if out_file:
                written.append(out_file)
                tqdm.write(f"Wrote: {out_file}")

    pbar.close()
    print(f"Done. Wrote {len(written)} daily files to {output_dir}")
    return written



## Run the files

You need to look up the starting and ending bin_id from the dashboard. For example, https://ifcb-data.whoi.edu/timeline?dataset=mvco, roll over the timeline to see bin_id. Look up the instrument from a bin file on the dashboard.

In [None]:
written_files = summarize_bins_to_daily_csvs_streaming_parallel(
    start_bin_id="D20240215T150055_IFCB010",
    end_bin_id = "D20241227T181716_IFCB010",
    base_url="https://ifcb-data.whoi.edu",
    dataset="mvco",
    instrument=None,
    output_dir="../../data/mvco/daily_summaries",
    thresh=0.7,
    max_workers=8,  # try 8; if rate-limited, drop to 4–6
)

Streaming metadata: 0bin [00:00, ?bin/s]

Wrote: ../../data/mvco/daily_summaries/20240215_daily_summary_long.csv


## Merge into one summary file with all days

In [None]:
import os
import glob
import pandas as pd

# Where your daily files are
daily_dir = "../../data/mvco/daily_summaries"
out_file  = "../../data/mvco/mvco_2024_abundance_long.csv"

# Find all daily summaries
files = sorted(glob.glob(os.path.join(daily_dir, "*_daily_summary_long.csv")))
print(f"Found {len(files)} daily files")

dfs = []
for f in files:
    df = pd.read_csv(f)

    # Keep/rename to what you want
    df = df[["date", "latitude", "longitude", "threshold", "ml_analyzed_total", "class", "objects_per_ml"]]

    dfs.append(df)

merged = pd.concat(dfs, ignore_index=True)

# Optional: ensure types are sane
merged["date"] = pd.to_datetime(merged["day"], format="%Y%m%d")          # YYYY-MM-DD
merged["threshold"] = pd.to_numeric(merged["threshold"], errors="coerce")
merged["ml_analyzed_total"] = pd.to_numeric(merged["ml_analyzed_total"], errors="coerce")
merged["objects_per_ml"] = pd.to_numeric(merged["objects_per_ml"], errors="coerce")

os.makedirs(os.path.dirname(out_file), exist_ok=True)
merged.to_csv(out_file, index=False)

print(f"Wrote merged file: {out_file}")
print(merged.head())
