Recreate a model based on the third place solution of the [Tick Tick Bloom: Harmful Algal Bloom Detection Challenge](https://github.com/drivendataorg/tick-tick-bloom/tree/main)

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import yaml

from cloudpathlib import AnyPath

from cyano.config import FeaturesConfig, ModelTrainingConfig
from cyano.experiment import ExperimentConfig
from cyano.settings import REPO_ROOT




In [3]:
DATA_DIR = REPO_ROOT.parent / "data/experiments"
S3_DATA_DIR = AnyPath("s3://drivendata-competition-nasa-cyanobacteria") / "experiments"

SPLITS_DIR = S3_DATA_DIR / "splits"
LOCAL_CACHE_DIR = DATA_DIR / "cache"
EXPERIMENT_SAVE_DIR = DATA_DIR / "rerun_third"
EXPERIMENT_SAVE_DIR.mkdir(exist_ok=True, parents=True)

## Settings

Write config to match third place code

In [4]:
use_sentinel_bands = [
    "AOT",
    "B01",
    "B02",
    "B03",
    "B04",
    "B05",
    "B06",
    "B07",
    "B08",
    "B09",
    "B11",
    "B12",
    "B8A",
    "SCL",
    "WVP",
]
sat_image_fts = [
    f"{band}_{stat}"
    for band in use_sentinel_bands
    for stat in ["mean", "min", "max", "range"]
]
sat_image_fts += ["NDVI_B04", "NDVI_B05", "NDVI_B06", "NDVI_B07"]
sat_image_fts[:6]

['AOT_mean', 'AOT_min', 'AOT_max', 'AOT_range', 'B01_mean', 'B01_min']

In [5]:
len(sat_image_fts)

64

In [6]:
features_config = FeaturesConfig(
    image_feature_meter_window=200,
    n_sentinel_items=15,
    pc_meters_search_window=5000,
    pc_days_search_window=15,
    use_sentinel_bands=use_sentinel_bands,
    satellite_image_features=sat_image_fts,
    satellite_meta_features=["month", "days_before_sample"],
    metadata_features=["rounded_longitude"],
)

In [7]:
model_config = ModelTrainingConfig(
    num_boost_round=100000,
    params={
        "application": "regression",
        "metric": "rmse",
        "max_depth": -1,
        "num_leaves": 31,
        "learning_rate": 0.1,
    },
)

In [8]:
experiment_config = ExperimentConfig(
    features_config=features_config,
    train_csv=SPLITS_DIR / "competition/train.csv",
    predict_csv=SPLITS_DIR / "competition/test.csv",
    cache_dir=LOCAL_CACHE_DIR,
    save_dir=EXPERIMENT_SAVE_DIR,
)

with (EXPERIMENT_SAVE_DIR / "experiment_config.yaml").open("w") as fp:
    yaml.dump(experiment_config.model_dump(), fp)

## Run experiment

In [None]:
experiment_config.run_experiment()

[32m2023-08-11 17:02:42.862[0m | [1mINFO    [0m | [36mcyano.pipeline[0m:[36m_prep_train_data[0m:[36m51[0m - [1mLoaded 17,060 samples for training[0m
[32m2023-08-11 17:02:42.863[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36mgenerate_candidate_metadata[0m:[36m195[0m - [1mGenerating metadata for all satellite item candidates[0m
[32m2023-08-11 17:02:47.575[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36mgenerate_candidate_metadata[0m:[36m208[0m - [1mLoaded 56,173 rows of Sentinel candidate metadata from /Users/katewetstone/Repos/competition-nasa-cyanobacteria/data/interim/full_pc_search[0m
[32m2023-08-11 17:02:47.935[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36midentify_satellite_data[0m:[36m303[0m - [1mSelecting which items to use for feature generation[0m
100%|████████████████████████████████████████████████████████████| 17060/17060 [01:51<00:00, 152.84it/s]
[32m2023-08-11 17:04:42.606[0m | [1mINFO  

  0%|          | 0/45210 [00:00<?, ?it/s]