Recreate a model based on the third place solution of the [Tick Tick Bloom: Harmful Algal Bloom Detection Challenge](https://github.com/drivendataorg/tick-tick-bloom/tree/main)

In [3]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [8]:
from cyano.pipeline import CyanoModelPipeline
from cyano.config import FeaturesConfig, ModelTrainingConfig
from cyano.settings import REPO_ROOT

In [9]:
LOCAL_CACHE_DIR = REPO_ROOT.parent / "data/experiment/cache"
LOCAL_CACHE_DIR

PosixPath('/Users/katewetstone/Repos/cyanobacteria-prediction/data/experiment/cache')

### Pipeline settings

In [10]:
use_sentinel_bands = [
    "AOT",
    "B01",
    "B02",
    "B03",
    "B04",
    "B05",
    "B06",
    "B07",
    "B08",
    "B09",
    "B11",
    "B12",
    "B8A",
    "SCL",
    "WVP",
]
sat_image_fts = [
    f"{band}_{stat}"
    for band in use_sentinel_bands
    for stat in ["mean", "min", "max", "range"]
]
sat_image_fts += ["NDVI_B04", "NDVI_B05", "NDVI_B06", "NDVI_B07"]
sat_image_fts[:6]

['AOT_mean', 'AOT_min', 'AOT_max', 'AOT_range', 'B01_mean', 'B01_min']

In [11]:
len(sat_image_fts)

64

In [12]:
feature_config = FeaturesConfig(
    image_feature_meter_window=200,
    n_sentinel_items=15,
    pc_meters_search_window=5000,
    pc_days_search_window=15,
    use_sentinel_bands=use_sentinel_bands,
    satellite_image_features=sat_image_fts,
    satellite_meta_features=["month", "days_before_sample"],
    metadata_features=["rounded_longitude"],
)

In [13]:
train_path = "../tests/assets/train_data.csv"

## Training

In [74]:
pipeline = CyanoModelPipeline(
    features_config=feature_config,
    model_training_config=ModelTrainingConfig(),
    cache_dir=LOCAL_CACHE_DIR,
)
pipeline.run_training("../tests/assets/train_data.csv", save_path="/tmp/model.zip")

[32m2023-08-10 17:11:22.853[0m | [1mINFO    [0m | [36mcyano.pipeline[0m:[36m_prep_train_data[0m:[36m49[0m - [1mLoaded 5 samples for training[0m
[32m2023-08-10 17:11:22.856[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36mgenerate_candidate_metadata[0m:[36m192[0m - [1mGenerating metadata for all satellite item candidates[0m
[32m2023-08-10 17:11:22.857[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36mgenerate_candidate_metadata[0m:[36m211[0m - [1mSearching ['sentinel-2-l2a'] within 15 days and 5000 meters[0m
100%|██████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.08it/s]
[32m2023-08-10 17:11:23.857[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36mgenerate_candidate_metadata[0m:[36m241[0m - [1mGenerated metadata for 4 Sentinel item candidates[0m
[32m2023-08-10 17:11:23.859[0m | [1mINFO    [0m | [36mcyano.data.satellite_data[0m:[36midentify_satellite_data[0m:[36m

In [75]:
pipeline.train_samples

Unnamed: 0_level_0,date,latitude,longitude
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
e3ebefd90a00c3cc9f5aeaf32cd4c184,2015-06-29,41.424144,-73.206937
671520fa92f555ab335e0cfa888c57e7,2013-07-25,36.045,-79.091942
9c601f226c2af07d570134127a7fda27,2017-08-21,35.884524,-78.953997
3a2c48812b551d720f8d56772efa6df1,2019-08-28,41.39249,-75.3607
2543db364f727f17fe4ce7881aa180da,2013-07-11,38.3056,-122.026


In [76]:
pipeline.train_features

Unnamed: 0_level_0,AOT_mean,AOT_min,AOT_max,AOT_range,B01_mean,B01_min,B01_max,B01_range,B02_mean,B02_min,...,WVP_min,WVP_max,WVP_range,NDVI_B04,NDVI_B05,NDVI_B06,NDVI_B07,month,days_before_sample,rounded_longitude
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9c601f226c2af07d570134127a7fda27,95.586168,95.0,96.0,1.0,4452.640625,3186.0,7687.0,4501.0,4323.993197,2168.0,...,1138.0,4300.0,3162.0,0.121759,0.054942,0.017671,-0.007689,8.0,9.0,-8.0
3a2c48812b551d720f8d56772efa6df1,129.913742,129.0,130.0,1.0,12258.953125,11413.0,13228.0,1815.0,11568.77097,10448.0,...,2934.0,2934.0,0.0,-0.025785,-0.044395,-0.024266,-0.018886,8.0,14.0,-8.0
3a2c48812b551d720f8d56772efa6df1,96.0,96.0,96.0,0.0,289.15625,245.0,402.0,157.0,271.728138,189.0,...,897.0,1659.0,762.0,0.736759,0.519615,0.102936,0.021354,8.0,4.0,-8.0
3a2c48812b551d720f8d56772efa6df1,151.995241,151.0,152.0,1.0,678.109375,241.0,1528.0,1287.0,728.650208,275.0,...,1671.0,3353.0,1682.0,0.543555,0.382714,0.096019,0.013761,8.0,9.0,-8.0


In [77]:
pipeline.train_labels

uid
e3ebefd90a00c3cc9f5aeaf32cd4c184    1
671520fa92f555ab335e0cfa888c57e7    1
9c601f226c2af07d570134127a7fda27    3
3a2c48812b551d720f8d56772efa6df1    2
2543db364f727f17fe4ce7881aa180da    4
Name: severity, dtype: int64