# Load Flood Dataset from W&B

In [1]:
import polars as pl
import wandb

In [2]:
# Download artifact from W&B
api = wandb.Api()
artifact = api.artifact("flood-forecasting/flood-dataset:latest")
artifact_dir = artifact.download()
print(f"Downloaded to: {artifact_dir}")

[34m[1mwandb[0m: [wandb.Api()] Loaded credentials for https://api.wandb.ai from C:\Users\connor\_netrc.
[34m[1mwandb[0m:   1 of 1 files downloaded.  


Downloaded to: d:\Flood-Forecasting\models\artifacts\flood-dataset-v2


In [3]:
# Load as Polars DataFrame
df = pl.read_parquet(f"{artifact_dir}/flood_model.parquet")
print(f"Rows: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"Sites: {df['site_id'].n_unique()}")
print(f"Date range: {df['observation_hour'].min()} to {df['observation_hour'].max()}")

Rows: 4,616
Columns: 47
Sites: 18
Date range: 2026-01-18 00:00:00+00:00 to 2026-01-27 23:00:00+00:00


In [4]:
df.schema

Schema([('site_id', String),
        ('observation_hour', Datetime(time_unit='us', time_zone='UTC')),
        ('latitude', Float64),
        ('longitude', Float64),
        ('streamflow_cfs_mean', Float64),
        ('streamflow_cfs_max', Float64),
        ('streamflow_cfs_min', Float64),
        ('gage_height_ft_mean', Float64),
        ('gage_height_ft_max', Float64),
        ('gage_height_ft_min', Float64),
        ('observation_count', Int64),
        ('precipitation_mm', Float32),
        ('temperature_c', Float32),
        ('wind_speed_ms', Float32),
        ('humidity_pct', Float32),
        ('station_name', String),
        ('huc_code', Int64),
        ('drainage_area_sq_km', Float64),
        ('is_reference_hcdn2009', String),
        ('elev_mean_m', Float64),
        ('elev_max_m', Int32),
        ('elev_min_m', Int32),
        ('SLOPE_PCT', Float64),
        ('ASPECT_NORTHNESS', Float64),
        ('ASPECT_EASTNESS', Float64),
        ('geology_class_reedbush', String),
      

In [5]:
df.filter(
    pl.col("streamflow_cfs_mean").is_between(0, 1000)
)

site_id,observation_hour,latitude,longitude,streamflow_cfs_mean,streamflow_cfs_max,streamflow_cfs_min,gage_height_ft_mean,gage_height_ft_max,gage_height_ft_min,observation_count,precipitation_mm,temperature_c,wind_speed_ms,humidity_pct,station_name,huc_code,drainage_area_sq_km,is_reference_hcdn2009,elev_mean_m,elev_max_m,elev_min_m,SLOPE_PCT,ASPECT_NORTHNESS,ASPECT_EASTNESS,geology_class_reedbush,geology_desc_hunt,p_mean,pet_mean,aridity_index,p_seasonality,frac_snow,high_prec_freq,low_prec_freq,hydroatlas_elev_m,hydroatlas_slope_deg,hydroatlas_temp_mean_c,hydroatlas_precip_mm_yr,hydroatlas_pet_mm_yr,hydroatlas_aridity,hydroatlas_clay_pct,hydroatlas_sand_pct,hydroatlas_forest_pct,hydroatlas_crop_pct,hydroatlas_urban_pct,streamflow_cfs_target_1h,gage_height_ft_target_1h
str,"datetime[μs, UTC]",f64,f64,f64,f64,f64,f64,f64,f64,i64,f32,f32,f32,f32,str,i64,f64,str,f64,i32,i32,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""06020600""",2026-01-20 00:00:00 UTC,45.241869,-112.111239,43.6,43.6,43.6,2.25,2.25,2.25,4,0.0,-3.393,0.608276,63.688213,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,43.1,2.245
"""06020600""",2026-01-20 01:00:00 UTC,45.241869,-112.111239,43.1,43.6,42.6,2.245,2.25,2.24,4,0.0,-5.093,0.608276,66.620483,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,42.85,2.2425
"""06020600""",2026-01-20 02:00:00 UTC,45.241869,-112.111239,42.85,43.6,42.6,2.2425,2.25,2.24,4,0.0,-5.843,0.701783,65.396736,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,42.6,2.24
"""06020600""",2026-01-20 03:00:00 UTC,45.241869,-112.111239,42.6,42.6,42.6,2.24,2.24,2.24,4,0.0,-6.493,0.901388,62.169052,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,43.1,2.245
"""06020600""",2026-01-20 04:00:00 UTC,45.241869,-112.111239,43.1,43.6,42.6,2.245,2.25,2.24,4,0.0,-7.243,1.05,58.545712,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,43.35,2.2475
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""06020600""",2026-01-19 19:00:00 UTC,45.241869,-112.111239,40.65,42.6,38.7,2.22,2.24,2.2,4,0.0,-2.393,1.431782,62.698296,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,42.6,2.24
"""06020600""",2026-01-19 20:00:00 UTC,45.241869,-112.111239,42.6,42.6,42.6,2.24,2.24,2.24,4,0.0,-0.693,1.379311,57.755257,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,43.1,2.245
"""06020600""",2026-01-19 21:00:00 UTC,45.241869,-112.111239,43.1,43.6,42.6,2.245,2.25,2.24,4,0.0,0.457,1.070047,58.731556,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,43.1,2.245
"""06020600""",2026-01-19 22:00:00 UTC,45.241869,-112.111239,43.1,43.6,42.6,2.245,2.25,2.24,4,0.0,0.557,0.894427,62.221676,"""Ruby River below reservoir nea…",10020003,1541.163,,2303.683,3214,1810,24.43341,0.938772,-0.34454,"""sedimentary""","""Sandy and stony colluvium deri…",1.527315,3.281195,2.148342,0.161187,0.360557,14.811401,235.650058,2187.945364,92.951244,20.893751,448.528752,823.928091,52.166514,16.124195,44.578658,40.784729,2.845446,0.0,43.1,2.245


In [6]:
# Check if observation_hour + site_id combination is unique (primary key)
total_rows = len(df)
unique_combinations = df.select([
    pl.col("observation_hour"),
    pl.col("site_id")
]).n_unique()

print(f"Total rows: {total_rows:,}")
print(f"Unique combinations (observation_hour, site_id): {unique_combinations:,}")
print(f"Is unique (acts as primary key): {total_rows == unique_combinations}")

# Check for duplicates if any exist
if total_rows != unique_combinations:
    duplicates = df.filter(
        pl.struct(["observation_hour", "site_id"]).is_duplicated()
    )
    print(f"\nFound {len(duplicates):,} duplicate rows:")
    print(duplicates.select(["observation_hour", "site_id"]).head(10))
else:
    print("\n✓ No duplicates found - (observation_hour, site_id) is a valid primary key!")


Total rows: 4,616
Unique combinations (observation_hour, site_id): 4,136
Is unique (acts as primary key): False

Found 720 duplicate rows:
shape: (10, 2)
┌─────────────────────────┬──────────┐
│ observation_hour        ┆ site_id  │
│ ---                     ┆ ---      │
│ datetime[μs, UTC]       ┆ str      │
╞═════════════════════════╪══════════╡
│ 2026-01-20 00:00:00 UTC ┆ 06026210 │
│ 2026-01-20 00:00:00 UTC ┆ 06026210 │
│ 2026-01-20 00:00:00 UTC ┆ 06026210 │
│ 2026-01-20 01:00:00 UTC ┆ 06026210 │
│ 2026-01-20 01:00:00 UTC ┆ 06026210 │
│ 2026-01-20 01:00:00 UTC ┆ 06026210 │
│ 2026-01-20 02:00:00 UTC ┆ 06026210 │
│ 2026-01-20 02:00:00 UTC ┆ 06026210 │
│ 2026-01-20 02:00:00 UTC ┆ 06026210 │
│ 2026-01-20 03:00:00 UTC ┆ 06026210 │
└─────────────────────────┴──────────┘


In [7]:
df.describe()

statistic,site_id,observation_hour,latitude,longitude,streamflow_cfs_mean,streamflow_cfs_max,streamflow_cfs_min,gage_height_ft_mean,gage_height_ft_max,gage_height_ft_min,observation_count,precipitation_mm,temperature_c,wind_speed_ms,humidity_pct,station_name,huc_code,drainage_area_sq_km,is_reference_hcdn2009,elev_mean_m,elev_max_m,elev_min_m,SLOPE_PCT,ASPECT_NORTHNESS,ASPECT_EASTNESS,geology_class_reedbush,geology_desc_hunt,p_mean,pet_mean,aridity_index,p_seasonality,frac_snow,high_prec_freq,low_prec_freq,hydroatlas_elev_m,hydroatlas_slope_deg,hydroatlas_temp_mean_c,hydroatlas_precip_mm_yr,hydroatlas_pet_mm_yr,hydroatlas_aridity,hydroatlas_clay_pct,hydroatlas_sand_pct,hydroatlas_forest_pct,hydroatlas_crop_pct,hydroatlas_urban_pct,streamflow_cfs_target_1h,gage_height_ft_target_1h
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""4616""","""4616""",4616.0,4616.0,4608.0,4608.0,4608.0,4616.0,4616.0,4616.0,4616.0,4616.0,4616.0,4616.0,4616.0,"""4616""",4616.0,3896.0,"""0""",3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,"""3896""","""3896""",3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,3896.0,4590.0,4598.0
"""null_count""","""0""","""0""",0.0,0.0,8.0,8.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0,720.0,"""4616""",720.0,720.0,720.0,720.0,720.0,720.0,"""720""","""720""",720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,720.0,26.0,18.0
"""mean""",,"""2026-01-22 22:54:36.343154+00:…",45.563356,-112.509034,-947913.488379,-947913.475911,-947913.501497,-732667.48251,-732667.469162,-732667.493377,4.402946,0.004073,-7.756198,1.405346,62.538898,,10019000.0,6416.077049,,2191.666357,3304.246407,1634.772074,17.104779,0.339204,0.530287,,,1.591449,3.290011,2.147011,0.080167,0.374301,14.411739,234.426448,2162.151417,80.753533,18.384734,410.008634,833.242222,44.315275,14.042315,43.223678,47.258411,4.649996,0.0,-947927.105621,-732925.868015
"""std""",,,0.851097,0.401224,222228.501367,222228.554561,222228.445396,442615.310795,442615.332895,442615.292802,1.788886,0.031652,5.288546,1.212223,14.54333,,2220.632613,4669.97758,,59.147042,98.645829,192.774953,3.245334,0.431375,0.646379,,,0.377175,0.036787,0.358802,0.125592,0.048089,0.685538,11.556315,50.622589,19.595306,5.269352,56.767229,15.045359,9.427164,3.76977,10.411639,16.585862,1.428994,0.0,222201.139983,442479.544181
"""min""","""05018500""","""2026-01-18 00:00:00+00:00""",44.615792,-113.456942,-999999.0,-999999.0,-999999.0,-999999.0,-999999.0,-999999.0,1.0,0.0,-25.602999,0.0,21.222441,"""Beaverhead River at Barretts M…",10010002.0,98.6499,,2119.105,3084.0,1394.0,11.51282,-0.75227,-0.860386,"""gneiss""","""Fan gravels""",1.259084,3.211479,1.115863,-0.279911,0.315876,12.946097,201.56385,2023.0,28.050842,8.0,361.697554,803.0,12.130094,4.043365,11.624673,14.151776,1.010841,0.0,-999999.0,-999999.0
"""25%""",,"""2026-01-20 10:00:00+00:00""",45.218383,-112.701725,-999999.0,-999999.0,-999999.0,-999999.0,-999999.0,-999999.0,4.0,0.0,-11.45,0.570088,52.649288,,10020002.0,1541.163,,2154.561,3214.0,1479.0,15.20178,0.079573,0.242091,,,1.38289,3.272939,2.006891,0.0259,0.337893,13.923161,230.653708,2155.704912,72.523816,14.787101,371.974269,822.533866,43.74915,13.80914,44.578658,37.532593,4.913442,0.0,-999999.0,-999999.0
"""50%""",,"""2026-01-22 23:00:00+00:00""",45.440094,-112.452831,-999999.0,-999999.0,-999999.0,-999999.0,-999999.0,-999999.0,4.0,0.0,-7.943,1.106797,62.515141,,10020003.0,6906.721,,2177.793,3368.0,1605.0,17.15961,0.243609,0.838195,,,1.553551,3.277489,2.109676,0.058404,0.374291,14.544929,231.652978,2169.98098,85.509702,15.998913,385.361554,826.261048,47.345622,13.897497,46.974271,42.586856,4.95444,0.0,-999999.0,-999999.0
"""75%""",,"""2026-01-25 11:00:00+00:00""",45.613283,-112.329397,-999999.0,-999999.0,-999999.0,1.3975,1.41,1.38,4.0,0.0,-3.9,1.856071,72.245773,,10020004.0,7225.151,,2234.878,3369.0,1810.0,17.70469,0.616538,0.996829,,,1.621935,3.324942,2.404343,0.164652,0.392521,14.811401,238.758898,2191.44004,95.677612,21.415237,433.76029,849.0,48.64124,16.124195,49.318357,65.398606,4.997263,0.0,-999999.0,1.395
"""max""","""06027600""","""2026-01-27 23:00:00+00:00""",48.946967,-111.625669,43.6,43.6,43.6,7.835,12.57,6.33,12.0,1.1,6.45,9.550523,100.0,"""St. Mary Canal at St. Mary Cro…",10020005.0,19721.99,,2307.653,3369.0,2023.0,24.43341,0.970253,0.999747,"""sedimentary""","""Sandy and stony colluvium deri…",2.949739,3.349704,2.656656,0.231338,0.522342,15.788465,251.882643,2250.441649,96.116134,29.0,592.0,859.826053,52.490776,17.995142,49.633918,69.487486,8.813127,0.0,43.6,7.375
