In [1]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LassoCV
from datetime import datetime, timedelta

In [None]:
BLOOM_DIR = "data/blossoms"

NOAA_DIR = "data/noaa"

SPECIES = {
    "northamerica" : "Prunus x yedoensis",
    "switzerland": "Prunus avium",
    "japan": "Prunus x jamasakura",
    "korea" : "Prunus x yedoensis",
}


In [9]:
jp = pd.read_csv("data/japan.csv")

print(jp.location.unique())

<StringArray>
[       'Japan/Wakkanai',           'Japan/Rumoi',       'Japan/Asahikawa',
        'Japan/Abashiri',         'Japan/Sapporo',       'Japan/Iwamizawa',
         'Japan/Obihiro',         'Japan/Kushiro',          'Japan/Nemuro',
         'Japan/Muroran',
 ...
            'Japan/Naze', 'Japan/Naze/Funchatoge',    'Japan/Yonagunijima',
    'Japan/Iriomotejima',    'Japan/Ishigakijima',      'Japan/Miyakojima',
        'Japan/Kumejima',            'Japan/Naha',            'Japan/Nago',
 'Japan/Minamidaitojima']
Length: 103, dtype: str


## Set the Model Parameters


In [4]:
MODEL_OUTPUT_DIR = os.path.join("data", "model_outputs")
TRAIN_LOCATIONS = ["kyoto", "washingtondc", "liestal"]
MIN_YEAR = 1973
CHILL_TEMP_C = 4.3
FORCING_BASE_TEMP_C = 5.0

os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)

In [5]:
def load_data():
    print("Loading bloom historical data...")
    bloom_dfs = []
    for f in BLOOM_FILES:
        if os.path.exists(f):
            bloom_dfs.append(pd.read_csv(f))
    
    if not bloom_dfs:
        raise FileNotFoundError("No cherry bloom CSV files found in 'data/' directory.")
        
    bloom_df = pd.concat(bloom_dfs, ignore_index=True)
    # Ensure standard column naming
    bloom_df = bloom_df.rename(columns={"bloom_date": "bloom_date", "bloom_doy": "bloom_doy"})
    bloom_df['bloom_date'] = pd.to_datetime(bloom_df['bloom_date'])
    bloom_df['year'] = bloom_df['year'].astype(int)
    
    print("Loading station climate data...")
    climate_dfs = []
    
    for f in CLIMATE_DATA_FILES:
        if os.path.exists(f):
            # Read the station CSV
            df = pd.read_csv(f)
            
            # Identify the location based on the filename
            filename = os.path.basename(f).replace(".csv", "")
            loc_name = STATION_MAPPING.get(filename, filename)
            
            # Tag the dataframe with the correct location string
            df['location'] = loc_name
            climate_dfs.append(df)
        else:
            print(f"Warning: Could not find {f}")

    if not climate_dfs:
         raise FileNotFoundError("No station climate CSV files were found.")

    climate_df = pd.concat(climate_dfs, ignore_index=True)
    
    # Standardize NOAA GHCN-Daily columns
    # We use pd.to_numeric with errors='coerce' to safely handle any missing data ("" or "NA")
    climate_df['date'] = pd.to_datetime(climate_df['DATE'])
    climate_df['year'] = climate_df['date'].dt.year
    
    # NOAA data is typically recorded in tenths of degrees (C) and tenths of millimeters
    climate_df['tmax_c'] = pd.to_numeric(climate_df['TMAX'], errors='coerce') / 10.0
    climate_df['tmin_c'] = pd.to_numeric(climate_df['TMIN'], errors='coerce') / 10.0
    climate_df['tmean_c'] = (climate_df['tmax_c'] + climate_df['tmin_c']) / 2.0
    
    # If precipitation doesn't exist in a file, create it as NaN to prevent KeyError
    if 'PRCP' in climate_df.columns:
        climate_df['prcp_mm'] = pd.to_numeric(climate_df['PRCP'], errors='coerce') / 10.0
    else:
        climate_df['prcp_mm'] = np.nan
        
    # Calculate daily thermal growth (GDD)
    climate_df['forcing_gdd'] = np.maximum(climate_df['tmean_c'] - FORCING_BASE_TEMP_C, 0)
    
    return bloom_df, climate_df

In [6]:
bloom_df, climate_df = load_data()

Loading bloom historical data...
Loading station climate data...


  df = pd.read_csv(f)
  df = pd.read_csv(f)
  df = pd.read_csv(f)


In [7]:
bloom_df

Unnamed: 0,location,lat,long,alt,year,bloom_date,bloom_doy
0,washingtondc,38.88535,-77.038628,0.0,1921,1921-03-20,79
1,washingtondc,38.88535,-77.038628,0.0,1922,1922-04-07,97
2,washingtondc,38.88535,-77.038628,0.0,1923,1923-04-09,99
3,washingtondc,38.88535,-77.038628,0.0,1924,1924-04-13,104
4,washingtondc,38.88535,-77.038628,0.0,1925,1925-03-27,86
...,...,...,...,...,...,...,...
1075,vancouver,49.22370,-123.163600,24.0,2023,2023-04-07,96
1076,vancouver,49.22370,-123.163600,24.0,2024,2024-03-23,83
1077,vancouver,49.22370,-123.163600,24.0,2025,2025-04-03,93
1078,newyorkcity,40.73040,-73.998090,8.5,2024,2024-03-28,88
