In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error

sns.set_theme(style='whitegrid')

# Load data
DATA_PATH = '../data/cherry_blossom_data.csv'
df = pd.read_csv(DATA_PATH)
numeric_cols = ['days_dec_ge_45','days_jan_ge_45','days_feb_ge_45','prec_winter','mean_temp_winter','surface_temp_chg','climate_incidents']
for c in numeric_cols + ['bloom_day','year']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df = df.sort_values('year').drop_duplicates('year').dropna(subset=['year','bloom_day']).reset_index(drop=True)
df['time_idx'] = (df['year'] - df['year'].min()).astype(int)
df['series'] = 'cherry_blossom'

# Splits
train_val_df = df[(df['year'] >= 1921) & (df['year'] <= 2015)].sort_values('time_idx').reset_index(drop=True)
train_n = int(np.floor(len(train_val_df)*0.8))
train_df = train_val_df.iloc[:train_n].reset_index(drop=True)
val_df = train_val_df.iloc[train_n:].reset_index(drop=True)
test_df = df[(df['year'] >= 2016) & (df['year'] <= 2025)].sort_values('time_idx').reset_index(drop=True)

print(f'Train rows: {len(train_df)}, Val rows: {len(val_df)}, Test rows: {len(test_df)}')

# For TimeGPT, we need time series data; use the full historical data up to test
# TimeGPT can forecast from historical data
# Install nixtla if needed: pip install nixtla
try:
    from nixtla import NixtlaClient
except ImportError:
    print("Nixtla not installed; install with: pip install nixtla")
    raise

# Initialize TimeGPT client (requires API key; set NIXTLA_API_KEY env var)
client = NixtlaClient()

# Prepare data for TimeGPT: needs 'ds' (date) and 'y' (target)
timegpt_df = df[['year', 'bloom_day']].copy()
timegpt_df['ds'] = pd.to_datetime(timegpt_df['year'], format='%Y')
timegpt_df = timegpt_df[['ds', 'bloom_day']].rename(columns={'bloom_day': 'y'})

# Split for training: use up to 2015 for training, 2016-2025 for test
train_timegpt = timegpt_df[timegpt_df['ds'].dt.year <= 2015]
test_timegpt = timegpt_df[timegpt_df['ds'].dt.year >= 2016]

# Forecast the test period using TimeGPT
# TimeGPT can do multi-step ahead, but for simplicity, we'll do one-step ahead
forecasts = []
for i in range(len(test_timegpt)):
    # Use data up to the point before the test point
    hist = timegpt_df.iloc[:len(train_timegpt) + i]
    # Forecast the next step
    fcst = client.forecast(hist, h=1, level=[80])  # h=1 for one-step
    pred = fcst['TimeGPT'].iloc[0]
    forecasts.append(pred)

test_df['timegpt_pred'] = forecasts

# Evaluate
y_test = test_df['bloom_day'].values
y_pred = test_df['timegpt_pred'].values
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_mae = mean_absolute_error(y_test, y_pred)
test_wmape = np.sum(np.abs(y_test - y_pred)) / np.sum(np.abs(y_test))

print('\nTest set performance (2016-2025)')
print('Rows:', len(test_df))
print('RMSE: {:.3f}'.format(test_rmse))
print('MAE: {:.3f}'.format(test_mae))
print('WMAPE: {:.3%}'.format(test_wmape))

# Plot
plt.figure(figsize=(10,5))
plt.plot(df['year'], df['bloom_day'].values, marker='o', color='tab:blue', label='Actual (since 1921)')
plt.plot(test_df['year'], test_df['timegpt_pred'].values, marker='o', color='tab:red', label='TimeGPT Forecast (2016-2025)')
plt.axvline(2016, color='k', linestyle='--', linewidth=1, label='Forecast start (2016)')
plt.xlabel('Year')
plt.ylabel('Bloom Day')
plt.title('Actual vs TimeGPT Forecasts (2016-2025)')
plt.legend()
plt.grid(True)
plt.show()

# Save forecasts
out_csv = 'notebooks/timegpt_test_forecasts.csv'
test_df[['year','bloom_day','timegpt_pred']].to_csv(out_csv, index=False)
print('Saved test forecasts to', out_csv)


Train rows: 76, Val rows: 19, Test rows: 10
Nixtla not installed; install with: pip install nixtla


ImportError: cannot import name 'Sentinel' from 'typing_extensions' (/Users/ecilteodoro/miniconda3/envs/cherry-blossom-env/lib/python3.12/site-packages/typing_extensions.py)