In [None]:
!pip install pytorch-forecasting pytorch-lightning
!pip install pytorch-lightning==1.9.5




In [3]:
# Final corrected XGBoost script without overwriting total_cases

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer

# Load data
train_features = pd.read_csv("dengue_features_train.csv")
train_labels   = pd.read_csv("dengue_labels_train.csv")
test_features  = pd.read_csv("dengue_features_test.csv")
submission     = pd.read_csv("submission_format.csv")

# Merge and combine
train = pd.merge(train_features, train_labels, on=["city", "year", "weekofyear"])
test  = test_features.copy()
test["total_cases"] = np.nan
full = pd.concat([train, test], ignore_index=True)

# Feature engineering
for lag in [1,2,3]:
    full[f"cases_lag_{lag}"] = full.groupby("city")["total_cases"].shift(lag)
    full[f"temp_lag_{lag}"]  = full.groupby("city")["reanalysis_avg_temp_k"].shift(lag)

full["humidity_roll3"]     = full.groupby("city")["reanalysis_specific_humidity_g_per_kg"]\
                                  .transform(lambda x: x.rolling(3).mean())
full["sin_week"]           = np.sin(2*np.pi * full["weekofyear"]/52)
full["cos_week"]           = np.cos(2*np.pi * full["weekofyear"]/52)
full["temp_high"]          = (full["reanalysis_avg_temp_k"] > 295).astype(int)
full["humid_ndvi"]         = full["reanalysis_specific_humidity_g_per_kg"] * full["ndvi_ne"]
full["case_growth"]        = full.groupby("city")["total_cases"].diff()

# Preserve train/test split mask
train_mask = full["total_cases"].notna()

# Fill missing values in feature columns only (do not overwrite total_cases)
drop_cols = ["city","year","weekofyear","week_start_date","total_cases"]
feature_cols = [c for c in full.select_dtypes(include="number").columns if c not in drop_cols]

full.loc[:, feature_cols] = (
    full[feature_cols]
        .fillna(method="ffill")
        .fillna(method="bfill")
)

X_full = full[feature_cols]
y_full = full["total_cases"]
imputer = SimpleImputer(strategy="mean")
submission_pred = submission.copy()

# Train & predict for each city
for city in ["sj","iq"]:
    mask_tr = train_mask & (full["city"] == city)
    mask_te = (~train_mask) & (full["city"] == city)

    X_tr = X_full.loc[mask_tr]
    y_tr = y_full.loc[mask_tr]
    X_te = X_full.loc[mask_te]

    X_tr_imp = imputer.fit_transform(X_tr)
    X_te_imp = imputer.transform(X_te)

    # Train XGBoost
    model = XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        objective="reg:squarederror",
        random_state=42
    )
    model.fit(X_tr_imp, y_tr)

    # Predict
    preds = model.predict(X_te_imp)
    preds = np.clip(np.round(preds), 0, None).astype(int)

# Save predictions
    submission_pred.loc[submission_pred.city == city, "total_cases"] = preds

submission_pred.to_csv("dengue_xgb_predictions.csv", index=False)
print("Saved: dengue_xgb_predictions.csv")


  .fillna(method="ffill")
  .fillna(method="bfill")


Saved: dengue_xgb_predictions.csv


In [4]:
# Complete Updated Ensemble Script with Randomized Hyperparameter Search and Correct FL Forecasting

# !pip install xgboost statsmodels scikit-learn

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.statespace.sarimax import SARIMAX

train_features = pd.read_csv("dengue_features_train.csv")
train_labels   = pd.read_csv("dengue_labels_train.csv")
test_features  = pd.read_csv("dengue_features_test.csv")
submission     = pd.read_csv("submission_format.csv")

# Merge training features and labels
train = pd.merge(train_features, train_labels, on=["city","year","weekofyear"])
test  = test_features.copy()
test["total_cases"] = np.nan

# Combine for feature engineering
full = pd.concat([train, test], ignore_index=True)

# Feature Engineering
# Lag features
for lag in [1,2,3]:
    full[f"cases_lag_{lag}"] = full.groupby("city")["total_cases"].shift(lag)
    full[f"temp_lag_{lag}"]  = full.groupby("city")["reanalysis_avg_temp_k"].shift(lag)

# Rolling mean of humidity
full["humidity_roll3"] = full.groupby("city")["reanalysis_specific_humidity_g_per_kg"]\
                             .transform(lambda x: x.rolling(3).mean())

# Seasonality
full["sin_week"] = np.sin(2 * np.pi * full["weekofyear"]/52)
full["cos_week"] = np.cos(2 * np.pi * full["weekofyear"]/52)

# Threshold flag and interaction
full["temp_high"] = (full["reanalysis_avg_temp_k"] > 295).astype(int)
full["humid_ndvi"] = full["reanalysis_specific_humidity_g_per_kg"] * full["ndvi_ne"]

# Case growth
full["case_growth"] = full.groupby("city")["total_cases"].diff()

# Fill feature NaNs
feature_cols = [c for c in full.select_dtypes(include="number").columns
                if c not in ["year","weekofyear","total_cases"]]
full[feature_cols] = full[feature_cols].ffill().bfill()

# Prepare for modeling
# Masks and matrices
train_mask = full["total_cases"].notna()
X_full = full[feature_cols]
y_full = full["total_cases"]

imputer = SimpleImputer(strategy="mean")

# Hyperparameter search setup
param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6],
    "learning_rate": [0.01, 0.05],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}
tscv = TimeSeriesSplit(n_splits=3)
xgb = XGBRegressor(objective="reg:squarederror", random_state=42, n_jobs=1, verbosity=0)

# Randomized search dictionary
best_models = {}
for city in ["sj", "iq"]:
    # Training subset
    mask_tr = train_mask & (full["city"] == city)
    X_tr = X_full.loc[mask_tr]
    y_tr = y_full.loc[mask_tr]
    # Impute
    X_tr_imp = imputer.fit_transform(X_tr)
    # Search
    rs = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=20,
        cv=tscv,
        scoring="neg_mean_squared_error",
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    rs.fit(X_tr_imp, y_tr)
    best_models[city] = rs.best_estimator_
    print(f"Best XGB params for {city}: {rs.best_params_}")

# SARIMAX forecasting with correct exogenous shaping
sarimax_preds = {}
for city in ["sj", "iq"]:
    mask_tr = train_mask & (full["city"] == city)
    mask_te = (~train_mask) & (full["city"] == city)
    # Exogenous variable (humidity)
    exog_tr = full.loc[mask_tr, ["reanalysis_specific_humidity_g_per_kg"]]
    exog_te = full.loc[mask_te, ["reanalysis_specific_humidity_g_per_kg"]]
    y_tr = full.loc[mask_tr, "total_cases"]
    # Fit SARIMAX
    sar = SARIMAX(
        endog=y_tr,
        exog=exog_tr,
        order=(1,1,1),
        seasonal_order=(1,1,1,52),
        enforce_stationarity=False,
        enforce_invertibility=False
    ).fit(disp=False)
    # Forecast only test period
    n_forecast = len(exog_te)
    sar_pred = sar.predict(
        start=len(y_tr),
        end=len(y_tr) + n_forecast - 1,
        exog=exog_te
    )
    sarimax_preds[city] = sar_pred.values

# Ensemble predictions and save
submission_pred = submission.copy()
for city in ["sj", "iq"]:
    mask_te = (~train_mask) & (full["city"] == city)
    X_te = X_full.loc[mask_te]
    X_te_imp = imputer.transform(X_te)
    xgb_pred = best_models[city].predict(X_te_imp)
    sar_pred = sarimax_preds[city]
    ensemble = np.round((xgb_pred + sar_pred) / 2)
    ensemble = np.clip(ensemble, 0, None).astype(int)
    submission_pred.loc[submission_pred.city == city, "total_cases"] = ensemble


submission_pred.to_csv("dengue_ensemble_predictions.csv", index=False)
print("Saved: dengue_ensemble_predictions.csv")



Best XGB params for sj: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Best XGB params for iq: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.05, 'colsample_bytree': 1.0}


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Saved: dengue_ensemble_predictions.csv


  return get_prediction_index(
  return get_prediction_index(
