In [9]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_absolute_error

In [None]:
def add_season(df):
    df = df.copy()
    df["sin_week"] = np.sin(2*np.pi*df["weekofyear"]/52)
    df["cos_week"] = np.cos(2*np.pi*df["weekofyear"]/52)
    return df

def interpolate_citywise(df):

    df = df.sort_values(["city","year","weekofyear"]).copy()
    num_cols = df.select_dtypes(include=["number"]).columns
    for city in df["city"].unique():
        idx = df["city"] == city
        df.loc[idx, num_cols] = (
            df.loc[idx, num_cols]
              .interpolate(method="linear", limit_direction="both")
        )
    return df

def time_split(city_df, val_frac=0.2):
    city_df = city_df.sort_values(["year","weekofyear"]).copy()
    n = len(city_df)
    cut = int((1 - val_frac) * n)
    return city_df.iloc[:cut], city_df.iloc[cut:]

In [11]:
def make_train_df(features_train_path="dengue_features_train.csv",
                  labels_train_path="dengue_labels_train.csv"):
    Xtr = pd.read_csv(features_train_path)
    ytr = pd.read_csv(labels_train_path)

    # seasonality
    Xtr = add_season(Xtr)

    # interpolate numeric inside each city
    Xtr = interpolate_citywise(Xtr)

    # merge labels
    df = Xtr.merge(ytr, on=["city","year","weekofyear"]).sort_values(["city","year","weekofyear"])
    df = df.reset_index(drop=True)

    # ---- Target lags (1-4)
    for l in [1,2,3,4]:
        df[f"total_cases_lag_{l}"] = df.groupby("city")["total_cases"].shift(l)

    # ---- Target rolling (4) using shift(1) to avoid leakage
    df["total_cases_roll_mean_4"] = (
        df.groupby("city")["total_cases"].shift(1).rolling(4).mean()
          .reset_index(level=0, drop=True)
    )
    df["total_cases_roll_std_4"] = (
        df.groupby("city")["total_cases"].shift(1).rolling(4).std()
          .reset_index(level=0, drop=True)
    )

    # ---- Env lag (1)
    df["reanalysis_avg_temp_k_lag_1"] = df.groupby("city")["reanalysis_avg_temp_k"].shift(1)
    df["precipitation_amt_mm_lag_1"]  = df.groupby("city")["precipitation_amt_mm"].shift(1)

    # ---- Env rolling (4)
    df["temp_roll_mean_4"] = (
        df.groupby("city")["reanalysis_avg_temp_k"].rolling(4, min_periods=1).mean()
          .reset_index(level=0, drop=True)
    )
    df["precip_roll_sum_4"] = (
        df.groupby("city")["precipitation_amt_mm"].rolling(4, min_periods=1).sum()
          .reset_index(level=0, drop=True)
    )

    # ---- Env rolling (53)
    df["temp_roll_mean_53"] = (
        df.groupby("city")["reanalysis_avg_temp_k"].rolling(53, min_periods=1).mean()
          .reset_index(level=0, drop=True)
    )
    df["precip_roll_sum_53"] = (
        df.groupby("city")["precipitation_amt_mm"].rolling(53, min_periods=1).sum()
          .reset_index(level=0, drop=True)
    )
    df["humidity_roll_mean_53"] = (
        df.groupby("city")["reanalysis_relative_humidity_percent"].rolling(53, min_periods=1).mean()
          .reset_index(level=0, drop=True)
    )

    return df

In [12]:
def make_test_df(features_train_path="dengue_features_train.csv",
                 features_test_path="dengue_features_test.csv"):
    Xtr = pd.read_csv(features_train_path)
    Xte = pd.read_csv(features_test_path)

    # seasonality
    Xtr = add_season(Xtr)
    Xte = add_season(Xte)

    # interpolate
    Xtr = interpolate_citywise(Xtr)
    Xte = interpolate_citywise(Xte)

    # We'll add env rolling features to Xte with seeding from Xtr.
    Xte = Xte.sort_values(["city","year","weekofyear"]).copy()

    out_parts = []
    for city in ["sj", "iq"]:
        trc = Xtr[Xtr["city"] == city].sort_values(["year","weekofyear"]).copy()
        tec = Xte[Xte["city"] == city].sort_values(["year","weekofyear"]).copy()

        # --- rolling 4: need last 3
        seed4 = trc.tail(3)[["reanalysis_avg_temp_k", "precipitation_amt_mm"]].copy()
        combo4 = pd.concat([
            seed4,
            tec[["reanalysis_avg_temp_k", "precipitation_amt_mm"]]
        ], ignore_index=True)

        r4_temp = combo4["reanalysis_avg_temp_k"].rolling(4, min_periods=1).mean()
        r4_prec = combo4["precipitation_amt_mm"].rolling(4, min_periods=1).sum()

        r4_temp = r4_temp.iloc[len(seed4):].reset_index(drop=True)
        r4_prec = r4_prec.iloc[len(seed4):].reset_index(drop=True)

        # --- rolling 53: need last 52
        seed53 = trc.tail(52)[["reanalysis_avg_temp_k", "precipitation_amt_mm", "reanalysis_relative_humidity_percent"]].copy()
        combo53 = pd.concat([
            seed53,
            tec[["reanalysis_avg_temp_k", "precipitation_amt_mm", "reanalysis_relative_humidity_percent"]]
        ], ignore_index=True)

        r53_temp = combo53["reanalysis_avg_temp_k"].rolling(53, min_periods=1).mean()
        r53_prec = combo53["precipitation_amt_mm"].rolling(53, min_periods=1).sum()
        r53_hum  = combo53["reanalysis_relative_humidity_percent"].rolling(53, min_periods=1).mean()

        r53_temp = r53_temp.iloc[len(seed53):].reset_index(drop=True)
        r53_prec = r53_prec.iloc[len(seed53):].reset_index(drop=True)
        r53_hum  = r53_hum.iloc[len(seed53):].reset_index(drop=True)

        tec = tec.reset_index(drop=True)
        tec["temp_roll_mean_4"] = r4_temp.values
        tec["precip_roll_sum_4"] = r4_prec.values

        tec["temp_roll_mean_53"] = r53_temp.values
        tec["precip_roll_sum_53"] = r53_prec.values
        tec["humidity_roll_mean_53"] = r53_hum.values

        out_parts.append(tec)

    Xte2 = pd.concat(out_parts, ignore_index=True).sort_values(["city","year","weekofyear"])
    return Xtr, Xte2

In [13]:
def get_xgb():
    from xgboost import XGBRegressor
    return XGBRegressor(
        n_estimators=1500,
        learning_rate=0.03,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        reg_lambda=1.0,
        objective="reg:squarederror",
        n_jobs=-1
    )


In [14]:
BASE_FEATURES = [
    "year", "sin_week", "cos_week",
    "ndvi_ne","ndvi_nw","ndvi_se","ndvi_sw",
    "precipitation_amt_mm","reanalysis_precip_amt_kg_per_m2",
    "reanalysis_sat_precip_amt_mm","station_precip_mm",
    "reanalysis_air_temp_k","reanalysis_avg_temp_k",
    "reanalysis_min_air_temp_k","reanalysis_max_air_temp_k",
    "station_avg_temp_c","station_min_temp_c","station_max_temp_c",
    "reanalysis_relative_humidity_percent",
    "reanalysis_specific_humidity_g_per_kg",
    "reanalysis_dew_point_temp_k",
    "reanalysis_tdtr_k","station_diur_temp_rng_c",
]

TARGET_LAG_ROLL = [
    "total_cases_lag_1","total_cases_lag_2","total_cases_lag_3","total_cases_lag_4",
    "total_cases_roll_mean_4","total_cases_roll_std_4",
]

ENV_LAG = [
    "reanalysis_avg_temp_k_lag_1","precipitation_amt_mm_lag_1",
]

ENV_ROLL = [
    "temp_roll_mean_4","precip_roll_sum_4",
    "temp_roll_mean_53","precip_roll_sum_53","humidity_roll_mean_53"
]

FEATURES = BASE_FEATURES + TARGET_LAG_ROLL + ENV_LAG + ENV_ROLL
TARGET = "total_cases"

In [15]:
train_df = make_train_df()
city_models = {}
val_report = []

for city in ["sj", "iq"]:
    cdf = train_df[train_df["city"] == city].copy()

    # Need target-lags to exist
    needed = TARGET_LAG_ROLL + [TARGET]
    cdf = cdf.dropna(subset=needed)

    tr, va = time_split(cdf, val_frac=0.2)

    Xtr = tr[FEATURES].apply(pd.to_numeric, errors="coerce").astype(float)
    ytr = tr[TARGET].astype(float)

    Xva = va[FEATURES].apply(pd.to_numeric, errors="coerce").astype(float)
    yva = va[TARGET].astype(float)

    # If any NaN remains from coercion, fill
    Xtr = Xtr.fillna(Xtr.mean())
    Xva = Xva.fillna(Xtr.mean())

    model = get_xgb()
    model.fit(Xtr, ytr)

    pred = model.predict(Xva)
    mae = mean_absolute_error(yva, pred)

    city_models[city] = model
    val_report.append({"city": city, "MAE": mae, "n_train": len(tr), "n_val": len(va)})

val_report_df = pd.DataFrame(val_report)
print(val_report_df)



  city       MAE  n_train  n_val
0   sj  7.265921      745    187
1   iq  7.115772      412    104


In [16]:
def make_submission_iterative(city_models,
                              features_train_path="dengue_features_train.csv",
                              labels_train_path="dengue_labels_train.csv",
                              features_test_path="dengue_features_test.csv",
                              submission_path="submission_format.csv",
                              out_path="submission_model2_xgb_v3.csv"):

    # Prepare train history (with real total_cases)
    Xtr_raw = pd.read_csv(features_train_path)
    ytr = pd.read_csv(labels_train_path)
    Xtr_raw = add_season(Xtr_raw)
    Xtr_raw = interpolate_citywise(Xtr_raw)

    train_full = Xtr_raw.merge(ytr, on=["city","year","weekofyear"]).sort_values(["city","year","weekofyear"])

    # Prepare test with seeded env rolling features
    _, Xte = make_test_df(features_train_path, features_test_path)
    Xte = Xte.sort_values(["city","year","weekofyear"]).copy()

    sub = pd.read_csv(submission_path)

    preds_all = []

    for city in ["sj", "iq"]:
        model = city_models[city]

        trc = train_full[train_full["city"] == city].copy()
        tec = Xte[Xte["city"] == city].copy().reset_index(drop=True)

        # Seed env-lag from last train row
        last_temp = float(trc["reanalysis_avg_temp_k"].iloc[-1])
        last_prec = float(trc["precipitation_amt_mm"].iloc[-1])

        # History of true cases (seed for target lags)
        history = trc["total_cases"].astype(float).tolist()

        yhat_list = []

        for i in range(len(tec)):
            row = tec.loc[i].copy()

            # Target lags/rolling from history
            row["total_cases_lag_1"] = history[-1]
            row["total_cases_lag_2"] = history[-2]
            row["total_cases_lag_3"] = history[-3]
            row["total_cases_lag_4"] = history[-4]
            row["total_cases_roll_mean_4"] = float(np.mean(history[-4:]))
            row["total_cases_roll_std_4"]  = float(np.std(history[-4:], ddof=1)) if len(history[-4:]) >= 2 else 0.0

            # Env lag 1
            if i == 0:
                row["reanalysis_avg_temp_k_lag_1"] = last_temp
                row["precipitation_amt_mm_lag_1"]  = last_prec
            else:
                row["reanalysis_avg_temp_k_lag_1"] = float(tec.loc[i-1, "reanalysis_avg_temp_k"])
                row["precipitation_amt_mm_lag_1"]  = float(tec.loc[i-1, "precipitation_amt_mm"])

            # Build X row (force numeric)
            Xrow = pd.DataFrame([{k: row[k] for k in FEATURES}])
            Xrow = Xrow.apply(pd.to_numeric, errors="coerce").astype(float)
            Xrow = Xrow.fillna(0.0)

            yhat = float(model.predict(Xrow)[0])
            yhat = max(0.0, yhat)  # no negative cases

            yhat_list.append(yhat)
            history.append(yhat)   # recursive step

        out = tec[["city","year","weekofyear"]].copy()
        out["total_cases"] = np.round(yhat_list).astype(int)
        preds_all.append(out)

    pred_df = pd.concat(preds_all, ignore_index=True)

    sub2 = sub.drop(columns=["total_cases"]).merge(pred_df, on=["city","year","weekofyear"], how="left")
    sub2["total_cases"] = sub2["total_cases"].fillna(0).clip(lower=0).astype(int)
    sub2.to_csv(out_path, index=False)

    return sub2

sub2 = make_submission_iterative(city_models)
print(sub2.head())
print("Saved:", "submission_model2_xgb_v3.csv")


  city  year  weekofyear  total_cases
0   sj  2008          18            7
1   sj  2008          19            8
2   sj  2008          20            9
3   sj  2008          21            9
4   sj  2008          22            7
Saved: submission_model2_xgb_v3.csv
