In [9]:
import pandas as pd
import numpy as np

X = pd.read_csv("dengue_features_train.csv")
y = pd.read_csv("dengue_labels_train.csv")




In [10]:
import numpy as np
import pandas as pd

def force_numeric(df, cols):
    df = df.copy()
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df
    
def add_season(df):
    df = df.copy()
    df["sin_week"] = np.sin(2*np.pi*df["weekofyear"]/52)
    df["cos_week"] = np.cos(2*np.pi*df["weekofyear"]/52)
    return df

def make_train_features(features_train_path="dengue_features_train.csv",
                        labels_train_path="dengue_labels_train.csv"):
    X = pd.read_csv(features_train_path)
    y = pd.read_csv(labels_train_path)

    df = X.merge(y, on=["city","year","weekofyear"]).sort_values(["city","year","weekofyear"])
    df = add_season(df)

    # Target lag: strictly past
    for l in [1,2,3,4]:
        df[f"total_cases_lag_{l}"] = df.groupby("city")["total_cases"].shift(l)

    # Rolling: geçmiş 4 hafta (current week dahil değil!)
    # shift(1) -> current week target'ı asla feature'a girmez
    df["total_cases_roll_mean_4"] = (
        df.groupby("city")["total_cases"].shift(1).rolling(4).mean().reset_index(level=0, drop=True)
    )
    df["total_cases_roll_std_4"] = (
        df.groupby("city")["total_cases"].shift(1).rolling(4).std().reset_index(level=0, drop=True)
    )

    # Environment lag (sadece 1)
    df["reanalysis_avg_temp_k_lag_1"] = df.groupby("city")["reanalysis_avg_temp_k"].shift(1)
    df["precipitation_amt_mm_lag_1"]  = df.groupby("city")["precipitation_amt_mm"].shift(1)

    # İlk haftalarda lag/rolling NaN olacak -> train/val'da drop edeceğiz
    return df


In [11]:
BASE_FEATURES = [
    "year", "sin_week", "cos_week",
    "ndvi_ne","ndvi_nw","ndvi_se","ndvi_sw",
    "precipitation_amt_mm","reanalysis_precip_amt_kg_per_m2",
    "reanalysis_sat_precip_amt_mm","station_precip_mm",
    "reanalysis_air_temp_k","reanalysis_avg_temp_k",
    "reanalysis_min_air_temp_k","reanalysis_max_air_temp_k",
    "station_avg_temp_c","station_min_temp_c","station_max_temp_c",
    "reanalysis_relative_humidity_percent",
    "reanalysis_specific_humidity_g_per_kg",
    "reanalysis_dew_point_temp_k",
    "reanalysis_tdtr_k","station_diur_temp_rng_c",
]

LAG_FEATURES = [
    "total_cases_lag_1","total_cases_lag_2","total_cases_lag_3","total_cases_lag_4",
    "total_cases_roll_mean_4","total_cases_roll_std_4",
    "reanalysis_avg_temp_k_lag_1","precipitation_amt_mm_lag_1",
]

FEATURES_V2 = BASE_FEATURES + LAG_FEATURES
TARGET = "total_cases"


In [12]:
from sklearn.metrics import mean_absolute_error

def time_split(city_df, val_frac=0.2):
    city_df = city_df.sort_values(["year","weekofyear"])
    n = len(city_df)
    cut = int((1 - val_frac) * n)
    return city_df.iloc[:cut], city_df.iloc[cut:]

def get_model():
    try:
        from xgboost import XGBRegressor
        return XGBRegressor(
            n_estimators=1200,
            learning_rate=0.03,
            max_depth=4,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            objective="reg:squarederror",
        )
    except Exception:
        # Fallback: sklearn (genelde daha zayıf ama çalışır)
        from sklearn.ensemble import HistGradientBoostingRegressor
        return HistGradientBoostingRegressor(random_state=42)

df = make_train_features()

results = []
models = {}

for city in ["sj","iq"]:
    cdf = df[df["city"] == city].copy()

    # lag/rolling NaN satırlarını at
    cdf = cdf.dropna(subset=LAG_FEATURES + [TARGET])

    tr, va = time_split(cdf, val_frac=0.2)
    Xtr, ytr = tr[FEATURES_V2], tr[TARGET]
    Xva, yva = va[FEATURES_V2], va[TARGET]

    model = get_model()
    model.fit(Xtr, ytr)

    pred = model.predict(Xva)
    mae = mean_absolute_error(yva, pred)

    results.append({"city": city, "MAE": mae})
    models[city] = model

pd.DataFrame(results)


Unnamed: 0,city,MAE
0,sj,7.458559
1,iq,4.718907


In [13]:
print(df[FEATURES_V2].dtypes.value_counts())


float64    30
int64       1
Name: count, dtype: int64


In [15]:
def make_test_predictions_iterative(models,
                                    features_test_path="dengue_features_test.csv",
                                    features_train_path="dengue_features_train.csv",
                                    labels_train_path="dengue_labels_train.csv",
                                    submission_path="submission_format.csv"):
    Xtr = pd.read_csv(features_train_path)
    ytr = pd.read_csv(labels_train_path)
    Xte = pd.read_csv(features_test_path)
    sub = pd.read_csv(submission_path)

    Xtr = add_season(Xtr)
    Xte = add_season(Xte)

    train_full = Xtr.merge(ytr, on=["city","year","weekofyear"]).sort_values(["city","year","weekofyear"])
    test_full  = Xte.sort_values(["city","year","weekofyear"])

    preds_all = []

    for city in ["sj","iq"]:
        model = models[city]

        trc = train_full[train_full["city"] == city].copy()
        tec = test_full[test_full["city"] == city].copy()

        # geçmiş total_cases listesi: train gerçekleriyle seed
        history = trc["total_cases"].tolist()

        # test satırlarını sırayla gez
        preds_city = []
        for _, row in tec.iterrows():
            # lag feature'ları history'den üret
            lag1 = history[-1]
            lag2 = history[-2]
            lag3 = history[-3]
            lag4 = history[-4]

            roll_mean_4 = np.mean(history[-4:])
            roll_std_4  = np.std(history[-4:], ddof=1) if len(history[-4:]) >= 2 else 0.0

            # env lag_1: environment için 1 hafta geriden almak için
            # pratikte: tec'te bir önceki satırın value'su; ilk satırda train'in son value'su
            # Bunun için birlikte dizi kuruyoruz:
            # (train son env değeri) + (test env değerleri)
            # ve index'i kaydırıyoruz
            # Kolay yol:
            # geçmiş env değerleri için train son + test'e kadar kümülatif ilerle
            # burada tec üzerinde i kullanmak daha kolay:
            # (aşağıda i ile yapacağız)
            preds_city.append(row)  # placeholder

        # env lag'leri için index ile tekrar döngü (daha temiz)
        tec = tec.reset_index(drop=True)
        last_temp = trc["reanalysis_avg_temp_k"].iloc[-1]
        last_prec = trc["precipitation_amt_mm"].iloc[-1]

        history = trc["total_cases"].tolist()
        yhat_list = []

        for i in range(len(tec)):
            row = tec.loc[i].copy()

            row["total_cases_lag_1"] = history[-1]
            row["total_cases_lag_2"] = history[-2]
            row["total_cases_lag_3"] = history[-3]
            row["total_cases_lag_4"] = history[-4]
            row["total_cases_roll_mean_4"] = np.mean(history[-4:])
            row["total_cases_roll_std_4"]  = np.std(history[-4:], ddof=1) if len(history[-4:]) >= 2 else 0.0

            # env lag 1
            if i == 0:
                row["reanalysis_avg_temp_k_lag_1"] = last_temp
                row["precipitation_amt_mm_lag_1"]  = last_prec
            else:
                row["reanalysis_avg_temp_k_lag_1"] = tec.loc[i-1, "reanalysis_avg_temp_k"]
                row["precipitation_amt_mm_lag_1"]  = tec.loc[i-1, "precipitation_amt_mm"]

            Xrow = row[FEATURES_V2].to_frame().T
            Xrow = Xrow.apply(pd.to_numeric, errors="coerce").astype(float)
            Xrow = Xrow.fillna(0.0)

            yhat = float(model.predict(Xrow)[0])

            # Dengue negatif olmaz
            yhat = max(0.0, yhat)

            yhat_list.append(yhat)
            history.append(yhat)  # kritik: bir sonraki haftanın lag'i bu olacak

        out = tec[["city","year","weekofyear"]].copy()
        out["total_cases"] = np.round(yhat_list).astype(int)
        preds_all.append(out)

    pred_df = pd.concat(preds_all, ignore_index=True)

    sub2 = sub.drop(columns=["total_cases"]).merge(pred_df, on=["city","year","weekofyear"], how="left")
    sub2["total_cases"] = sub2["total_cases"].fillna(0).clip(lower=0).astype(int)
    sub2.to_csv("submission_model2_xgb_v1.csv", index=False)

    return sub2

# kullanım:
sub2 = make_test_predictions_iterative(models)
sub2.head()


Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,6
1,sj,2008,19,7
2,sj,2008,20,10
3,sj,2008,21,8
4,sj,2008,22,8


In [16]:
sub2.describe()

Unnamed: 0,year,weekofyear,total_cases
count,416.0,416.0,416.0
mean,2010.766827,26.439904,9.399038
std,1.434835,14.978257,7.632486
min,2008.0,1.0,0.0
25%,2010.0,13.75,5.0
50%,2011.0,26.0,7.0
75%,2012.0,39.0,10.0
max,2013.0,53.0,46.0
