In [263]:
import polars as pl
from sklearn.ensemble import HistGradientBoostingRegressor

In [264]:
def preprocess_data(features: pl.DataFrame) -> pl.DataFrame:
    features = features.fill_nan(None)
    features = features.fill_null(strategy="forward")
    features = features.sort("week_start_date")
    features = features.with_columns(
        reanalysis_specific_humidity_g_per_kg_prev
            = pl.col("reanalysis_specific_humidity_g_per_kg").shift(8))
    features = features.with_columns(station_avg_temp_c_prev
            = pl.col("station_avg_temp_c").shift(9))
    features = features.with_columns(reanalysis_dew_point_temp_k_prev
            = pl.col("reanalysis_dew_point_temp_k").shift(8))
    features = features.fill_null(strategy="backward")
    return features

In [265]:
def get_predictions(city):
    # build the training data set
    q = (
        pl.scan_csv("data/dengue_features_train.csv")
        .filter(pl.col("city") == city)
    )
    features_train = q.collect()
    features_train = preprocess_data(features_train)

    q = (
        pl.scan_csv("data/dengue_labels_train.csv")
        .filter(pl.col("city") == city)
    )
    labels_train = q.collect()

    features_and_labels_train = features_train.join(
        labels_train, left_on=['city', 'year', 'weekofyear'], 
        right_on=['city', 'year', 'weekofyear'], how='inner', 
        maintain_order="left")
    selected_features = [
        #"reanalysis_specific_humidity_g_per_kg",
        "reanalysis_specific_humidity_g_per_kg_prev",
        #"station_avg_temp_c",
        "station_avg_temp_c_prev",
        #"reanalysis_dew_point_temp_k",
        "reanalysis_dew_point_temp_k_prev",
        # "station_avg_temp_c",
        # "station_min_temp_c",
        #"total_cases"
    ]
    #print(features_and_labels_train[selected_features].corr())

    # train the model
    X = features_and_labels_train[selected_features]
    y = features_and_labels_train["total_cases"]
    clf = HistGradientBoostingRegressor(max_iter=100).fit(X, y)
    
    # now make predictions on the test set
    q = (
        pl.scan_csv("data/dengue_features_test.csv")
        .filter(pl.col("city") == city)
    )
    features_test = q.collect()
    features_test = preprocess_data(features_test)

    y_pred = clf.predict(features_test[selected_features]).clip(min=0)
    result = pl.concat([features_test, 
                     pl.DataFrame({"total_cases": y_pred.round().astype(int)})], how="horizontal")
    
    return result['city', 'year', 'weekofyear', 'total_cases']

In [266]:
sj_predictions = get_predictions("sj")
iq_predictions = get_predictions("iq")
pl.concat([sj_predictions, iq_predictions], how="vertical").write_csv(
    "data/output.csv")