In [None]:
import polars as pl
import math
from datetime import timedelta
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
"""
Adds columns for previous week and previous week year to the dataframe
"""
def preprocess_features(features: pl.DataFrame) -> pl.DataFrame:
    delta_in_weeks = 3
    features = features.with_columns(
        ((pl.col("week_start_date").cast(pl.Date) 
         + timedelta(weeks=delta_in_weeks)).dt.year()).alias("next_week_year"))
    
    features = features.with_columns(
        (((pl.col("week_start_date").cast(pl.Date) 
           + timedelta(weeks=delta_in_weeks)).dt.week())).alias("next_weekofyear"))
    
    # cyclical encode week of year
    max_week_value = 52
    features = features.with_columns(\
        weekofyear_encoded=(pl.Expr.sin(2 * math.pi * pl.col("weekofyear") / max_week_value)))

    features = features.with_columns(\
        ndvi_aggregate=pl.col("ndvi_ne") + pl.col("ndvi_nw") + pl.col("ndvi_se") + pl.col("ndvi_sw"))

    return (features)

In [None]:
def get_predictions(city):
    # build the training data set
    q = (
        pl.scan_csv("data/dengue_features_train.csv")
        .filter(pl.col("city") == city)
    )
    train_features = q.collect()
    train_features = preprocess_features(train_features)

    q = (
        pl.scan_csv("data/dengue_labels_train.csv")
        .filter(pl.col("city") == city)
    )
    train_labels = q.collect()

    train_features_and_labels = train_features.join(
        train_labels, left_on=['city', 'next_week_year', 'next_weekofyear'], 
        right_on=['city', 'year', 'weekofyear'], how='inner')
    
    train_features_and_labels = train_features_and_labels.fill_nan(None)
    train_features_and_labels = train_features_and_labels\
        .fill_null(strategy="forward")
    train_features_and_labels = train_features_and_labels.with_columns(
        weekofyear_historical_average_total_cases = 
        pl.col("total_cases").mean().over(pl.col("weekofyear"))
    )
    
    selected_feature_names = [
        "weekofyear_encoded",
        "weekofyear_historical_average_total_cases",
        "reanalysis_specific_humidity_g_per_kg",
        "reanalysis_dew_point_temp_k",
        "station_avg_temp_c",
        "station_min_temp_c"
    ]

    # corr = train_features_and_labels[selected_feature_names].corr()
    # print(corr["total_cases"])
    # return

    # train the model
    split_training_set = False
    X = train_features_and_labels[selected_feature_names]
    y = train_features_and_labels["total_cases"]

    train_len = X.shape[0]
    if split_training_set:
        train_len = math.ceil(3/4 * X.shape[0])

    X_train, X_test = X[:train_len], X[train_len:]
    y_train, y_test = y[:train_len], y[train_len:]
    clf = HistGradientBoostingRegressor(max_iter=100).fit(X_train, y_train)

    if split_training_set:
        print(clf.score(X_test, y_test))

    # now make predictions on the test set
    q = (
        pl.scan_csv("data/dengue_features_test.csv")
        .filter(pl.col("city") == city)
    )
    test_features = q.collect()
    test_features = preprocess_features(test_features)
    test_features = test_features.join(train_features_and_labels.drop("total_cases"), how="left", on=["year", "weekofyear"], maintain_order="left")

    y_pred = clf.predict(test_features[selected_feature_names]).clip(min=0)
    result = pl.concat([test_features, 
                     pl.DataFrame({"total_cases": y_pred.round().astype(int)})], how="horizontal")
    
    return result['city', 'year', 'weekofyear', 'total_cases']

In [None]:
sj_predictions = get_predictions("sj")
iq_predictions = get_predictions("iq")
pl.concat([sj_predictions, iq_predictions], how="vertical").write_csv(
    "data/output.csv")