In [38]:
import polars as pl
import math
from datetime import timedelta, datetime
from sklearn.ensemble import HistGradientBoostingRegressor

In [39]:
"""
Adds columns for previous week and previous week year to the dataframe
"""
def preprocess_features(features: pl.DataFrame) -> pl.DataFrame:
    delta_in_weeks = 3
    features = features.with_columns(
        ((pl.col("week_start_date").cast(pl.Date) 
         + timedelta(weeks=delta_in_weeks)).dt.year()).alias("next_week_year"))
    features = features.with_columns(
        (((pl.col("week_start_date").cast(pl.Date) 
           + timedelta(weeks=delta_in_weeks)).dt.week())).alias("next_weekofyear"))
    return (features)

In [40]:
def get_predictions(city):
    # build the training data set
    q = (
        pl.scan_csv("data/dengue_features_train.csv")
        .filter(pl.col("city") == city)
    )
    train_features = q.collect()
    train_features = preprocess_features(train_features)

    q = (
        pl.scan_csv("data/dengue_labels_train.csv")
        .filter(pl.col("city") == city)
    )
    train_labels = q.collect()

    train_features_and_labels = train_features.join(
        train_labels, left_on=['city', 'next_week_year', 'next_weekofyear'], 
        right_on=['city', 'year', 'weekofyear'], how='inner')
    
    selected_features = [
        "weekofyear",
        "reanalysis_specific_humidity_g_per_kg",
        "reanalysis_dew_point_temp_k",
        "station_avg_temp_c",
        "station_min_temp_c",
    ]

    # train the model
    X = train_features_and_labels[selected_features]
    y = train_features_and_labels["total_cases"]
    train_len = math.ceil(3/4 * X.shape[0])
    #train_len = X.shape[0]
    X_train, X_test = X[:train_len], X[train_len:]
    y_train, y_test = y[:train_len], y[train_len:]
    clf = HistGradientBoostingRegressor(max_iter=100).fit(X_train, y_train)
    print(clf.score(X_test, y_test))

    # now make predictions on the test set
    q = (
        pl.scan_csv("data/dengue_features_test.csv")
        .filter(pl.col("city") == city)
    )
    test_features = q.collect()
    y_pred = clf.predict(test_features[selected_features])
    result = pl.concat([test_features, 
                     pl.DataFrame({"total_cases": y_pred.round().astype(int)})], how="horizontal")
    
    return result['city', 'year', 'weekofyear', 'total_cases']

In [41]:
sj_predictions = get_predictions("sj")
iq_predictions = get_predictions("iq")
pl.concat([sj_predictions, iq_predictions], how="vertical").write_csv(
    "data/output.csv")

"""
1 week delta
-0.36469794209963124
-0.023403422194176793

2 weeks delta
-0.20022916400242186
-0.00998963400642272

3 weeks delta
-0.1581144756410573
-0.006391480350445011


"""

-0.1581144756410573
-0.006391480350445011


'\n1 week delta\n-0.36469794209963124\n-0.023403422194176793\n\n2 weeks delta\n-0.20022916400242186\n-0.00998963400642272\n\n3 weeks delta\n-0.1581144756410573\n-0.006391480350445011\n\n\n'