In [None]:
import polars as pl
import numpy as np
import math
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

generate_output = True

In [None]:
def get_season(weekofyear: int, nothern_hemisphere: bool=True) -> str:
    season = "unknown"
    
    if weekofyear in range(1, 10) or weekofyear in range(50, 53):
        if nothern_hemisphere:
            season = "winter"
        else:
            season = "summer"
    elif weekofyear in range(10, 22):
        if nothern_hemisphere:
            season = "spring"
        else:
            season = "fall"
    elif weekofyear in range(22, 35):
        if nothern_hemisphere:
            season = "summer"
        else:
            season = "winter"
    elif weekofyear in range(35, 50):
        if nothern_hemisphere:
            season = "fall"
        else:
            season = "spring"

    return season

In [None]:
def preprocess_data(features: pl.DataFrame) -> pl.DataFrame:
    features = features.fill_nan(None)
    features = features.fill_null(strategy="forward")
    features = features.sort("week_start_date")
    
    num_weeks_backward_to_look = 8
    feature_columns = features.columns[4:]
    for i in range(1, num_weeks_backward_to_look + 1):
        for feature in feature_columns:
            features = features.with_columns(
                pl.col(feature).shift(i).alias(f"{feature}_weekminus{i}"))
    features = features.fill_null(strategy="backward")

    features = features.with_columns(
        season=pl.struct("weekofyear", "city")
        .map_elements(lambda cols: get_season(cols["weekofyear"], cols["city"]=="sj"), return_dtype=pl.String))
    
    return features

In [None]:
"""
returns something like {'l2_regularization': 0.1, 'learning_rate': 0.01, 'max_depth': 3, 'max_iter': 100}
"""
def get_best_hyperparameters(model, X_train, y_train):
    # Define the hyperparameter grid
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_iter': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'l2_regularization': [0.0, 0.01, 0.1]
    }

    # Define the scoring metric
    scoring = make_scorer(mean_squared_error, greater_is_better=False)

    # Define the grid search
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=5, n_jobs=-1, verbose=1)

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Get the best parameters and score, Negate for MSE
    return  grid_search.best_params_

In [None]:
def get_predictions(city):
    # build the training data set
    q = (
        pl.scan_csv("data/dengue_features_train.csv")
        .filter(pl.col("city") == city)
    )
    features_train = q.collect()
    original_columns = features_train.columns[4:]
    features_train = preprocess_data(features_train)

    q = (
        pl.scan_csv("data/dengue_labels_train.csv")
        .filter(pl.col("city") == city)
    )
    labels_train = q.collect()

    features_and_labels_train = features_train.join(
        labels_train, left_on=['city', 'year', 'weekofyear'], 
        right_on=['city', 'year', 'weekofyear'], how='inner')
    
    # pick features with best correlation to total cases
    features_to_correlate = [col for col in features_and_labels_train.columns 
                             if col not in ["city", "year", "weekofyear", "week_start_date", "season"]]
    corr = features_and_labels_train[features_to_correlate].corr()
    corr_column_df = pl.DataFrame({ "columns": corr.columns})
    corr = pl.concat([corr, corr_column_df], how="horizontal")
    corr = corr[["columns", "total_cases"]]

    best_feature_columns = []
    for column in original_columns:
        filtered_columns = corr.filter(pl.col("columns").str.contains(column))
        best_corr = filtered_columns.select(pl.max("total_cases")).item()
        if best_corr >= 0.3:
            best_feature_column = filtered_columns.filter(
                pl.col("total_cases") == best_corr).select("columns").item()
            best_feature_columns.append(best_feature_column)
    
    # limit the number of features used
    best_feature_columns = best_feature_columns[:5]
    best_feature_columns.append("weekofyear")

    # train the model
    X = features_and_labels_train[best_feature_columns]
    y = features_and_labels_train["total_cases"]

    train_len = X.shape[0]
    if not generate_output:
        train_len = math.ceil(3/4 * X.shape[0])

    # pick best hyperparameters
    clf = HistGradientBoostingRegressor(random_state=42, categorical_features=["weekofyear"])
    best_params = get_best_hyperparameters(clf, X[:train_len], y[:train_len])
    clf.set_params(l2_regularization=best_params["l2_regularization"],
                   learning_rate=best_params["learning_rate"], 
                   max_depth=best_params["max_depth"], 
                   max_iter=best_params["max_iter"])
    clf.fit(X[:train_len], y[:train_len])

    if not generate_output:
        score = clf.score(X[train_len:], y[train_len:])
        print(f"Training set score: {score}")
    
    # now make predictions on the test set
    q = (
        pl.scan_csv("data/dengue_features_test.csv")
        .filter(pl.col("city") == city)
    )
    features_test = q.collect()
    features_test = preprocess_data(features_test)

    y_pred = clf.predict(features_test[best_feature_columns]).clip(min=0)
    result = pl.concat([features_test, 
                     pl.DataFrame({"total_cases": y_pred.round().astype(int)})], how="horizontal")
    
    return result['city', 'year', 'weekofyear', 'total_cases']

In [None]:
sj_predictions = get_predictions("sj")
iq_predictions = get_predictions("iq")

if generate_output:
    pl.concat([sj_predictions, iq_predictions], how="vertical").write_csv(
        "data/output.csv")