In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import polars as pl

df_train = pl.read_csv("../data/raw/pond_train.csv")
df_test = pl.read_csv("../data/raw/pond_test.csv")

In [3]:
df_train.head()


id,project_a,project_b,weight_a,weight_b,total_amount_usd,funder,quarter
i64,str,str,f64,f64,i64,str,str
715,"""https://github.com/mochajs/moc…","""https://github.com/chzyer/read…",0.961581,0.038419,2681,"""opencollective""","""2016-04"""
716,"""https://github.com/mochajs/moc…","""https://github.com/gulpjs/gulp""",0.992302,0.007698,2598,"""opencollective""","""2016-04"""
717,"""https://github.com/chzyer/read…","""https://github.com/gulpjs/gulp""",0.837398,0.162602,123,"""opencollective""","""2016-04"""
718,"""https://github.com/chzyer/read…","""https://github.com/gulpjs/gulp""",0.231595,0.768405,652,"""opencollective""","""2016-07"""
719,"""https://github.com/chzyer/read…","""https://github.com/mochajs/moc…",0.078851,0.921149,1915,"""opencollective""","""2016-07"""


In [4]:
df_test.head()

id,project_a,project_b,total_amount_usd,funder,quarter
i64,str,str,i64,str,str
20884,"""https://github.com/rollup/roll…","""https://github.com/webpack/web…",29097,"""opencollective""","""2024-01"""
20885,"""https://github.com/rollup/roll…","""https://github.com/babel/babel""",26093,"""opencollective""","""2024-01"""
20886,"""https://github.com/rollup/roll…","""https://github.com/sindresorhu…",4460,"""opencollective""","""2024-01"""
20887,"""https://github.com/rollup/roll…","""https://github.com/inikulin/pa…",2454,"""opencollective""","""2024-01"""
20888,"""https://github.com/rollup/roll…","""https://github.com/zloirock/co…",13229,"""opencollective""","""2024-01"""


In [5]:
base_columns = [
    pl.col("project_a").str.split(".com/").list.last().cast(pl.Categorical),
    pl.col("project_b").str.split(".com/").list.last().cast(pl.Categorical),
    # Total Amount USD should not be used because it won't be known at prediction time
    # pl.col("total_amount_usd"),
    pl.col("funder").cast(pl.Categorical),
]

organization_columns = [
    pl.col("project_a")
    .str.split("/")
    .list.get(-2)
    .cast(pl.Categorical)
    .alias("project_a_organization"),
    pl.col("project_b")
    .str.split("/")
    .list.get(-2)
    .cast(pl.Categorical)
    .alias("project_b_organization"),
]

time_columns = [
    pl.col("quarter").str.split("-").list.get(0).cast(pl.Int32).alias("year"),
    (pl.col("quarter").str.split("-").list.get(1).cast(pl.Int32)).alias("month"),
    pl.col("quarter").cast(pl.Categorical).alias("year_quarter"),
]


In [6]:
X = df_train.select(*base_columns, *organization_columns, *time_columns)
y = df_train.select("weight_a")

In [10]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import (
    GridSearchCV,
    TimeSeriesSplit,
)
from sklearn.pipeline import Pipeline

pipeline = Pipeline(
    [
        ("model", LGBMRegressor(random_state=42, verbose=-1)),
    ]
)

# Use TimeSeriesSplit to ensure that the data is split in a way that preserves the temporal order
cv = TimeSeriesSplit(n_splits=5)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "model__n_estimators": [1000, 2000],
}

# Perform nested cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring="neg_mean_squared_error",
    refit=True,
    verbose=1,
)

grid_search.fit(X.to_pandas(), y.to_pandas())

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [12]:
print(f"Parameters: {grid_search.best_params_}")
print(f"MSE: {-grid_search.best_score_}")

Parameters: {'model__n_estimators': 1000}
MSE: 0.07653590509101778


In [13]:
import altair as alt
import pandas as pd

# Get feature importances and names
importances = grid_search.best_estimator_.named_steps["model"].feature_importances_
feature_names = grid_search.best_estimator_.named_steps["model"].feature_names_in_

# Create DataFrame with feature importances
importance_df = (
    pd.DataFrame({"Feature": feature_names, "Importance": importances})
    .sort_values("Importance", ascending=True)
    .tail(20)
)

# Create Altair chart
chart = (
    alt.Chart(importance_df)
    .mark_bar()
    .encode(
        x=alt.X("Importance:Q", title="Feature Importance"),
        y=alt.Y("Feature:N", sort="-x", title="Features"),
    )
    .properties(title="Feature Importance Plot", width="container", height=400)
)

chart