In [7]:
import warnings

from pfc.data import get_hf_raw_dataframes

warnings.filterwarnings("ignore")

df_train, df_test = get_hf_raw_dataframes()

In [2]:
from pfc.data import (
    add_github_projects_data,
    extract_interaction_features,
    extract_ratio_features,
    extract_temporal_features,
    mirror_projects,
)

df_train_full = df_train
df_train_full = mirror_projects(df_train_full)
df_train_full = add_github_projects_data(df_train_full, "hf")
df_train_full = extract_ratio_features(df_train_full)
df_train_full = extract_interaction_features(df_train_full)
df_train_full = extract_temporal_features(df_train_full)


df_test_full = df_test
df_test_full = add_github_projects_data(df_test_full, "hf")
df_test_full = extract_ratio_features(df_test_full)
df_test_full = extract_interaction_features(df_test_full)
df_test_full = extract_temporal_features(df_test_full)


In [3]:
import polars as pl

features = [
    "project_a",
    "project_b",
    "organization",
    "organization_b",
    "language",
    "language_b",
    "is_private",
    "has_homepage",
    "size",
    "stars",
    "watchers",
    "has_projects",
    "has_pages",
    "has_wiki",
    "has_discussions",
    "forks",
    "is_archived",
    "is_disabled",
    "open_issues",
    "network_count",
    "subscribers_count",
    "is_private_b",
    "has_homepage_b",
    "size_b",
    "stars_b",
    "watchers_b",
    "has_projects_b",
    "has_pages_b",
    "has_wiki_b",
    "has_discussions_b",
    "forks_b",
    "is_archived_b",
    "is_disabled_b",
    "open_issues_b",
    "network_count_b",
    "subscribers_count_b",
    "stars_ratio",
    "watchers_ratio",
    "forks_ratio",
    "size_ratio",
    "issues_ratio",
    "subscribers_count_ratio",
    "stars_forks_interaction",
    "stars_forks_interaction_b",
    "engagement_score",
    "engagement_score_b",
    "stars_watchers_interaction",
    "stars_watchers_interaction_b",
    "stars_size_interaction",
    "stars_size_interaction_b",
    "days_since_update",
    "days_since_update_b",
    "days_since_creation",
    "days_since_creation_b",
]

categorical_indices = [0, 1, 2, 3, 4, 5]
X = (
    df_train_full.select(features)
    .with_columns(pl.col(pl.Utf8).cast(pl.Categorical))
    .to_pandas()
)
y = df_train_full.get_column("weight_a").to_pandas()


In [4]:
import lightgbm as lgb
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
    objective="regression",
    random_state=42,
    verbose=-1,
    callbacks=[lgb.early_stopping(stopping_rounds=50)],
    categorical_feature=categorical_indices,
)

In [5]:
from sklearn.model_selection import GridSearchCV

params_lgbm = {
    # "num_leaves": [31, 26],
    # "learning_rate": [0.1, 0.05, 0.2],
    # "colsample_bytree": [0.5, 0.8, 1.0],
    # "subsample": [0.5, 1.0],
    # "reg_alpha": [0, 0.05],
    # "reg_lambda": [0, 0.05],
    "n_estimators": [100],
    "random_state": [42],
    "categorical_feature": [categorical_indices],
}

grid_search_lgbm = GridSearchCV(
    estimator=lgbm,  # type: ignore
    param_grid=params_lgbm,
    verbose=1,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=1,
).fit(X, y)  # type: ignore

lgbm_best = grid_search_lgbm.best_estimator_
print(f"Best Parmas: {grid_search_lgbm.best_params_}")
print(f"Best score: {-grid_search_lgbm.best_score_}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parmas: {'categorical_feature': [0, 1, 2, 3, 4, 5], 'n_estimators': 100, 'random_state': 42}
Best score: 0.029654900192253864


In [6]:
feature_importance = pl.DataFrame(
    {
        "feature": lgbm_best.feature_names_in_,  # type: ignore
        "importance": lgbm_best.feature_importances_,  # type: ignore
    }
).sort("importance", descending=True)

feature_importance.plot.bar(x="feature", y="importance")

In [8]:
X_test = (
    df_test_full.select(features)
    .with_columns(pl.col(pl.Utf8).cast(pl.Categorical))
    .to_pandas()
)

test_predictions = lgbm_best.predict(X_test)  # type: ignore

In [10]:
from pfc.data import generate_submission

extra_data = f"mse-{(-grid_search_lgbm.best_score_):.4f}"

generate_submission(test_predictions, data=extra_data, challenge="hf")