In [1]:
%matplotlib inline

import warnings
import dotenv
import mlflow
import logging
from utils import (
    test_mlflow_connection,
    mlflow_cross_validate,
)

MLFLOW_ENDPOINT = "http://10.121.252.164:5001"
MLFLOW_EXPERIMENT_NAME = "datalab_cup01"

dotenv.load_dotenv(".env")
warnings.filterwarnings("ignore")
test_mlflow_connection(MLFLOW_ENDPOINT)


def mlflow_setup():
    mlflow_logger = logging.getLogger("mlflow")
    mlflow_logger.setLevel(logging.ERROR)
    mlflow.set_tracking_uri(MLFLOW_ENDPOINT)
    mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
    mlflow.autolog(log_datasets=False, silent=True)


def cross_validate(
    clf,
    X,
    y,
    columns=None,
    n_folds=5,
    seed=42,
    verbose=1,
    n_jobs=-1,
    parent_run: mlflow.ActiveRun = None,
):
    return mlflow_cross_validate(
        clf,
        X,
        y,
        columns=columns,
        n_folds=n_folds,
        seed=seed,
        verbose=verbose,
        n_jobs=n_jobs,
        mlflow_parent_run=parent_run,
        mlflow_setup=mlflow_setup,
    )


mlflow_setup()

In [None]:
from features import Features
from utils.plotting import plot_correlations

features = Features(
    train_path="data/train.parquet",
    test_path="data/test.parquet",
    onehot_weekday=False,
    onehot_month=False,
    category_max_features=10000,
    category_train_min=3,
    category_test_min=2,
)
plot = True

features.extract_info()
X_info = features.X_info
y = features.y
df_test = features.df_test
X_test_info = features.X_test_info
print("X_info shape: ", X_info.shape)
print("X_test_info shape: ", X_test_info.shape)
if plot:
    plot_correlations(X_info, y)

X_info = X_info.fillna(X_info.mean())
X_test_info = X_test_info.fillna(X_test_info.mean())

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Clone
X_info_filtered = X_info.copy()
X_test_info_filtered = X_test_info.copy()
columns = X_info_filtered.columns.tolist()

# Filter out columns
regex_filters = [
    # r"^datetime_",  # All datetime columns
    # r"^datetime_month_",
    # r"^datetime_weekday_",
    # r"^datetime_year",
    # r"^datetime_day",
    # r"^datetime_hour",
    r"^datetime_minute",  # Overfitting
    r"^datetime_second",  # Overfitting
    r"^sel_",  # All selectors
    # r"^sel_.+(?<!_token)_count$",
    # r"^sel_.+(?<!_non_stop)(?<!_unique)_token_count$",
    # r"^sel_.+(?<!_non_stop)_unique_token_count$",
    # r"^sel_.+_non_stop_token_count$",
    # r"^sel_.+_non_stop_unique_token_count$",
    # r"^sel_.+_pos$",
    # r"^sel_.+_neg$", # Very Bad
    # r"^sel_.+_neu$",
    # r"^sel_.+_subjectivity$",
    # r"^sel_.+_polarity$", # Very Good
    # r"^sel_.+_compound$", # Very Good
    # r"^sel_.+_readability$", # Very Bad
    # r"^sel_html_",
    # r"^sel_h1_",
    # r"^sel_h2_",
    # r"^sel_p_",
    # r"^sel_a_",
    # r"^sel_div_",
    # r"^sel_footer_a_",
    # r"^sel_section_",
    # r"^sel_instagram_",
    # r"^sel_twitter_",
    # r"^sel_img_",
    # r"^sel_iframe_",
    # r"^sel_video_",
    # r"^channel_",  # All channel columns
    # r"^category_",  # All category columns
]

# Filter columns using regex
filtered_columns = [
    col
    for col in columns
    if all([not re.match(regex, col) for regex in regex_filters])
]

X_info_filtered = X_info_filtered[filtered_columns]
X_test_info_filtered = X_test_info_filtered[filtered_columns]

top_n_categories = False
top_n_categories_count = 50
if top_n_categories:
    # Remove all category columns
    category_columns = [col for col in columns if col.startswith("category_")]
    X_info_filtered = X_info_filtered.drop(columns=category_columns)
    X_test_info_filtered = X_test_info_filtered.drop(columns=category_columns)

    # Get top n category columns
    top_category_columns = (
        X_info[category_columns]
        .corrwith(pd.Series(y))
        .sort_values(
            ascending=False,
            key=lambda x: np.abs(x),
        )
        .head(top_n_categories_count)
        .index.tolist()
    )

    # Put top n category columns back
    X_info_filtered = pd.concat(
        [X_info_filtered, X_info[top_category_columns]],
        axis=1,
    )
    X_test_info_filtered = pd.concat(
        [X_test_info_filtered, X_test_info[top_category_columns]],
        axis=1,
    )

print("X_info_filtered shape: ", X_info_filtered.shape)
print("X_test_info_filtered shape: ", X_test_info_filtered.shape)
if plot:
    plot_correlations(X_info_filtered, y)

# Scale data
columns = X_info_filtered.columns.tolist()
X = X_info_filtered.values
X_test = X_test_info_filtered.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

## Training

In [4]:
import os
from datetime import datetime
import mlflow


def save_prediction(id, y_pred, name, mlflow_logging=True):
    df_pred = pd.DataFrame({"Id": id, "Popularity": y_pred})
    filepath = "output/minimal/{}.{}.csv".format(
        name, datetime.now().strftime("%Y%m%d-%H%M%S")
    )
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    df_pred.to_csv(filepath, index=False)
    print("Saved prediction to ", filepath)
    if mlflow_logging:
        mlflow.log_artifact(filepath, "predictions")

### Start Training

#### CatBoost

In [5]:
import os

cb_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "silent": True,
    "thread_count": os.cpu_count(),
    "random_seed": 42,  # Ensures reproducibility
    "n_estimators": 200,
    "learning_rate": 0.01,
    "depth": 6,
    "l2_leaf_reg": 10.0,
}

In [None]:
%%script false --no-raise-error

import mlflow
from catboost import CatBoostClassifier

catboost_model_tags = {
    "model": "CatBoostClassifier",
    "tuning": "false",
    "final": "false",
}

n_folds = 5
with mlflow.start_run(
    run_name="minimal-catboost-cv", tags=catboost_model_tags
) as run:
    _cb_params = {
        **cb_params,
        "thread_count": max(os.cpu_count() // n_folds, 1),
    }
    cb_clf = CatBoostClassifier(**_cb_params)
    cross_validate(
        cb_clf, X, y, columns=columns, n_folds=n_folds, parent_run=run
    )

In [7]:
%%script false --no-raise-error

import mlflow
import mlflow.catboost
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

with mlflow.start_run(run_name="minimal-catboost") as run:
    print("Run ID: ", run.info.run_id)
    mlflow.set_tags(cb_model_tags)
    for k, v in cb_params.items():
        mlflow.log_param(k, v)
    cb_clf = CatBoostClassifier(**cb_params)
    cb_clf.fit(X, y)
    mlflow.catboost.log_model(cb_clf, "model")
    train_auc = roc_auc_score(y, cb_clf.predict_proba(X)[:, 1])
    mlflow.log_metric("train_auc", train_auc)
    print("AUC train: {:.5f}".format(train_auc))

    y_test = cb_clf.predict_proba(X_test)[:, 1]
    save_prediction(df_test["Id"], y_test, "catboost")

#### XGBoost

In [8]:
xgb_params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "n_jobs": -1,
    "verbosity": 0,
    "random_state": 42,
    "n_estimators": 300,
    "max_depth": 8,
    "learning_rate": 0.01,
    "reg_lambda": 20.0,
}

In [None]:
%%script false --no-raise-error

import os
import mlflow
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import roc_auc_score

xgb_model_tags = {
    "model": "XGBClassifier",
    "tuning": "false",
    "final": "false",
}

n_folds = 5
with mlflow.start_run(
    run_name="minimal-xgboost-cv",
    tags=xgb_model_tags,
) as run:
    _xgb_params = {**xgb_params, "n_jobs": max(os.cpu_count() // n_folds, 1)}
    xgb_clf = XGBClassifier(**_xgb_params)
    cv_results = cross_validate(
        xgb_clf, X, y, columns=columns, n_folds=n_folds, parent_run=run
    )

# xgb_param_grids = {
#     "max_depth": [4, 6, 8],
#     "learning_rate": [0.005, 0.01, 0.1],
#     "n_estimators": [100, 200, 300],
# }
# xgb_model_tags = {
#     "model": "XGBClassifier",
#     "tuning": "true",
#     "final": "false",
# }

# n_folds = 5
# with mlflow.start_run(
#     run_name="minimal-xgboost-gridsearch-cv", tags=xgb_model_tags
# ) as run:
#     mlflow.log_params(xgb_param_grids)
#     param_grids = ParameterGrid(xgb_param_grids)
#     best_params = None
#     best_val_auc = 0
#     for params in param_grids:
#         with mlflow.start_run(
#             run_name="minimal-xgboost-child-cv",
#             nested=True,
#             tags=xgb_model_tags,
#         ) as nested_run:
#             _xgb_params = {**xgb_params, **params}
#             mlflow.log_params(_xgb_params)
#             xgb_clf = XGBClassifier(**_xgb_params)
#             cv_results = cross_validate(
#                 xgb_clf,
#                 X,
#                 y,
#                 n_folds=n_folds,
#                 n_jobs=n_folds,
#                 verbose=0,
#                 parent_run=nested_run,
#             )
#             train_auc = cv_results["train_auc"].mean()
#             val_auc = cv_results["val_auc"].mean()
#             if val_auc > best_val_auc:
#                 best_val_auc = val_auc
#                 best_params = _xgb_params

#     print("Best params: ", best_params)

In [None]:
%%script false --no-raise-error

from xgboost import XGBClassifier
import mlflow

xgb_model_tags = {
    "model": "XGBClassifier",
    "tuning": "false",
    "final": "true",
}

with mlflow.start_run(
    run_name="minimal-xgboost",
    tags=xgb_model_tags,
) as run:
    print("Run ID: ", run.info.run_id)
    mlflow.log_param("columns", columns)
    mlflow.log_params(xgb_params)
    xgb_clf = XGBClassifier(**xgb_params)
    xgb_clf.fit(X, y)
    train_auc = roc_auc_score(y, xgb_clf.predict_proba(X)[:, 1])
    mlflow.log_metric("train_auc", train_auc)
    print("AUC train: {:.5f}".format(train_auc))

    y_test = xgb_clf.predict_proba(X_test)[:, 1]
    save_prediction(df_test["Id"], y_test, "xgboost")

#### LightGBM

In [19]:
lgbm_params = {
    "boosting_type": "gbdt",  # Gradient boosting decision tree
    "objective": "binary",  # Binary classification
    "metric": "auc",  # Use AUC for evaluation
    "verbosity": -1,  # Suppress excessive logs,
    "n_jobs": 6,
    "random_state": 42,
    "num_leaves": 31,  # Maximum number of leaves in one tree
    "learning_rate": 0.01,
    "n_estimators": 500,
    "max_depth": 8,
    "reg_lambda": 10.0,  # Equivalent to l2 regularization
}

In [None]:
%%script false --no-raise-error

import mlflow
from lightgbm import LGBMClassifier
from sklearn.model_selection import ParameterGrid

lgbm_model_tags = {
    "model": "LGBMClassifier",
    "tuning": "false",
    "final": "false",
}

n_folds = 5
with mlflow.start_run(
    run_name="minimal-lightgbm-cv", tags=lgbm_model_tags
) as run:
    _lgbm_params = {**lgbm_params, "n_jobs": max(os.cpu_count() // n_folds, 1)}
    lgbm_clf = LGBMClassifier(**_lgbm_params)
    cv_results = cross_validate(
        lgbm_clf, X, y, columns=columns, n_folds=n_folds, parent_run=run
    )

# lgbm_param_grids = {
#     "n_estimators": [200, 500],
#     "num_leaves": [31, 63],
#     "max_depth": [6, 8, 10],
#     "learning_rate": [0.001, 0.01, 0.1],
#     "reg_lambda": [1.0, 10.0, 20.0],
# }
# lgbm_model_tags = {
#     "model": "LGBMClassifier",
#     "tuning": "true",
#     "final": "false",
# }

# n_folds = 5
# with mlflow.start_run(
#     run_name="minimal-lightgbm-gridsearch-cv", tags=lgbm_model_tags
# ) as run:
#     mlflow.log_params(lgbm_param_grids)
#     param_grids = ParameterGrid(lgbm_param_grids)
#     assert len(param_grids) < 50
#     best_params = None
#     best_val_auc = 0
#     for params in param_grids:
#         if params["max_depth"] ** 2 < params["num_leaves"]:
#             continue
#         with mlflow.start_run(
#             run_name="minimal-lightgbm-child-cv",
#             nested=True,
#             tags=lgbm_model_tags,
#         ) as nested_run:
#             _lgbm_params = {**lgbm_params, **params}
#             mlflow.log_params(_lgbm_params)
#             lgbm_clf = LGBMClassifier(**_lgbm_params)
#             cv_results = cross_validate(
#                 lgbm_clf,
#                 X,
#                 y,
#                 columns=columns,
#                 n_folds=n_folds,
#                 n_jobs=n_folds,
#                 verbose=0,
#                 parent_run=nested_run,
#             )
#             train_auc = cv_results["train_auc"].mean()
#             val_auc = cv_results["val_auc"].mean()
#             if val_auc > best_val_auc:
#                 best_val_auc = val_auc
#                 best_params = _lgbm_params

#     print("Best params: ", best_params)

In [None]:
%%script false --no-raise-error

import mlflow
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

lgbm_model_tags = {
    "model": "LGBMClassifier",
    "tuning": "false",
    "final": "true",
}
with mlflow.start_run(run_name="minimal-lightgbm", tags=lgbm_model_tags) as run:
    print("Run ID: ", run.info.run_id)
    lgbm_clf = LGBMClassifier(**lgbm_params)
    lgbm_clf.fit(X, y)
    train_auc = roc_auc_score(y, lgbm_clf.predict_proba(X)[:, 1])
    mlflow.log_metric("train_auc", train_auc)
    print("AUC train: {:.5f}".format(train_auc))

    y_test = lgbm_clf.predict_proba(X_test)[:, 1]
    save_prediction(df_test["Id"], y_test, "lightgbm")

#### Voting

In [14]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

cb_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "silent": True,
    "thread_count": 6,
    "n_estimators": 1000,
    "learning_rate": 0.01,
    # "depth": 6,
    # "l2_leaf_reg": 10.0,
    # "random_seed": 42,  # Ensures reproducibility
}
xgb_params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "verbosity": 0,
    "n_jobs": 6,
    # "random_state": 42,
    "n_estimators": 300,
    "max_depth": 8,
    "learning_rate": 0.01,
    "reg_lambda": 20.0,
}
lgbm_params = {
    "boosting_type": "gbdt",  # Gradient boosting decision tree
    "objective": "binary",  # Binary classification
    "metric": "auc",  # Use AUC for evaluation
    "verbosity": -1,  # Suppress excessive logs,
    "n_jobs": 6,
    # "random_state": 42,
    "num_leaves": 31,  # Maximum number of leaves in one tree
    "learning_rate": 0.01,
    "n_estimators": 500,
    "max_depth": 8,
    "reg_lambda": 10.0,  # Equivalent to l2 regularization
}

estimators = []
# estimators.append(("catboost", CatBoostClassifier(**cb_params)))
# estimators.append(("xgboost", XGBClassifier(**xgb_params)))
# estimators.append(("lightgbm", LGBMClassifier(**lgbm_params)))
for i in range(2):
    estimators.append((f"xgboost{i}", XGBClassifier(**xgb_params)))
for i in range(3):
    estimators.append((f"lightgbm{i}", LGBMClassifier(**lgbm_params)))

# Train a voting classifier
voting_params = {
    "estimators": estimators,
    "voting": "soft",
    "n_jobs": -1,
}

In [15]:
%%script false --no-raise-error

import mlflow
from sklearn.ensemble import VotingClassifier

voting_model_tags = {
    "model": "VotingClassifier",
    "tuning": "false",
    "final": "false",
}

n_folds = 5
with mlflow.start_run(
    run_name="minimal-voting-cv", tags=voting_model_tags
) as run:
    _voting_params = {
        **voting_params,
        "n_jobs": max(os.cpu_count() // n_folds, 1),
    }
    voting_clf = VotingClassifier(**_voting_params)
    cv_results = cross_validate(
        voting_clf, X, y, n_folds=n_folds, parent_run=run
    )

In [16]:
%%script false --no-raise-error

import mlflow
from sklearn.metrics import roc_auc_score

voting_model_tags = {
    "model": "VotingClassifier",
    "tuning": "false",
    "final": "true",
}

with mlflow.start_run(
    run_name="minimal-voting", tags=voting_model_tags
) as run:
    voting_clf.fit(X, y)
    train_auc = roc_auc_score(y, voting_clf.predict_proba(X)[:, 1])
    mlflow.log_metric("train_auc", train_auc)
    print("AUC train: {:.5f}".format(train_auc))
    print("Run ID: ", run.info.run_id)

    y_test = voting_clf.predict_proba(X_test)[:, 1]
    save_prediction(df_test["Id"], y_test, "voting")