Compute metrics for Novartis Datathon 2024.
   This auxiliar file is intended to be used by participants in case
   you want to test the metric with your own train/validation splits.

In [None]:
import pandas as pd
from pathlib import Path
from typing import Tuple
from xgboost import XGBRegressor
import joblib

In [None]:
def _CYME(df: pd.DataFrame) -> float:
    """ Compute the CYME metric, that is 1/2(median(yearly error) + median(monthly error))"""

    yearly_agg = df.groupby("cluster_nl")[["target", "prediction"]].sum().reset_index()
    yearly_error = abs((yearly_agg["target"] - yearly_agg["prediction"])/yearly_agg["target"]).median()

    monthly_error = abs((df["target"] - df["prediction"])/df["target"]).median()

    return 1/2*(yearly_error + monthly_error)


def _metric(df: pd.DataFrame) -> float:
    """Compute metric of submission.

    :param df: Dataframe with target and 'prediction', and identifiers.
    :return: Performance metric
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Split 0 actuals - rest
    zeros = df[df["zero_actuals"] == 1]
    recent = df[df["zero_actuals"] == 0]

    # weight for each group
    zeros_weight = len(zeros)/len(df)
    recent_weight = 1 - zeros_weight

    # Compute CYME for each group
    return round(recent_weight*_CYME(recent) + zeros_weight*min(1,_CYME(zeros)), 8)


def compute_metric(submission: pd.DataFrame) -> Tuple[float, float]:
    """Compute metric.

    :param submission: Prediction. Requires columns: ['cluster_nl', 'date', 'target', 'prediction']
    :return: Performance metric.
    """

    submission["date"] = pd.to_datetime(submission["date"])
    submission = submission[['cluster_nl', 'date', 'target', 'prediction', 'zero_actuals']]

    return _metric(submission)

## Load data

In [None]:
PATH = Path(".")
dataset_name = 'xgb_regressor'
model_name = 'xgb_regressor'
training_dataset = pd.read_csv(PATH / 'train_datasets' / dataset_name + '.csv')

In [None]:
train_val_set = training_dataset[training_dataset['date'] < '2022-01-01']
validation_set = training_dataset[training_dataset['date'] >= '2022-01-01']

## Split the `Recent` items from the `Future` items

In [None]:
train_data = pd.read_csv('data/train_data.csv')
item_id_column = 'cluster_nl'

In [None]:
submission_ids = train_data[item_id_column].unique()
validation_set['item_in_submission'] = validation_set[item_id_column].isin(submission_ids)

In [None]:
validation_known_products = validation_set[validation_set['item_in_submission']]
validation_new_products = validation_set[~validation_set['item_in_submission']]

#### THIS DOESN'T WORK IF THE RECENTS AND FUTURES USE DIFFERENT VARIABLES!

## Train the model

In [None]:
X_train_val = train_val_set.drop(['date', 'target']) # Maybe drop the date column also
y_train_val = train_val_set['target']

X_val = validation_set.drop(['date', 'target'] )# Maybe drop the date column also
y_val = validation_set['target']

Copy here the code to train the model, or import the model / the train function:

In [None]:
model = joblib.load(model_name)
model.fit(X_train_val, y_train_val)

## Perform Validation

In [None]:
validation_set['prediction'] = model.predict(y_val)

In [None]:
validation_set["zero_actuals"] = ...

## Check Performance

In [None]:
print("Performance:", compute_metric(validation_set))

## Prepare submission

In [None]:
# submission_data = pd.read_parquet(PATH / "submission_data.csv")
submission = pd.read_csv(PATH / "submission_template.csv")

In [None]:
submission["prediction"] = ... # model.predict(submission_data[features])

In [None]:
SAVE_PATH = Path("submissions")
ATTEMPT = "attempt_1"
submission.to_csv(SAVE_PATH / f"submission_{ATTEMPT}.csv", sep=",", index=False)