# Imports

In [None]:
import os
import random

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline


# Loading data

In [None]:
DATA_DIR = "../data"

X_TRAIN_FN = "X_train.csv"
Y_TRAIN_FN = "y_train.csv"
X_TEST_FN = "X_test.csv"
Y_TEST_FN = "y_test.csv"
LARGE_TEST_FN = "criteo-ppml-challenge-adkdd21-dataset-additional-test-data.csv"
SINGLE_AGG_FN = "aggregated_singles.csv"
DOUBLE_AGG_FN = "aggregated_pairs.csv"


def load_data():
    X_train_path = os.path.join(os.path.join(DATA_DIR, X_TRAIN_FN))
    y_train_path = os.path.join(os.path.join(DATA_DIR, Y_TRAIN_FN))
    X_test_path = os.path.join(os.path.join(DATA_DIR, X_TEST_FN))
    y_test_path = os.path.join(os.path.join(DATA_DIR, Y_TEST_FN))
    single_agg_path = os.path.join(os.path.join(DATA_DIR, SINGLE_AGG_FN))
    double_agg_path = os.path.join(os.path.join(DATA_DIR, DOUBLE_AGG_FN))
    large_test_path = os.path.join(os.path.join(DATA_DIR, LARGE_TEST_FN))

    X = pd.read_csv(X_train_path)
    y = pd.read_csv(y_train_path).click
    X_test = pd.read_csv(X_test_path)
    y_test = pd.read_csv(y_test_path).click
    df_single_agg = pd.read_csv(single_agg_path, index_col=0)
    df_double_agg = pd.read_csv(double_agg_path, index_col=0)
    df_large_test = pd.read_csv(large_test_path)

    df_single_agg = df_single_agg.rename(columns={"click": "nb_clicks", "c": "count"})
    df_double_agg = df_double_agg.rename(columns={"click": "nb_clicks", "c": "count"})

    return X, y, X_test, y_test, df_single_agg, df_double_agg, df_large_test


X, y, X_test, y_test, df_single_agg, df_double_agg, df_large_test = load_data()


# Utils

In [None]:
LOGFILE = "results_agg_gbt_clicks.log"


def print_and_log(x):
    print(x)
    with open(LOGFILE, "a+") as handle:
        handle.write(x + "\n")


SEED = 2022

# Set seed for reproducibility results
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


set_seed(2022)


# Transformers

In [None]:
class AggClickEncoder:
    """
    Beta Target Encoding.
    Encode single and double features using a smoothed version of the observed CTR.
    """

    def __init__(self, single_agg_data, double_agg_data, prior_weight=100):
        self.single_agg_data = single_agg_data
        self.double_agg_data = double_agg_data
        self.prior_weight = prior_weight
        self.prior = 0
        self.single_agg_posteriors = dict()
        self.single_agg_count = dict()
        self.double_agg_posteriors = dict()
        self.double_agg_count = dict()

    def _fit(self):
        single_agg_posteriors = dict()
        single_agg_count = dict()
        double_agg_posteriors = dict()
        double_agg_count = dict()

        # Initialize dicts
        for i in range(19):
            single_agg_posteriors[i] = {}
            single_agg_count[i] = {}
            for j in range(i + 1, 19):
                double_agg_posteriors[(i, j)] = {}
                double_agg_count[(i, j)] = {}

        # Compute prior CTR over feature 0
        prior = (
            self.single_agg_data[self.single_agg_data.feature_1_id == 0]["nb_clicks"].sum()
            / self.single_agg_data[self.single_agg_data.feature_1_id == 0]["count"].sum()
        )
        self.prior = prior

        # Compute posterior CTR per single & double feature modality, using a beta prior
        for id, value, count, clicks in zip(
            self.single_agg_data["feature_1_id"].values,
            self.single_agg_data["feature_1_value"].values,
            self.single_agg_data["count"].values,
            self.single_agg_data["nb_clicks"].values,
        ):
            if count <= 0:
                single_agg_posteriors[id][value] = np.nan
                single_agg_count[id][value] = 0
            else:
                ctr = clicks / count
                single_agg_posteriors[id][value] = (ctr * count + self.prior_weight * prior) / (
                    count + self.prior_weight
                )
                single_agg_count[id][value] = count
        del self.single_agg_data

        for id_1, id_2, value_1, value_2, count, clicks in zip(
            self.double_agg_data["feature_1_id"].values,
            self.double_agg_data["feature_2_id"].values,
            self.double_agg_data["feature_1_value"].values,
            self.double_agg_data["feature_2_value"].values,
            self.double_agg_data["count"].values,
            self.double_agg_data["nb_clicks"].values,
        ):
            if count <= 0:
                double_agg_posteriors[(id_1, id_2)][(value_1, value_2)] = np.nan
                double_agg_count[(id_1, id_2)][(value_1, value_2)] = 0
            else:
                ctr = clicks / count
                double_agg_posteriors[(id_1, id_2)][(value_1, value_2)] = (ctr * count + self.prior_weight * prior) / (
                    count + self.prior_weight
                )
                double_agg_count[(id_1, id_2)][(value_1, value_2)] = count
        del self.double_agg_data
        return (
            single_agg_posteriors,
            single_agg_count,
            double_agg_posteriors,
            double_agg_count,
        )

    def fit(self, X, y=None):
        print("Fitting aggregated click encoder")
        (
            self.single_agg_posteriors,
            self.single_agg_count,
            self.double_agg_posteriors,
            self.double_agg_count,
        ) = self._fit()
        return self

    def transform(self, X):
        X = self._encode(X)
        return X

    def _encode(self, X):
        n = 19
        rows = X.shape[0]
        for i in range(n):
            ctr_results = np.zeros(rows, dtype=np.float32)
            count_results = np.zeros(rows, dtype=np.float32)
            values = X[f"hash_{i}"].values
            for j in range(rows):
                try:
                    ctr_results[j] = self.single_agg_posteriors[i][values[j]]
                    count_results[j] = self.single_agg_count[i][values[j]]
                except KeyError:
                    # Unseen modality
                    ctr_results[j] = np.nan
                    count_results[j] = 0
            X[f"feature_{i}_count"] = count_results
            X[f"feature_{i}_ctr"] = ctr_results

        for i in range(n - 1):
            for j in range(i + 1, n):
                ctr_results = np.zeros(rows, dtype=np.float32)
                count_results = np.zeros(rows, dtype=np.float32)
                values = list(zip(X[f"hash_{i}"].values, X[f"hash_{j}"].values))
                for k in range(rows):
                    try:
                        ctr_results[k] = self.double_agg_posteriors[(i, j)][values[k]]
                        count_results[k] = self.double_agg_count[(i, j)][values[k]]
                    except KeyError:
                        # Unseen modality
                        ctr_results[k] = np.nan
                        count_results[k] = 0

                X[f"double_feature_{i}_{j}_count"] = count_results
                X[f"double_feature_{i}_{j}_ctr"] = ctr_results
        return X

    def fit_transform(self, X, y=None):
        self.fit(X)
        X = self.transform(X)
        return X


class ColumnsSelector(BaseEstimator, TransformerMixin):
    """
    Encoder used to select only used columns.
    """

    def __init__(self, columns_to_drop=None, validate=False):
        self.columns_to_drop = columns_to_drop
        self.validate = validate
        self.columns = None

    def fit(self, X, y=None):
        if self.columns_to_drop is not None:
            self.columns = [col for col in X.columns if col not in self.columns_to_drop]
        else:
            self.columns = list(X.columns)
        return self

    def transform(self, X):
        columns = [col for col in self.columns if col in X.columns]
        if self.validate and len(columns) < len(self.columns):
            missing = set(self.columns).difference(columns)
            raise ValueError(f"Missing columns: {missing}")
        return X[columns]


# Metrics

In [None]:
def LLH(prediction, y):
    llh = np.log(prediction) * y + np.log(1 - prediction) * (1 - y)
    return sum(llh) / len(y)


def Entropy(y):
    py = sum(y > 0) / len(y)
    return py * np.log(py) + (1 - py) * np.log(1 - py)


def Nllh(prediction, y):
    if any(prediction < 0) or any(prediction > 1):
        return np.nan
    h = Entropy(y)
    llh = LLH(prediction, y)
    return (h - llh) / h


# Training code

In [None]:
def train(X_train, X_valid, y_train, y_valid, lgb_params):

    train_dataset = lgb.Dataset(X_train, y_train)
    valid_dataset = lgb.Dataset(X_valid, y_valid)
    model = lgb.train(
        params=lgb_params,
        train_set=train_dataset,
        valid_sets=[valid_dataset],
        num_boost_round=50_000,
        early_stopping_rounds=100,
        verbose_eval=200,
    )

    best_iter = model.best_iteration
    # Retrain on all the data
    train_dataset = lgb.Dataset(
        pd.concat([X_train, X_valid], axis=0, ignore_index=True),
        pd.concat([y_train, y_valid], axis=0, ignore_index=True),
    )
    model = lgb.train(params=lgb_params, train_set=train_dataset, num_boost_round=best_iter)
    return model


def run(X, y, X_test, y_test, df_single_agg, df_double_agg, gaussian_sigma=17, prior_weight=200):

    # Create copies
    df_single_agg_c = df_single_agg.copy()
    df_double_agg_c = df_double_agg.copy()

    # Adding noise
    df_single_agg_c["count"] = df_single_agg_c["count"] + np.random.normal(0, gaussian_sigma, df_single_agg.shape[0])
    df_single_agg_c["nb_clicks"] = df_single_agg_c["nb_clicks"] + np.random.normal(
        0, gaussian_sigma, df_single_agg.shape[0]
    )
    df_double_agg_c["count"] = df_double_agg_c["count"] + np.random.normal(0, gaussian_sigma, df_double_agg.shape[0])
    df_double_agg_c["nb_clicks"] = df_double_agg_c["nb_clicks"] + np.random.normal(
        0, gaussian_sigma, df_double_agg.shape[0]
    )

    # Feature engineering pipeline
    encoder_list = [AggClickEncoder(df_single_agg_c, df_double_agg_c, prior_weight=prior_weight)]
    to_drop = [f"hash_{i}" for i in range(19)]
    encoder_list.extend([ColumnsSelector(columns_to_drop=to_drop, validate=True)])
    fe_pipeline = make_pipeline(*encoder_list)

    X = fe_pipeline.fit_transform(X, y)
    X_test = fe_pipeline.transform(X_test)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y)

    lgb_params = {
        "objective": "binary",
        "learning_rate": 0.01,
        "boosting_type": "gbdt",
        "random_state": 42,
        "feature_fraction": 0.7,
        "bagging_fraction": 0.8,
        "deterministic": True,
        "force_col_wise": True,
    }

    model = train(X_train, X_valid, y_train, y_valid, lgb_params)

    test_prediction = model.predict(X_test)
    llh_test = Nllh(test_prediction, y_test)

    return llh_test


In [None]:
print_and_log("Baseline")
sigma = 17
prior_weight = 10

llh_test = run(X, y, X_test, y_test, df_single_agg, df_double_agg, gaussian_sigma=sigma, prior_weight=prior_weight)
tolog = f"gaussianSigma:{sigma};priorWeight:{prior_weight};llh_test:{llh_test};"
print_and_log(tolog)


# Running experiments

In [None]:
print_and_log("Benching noise robustness with prior weight")

# We assume that the function that maps a prior weight to a given sigma is monotonically increasing
# For a given sigma, we will therefore only explore the space of priors weights greater than
# the past weights used to regularize lower sigmas.

sigmas = [0, 10, 17, 50, 250, 1_000, 5_000, 25_000, 100_000]
prior_weights = [0, 10, 20, 50, 100, 200, 500, 1000, 5000, 10000]
last_weight = 0
for sigma in sigmas:
    best_llh = 0
    for prior_weight in prior_weights:
        if prior_weight < last_weight:
            continue
        metrics = []
        for i in range(5):
            llh_test = run(
                X,
                y,
                X_test,
                y_test,
                df_single_agg,
                df_double_agg,
                gaussian_sigma=sigma,
                prior_weight=prior_weight,
            )
            metrics.append(llh_test)
        mean_llh = np.mean(metrics)
        tolog = f"gaussianSigma:{sigma};priorWeight:{prior_weight};llh_test:{mean_llh}+/-{np.std(metrics)};"
        print_and_log(tolog)
        if mean_llh > best_llh:
            best_llh = mean_llh
            last_weight = prior_weight
        else:
            break


In [None]:
print_and_log("Benching performances as a function of number of available granular samples")

nb_of_samples = [1e5, 2e5, 5e5, 1e6, 2e6, df_large_test.shape[0]]
sigma = 17
prior_weight = 10

for n in nb_of_samples:
    metrics = []
    for i in range(5):
        df_tmp = df_large_test.sample(n=int(n), replace=False)
        X_tmp, y_tmp = df_tmp.drop(columns=["click", "sale"]), df_tmp["click"]
        llh_test = run(
            X_tmp,
            y_tmp,
            X_test,
            y_test,
            df_single_agg,
            df_double_agg,
            gaussian_sigma=sigma,
            prior_weight=prior_weight,
        )
        metrics.append(llh_test)
    tolog = f"gaussianSigma:{sigma};priorWeight:{prior_weight}; nbOfSamples:{n};llh_test:{np.mean(metrics)}+/-{np.std(metrics)};"
    print_and_log(tolog)
