This notebook read the large train set, and produce aggregated data similar to those provided in the challenge.
Those aggregated data are then saved on disk to use as input of the ML algos.
Unlike the challenge data however, we do not add Gaussian noise to the aggregated data; to allow comparing the results with different levels of noise. Remember that a Gaussian noise of sigma 17 should be added to get a dataset really similar to the one we provided in the challenge.

In [1]:
import numpy as np
import pandas as pd


In [2]:
labels = ["click", "sale"]
allfeatures = ["hash_" + str(i) for i in range(0, 19)]


In [3]:
def aggregate_on_features(features, mincount, filename):
    df = pd.read_csv(filename, usecols=labels + features, dtype=np.int32)
    df["c"] = 1
    df = df.groupby(features).sum().reset_index()
    df = df[df.c > mincount].copy()
    return df


In [4]:
def aggregate_on_all_pairs(
    allfeatures,
    mincount=10,
    filename="criteo-ppml-challenge-adkdd21-dataset-raw-granular-data.csv",
    gaussian_sigma=None,
):
    allpairsdf = pd.DataFrame()
    for f0 in allfeatures:
        feature_1_id = int(f0.split("_")[-1])
        for f1 in allfeatures:
            feature_2_id = int(f1.split("_")[-1])
            if not feature_1_id < feature_2_id:
                continue
            print("aggregating on", f0, f1)
            features = [f0, f1]
            df = aggregate_on_features(features, mincount, filename)
            df["feature_1_id"] = feature_1_id
            df["feature_2_id"] = feature_2_id
            df = df.rename(
                {
                    features[0]: "feature_1_value",
                    features[1]: "feature_2_value",
                },
                axis=1,
            )
            allpairsdf = pd.concat([allpairsdf, df])
    if gaussian_sigma is not None:
        allpairsdf["c"] += np.random.normal(0, gaussian_sigma, len(allpairsdf))
        allpairsdf["click"] += np.random.normal(0, gaussian_sigma, len(allpairsdf))
        allpairsdf["sale"] += np.random.normal(0, gaussian_sigma, len(allpairsdf))
    return allpairsdf


In [5]:
## This may take severals hours:
# - there are 19*18/2 = 171 pairs of features to process,
# - each of them requires to read the full csv and run in a few minutes
# Memory requirement is about 10 Go (Mostly from pandas overhead)
allpairsdf = aggregate_on_all_pairs(allfeatures)


aggregating on hash_0 hash_1
aggregating on hash_0 hash_2
aggregating on hash_0 hash_3
aggregating on hash_0 hash_4
aggregating on hash_0 hash_5
aggregating on hash_0 hash_6
aggregating on hash_0 hash_7
aggregating on hash_0 hash_8
aggregating on hash_0 hash_9
aggregating on hash_0 hash_10
aggregating on hash_0 hash_11
aggregating on hash_0 hash_12
aggregating on hash_0 hash_13
aggregating on hash_0 hash_14
aggregating on hash_0 hash_15
aggregating on hash_0 hash_16
aggregating on hash_0 hash_17
aggregating on hash_0 hash_18
aggregating on hash_1 hash_2
aggregating on hash_1 hash_3
aggregating on hash_1 hash_4
aggregating on hash_1 hash_5
aggregating on hash_1 hash_6
aggregating on hash_1 hash_7
aggregating on hash_1 hash_8
aggregating on hash_1 hash_9
aggregating on hash_1 hash_10
aggregating on hash_1 hash_11
aggregating on hash_1 hash_12
aggregating on hash_1 hash_13
aggregating on hash_1 hash_14
aggregating on hash_1 hash_15
aggregating on hash_1 hash_16
aggregating on hash_1 hash_

In [6]:
allpairsdf.to_csv("aggregated_pairs.csv", index=True)


In [9]:
def aggregate_on_all_single(
    allfeatures, mincount=0, filename="criteo-ppml-challenge-adkdd21-dataset-raw-granular-data.csv", gaussianSigma=None
):
    allpairsdf = pd.DataFrame()
    for f0 in allfeatures:
        print("aggregating on", f0)

        features = [f0]
        df = aggregate_on_features(features, mincount, filename)
        df["feature_1_id"] = int(f0.split("_")[-1])
        df = df.rename({features[0]: "feature_1_value"}, axis=1)
        allpairsdf = pd.concat([allpairsdf, df])
    if gaussianSigma is not None:
        allpairsdf["c"] += np.random.normal(0, gaussianSigma, len(allpairsdf))
        allpairsdf["click"] += np.random.normal(0, gaussianSigma, len(allpairsdf))
        allpairsdf["sale"] += np.random.normal(0, gaussianSigma, len(allpairsdf))
    return allpairsdf


In [10]:
allsingles = aggregate_on_all_single(allfeatures)


aggregating on hash_0
aggregating on hash_1
aggregating on hash_2
aggregating on hash_3
aggregating on hash_4
aggregating on hash_5
aggregating on hash_6
aggregating on hash_7
aggregating on hash_8
aggregating on hash_9
aggregating on hash_10
aggregating on hash_11
aggregating on hash_12
aggregating on hash_13
aggregating on hash_14
aggregating on hash_15
aggregating on hash_16
aggregating on hash_17
aggregating on hash_18


In [11]:
allsingles.to_csv("aggregated_singles.csv", index=True)
