This notebook preprocess the aggregated data and the small granular sets, to preprare for training a logistic regression with granular data and  aggregated labels, following the method proposed by the challenge winners.
Here we one-hot encode in a common space the modalities of each feature and the pairs of modalities of each pair of feature.
Following the notations of the paper, we thus specify the function x -> K(x)
We apply this encoding to:
- both aggregated data (singles and pairs) files, to get the vector D, C, and S of aggregated displays counts, clicks and sales respectively. We thus get D := Sum_{large train set} K(x)
 ,  C := Sum_{large train set} K(x)*Click  and S := Sum_{large train set} K(x) * Sale
- the 'small' test set and the 'large' test set: in each line x,y of those files, we compute the list of component of  K(x) which are equal to 1  
- we save the resulting encoded data to disk.

This encoding step is a bit slow (several hours). Note that the training after that is extremly fast (a few minutes to train a model)


In [None]:
# Common libs to import everywhere
import gc
import numpy as np


# Loading data

In [None]:
!cat '../data/aggregated_singles.csv' | head -2

In [None]:
COL_singles_mod = 0
COL_singles_clicks = 1
COL_singles_sales = 2
COL_singles_counts = 3
COL_singles_fid = 4


In [None]:
Xy_agg_data_singles = np.loadtxt("../data/aggregated_singles.csv", skiprows=1, delimiter=",", dtype=np.int32)[:, 1:]


In [None]:
!cat '../data/aggregated_pairs.csv' | head -2

In [None]:
COL_pairs_mod1 = 0
COL_pairs_mod2 = 1
COL_pairs_clicks = 2
COL_pairs_sales = 3
COL_pairs_counts = 4
COL_pairs_fid1 = 5
COL_pairs_fid2 = 6


In [None]:
Xy_agg_data_full = np.loadtxt("../data/aggregated_pairs.csv", skiprows=1, delimiter=",", dtype=np.int32)[:, 1:]
Xy_agg_data_full.shape


In [None]:
!cat '../data/X_test.csv' | head -2

In [None]:
# Almost 1M granular unlabeled samples from the Test set

X_test = np.loadtxt("../data/X_test.csv", skiprows=1, delimiter=",", dtype=np.int32)
X_test.shape


In [None]:
# More granular samples, from another set to compare perfs.

X_another_set = np.loadtxt("../data/criteo-ppml-challenge-adkdd21-dataset-additional-test-data.csv", skiprows=1, delimiter=",", dtype=np.int32)[:,:19]
X_another_set.shape


In [None]:
# Labels  (Used for computing validation score, not used for training!)

Y_clicks_test = np.loadtxt("../data/y_test.csv", skiprows=1, delimiter=",", dtype=np.int32, usecols=(0,))
Y_sales_test = np.loadtxt("../data/y_test.csv", skiprows=1, delimiter=",", dtype=np.int32, usecols=(1,))

Y_clicks_another_set = np.loadtxt("../data/criteo-ppml-challenge-adkdd21-dataset-additional-test-data.csv", skiprows=1, delimiter=",", dtype=np.int32, usecols=(19,))
Y_sales_another_set = np.loadtxt("../data/criteo-ppml-challenge-adkdd21-dataset-additional-test-data.csv", skiprows=1, delimiter=",", dtype=np.int32, usecols=(20,))


In [None]:
## to run faster -> remove crosses with small volume
# minDisplays = 1000
# ind = np.where(Xy_agg_data[:,COL_pairs_counts] > minDisplays )[0]
# print( "Proportion kept crosses"  ,  len(ind)/Xy_agg_data_full.shape[0] )

# Xy_agg_data_full = Xy_agg_data_full[ind,:]


In [None]:
gc.collect()


# One- hot encoding single features

In [None]:
# features =sorted(set( Xy_agg_data_singles[:,COL_singles_fid] ))
features = np.arange(0, 19)
features


In [None]:
## One-hot encoding all modalities
#  Giving one distinct id, starting from 1, to each modality of each feature.
#  (ie first feature modalities will get ids 1,2,...n; 2 feature will get n+1, n+2, ... )
modalities_per_feature = {}
offset = 1
for f in features:
    modalities = Xy_agg_data_singles[:, COL_singles_mod][Xy_agg_data_singles[:, COL_singles_fid] == f]
    modalities = sorted(set(modalities))
    dico = {m: i + offset for i, m in enumerate(modalities)}  ## +1:  keeping "0" for 'unknown' modality
    modalities_per_feature[f] = dico
    offset += len(dico)


In [None]:
np.array([len(modalities_per_feature[f]) for f in modalities_per_feature])


In [None]:
def encode_feature(modalities, dico):
    return np.array([dico.get(m, 0) for m in modalities])


In [None]:
for f in features:
    X_test[:, f] = encode_feature(X_test[:, f], modalities_per_feature[f])
    X_another_set[:, f] = encode_feature(X_another_set[:, f], modalities_per_feature[f])


In [None]:
## Replacing features by one-hot index in Xy_agg_data_singles

x = np.array([modalities_per_feature[a[COL_singles_fid]].get(a[COL_singles_mod], 0) for a in Xy_agg_data_singles])
Xy_agg_data_singles[:, COL_singles_mod] = x


In [None]:
## Replacing features by one-hot index in Xy_agg_data

Xy_agg_data_full[:, COL_pairs_mod1] = np.array(
    [modalities_per_feature[a[COL_pairs_fid1]].get(a[COL_pairs_mod1], 0) for a in Xy_agg_data_full]
)
Xy_agg_data_full[:, COL_pairs_mod2] = np.array(
    [modalities_per_feature[a[COL_pairs_fid2]].get(a[COL_pairs_mod2], 0) for a in Xy_agg_data_full]
)


# One-hot encoding all pairs

In [None]:
pairs = sorted(set([(a[COL_pairs_fid1], a[COL_pairs_fid2]) for a in Xy_agg_data_full]))
len(pairs)


In [None]:
def encode_cf(x, f, f2, dico):
    return np.array([dico.get((a[f], a[f2]), 0) for a in x])


def appended_encoded_cf(x, f, f2, dico):
    col = encode_cf(x, f, f2, dico)
    return np.c_[x, col]


In [None]:
COL_pairs_encodedpair = 0  ## reusing this colum


In [None]:
offset_pairs = offset

for pair in pairs:
    print(pair, "     ", end="\r")
    f = pair[0]
    f2 = pair[1]
    ind = np.where((Xy_agg_data_full[:, COL_pairs_fid1] == f) & (Xy_agg_data_full[:, COL_pairs_fid2] == f2))[0]
    x = Xy_agg_data_full[ind, :]
    modalities_pairs = sorted(set([(a[COL_pairs_mod1], a[COL_pairs_mod2]) for a in x]))
    dico = {m: i + offset_pairs for i, m in enumerate(modalities_pairs)}
    offset_pairs += len(dico)

    # writting encoded pair in first column
    Xy_agg_data_full[ind, COL_pairs_encodedpair] = np.array(
        [dico.get((a[COL_pairs_mod1], a[COL_pairs_mod2]), 0) for a in x]
    )

    X_test = appended_encoded_cf(X_test, f, f2, dico)
    X_another_set = appended_encoded_cf(X_another_set, f, f2, dico)

    gc.collect()


# Vectors of aggregated data

In [None]:
D = np.zeros(offset_pairs)
C = np.zeros(offset_pairs)
S = np.zeros(offset_pairs)
D[Xy_agg_data_full[:, COL_pairs_encodedpair]] += Xy_agg_data_full[:, COL_pairs_counts]
D[Xy_agg_data_singles[:, COL_singles_mod]] += Xy_agg_data_singles[:, COL_singles_counts]


S[Xy_agg_data_full[:, COL_pairs_encodedpair]] += Xy_agg_data_full[:, COL_pairs_sales]
S[Xy_agg_data_singles[:, COL_singles_mod]] += Xy_agg_data_singles[:, COL_singles_sales]

C[Xy_agg_data_full[:, COL_pairs_encodedpair]] += Xy_agg_data_full[:, COL_pairs_clicks]
C[Xy_agg_data_singles[:, COL_singles_mod]] += Xy_agg_data_singles[:, COL_singles_clicks]


# Save / Load
- one-hot encoding is slow

In [None]:
nb_samples_agg = Xy_agg_data_singles[Xy_agg_data_singles[:, COL_singles_fid] == 0][:, COL_singles_counts].sum()


In [None]:
name = f"../data/encodedAggData_C_D_S_X_Xbis_n_pairs"
np.savez(name, C=C, D=D, S=S, X_test=X_test, X_another_set=X_another_set, nb_samples_agg=nb_samples_agg, pairs=pairs)
