In [4]:
%load_ext autoreload
%autoreload 2
from aggregated_models.myimports  import *

plt.style.use('ggplot')

from aggregated_models.aggdataset import AggDataset
from aggregated_models.basicmodels import NaiveBayesModel, LogisticModelWithCF
from aggregated_models.validation import MetricsComputer
from aggregated_models.aggLogistic import AggLogistic
from aggregated_models.agg_mrf_model import AggMRFModel, AggMRFModelParams
from  aggregated_models import loaddata

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
Validation = MetricsComputer("click")

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [6]:
loaddata.download_dataset()

downloading dataset
unzipping


## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [8]:
dataset= "small" # fast experiments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [9]:
train, valid, features, label = loaddata.getDataset(dataset)

Sampling ratio :0.01
Nb train samples: 115382 , Nb valid samples: 49770  
features:['cat1', 'cat4', 'cat6', 'cat8', 'cat9']


In [34]:
train.sample(2)

Unnamed: 0,cat1,cat4,cat6,cat8,cat9,click,display
15769950,30763035,29196072,29196072,29196072,29196072,1,1
6744972,30763035,29196072,1973606,5824233,29196072,0,1


In [35]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

feature cat1 has 9 distinct modalities
feature cat4 has 14 distinct modalities
feature cat6 has 30 distinct modalities
feature cat8 has 11 distinct modalities
feature cat9 has 30 distinct modalities


## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [36]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [37]:
aggdata = AggDataset( train , features, "*&*",  label, epsilon, delta )

In [47]:
aggdata.aggClicks[ 'cat1&cat4' ].Data

array([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       4.2000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 2.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.6700e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       2.0000e+01, 1.6000e+01, 2.7000e+01, 3.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       6.1000e+01, 5.2000e+01, 8.1000e+01, 2.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       2.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       5.0900e+02, 9.1000e+01, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e

In [14]:
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

 Nb Queries: 15
 Noise distribution: None


In [19]:
# aggdata may be viewed as a dictionary queryname -> dataframe
queries = [x for x in aggdata.aggDisplays]
print( f"list of queries {queries}" )

list of queries ['cat1', 'cat4', 'cat6', 'cat8', 'cat9', 'cat1&cat4', 'cat1&cat6', 'cat1&cat8', 'cat1&cat9', 'cat4&cat6', 'cat4&cat8', 'cat4&cat9', 'cat6&cat8', 'cat6&cat9', 'cat8&cat9']


# Comparing models on noiseless data

In [24]:
aggdata = AggDataset( train , features, "*&*",  label, None, None )

In [48]:
#  Set to True to run the full benchmarks on Regulaization parameter.
# Keep to false to get only the selected value
runBenchmarks = False

##### logistic Regression
- Using full log instead of aggdata. 
- Training with all "crossfeatures" found in agg data ( ie quadratic kernell)
- We do not expect to do better, the goal is to get similar performances

In [26]:
## regulL2 was  benched to get best validation LLH
regulL2s = [4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else [64]

for regulL2 in regulL2s:
    logisticCfs = LogisticModelWithCF( "click" , features, "*&*"  , train ,
                                      hashspace=2**22 , lambdaL2 = regulL2  )
    logisticCfs.fit( train )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

Logistic(*&*), l2:16 train NLLH=0.0600, NMSE=0.0772   valid NLLH=0.0560, NMSE=0.0724  


##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [None]:
## regulL2 was  benched to get best validation LLH
regulL2s = [4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else [64]

for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )
    logisticCfs.fit( train[features] , nbIter = 200 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
    

##### logistic Regression without quadratic kernell
 - still solid baseline, but significantly weaker than logistic with quadratic kernell

In [31]:
regulL2s = [0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [8] 
    
for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*" , regulL2=regulL2 )
    logisticCfs.fit( train[features] , nbIter = 200 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
    

Logistic(*&*), l2:1 train NLLH=0.0536, NMSE=0.0692   valid NLLH=0.0539, NMSE=0.0699  


##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [49]:
regulL2s = [ 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else 512 
nbSamples = 50000
nbIter = 200
if dataset == "full": # requires more samples and more training iterations to converge.
    print( "Warning: training one model with these settings may take about 1 week" )
    nbIter = 1500
    nbSamples = 400000    

In [51]:
regulL2 = 16


In [52]:


    params = AggMRFModelParams(
        features=features,
        exactComputation=False ,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2
    )


In [53]:
    self = AggMRFModel(aggdata, params)

In [61]:
self.samples.data

array([[ 5,  5,  8, ...,  7,  0,  5],
       [10, 10, 12, ..., 10, 12, 12],
       [23,  1, 19, ..., 23,  0, 23],
       [ 8,  8,  4, ...,  8,  0,  8],
       [25, 25,  7, ..., 25, 17, 25]])

In [54]:
        Optimizers.simpleGradientStep(
            self,
            nbiter=nbIter,
            alpha=alpha,
            endIterCallback=lambda: self.updateAllSamplesWithGibbs(),
        )

NameError: name 'Optimizers' is not defined

In [30]:

    self.fit(nbIter)
    print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

MRF lambda1= 16 train NLLH=0.0574, NMSE=0.0741   valid NLLH=0.0546, NMSE=0.0708  


#### Naive Bayes
- rather weak baseline
- would require only examples and labels counts aggregated on each single feature
- requires very strong L2 regularization to get Okish results when the number of features grows

In [None]:
## Implementation:  one classifier (logistic regresion) per feature.
regulL2s = [ 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [256]

for regulL2 in regulL2s:
    self = NaiveBayesModel( label, features , regulL2)
    self.fit(train)
    print( f"NaiveBayes, regulL={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

In [None]:
## Implementation:  MRF using only aggregated data on single features
regulL2s = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [256] 

for regulL2 in regulL2s:
    print( f"nbSamples:{nbSamples} , nbIter:{nbIter}, lambda_1:{regulL2} " )
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*",
        displaysCfs="*",
        nbSamples = 50000,
        regulL2=1.0,
        regulL2Click = regulL2
    )
    self = AggMRFModel(aggdata, features, params)
    self.fit(200)
    print( f"NaiveBayes,(MRF implem) lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

#### MRF without "cross-features" on the P(Y|X) part of the model
Should be compared to the "simple" logistic with no Kernell. Requires:
- count of clicks aggregated on each feature,
- count of displays aggregated on each pair of features

In [None]:
regulL2s = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [4] if dataset == "small" else [64] 

for regulL2 in regulL2s:
    print( f"nbSamples:{nbSamples} , nbIter:{nbIter}, lambda_1:{regulL2} " )
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*",
        displaysCfs="*&*",
        nbSamples = 50000,
        regulL2=1.0,
        regulL2Click = regulL2
    )
    self = AggMRFModel(aggdata, features, params)
    self.fit(200)
    print( f"MRF_simple lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

# Other results on noiseless data

## Benching MRF regularization

the MRF model has 2 distinct regularization parameters:
- one controling the smoothness of the P(Y|X) . This one should be set roughly with the same value as the regularization parameter of a logistic regression with the same features.
- one controling the smoothness of the P(X). This parameter should be kept to a low value.



In [None]:
regulL2s = [ 0.25, 1, 4, 16, 64, 256]
lambdas = [ (l1,l2) for l1 in regulL2s for l2 in regulL2s  ]

if not runBenchmarks or dataset != "small":
    # running only with lambda_1 set to the value giving best results when lambda_2=1,  
    #  and setting lambda_2 to lambda_1. 
    lambdas = [(16,16)] if dataset == "small" else [(128,128)] 

for lambda1,lambda2 in lambdas:
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = 50000,
        regulL2=lambda2,
        regulL2Click = lambda1
    )
    self = AggMRFModel(aggdata, features, params)
    self.fit(200)
    print( f"MRF l1= {lambda1} l2= {lambda2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

## Sampling Y in the MC estimate of the gradient
- this increases variance significantly, at least when the dataset is strongly imbalanced


In [None]:
def get_unbalanced_df(df , label_sampling_rate , label ):
    labels = df[label].values
    keep = ( -labels + np.random.rand ( len(labels) )) > -label_sampling_rate
    return df[  keep ].reset_index() .copy()

In [None]:
label_sampling_rate = 0.02
train_unbalanced = get_unbalanced_df( train,  label_sampling_rate, label  )
valid_unbalanced = get_unbalanced_df( valid,  label_sampling_rate, label  )
aggdata_unbalanced = AggDataset( features, "*&*", train_unbalanced, label )

print(train_unbalanced[label].sum() / train[label].sum())
print((1-train_unbalanced[label]).sum() /( 1-train[label]).sum())

In [None]:
regulL2 = 16.0
for nbSamples in [100,1000,10000]:
    print( f"nbSamples:{nbSamples} " )
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2
    )
    self = AggMRFModel(aggdata_unbalanced, features , params)
    self.fit(100)
    print( f"MRF(collapsed),nbSamples= {nbSamples}",  "train",   Validation.run(self,train_unbalanced) , "valid" , Validation.run(self,valid_unbalanced)   )
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2
    )
    self = AggMRFModel(aggdata_unbalanced, features, params)
    self.decollapseGibbs = True
    self.fit(100)
    print( f"MRF(Sampling Y),nbSamples= {nbSamples}",  "train",   Validation.run(self,train_unbalanced) , "valid" , Validation.run(self,valid_unbalanced)   )    
print("")    

# Learning differential private models

In [None]:
epsilons = [10.0 , 1.0 , 0.1]
deltas = [ None ,1e-7, 1e-4 ]
regulL2s = [4.0, 16, 64, 256, 1024 ]


if runBenchmarks == False:
    epsilons = [1.0]
    deltas = [ None ,1e-7 ]
    regulL2s = [ 16, 64 ]


In [None]:
for epsilon in epsilons:
    for delta in deltas:
        print("")
        ## seeding to ensure each algo will run on the same dataset. 
        # In the article, this was not seeded, but both presented models were trained on the same instance of aggdata 
        np.random.seed(0)
        aggdata = AggDataset( train , features, "*&*",  label, epsilon, delta )
        print("")
        params = AggMRFModelParams(
            exactComputation=False,
            clicksCfs = "*&*",
            displaysCfs="*&*",
            nbSamples = 50000,
            regulL2=1.0,
            regulL2Click = regulL2
        )
        for regulL2 in regulL2s:
            self = AggMRFModel(aggdata, features, params)
            self.fit(200)
            print( f"MRF(no noise model) l1={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
        

In [None]:
for epsilon in epsilons:
    for delta in deltas:
        print("")
        ## seeding to ensure each algo will run on the same dataset
        np.random.seed(0)
        aggdata = AggDataset( train , features, "*&*",  label, epsilon, delta )
        print("")
        for regulL2 in regulL2s:
            params = AggMRFModelParams(
                exactComputation=False,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = 50000,
                regulL2=1.0,
                regulL2Click = regulL2,
                noiseDistribution= aggdata.noiseDistribution
            )
            self = AggMRFModel(aggdata, features, params)
            self.fit(200)
            print( f"MRF(modeling noise) l1={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
        