In [1]:
%load_ext autoreload
%autoreload 2
from aggregated_models.myimports  import *

plt.style.use('ggplot')

from aggregated_models.aggdataset import AggDataset
from aggregated_models.basicmodels import NaiveBayesModel, LogisticModelWithCF
from aggregated_models.validation import MetricsComputer
from aggregated_models.aggLogistic import AggLogistic
from aggregated_models.agg_mrf_model import AggMRFModel, AggMRFModelParams
from  aggregated_models import loaddata
from  aggregated_models import Optimizers


failed to load pyspark
failed to load pyspark


In [2]:
Validation = MetricsComputer("click")

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [3]:
loaddata.download_dataset()

downloading dataset
unzipping


## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [4]:
dataset= "small" # fast experiments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [6]:
train, valid, features, label = loaddata.getDataset(dataset)

Sampling ratio :0.01
Nb train samples: 115382 , Nb valid samples: 49770  
features:['cat1', 'cat4', 'cat6', 'cat8', 'cat9']


In [7]:
len(train), len(valid)

(115382, 49770)

In [8]:
train.sample(2)

Unnamed: 0,cat1,cat4,cat6,cat8,cat9,click
16105981,25259032,23549932,5824235,29196072,29196072,0
10532191,30763035,29196072,1973606,23998111,16022558,0


In [9]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

feature cat1 has 9 distinct modalities
feature cat4 has 14 distinct modalities
feature cat6 has 30 distinct modalities
feature cat8 has 11 distinct modalities
feature cat9 has 30 distinct modalities


## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [10]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [11]:
aggdata = AggDataset.FromDF( train , features, "*&*",  label, epsilon0=epsilon, delta=delta )

In [12]:
aggdata.aggClicks[ 'cat1&cat4' ].Data

array([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       4.2000e+01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 2.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.6700e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       2.0000e+01, 1.6000e+01, 2.7000e+01, 3.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       6.1000e+01, 5.2000e+01, 8.1000e+01, 2.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       2.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       5.0900e+02, 9.1000e+01, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e

In [13]:
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

 Nb Queries: 15
 Noise distribution: None


In [14]:
# aggdata may be viewed as a dictionary queryname -> dataframe
queries = [x for x in aggdata.aggDisplays]
print( f"list of queries {queries}" )

list of queries ['cat1', 'cat4', 'cat6', 'cat8', 'cat9', 'cat1&cat4', 'cat1&cat6', 'cat1&cat8', 'cat1&cat9', 'cat4&cat6', 'cat4&cat8', 'cat4&cat9', 'cat6&cat8', 'cat6&cat9', 'cat8&cat9']


# Comparing models on noiseless data

In [15]:
aggdata = AggDataset.FromDF( train , features, "*&*",  label )

In [16]:
#  Set to True to run the full benchmarks on Regulaization parameter.
# Keep to false to get only the selected value
runBenchmarks = False

##### logistic Regression
- Using full log instead of aggdata. 
- Training with all "crossfeatures" found in agg data ( ie quadratic kernell)
- We do not expect to do better, the goal is to get similar performances

In [17]:
## regulL2 was  benched to get best validation LLH
regulL2s = [4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else [64]

for regulL2 in regulL2s:
    logisticCfs = LogisticModelWithCF( "click" , features, "*&*"  , train ,
                                      hashspace=2**22 , lambdaL2 = regulL2  )
    logisticCfs.fit( train )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

Logistic(*&*), l2:16 train NLLH=0.0600, NMSE=0.0772   valid NLLH=0.0560, NMSE=0.0724  


##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [18]:
## regulL2 was  benched to get best validation LLH
regulL2s = [4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else [64]

for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , rescaling=True, regulL2=regulL2 )
    logisticCfs.fit( train[features] , nbIter = 400 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
    

Logistic(*&*), l2:16 train NLLH=0.0598, NMSE=0.0770   valid NLLH=0.0558, NMSE=0.0721  


##### logistic Regression without quadratic kernell
 - still solid baseline, but significantly weaker than logistic with quadratic kernell

In [19]:
regulL2s = [0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [8] 
    
for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*" , regulL2=regulL2 )
    logisticCfs.fit( train[features] , nbIter = 400 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
    

Logistic(*&*), l2:1 train NLLH=0.0535, NMSE=0.0691   valid NLLH=0.0538, NMSE=0.0698  


##### Proposed MRF model
- uses only aggregated data
- almost retrieves logistic performances

In [20]:
regulL2s = [ 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else 512 
nbSamples = 50000
nbIter = 200
if dataset == "full": # requires more samples and more training iterations to converge.
    print( "Warning: training one model with these settings may take about 1 week" )
    nbIter = 1500
    nbSamples = 400000    
for regulL2 in regulL2s:
    params = AggMRFModelParams(
        features=features,
        exactComputation=False ,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2 )
    self = AggMRFModel(aggdata, params)
    self.fit(nbIter)
    print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
    

MRF lambda1= 16 train NLLH=0.0593, NMSE=0.0764   valid NLLH=0.0556, NMSE=0.0719  


#### Naive Bayes
- rather weak baseline
- would require only examples and labels counts aggregated on each single feature
- requires very strong L2 regularization to get Okish results when the number of features grows

In [21]:
## Implementation:  one classifier (logistic regresion) per feature.
regulL2s = [ 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [256]

for regulL2 in regulL2s:
    self = NaiveBayesModel( label, features , regulL2)
    self.fit(train)
    print( f"NaiveBayes, regulL={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

NaiveBayes, regulL=1 train NLLH=0.0488, NMSE=0.0637   valid NLLH=0.0505, NMSE=0.0661  


In [22]:
## Implementation:  MRF using only aggregated data on single features
regulL2s = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [256] 

for regulL2 in regulL2s:
    print( f"nbSamples:{nbSamples} , nbIter:{nbIter}, lambda_1:{regulL2} " )
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*",
        displaysCfs="*",
        nbSamples = 50000,
        regulL2=1.0,
        regulL2Click = regulL2,
        features = features,
    )
    self = AggMRFModel(aggdata, params)
    self.fit(200)
    print( f"NaiveBayes,(MRF implem) lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

nbSamples:50000 , nbIter:200, lambda_1:1 
NaiveBayes,(MRF implem) lambda1= 1 train NLLH=0.0488, NMSE=0.0637   valid NLLH=0.0505, NMSE=0.0660  


#### MRF without "cross-features" on the P(Y|X) part of the model
Should be compared to the "simple" logistic with no Kernell. Requires:
- count of clicks aggregated on each feature,
- count of displays aggregated on each pair of features

In [23]:
regulL2s = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [4] if dataset == "small" else [64] 

for regulL2 in regulL2s:
    print( f"nbSamples:{nbSamples} , nbIter:{nbIter}, lambda_1:{regulL2} " )
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*",
        displaysCfs="*&*",
        nbSamples = 50000,
        regulL2=1.0,
        regulL2Click = regulL2,
        features = features,        
    )
    self = AggMRFModel(aggdata, params)
    self.fit(200)
    print( f"MRF_simple lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

nbSamples:50000 , nbIter:200, lambda_1:4 
MRF_simple lambda1= 4 train NLLH=0.0526, NMSE=0.0681   valid NLLH=0.0532, NMSE=0.0690  


# Other results on noiseless data

## Benching MRF regularization

the MRF model has 2 distinct regularization parameters:
- one controling the smoothness of the P(Y|X) . This one should be set roughly with the same value as the regularization parameter of a logistic regression with the same features.
- one controling the smoothness of the P(X). This parameter should be kept to a low value.



In [24]:
regulL2s = [ 0.25, 1, 4, 16, 64, 256]
lambdas = [ (l1,l2) for l1 in regulL2s for l2 in regulL2s  ]

if not runBenchmarks or dataset != "small":
    # running only with lambda_1 set to the value giving best results when lambda_2=1,  
    #  and setting lambda_2 to lambda_1. 
    lambdas = [(16,16)] if dataset == "small" else [(128,128)] 

for lambda1,lambda2 in lambdas:
    params = AggMRFModelParams(
        exactComputation=False,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = 50000,
        regulL2=lambda2,
        regulL2Click = lambda1,
        features = features,
    )
    self = AggMRFModel(aggdata, params)
    self.fit(200)
    print( f"MRF l1= {lambda1} l2= {lambda2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

MRF l1= 16 l2= 16 train NLLH=0.0593, NMSE=0.0763   valid NLLH=0.0557, NMSE=0.0720  


# Learning differential private models

In [25]:
epsilons = [10.0 , 1.0 , 0.1]
deltas = [ None ,1e-7, 1e-4 ]
regulL2s = [4.0, 16, 64, 256, 1024 ]


if runBenchmarks == False:
    epsilons = [1.0]
    deltas = [ None ,1e-7 ]
    regulL2s = [ 16, 64 ]


In [26]:
for epsilon in epsilons:
    for delta in deltas:
        print("")
        ## seeding to ensure each algo will run on the same dataset. 
        # In the article, this was not seeded, but both presented models were trained on the same instance of aggdata 
        np.random.seed(0)
        aggdata = AggDataset.FromDF( train , features, "*&*",  label, epsilon0=epsilon, delta=delta )
        print("")
        params = AggMRFModelParams(
            exactComputation=False,
            clicksCfs = "*&*",
            displaysCfs="*&*",
            nbSamples = 50000,
            regulL2=1.0,
            regulL2Click = regulL2,
            features = features,
            gaussiansigma = 0
        )
        for regulL2 in regulL2s:
            self = AggMRFModel(aggdata, params)
            self.fit(200)
            print( f"MRF(no noise model) l1={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
        


LaplaceMechanism epsilon:1.0  scale:0.03333333333333333 sigma:42.42444274934185

MRF(no noise model) l1=16 train NLLH=-0.0899, NMSE=-0.0253   valid NLLH=-0.1095, NMSE=-0.0365  
MRF(no noise model) l1=64 train NLLH=-0.0899, NMSE=-0.0253   valid NLLH=-0.1095, NMSE=-0.0364  

GaussianMechanism epsilon:1.0 delta:1e-07 sigma:30.226839722275574

MRF(no noise model) l1=16 train NLLH=0.0500, NMSE=0.0646   valid NLLH=0.0496, NMSE=0.0641  
MRF(no noise model) l1=64 train NLLH=0.0500, NMSE=0.0646   valid NLLH=0.0496, NMSE=0.0641  


In [27]:
for epsilon in epsilons:
    for delta in deltas:
        print("")
        ## seeding to ensure each algo will run on the same dataset
        np.random.seed(0)
        aggdata = AggDataset.FromDF( train , features, "*&*",  label, epsilon0=epsilon, delta=delta )
        print("")
        for regulL2 in regulL2s:
            params = AggMRFModelParams(
                exactComputation=False,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = 50000,
                regulL2=1.0,
                regulL2Click = regulL2,
                features = features,
                gaussiansigma = aggdata.noiseDistribution.sigma /2
            )
            self = AggMRFModel(aggdata, params)
            self.fit(200)
            print( f"MRF(modeling noise) l1={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
        


LaplaceMechanism epsilon:1.0  scale:0.03333333333333333 sigma:42.42444274934185

MRF(modeling noise) l1=16 train NLLH=0.0484, NMSE=0.0626   valid NLLH=0.0467, NMSE=0.0605  
MRF(modeling noise) l1=64 train NLLH=0.0490, NMSE=0.0632   valid NLLH=0.0488, NMSE=0.0630  

GaussianMechanism epsilon:1.0 delta:1e-07 sigma:30.226839722275574

MRF(modeling noise) l1=16 train NLLH=0.0497, NMSE=0.0641   valid NLLH=0.0492, NMSE=0.0636  
MRF(modeling noise) l1=64 train NLLH=0.0505, NMSE=0.0650   valid NLLH=0.0509, NMSE=0.0657  
