In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")
from myimports  import *
import myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('ggplot')

from featuremappings import AggDataset
from basicmodels import LogisticModel, NaiveBayesModel, LogisticModelWithCF
from validation import MetricsComputer,  LLH
from aggLogistic import AggLogistic
import Optimizers
from agg_mrf_model import AggMRFModel
import loaddata

In [2]:
Validation = MetricsComputer("click")

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [3]:
loaddata.download_dataset()

downloading dataset
unzipping


## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [4]:
dataset= "small" # fast expriments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [5]:
train, valid, features, label = loaddata.getDataset(dataset)

165153
Nb train samples: 115383 , Nb valid samples: 49770  
features:['cat1', 'cat4', 'cat6', 'cat8', 'cat9']


In [6]:
train.sample(2)

Unnamed: 0,cat1,cat4,cat6,cat8,cat9,click
147515,30763035,29196072,1973606,20754144,29520629,0
148118,30763035,29196072,1973606,9312274,18291877,0


In [7]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

feature cat1 has 9 distinct modalities
feature cat4 has 13 distinct modalities
feature cat6 has 30 distinct modalities
feature cat8 has 11 distinct modalities
feature cat9 has 30 distinct modalities


## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [8]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [9]:
aggdata = AggDataset( features, "*&*", train , label, epsilon, delta )

In [10]:
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

 Nb Queries: 15
 Noise distribution: None


In [11]:
# aggdata may be viewed as a dictionary queryname -> dataframe
aggdata_datframe_dico = aggdata.toDFs()
queries = [x for x in aggdata_datframe_dico.keys()]
print( f"list of queries {queries}" )

list of queries ['cat1', 'cat4', 'cat6', 'cat8', 'cat9', 'cat1&cat4', 'cat1&cat6', 'cat1&cat8', 'cat1&cat9', 'cat4&cat6', 'cat4&cat8', 'cat4&cat9', 'cat6&cat8', 'cat6&cat9', 'cat8&cat9']


In [12]:
# Dataframe for the query  " select 'cat1', 'cat8' , count, sum(label) group by 'cat1', 'cat8' "
aggdata_datframe_dico["cat1&cat8"].sample(3)

Unnamed: 0,cat1&cat8,cat1,cat8,c,click
68,68,8,6,1106.0,426.0
117,117,7,11,0.0,0.0
16,16,6,1,391.0,84.0


# Comparing models on noiseless data

In [13]:
aggdata = AggDataset( features, "*&*", train , label, None, None )

In [14]:
#  Set to True to run the full benchmarks on Regulaization parameter.
# Keep to false to get only the selected value
runBenchmarks = False

##### logistic Regression
- Using full log instead of aggdata. 
- Training with all "crossfeatures" found in agg data ( ie quadratic kernell)
- We do not expect to do better, the goal is to get similar performances

In [15]:
## regulL2 was  benched to get best validation LLH
regulL2s = [4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else [64]

for regulL2 in regulL2s:
    logisticCfs = LogisticModelWithCF( "click" , features, "*&*"  , train ,
                                      hashspace=2**22 , lambdaL2 = regulL2  )
    logisticCfs.fit( train )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

Logistic(*&*), l2:16 train NLLH=0.0620, NMSE=0.0799   valid NLLH=0.0561, NMSE=0.0724  


##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [16]:
## regulL2 was  benched to get best validation LLH
regulL2s = [4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else [64]

for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )
    logisticCfs.fit( train[features] , nbIter = 200 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
    

Logistic(*&*), l2:16 train NLLH=0.0622, NMSE=0.0801   valid NLLH=0.0561, NMSE=0.0725  


##### logistic Regression without quadratic kernell
 - still solid baseline, but significantly weaker than logistic with quadratic kernell

In [17]:
regulL2s = [0.5, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [8] 
    
for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*" , regulL2=regulL2 )
    logisticCfs.fit( train[features] , nbIter = 200 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
    

Logistic(*&*), l2:1 train NLLH=0.0544, NMSE=0.0705   valid NLLH=0.0538, NMSE=0.0696  


##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [18]:
regulL2s = [ 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [16] if dataset == "small" else [128] if dataset == "sampled" else 512 
nbSamples = 50000
nbIter = 200
if dataset == "full": # requires more samples and more training iterations to converge.
    print( "Warning: training one model with these settings may take about 1 week" )
    nbIter = 1500
    nbSamples = 400000    

for regulL2 in regulL2s:
    self = AggMRFModel( aggdata, features , 
                           exactComputation=False , clicksCfs = "*&*", displaysCfs="*&*",
                          nbSamples = nbSamples, regulL2=1.0 , regulL2Click = regulL2)
    self.fit(nbIter)
    print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

llh:0.0E+00 a:1.1E-01(0), n:200, g:1.4E+03  --
MRF lambda1= 16 train NLLH=0.0614, NMSE=0.0791   valid NLLH=0.0555, NMSE=0.0717  


#### Naive Bayes
- rather weak baseline
- would require only examples and labels counts aggregated on each single feature
- requires very strong L2 regularization to get Okish results when the number of features grows

In [19]:
## Implementation:  one classifier (logistic regresion) per feature.
regulL2s = [ 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [256]

for regulL2 in regulL2s:
    self = NaiveBayesModel( label, features , regulL2)
    self.fit(train)
    print( f"NaiveBayes, regulL={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

NaiveBayes, regulL=1 train NLLH=0.0500, NMSE=0.0655   valid NLLH=0.0482, NMSE=0.0631  


In [20]:
## Implementation:  MRF using only aggregated data on single features
regulL2s = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [1] if dataset == "small" else [256] 

for regulL2 in regulL2s:
    print( f"nbSamples:{nbSamples} , nbIter:{nbIter}, lambda_1:{regulL2} " )
    self = AggMRFModel( aggdata, features , 
                           exactComputation=False , clicksCfs = "*", displaysCfs="*",
                          nbSamples = 50000, regulL2=1.0 , regulL2Click = regulL2)
    self.fit(200)
    print( f"NaiveBayes,(MRF implem) lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

nbSamples:50000 , nbIter:200, lambda_1:1 
llh:0.0E+00 a:2.5E-01(0), n:200, g:6.6E+02  --
NaiveBayes,(MRF implem) lambda1= 1 train NLLH=0.0490, NMSE=0.0642   valid NLLH=0.0468, NMSE=0.0615  


#### MRF without "cross-features" on the P(Y|X) part of the model
Should be compared to the "simple" logistic with no Kernell. Requires:
- count of clicks aggregated on each feature,
- count of displays aggregated on each pair of features

In [21]:
regulL2s = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
if not runBenchmarks:
    regulL2s = [4] if dataset == "small" else [64] 

for regulL2 in regulL2s:
    print( f"nbSamples:{nbSamples} , nbIter:{nbIter}, lambda_1:{regulL2} " )
    self = AggMRFModel( aggdata, features , 
                           exactComputation=False , clicksCfs = "*", displaysCfs="*&*",
                          nbSamples = 50000, regulL2=1.0 , regulL2Click = regulL2)
    self.fit(200)
    print( f"MRF_simple lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

nbSamples:50000 , nbIter:200, lambda_1:4 
llh:0.0E+00 a:1.7E-01(0), n:200, g:1.3E+03  --
MRF_simple lambda1= 4 train NLLH=0.0529, NMSE=0.0687   valid NLLH=0.0518, NMSE=0.0672  


# Other results on noiseless data

## Benching MRF regularization

the MRF model has 2 distinct regularization parameters:
- one controling the smoothness of the P(Y|X) . This one should be set roughly with the same value as the regularization parameter of a logistic regression with the same features.
- one controling the smoothness of the P(X). This parameter should be kept to a low value.



In [22]:
regulL2s = [ 0.25, 1, 4, 16, 64, 256]
lambdas = [ (l1,l2) for l1 in regulL2s for l2 in regulL2s  ]

if not runBenchmarks or dataset != "small":
    # running only with lambda_1 set to the value giving best results when lambda_2=1,  
    #  and setting lambda_2 to lambda_1. 
    lambdas = [(16,16)] if dataset == "small" else [(128,128)] 

for lambda1,lambda2 in lambdas:
    self = AggMRFModel( aggdata, features , 
                           exactComputation=False , clicksCfs = "*&*", displaysCfs="*&*",
                          nbSamples = 50000, regulL2=lambda2 , regulL2Click = lambda1)
    self.fit(200)
    print( f"MRF l1= {lambda1} l2= {lambda2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

llh:0.0E+00 a:1.1E-01(0), n:200, g:2.8E+03  --
MRF l1= 16 l2= 16 train NLLH=0.0591, NMSE=0.0762   valid NLLH=0.0538, NMSE=0.0695  


## Sampling Y in the MC estimate of the gradient
- this increases variance significantly, at least when the dataset is strongly imbalanced


In [23]:
def get_unbalanced_df(df , label_sampling_rate , label ):
    labels = df[label].values
    keep = ( -labels + np.random.rand ( len(labels) )) > -label_sampling_rate
    return df[  keep ].reset_index() .copy()

In [24]:
label_sampling_rate = 0.02
train_unbalanced = get_unbalanced_df( train,  label_sampling_rate, label  )
valid_unbalanced = get_unbalanced_df( valid,  label_sampling_rate, label  )
aggdata_unbalanced = AggDataset( features, "*&*", train_unbalanced, label )

print(train_unbalanced[label].sum() / train[label].sum())
print((1-train_unbalanced[label]).sum() /( 1-train[label]).sum())

0.020789821149921978
1.0


In [25]:
regulL2 = 16.0
for nbSamples in [100,1000,10000]:
    print( f"nbSamples:{nbSamples} " )
    self = AggMRFModel( aggdata_unbalanced, features , 
                           exactComputation=False , clicksCfs = "*&*", displaysCfs="*&*",
                          nbSamples = nbSamples, regulL2=1.0 , regulL2Click = regulL2)
    self.fit(100)
    print( f"MRF(collapsed),nbSamples= {nbSamples}",  "train",   Validation.run(self,train_unbalanced) , "valid" , Validation.run(self,valid_unbalanced)   )
    self = AggMRFModel( aggdata_unbalanced, features , 
                           exactComputation=False , clicksCfs = "*&*", displaysCfs="*&*",
                          nbSamples = nbSamples, regulL2=1.0 , regulL2Click = regulL2)
    self.decollapseGibbs = True
    self.fit(100)
    print( f"MRF(Sampling Y),nbSamples= {nbSamples}",  "train",   Validation.run(self,train_unbalanced) , "valid" , Validation.run(self,valid_unbalanced)   )    
print("")    

nbSamples:100 
llh:0.0E+00 a:1.7E-01(0), n:100, g:5.3E+04  --
MRF(collapsed),nbSamples= 100 train NLLH=0.0047, NMSE=-0.0010   valid NLLH=-0.0218, NMSE=-0.0064  
llh:0.0E+00 a:1.1E-01(0), n:100, g:5.9E+04  --
MRF(Sampling Y),nbSamples= 100 train NLLH=-0.3144, NMSE=-0.2526   valid NLLH=-0.4064, NMSE=-0.2969  
nbSamples:1000 
llh:0.0E+00 a:1.7E-01(0), n:100, g:1.3E+04  --
MRF(collapsed),nbSamples= 1000 train NLLH=0.0363, NMSE=0.0054   valid NLLH=0.0172, NMSE=0.0024  
llh:0.0E+00 a:1.1E-01(0), n:100, g:1.5E+04  --
MRF(Sampling Y),nbSamples= 1000 train NLLH=0.0074, NMSE=0.0016   valid NLLH=-0.0080, NMSE=0.0001  
nbSamples:10000 
llh:0.0E+00 a:1.7E-01(0), n:100, g:1.8E+03  --
MRF(collapsed),nbSamples= 10000 train NLLH=0.0365, NMSE=0.0059   valid NLLH=0.0144, NMSE=0.0020  
llh:0.0E+00 a:1.1E-01(0), n:100, g:2.4E+03  --
MRF(Sampling Y),nbSamples= 10000 train NLLH=0.0196, NMSE=0.0026   valid NLLH=-0.0162, NMSE=-0.0053  



# Learning differential private models

In [26]:
epsilons = [10.0 , 1.0 , 0.1]
deltas = [ None ,1e-7, 1e-4 ]
regulL2s = [4.0, 16, 64, 256, 1024 ]


if runBenchmarks == False:
    epsilons = [1.0]
    deltas = [ None ,1e-7 ]
    regulL2s = [ 16, 64 ]


In [27]:
for epsilon in epsilons:
    for delta in deltas:
        print("")
        ## seeding to ensure each algo will run on the same dataset. 
        # In the article, this was not seeded, but both presented models were trained on the same instance of aggdata 
        np.random.seed(0)
        aggdata = AggDataset( features, "*&*", train , label, epsilon, delta )
        print("")
        for regulL2 in regulL2s:
            self = AggMRFModel( aggdata, features , 
                           exactComputation=False , clicksCfs = "*&*", displaysCfs="*&*",
                           nbSamples = 50000, regulL2=1.0 , regulL2Click = regulL2)
            self.fit(200)
            print( f"MRF(no noise model) l1={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
        


LaplaceMechanism epsilon:1.0  scale:0.06666666666666667 sigma:21.209275573763474

llh:0.0E+00 a:1.1E-01(0), n:200, g:1.8E+03  --
MRF(no noise model) l1=16 train NLLH=0.0419, NMSE=0.0575   valid NLLH=0.0371, NMSE=0.0520  
llh:0.0E+00 a:1.6E-01(1), n:200, g:1.7E+03  --
MRF(no noise model) l1=64 train NLLH=0.0547, NMSE=0.0707   valid NLLH=0.0521, NMSE=0.0675  

GaussianMechanism epsilon:1.0 delta:1e-07 sigma:21.373603341459955

llh:0.0E+00 a:1.0E-01(1), n:200, g:2.6E+03  --
MRF(no noise model) l1=16 train NLLH=0.0396, NMSE=0.0558   valid NLLH=0.0328, NMSE=0.0479  
llh:0.0E+00 a:1.0E-01(1), n:200, g:1.6E+03  --
MRF(no noise model) l1=64 train NLLH=0.0551, NMSE=0.0711   valid NLLH=0.0516, NMSE=0.0667  


In [28]:
for epsilon in epsilons:
    for delta in deltas:
        print("")
        ## seeding to ensure each algo will run on the same dataset
        np.random.seed(0)
        aggdata = AggDataset( features, "*&*", train , label, epsilon, delta )
        print("")
        for regulL2 in regulL2s:
            self = AggMRFModel( aggdata, features , 
                           exactComputation=False , clicksCfs = "*&*", displaysCfs="*&*",
                           nbSamples = 50000, regulL2=1.0 , regulL2Click = regulL2,
                              noiseDistribution= aggdata.noiseDistribution )
            self.fit(200)
            print( f"MRF(modeling noise) l1={regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )
        


LaplaceMechanism epsilon:1.0  scale:0.06666666666666667 sigma:21.209275573763474

llh:0.0E+00 a:1.1E-01(0), n:200, g:1.9E+03  --
MRF(modeling noise) l1=16 train NLLH=0.0546, NMSE=0.0704   valid NLLH=0.0521, NMSE=0.0675  
llh:0.0E+00 a:1.1E-01(0), n:200, g:1.3E+03  --
MRF(modeling noise) l1=64 train NLLH=0.0537, NMSE=0.0693   valid NLLH=0.0522, NMSE=0.0675  

GaussianMechanism epsilon:1.0 delta:1e-07 sigma:21.373603341459955

llh:0.0E+00 a:1.1E-01(0), n:200, g:1.6E+03  --
MRF(modeling noise) l1=16 train NLLH=0.0556, NMSE=0.0718   valid NLLH=0.0528, NMSE=0.0682  
llh:0.0E+00 a:1.6E-01(1), n:200, g:1.4E+03  --
MRF(modeling noise) l1=64 train NLLH=0.0539, NMSE=0.0696   valid NLLH=0.0523, NMSE=0.0677  
