In [None]:
%load_ext autoreload
%autoreload 2
import sys
from aggregated_models.myimports  import *
import aggregated_models.myJupyterUtils as myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('ggplot')

from datetime import date

# helpers to compute metrics
from aggregated_models.validation import MetricsComputer,  LLH  

# baselines
from aggregated_models.basicmodels import LogisticModel, NaiveBayesModel, LogisticModelWithCF 
from aggregated_models.aggLogistic import AggLogistic

# loading public "criteo attribution dataset"
import aggregated_models.loaddata as loaddata

In [None]:
# code to prepare the aggregated dataset
from aggregated_models.aggdataset import AggDataset 

In [None]:
## Most relevant code is there:
from aggregated_models.agg_mrf_model import AggMRFModel, fastGibbsSample, fastGibbsSampleFromPY0
import aggregated_models.agg_mrf_model
# also in https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [None]:
#loaddata.download_dataset()

## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [None]:
# dataset= "small_tb" # fast expriments
# dataset= "medium_tb" # fast expriments
dataset= "small_tb" # fast expriments
# dataset= "small" # fast expriments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [None]:
train, valid, features, label = loaddata.getDataset(dataset, splitOnDate=date(2015, 3,3))

In [None]:
fids = [-60029,-60036,-60040,-60042,-60049,-160020]

In [None]:
crosses = [ f"{f}&{g}" for i,f in enumerate(fids) for j,g in enumerate(fids) if i > j]

In [None]:
len(crosses)

In [None]:
'|'.join(crosses)

In [None]:
Validation = MetricsComputer(label)

In [None]:
train.dtypes

In [None]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [None]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [None]:
aggdata = AggDataset( train , features, "*&*",  label, epsilon, delta )

#https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/featuremappings.py#L205

In [None]:
print( f" Label: {aggdata.label}")
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

In [None]:
# aggdata may be viewed as a dictionary queryname -> dataframe
aggdata_datframe_dico = aggdata.toDFs()
queries = [x for x in aggdata_datframe_dico.keys()]
print( f"list of queries {queries}" )

In [None]:
# Dataframe for the query  " select 'cat1', 'cat8' , count, sum(label) group by 'cat1', 'cat8' "
aggdata_datframe_dico[queries[-3]].sample(3)

In [None]:
aggdata.aggDisplays

# Dictionary of projections 

In [None]:
aggdata.aggDisplays["categorical_feature_6&integer_feature_10"]

In [None]:
aggdata.aggDisplays["categorical_feature_6"].Data

# a "projection" contains counts stored in an array. 
# there is dictionary modality -> index in array

In [None]:
# dico is stored in this class:
aggdata.aggDisplays["categorical_feature_6&integer_feature_10"].feature
##https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/featuremappings.py#L32

# Each modality of "cat8" found in train was assigned an id, from 0 to NbCat8-1.  
# At index NbCat8, it is the count for the modality " Not found in train".  (But maybe in valid )

In [None]:
df = train.sample(4).copy()
## changing initial modality by index
aggdata.aggDisplays["categorical_feature_6"].feature.Map( df ) ## replacing modalities of cat8 by modalities from 1 to NbCat8
aggdata.aggDisplays["integer_feature_10"].feature.Map( df )

aggdata.aggDisplays["categorical_feature_6&integer_feature_10"].feature.Map( df ) ##  cat8&cat9 = cat8 + nbCat8 * cat9    (Or is i the opposite ?)

##### logistic Regression
- Using full log instead of aggdata. 
- Training with all "crossfeatures" found in agg data ( ie quadratic kernell)
- We do not expect to do better, the goal is to get similar performances

In [None]:
regulL2 = 16
logisticCfs = LogisticModelWithCF(label , features, "*&*"  , train ,
                                      hashspace=2**22 , lambdaL2 = regulL2  )

In [None]:
# logisticCfs.fit( train )
# print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [None]:
regulL2 = 16
logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )

In [None]:
# logisticCfs.fit( train[features] , nbIter = 200 )
# print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [None]:
regulL2 = 16
nbSamples = 3000
nbIter = 200

self = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2  ## parmeter "lambda_1" 
                  )

In [None]:
# self.fit(nbIter)
# print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

## Results, random split 1/3 validation, 6 features
### Small sampled TB ###

|Model|train NLLH|train NMSE|valid NLLH|valid NMSE|
|-|-|-|-|-|
|Logistic(QKernel)l2:16|0.0653|0.0261|0.0382|0.0110|
|MRF 3000 lambda1 16|0.0644|0.0257|0.0363|0.0103|

### Medium sampled TB ###

|Model|train NLLH|train NMSE|valid NLLH|valid NMSE|
|-|-|-|-|-|
|Logistic(QKernel)l2:16|0.0486|0.0179|0.0424|0.0143|
|MRF 30000 lambda1 16|0.0484|0.0177|0.0421|0.0140|


## Results, date split 1/3 validation, 6 features

### Small sampled TB ###

|Model|train NLLH|train NMSE|valid NLLH|valid NMSE|Training time|
|-|-|-|-|-|-|
|Logistic(QKernel)l2:16|0.0647 |0.0237|0.0384|0.0144|27s|
|MRF 3000 lambda1 16|0.0635|0.0226|0.0363|0.0137|20s|

### Medium sampled TB ###

|Model|train NLLH|train NMSE|valid NLLH|valid NMSE|Training time|
|-|-|-|-|-|-|
|Logistic(QKernel)l2:16|0.0482|0.0175|0.0435|0.0156|39s|
|MRF 3000 lambda1 16|0.0452|0.0155|0.0409|0.0139|29s|
|MRF 30000 lambda1 16|0.0473|0.0171|0.0428|0.0154|2m50|

### Full sampled TB ###

|Model|train NLLH|train NMSE|valid NLLH|valid NMSE|Training time|
|-|-|-|-|-|-|
|Logistic(QKernel)l2:16|0.0469|0.0164|0.0446|0.0160|OOM->moz upscale|
|AggLogistic(QKernel)|0.0472|0.0165|0.0447|0.0161|7min48s|
|MRF 3000 lambda1 16|0.0419|0.121|0.0391|0.0115|48s|
|MRF 30000 lambda1 16|0.0464|0.0161|0.0439|0.0157|3m28s|
|MRF 300000 lambda1 16|0.0470|0.0164|0.0444|0.0160|20min45s|
|Baba (with crosses)|0.0472|-|0.0447|-|7min|

In [None]:
# all parameters mu and theta concatenated in a  single vector
self.parameters

# This vector is the concatenation of parameters for associated to each projection

In [None]:
# List of features and crossfeatures for mu
self.displayWeights

In [None]:
# List of features and cfs for theta
self.clickWeights
# class WeightsSet : https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py#L8

## In parameter vector, indices from 3719 to 3729 are the parameters "theta" 
##   associated to values of the single feature "cat1"

In [None]:
# there are also two 'intercept' parameters:
self.muIntercept, self.lambdaIntercept
#  ...  thus P(Y = 1 |X =x) = sigmoid( K(x) . self.parameters[someOffset:] +  self.lambdaIntercept )

#  todo:  remane self.lambdaIntercept to self.thetaIntercept to get coherent notations

In [None]:
## samples of "X"

self.samples.data.shape

In [None]:
## Computing dotproducts between K(x) and mu or theta:

## https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py#L62

mus    = self.dotproducts( self.displayWeights, self.samples.data ) + self.muIntercept
mus

## https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/agg_mrf_model.py#L145

## note: I just added some comments in the code, translating all the line numbers ...

In [None]:
d = self.Data
d
#  vector with  the counts of click or display  from aggregated data.
# Same indexing as self.parameters

In [None]:
p = self.getPredictionsVector( self.samples )
p
# expected counts according to the model, computed by MC on the samples 

In [None]:
## https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/agg_mrf_model.py#L187
w = self.displayWeights["integer_feature_10"]
w.feature.Project_(  self.samples.data  , self.samples.pdisplays ) # Correct for grads

# a bit uselessly complicated :  self.samples.pdisplays  is constant
# This allows having samples with different 'weights', for example one sample for each possible modality of X

In [None]:
# After fiting the model,  "data" and "prediction" should be equal
plt.plot( d,p, "x" )


In [None]:
# ... up to the noise of the sampling / convergence of optimizer
plt.plot( np.log (1+d), np.log( 1+p), "x" )