In [1]:
%load_ext autoreload
%autoreload 2
import sys
from agg_models.myimports  import *
# import agg_models.myJupyterUtils as myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('ggplot')

from datetime import date

# helpers to compute metrics
from agg_models.validation import MetricsComputer,  LLH  

# baselines
from agg_models.basicmodels import LogisticModel, NaiveBayesModel, LogisticModelWithCF 
from agg_models.aggLogistic import AggLogistic

# loading public "criteo attribution dataset"
import agg_models.loaddata as loaddata

In [2]:
%matplotlib inline

In [3]:
# code to prepare the aggregated dataset
from agg_models.featuremappings import AggDataset 

In [4]:
## Most relevant code is there:
from agg_models.agg_mrf_model import AggMRFModel, fastGibbsSample, fastGibbsSampleFromPY0
import agg_models.agg_mrf_model
# also in https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py

In [5]:
import thx.hadoop.hdfs_cache as hdfs
from thx.hadoop.spark_config_builder import create_remote_spark_session, SparkSession
import pyspark
from pyspark.sql import functions as F
from thx.datasources.parquet import create_df_from_parquet
from datetime import datetime, timedelta
import os



In [6]:
ss = create_remote_spark_session('Test Spark parallelize', 20, 1, memory='4g', memoryOverhead='2g', driver_memory='12g', hadoop_file_systems=['viewfs://root', 'viewfs://prod-am6'])
ss

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [7]:
#loaddata.download_dataset()

## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [8]:
# dataset= "small_tb" # fast expriments
# dataset= "medium_tb" # fast expriments
# dataset= "sampled" # fast expriments
dataset= "small" # fast expriments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [9]:
train, valid, features, label = loaddata.getDataset(dataset)

Sampling ratio :0.01
Nb train samples: 115382 , Nb valid samples: 49770  
features:['cat1', 'cat4', 'cat6', 'cat8', 'cat9']


In [10]:
features

['cat1', 'cat4', 'cat6', 'cat8', 'cat9']

In [11]:
label

'click'

In [12]:
fids = [-60029,-60036,-60040,-60042,-60049,-160020]

In [13]:
crosses = [ f"{f}&{g}" for i,f in enumerate(fids) for j,g in enumerate(fids) if i > j]

In [14]:
len(crosses)

15

In [15]:
'|'.join(crosses)

'-60036&-60029|-60040&-60029|-60040&-60036|-60042&-60029|-60042&-60036|-60042&-60040|-60049&-60029|-60049&-60036|-60049&-60040|-60049&-60042|-160020&-60029|-160020&-60036|-160020&-60040|-160020&-60042|-160020&-60049'

In [16]:
Validation = MetricsComputer(label)

In [17]:
train.dtypes

cat1     int64
cat4     int64
cat6     int64
cat8     int64
cat9     int64
click    int64
dtype: object

In [18]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

feature cat1 has 9 distinct modalities
feature cat4 has 14 distinct modalities
feature cat6 has 30 distinct modalities
feature cat8 has 11 distinct modalities
feature cat9 has 30 distinct modalities


## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [19]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [20]:
aggdata = AggDataset( features, "*&*", train , label, epsilon, delta, maxNbModalities=10000)

#https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/featuremappings.py#L205

In [21]:
print( f" Label: {aggdata.label}")
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

 Label: click
 Nb Queries: 15
 Noise distribution: None


In [22]:
# aggdata may be viewed as a dictionary queryname -> dataframe
aggdata_datframe_dico = aggdata.toDFs()
queries = [x for x in aggdata_datframe_dico.keys()]
print( f"list of queries {queries}" )

list of queries ['cat1', 'cat4', 'cat6', 'cat8', 'cat9', 'cat1&cat4', 'cat1&cat6', 'cat1&cat8', 'cat1&cat9', 'cat4&cat6', 'cat4&cat8', 'cat4&cat9', 'cat6&cat8', 'cat6&cat9', 'cat8&cat9']


In [23]:
# Dataframe for the query  " select 'cat1', 'cat8' , count, sum(label) group by 'cat1', 'cat8' "
aggdata_datframe_dico[queries[-3]].sample(3)

Unnamed: 0,cat6&cat8,cat6,cat8,c,click
201,201,15,6,0.0,0.0
243,243,26,7,0.0,0.0
55,55,24,1,0.0,0.0


In [24]:
aggdata.aggDisplays

# Dictionary of projections 

{'cat1': Projection c on cat1(10),
 'cat4': Projection c on cat4(15),
 'cat6': Projection c on cat6(31),
 'cat8': Projection c on cat8(12),
 'cat9': Projection c on cat9(31),
 'cat1&cat4': Projection c on cat1(10)xcat4(15),
 'cat1&cat6': Projection c on cat1(10)xcat6(31),
 'cat1&cat8': Projection c on cat1(10)xcat8(12),
 'cat1&cat9': Projection c on cat1(10)xcat9(31),
 'cat4&cat6': Projection c on cat4(15)xcat6(31),
 'cat4&cat8': Projection c on cat4(15)xcat8(12),
 'cat4&cat9': Projection c on cat4(15)xcat9(31),
 'cat6&cat8': Projection c on cat6(31)xcat8(12),
 'cat6&cat9': Projection c on cat6(31)xcat9(31),
 'cat8&cat9': Projection c on cat8(12)xcat9(31)}

In [25]:
"""
regulL2 = 16
logisticCfs = LogisticModelWithCF(label , features, "*&*"  , train ,
                                       hashspace=2**22 , lambdaL2 = regulL2  )
"""

'\nregulL2 = 16\nlogisticCfs = LogisticModelWithCF(label , features, "*&*"  , train ,\n                                       hashspace=2**22 , lambdaL2 = regulL2  )\n'

In [26]:
"""
logisticCfs.fit( train )
print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
"""

'\nlogisticCfs.fit( train )\nprint( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )\n'

##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [27]:
"""
regulL2 = 16
logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )
"""

'\nregulL2 = 16\nlogisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )\n'

In [28]:
"""logisticCfs.fit( train[features] , nbIter = 200 )
print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
"""

'logisticCfs.fit( train[features] , nbIter = 200 )\nprint( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )\n'

##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [29]:
regulL2 = 16
nbSamples = 10000
nbIter = 50

In [30]:
memMrf = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2,  ## parmeter "lambda_1" 
                    sampleFromPY0 = True,
                    maxNbRowsperGibbsUpdate = 50,
                    # sparkSession = ss
                  )

In [31]:
memMrf.fit(nbIter)
print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(memMrf,train) , "valid" , Validation.run(memMrf,valid)   )

llh:0.0E+00 a:9.2E-02(0), n:50, g:3.8E+03  --
MRF lambda1= 16 train NLLH=0.0569, NMSE=0.0732   valid NLLH=0.0532, NMSE=0.0686  


llh:2.7E+01 a:1.4E-01(0), n:50, g:3.6E+03  --  
MRF lambda1= 16 train NLLH=0.0567, NMSE=0.0728   valid NLLH=0.0527, NMSE=0.0681  

In [32]:
ss.sparkContext.setCheckpointDir("viewfs://prod-am6/tmp/j.rioufougeras/load/")

In [33]:
rddMrf = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2,  ## parmeter "lambda_1" 
                    sampleFromPY0 = True,
                    maxNbRowsperGibbsUpdate = 50,
                    sparkSession = ss
                  )

In [34]:
rddMrf.fit(nbIter)
print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(rddMrf,train) , "valid" , Validation.run(rddMrf,valid)   )

llh:0.0E+00 a:1.2E-05(0), n:50, g:1.1E+04  --
MRF lambda1= 16 train NLLH=0.0451, NMSE=0.0586   valid NLLH=0.0437, NMSE=0.0569  


Small llh:2.5E+01 a:1.4E-01(0), n:50, g:3.6E+03  --  
MRF lambda1= 16 train NLLH=0.0575, NMSE=0.0740   valid NLLH=0.0543, NMSE=0.0701  