In [None]:
%load_ext autoreload
%autoreload 2
import sys
from aggregated_models.myimports  import *
# import aggregated_models.myJupyterUtils as myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('ggplot')

from datetime import date

# helpers to compute metrics
from aggregated_models.validation import MetricsComputer,  LLH  

# baselines
from aggregated_models.basicmodels import LogisticModel, NaiveBayesModel, LogisticModelWithCF 
from aggregated_models.aggLogistic import AggLogistic

# loading public "criteo attribution dataset"
import aggregated_models.loaddata as loaddata

In [None]:
%matplotlib inline

In [None]:
# code to prepare the aggregated dataset
from aggregated_models.aggdataset import AggDataset 

In [None]:
## Most relevant code is there:
from aggregated_models.agg_mrf_model import AggMRFModel, fastGibbsSample, fastGibbsSampleFromPY0
import aggregated_models.agg_mrf_model
# also in https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py

In [None]:
import thx.hadoop.hdfs_cache as hdfs
from thx.hadoop.spark_config_builder import create_remote_spark_session, SparkSession
import pyspark
from pyspark.sql import functions as F
from thx.datasources.parquet import create_df_from_parquet
from datetime import datetime, timedelta
import os

In [None]:
ss = create_remote_spark_session('Test Spark parallelize', 20, 1, memory='4g', memoryOverhead='2g', driver_memory='12g', hadoop_file_systems=['viewfs://root', 'viewfs://prod-am6'])
ss

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [None]:
#loaddata.download_dataset()

## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [None]:
# dataset= "small_tb" # fast expriments
# dataset= "medium_tb" # fast expriments
# dataset= "sampled" # fast expriments
dataset= "small" # fast expriments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [None]:
train, valid, features, label = loaddata.getDataset(dataset)

In [None]:
features

In [None]:
label

In [None]:
fids = [-60029,-60036,-60040,-60042,-60049,-160020]

In [None]:
crosses = [ f"{f}&{g}" for i,f in enumerate(fids) for j,g in enumerate(fids) if i > j]

In [None]:
len(crosses)

In [None]:
'|'.join(crosses)

In [None]:
Validation = MetricsComputer(label)

In [None]:
train.dtypes

In [None]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [None]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [None]:
aggdata = AggDataset( train , features, "*&*",  label, epsilon, delta, maxNbModalities=10000)

#https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/featuremappings.py#L205

In [None]:
print( f" Label: {aggdata.label}")
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

In [None]:
# aggdata may be viewed as a dictionary queryname -> dataframe
aggdata_datframe_dico = aggdata.toDFs()
queries = [x for x in aggdata_datframe_dico.keys()]
print( f"list of queries {queries}" )

In [None]:
# Dataframe for the query  " select 'cat1', 'cat8' , count, sum(label) group by 'cat1', 'cat8' "
aggdata_datframe_dico[queries[-3]].sample(3)

In [None]:
aggdata.aggDisplays

# Dictionary of projections 

In [None]:
"""
regulL2 = 16
logisticCfs = LogisticModelWithCF(label , features, "*&*"  , train ,
                                       hashspace=2**22 , lambdaL2 = regulL2  )
"""

In [None]:
"""
logisticCfs.fit( train )
print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
"""

##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [None]:
"""
regulL2 = 16
logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )
"""

In [None]:
"""logisticCfs.fit( train[features] , nbIter = 200 )
print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )
"""

##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [None]:
regulL2 = 16
nbSamples = 10000
nbIter = 50

In [None]:
ss.sparkContext.setCheckpointDir("viewfs://prod-am6/tmp/j.rioufougeras/load/")

In [None]:
memMrf = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2,  ## parmeter "lambda_1" 
                    sampleFromPY0 = True,
                    maxNbRowsperGibbsUpdate = 500
                  )

In [None]:
memMrf.fit(nbIter,0.05)

In [None]:
print( f"MRF" ,  "train",  Validation.run(memMrf,train) , "valid" , Validation.run(memMrf,valid)   )

In [None]:
rddMrf = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2,  ## parmeter "lambda_1" 
                    sampleFromPY0 = True,
                    maxNbRowsperGibbsUpdate = 1,
                    sparkSession = ss
                  )

In [None]:
rddMrf.fit(nbIter,0.05)

In [None]:
print( f"MRF" ,  "train",  Validation.run(rddMrf,train) , "valid" , Validation.run(rddMrf,valid)   )

In [None]:
rddMrf.samples.PredictInternal(rddMrf)

In [None]:
rdd_pred = rddMrf.samples.prediction

In [None]:
rddMrf.samples = memMrf.buildSamplesSetFromSampleRdd(rddMrf.samples)
rddMrf.samples.PredictInternal(rddMrf)
mem_pred = rddMrf.samples.prediction

In [None]:
(rdd_pred-mem_pred).max()