In [None]:
%load_ext autoreload
%autoreload 2
from aggregated_models.myimports  import *
# import aggregated_models.myJupyterUtils as myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('ggplot')
import getpass

# helpers to compute metrics
from aggregated_models.validation import MetricsComputer

from aggregated_models.validation import SparkMetricsComputer

# loading public "criteo attribution dataset"
import aggregated_models.loaddata as loaddata

In [None]:
%matplotlib inline

In [None]:
# code to prepare the aggregated dataset
from aggregated_models.aggdataset import AggDataset 

In [None]:
## Most relevant code is there:
from aggregated_models.agg_mrf_model import AggMRFModel, AggMRFModelParams
# also in https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py

In [None]:
from thx.hadoop.spark_config_builder import create_remote_spark_session

In [None]:
ss = create_remote_spark_session('Test Spark parallelize', 20, 1, memory='4g', memoryOverhead='2g', driver_memory='12g', hadoop_file_systems=['viewfs://root', 'viewfs://prod-am6'])
ss

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [None]:
#loaddata.download_dataset()

## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [None]:
# dataset= "small_tb" # fast expriments
# dataset= "medium_tb" # fast expriments
# dataset= "sampled" # fast expriments
dataset= "small" # fast expriments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [None]:
train, valid, features, label = loaddata.getDataset(dataset)

In [None]:
df_train = ss.createDataFrame(train)

In [None]:
df_valid = ss.createDataFrame(valid)

In [None]:
features

In [None]:
label

In [None]:
Validation = MetricsComputer(label)

In [None]:
SparkValidation = SparkMetricsComputer(label)

## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [None]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [None]:
aggdata = AggDataset( features=features, dataframe=train, label=label, maxNbModalities=10000)


In [None]:
sparkdata = AggDatasetSpark( features=features, train=df_train, label=label, maxNbModalities=10000)

#https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/featuremappings.py#L205

##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [None]:
regulL2 = 16
nbSamples = 10000
nbIter = 50

In [None]:
params = AggMRFModelParams(
    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
    regulL2=1.0 ,  ## parmeter "lambda_2"
    regulL2Click = regulL2,  ## parmeter "lambda_1"
    sampleFromPY0 = True,
)
memMrf = AggMRFModel(aggdata, features, params)

In [None]:
memMrf.fit(nbIter, 0.05)

In [None]:
print( f"MRF" ,  "train",  Validation.run(memMrf,train) , "valid" , Validation.run(memMrf,valid)   )

In [None]:
memMrf.aggdata = sparkdata

In [None]:
print( f"MRF" ,  "train",  SparkValidation.run(memMrf, df_train) , "valid" , SparkValidation.run(memMrf, df_valid)   )

In [None]:
ss.sparkContext.setCheckpointDir(f"viewfs://prod-am6/tmp/{getpass.getuser()}/load/")

In [None]:
params = AggMRFModelParams(
    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
    regulL2=1.0 ,  ## parmeter "lambda_2"
    regulL2Click = regulL2,  ## parmeter "lambda_1"
    sampleFromPY0 = True,
)
rddMrf = AggMRFModel(sparkdata, features, params, sparkSession = ss)

In [None]:
rddMrf.fit(nbIter, 0.05)

In [None]:
print( f"MRF" ,  "train",  SparkValidation.run(rddMrf, df_train) , "valid" , SparkValidation.run(rddMrf, df_valid)   )

In [None]:
rddMrf.aggdata = aggdata

In [None]:
print( f"MRF" ,  "train",  Validation.run(rddMrf, train) , "valid" , Validation.run(rddMrf, valid)   )