In [1]:
%load_ext autoreload
%autoreload 2
import sys
from agg_models.myimports  import *
import agg_models.myJupyterUtils as myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('ggplot')

from datetime import date

# helpers to compute metrics
from agg_models.validation import MetricsComputer,  LLH  

# baselines
from agg_models.basicmodels import LogisticModel, NaiveBayesModel, LogisticModelWithCF 
from agg_models.aggLogistic import AggLogistic

# loading public "criteo attribution dataset"
import agg_models.loaddata as loaddata

In [2]:
# code to prepare the aggregated dataset
from agg_models.featuremappings import AggDataset 

In [3]:
## Most relevant code is there:
from agg_models.agg_mrf_model import AggMRFModel, fastGibbsSample, fastGibbsSampleFromPY0
import agg_models.agg_mrf_model
# also in https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py

In [4]:
import thx.hadoop.hdfs_cache as hdfs
from thx.hadoop.spark_config_builder import create_remote_spark_session, SparkSession
import pyspark
from pyspark.sql import functions as F
from thx.datasources.parquet import create_df_from_parquet
from datetime import datetime, timedelta
import os



In [5]:
ss = create_remote_spark_session('Test Spark parallelize', 10, 1, memory='4g', memoryOverhead='2g', driver_memory='16g', hadoop_file_systems=['viewfs://root', 'viewfs://prod-am6'])
ss

## Download Data
- downloading criteo-research-attribution-dataset
- from url http://go.criteo.net

In [6]:
#loaddata.download_dataset()

## Loading data
3 versions of the dataset are used for experiments: "small" , "sampled" and "full"
- "full" has 11 features with about 16M samples
- "sampled" has the same 11 features, but only 160k samples
- "small" also has 160k samples, but only the 5 features with lowest modalities count, and allow for fast experiments.

In [7]:
# dataset= "small_tb" # fast expriments
# dataset= "medium_tb" # fast expriments
dataset= "small" # fast expriments
# dataset= "small" # fast expriments
# dataset= "sampled" # Training a MRF may require 5h and 16Go data
# dataset= "full"  # Training a MRF may require 32Go, and several days

In [None]:
train, valid, features, label = loaddata.getDataset(dataset)

Sampling ratio :0.01


In [None]:
fids = [-60029,-60036,-60040,-60042,-60049,-160020]

In [None]:
crosses = [ f"{f}&{g}" for i,f in enumerate(fids) for j,g in enumerate(fids) if i > j]

In [None]:
len(crosses)

In [None]:
'|'.join(crosses)

In [None]:
Validation = MetricsComputer(label)

In [None]:
train.dtypes

In [None]:
for f in features:
    nbModalities = len(set(train[f].values))
    print( f"feature {f} has {nbModalities} distinct modalities" )

## Preparing Aggregated data

- aggdata contains projections of number of displays and clicks along each pair of feature
- may also add some noise to make it differential private
- the goal is to learn a model predicting Proba( label | features) using *only* those aggdata.

In [None]:
# parameters for of the privacy protecting noise.
epsilon = None  # Set to None to get no noise.
delta = None 

In [None]:
aggdata = AggDataset( features, "*&*", train , label, epsilon, delta )

#https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/featuremappings.py#L205

In [None]:
print( f" Label: {aggdata.label}")
print( f" Nb Queries: {len(aggdata.aggDisplays)}")
print( f" Noise distribution: {aggdata.noiseDistribution}" )

In [None]:
# aggdata may be viewed as a dictionary queryname -> dataframe
aggdata_datframe_dico = aggdata.toDFs()
queries = [x for x in aggdata_datframe_dico.keys()]
print( f"list of queries {queries}" )

In [None]:
# Dataframe for the query  " select 'cat1', 'cat8' , count, sum(label) group by 'cat1', 'cat8' "
aggdata_datframe_dico[queries[-3]].sample(3)

In [None]:
aggdata.aggDisplays

# Dictionary of projections 

In [None]:
regulL2 = 16
logisticCfs = LogisticModelWithCF(label , features, "*&*"  , train ,
                                      hashspace=2**22 , lambdaL2 = regulL2  )

In [None]:
# logisticCfs.fit( train )
# print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

##### logistic Regression from aggregated clicks and full display data (quadratic kernell)
 - same performances as "standard" logistic regression
 - but still using full display data, so not really usefull

In [None]:
regulL2 = 16
logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , regulL2=regulL2 )

In [None]:
# logisticCfs.fit( train[features] , nbIter = 200 )
# print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,train) , "valid" , Validation.run(logisticCfs,valid)   )

##### Proposed MRF model
- uses only aggregated data
- almost retrieves logitic performances

In [None]:
regulL2 = 16
nbSamples = 3000
nbIter = 200

self = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2,  ## parmeter "lambda_1" 
                    sampleFromPY0 = True,
                    maxNbRowsperGibbsUpdate = 300,
                    sparkSession = ss
                  )

In [None]:
self.fit(nbIter)
print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

In [None]:
self.samples.use_spark_rdd = True
gRdd = self.computeGradient()

In [None]:
gRdd

In [None]:
final_samples = np.vstack(self.samples.samplesRdd.map(lambda t: t[0]).collect()).transpose()

In [None]:
self.samples.data = final_samples
self.computedotprods(self.samples)
self.samples.computeProbaSamples(self.muIntercept, self.lambdaIntercept)
self.samples.setweights()
self.samples.applyreweighting(self.muIntercept, self.lambdaIntercept)
self.samples.use_spark_rdd = False
g = self.computeGradient()

In [None]:
regulL2 = 16
nbSamples = 10000
nbIter = 50

self = AggMRFModel( aggdata,
                    features , 
                    exactComputation=False ,  ## Using Gibbs Sampling.  actualy exact=True is broken in latest code
                    clicksCfs = "*&*", ## crossfeatures used by P(Y|X) part of the model
                    displaysCfs="*&*", ## crossfeatures used by P(X) part of the model. Here, all pairs + all single .
                    nbSamples = nbSamples, ## Nb Gibbs samples to estimate gradient
                    regulL2=1.0 ,  ## parmeter "lambda_2"
                    regulL2Click = regulL2,  ## parmeter "lambda_1" 
                    sampleFromPY0 = True,
                    maxNbRowsperGibbsUpdate = 300
                  )

In [None]:
self.fit(nbIter)
print( f"MRF lambda1= {regulL2}",  "train",   Validation.run(self,train) , "valid" , Validation.run(self,valid)   )

In [None]:
# all parameters mu and theta concatenated in a  single vector
self.parameters

# This vector is the concatenation of parameters for associated to each projection

In [None]:
maxNbRows = 100
rows = self.samples.data.transpose()
starts = np.arange(0,  len(rows) , maxNbRows)
slices =[ (rows[start:start+maxNbRows]) for start in starts  ]
rdd = ss.sparkContext.parallelize(slices)

In [None]:
rdd = ss.sparkContext.parallelize(slices)

In [None]:
def RunRddGibbsSampler(self, rdd, nbsteps=1):
        exportedDisplayWeights, exportedClickWeights, modalitiesByVarId, parameters = self.exportWeightsAll()
        
        def myfun_sampling_from_p_y0(s):
            s  = fastGibbsSampleFromPY0(exportedDisplayWeights, modalitiesByVarId, parameters,
                                 s,nbsteps )
            return s
                
        return rdd.map(myfun_sampling_from_p_y0).cache()

In [None]:
rdd = RunRddGibbsSampler(self, rdd, nbsteps=1)

In [None]:
def compute_expdotproducts(self, rdd):

    def expdotproducts(x):
        t_x = x.transpose()
        mus = np.zeros( x.shape[0] )
        lambdas = np.zeros( x.shape[0] )
        for w in self.displayWeights.values():
            mus  += self.parameters[ w.feature.Values_(t_x) + w.offset ]
        for w in self.clickWeights.values():
            lambdas  += self.parameters[ w.feature.Values_(t_x) + w.offset ]
        mus = np.exp(mus + self.muIntercept)
        return x, mus,  mus*np.exp(lambdas + self.lambdaIntercept)
    
    return rdd.map(expdotproducts)

In [None]:
x_mu_lambdas = compute_expdotproducts(self, rdd) 

In [None]:
from operator import add
def compute_pdisplays(self, xmulambdas):
    count = self.samples.Size
    n = np.exp( self.muIntercept )
    # x, expmu, explambda
    z0_on_z = count / xmulambdas.map(lambda mula: 1+mula[2]/mula[1]).reduce(add)
    enoclick = z0_on_z * (1+np.exp(self.lambdaIntercept)) * n /  count
    def _computePDisplays(tuple_x_mu_lambda):
        # x, expmu, explambda
        eclick   = enoclick * tuple_x_mu_lambda[2] / tuple_x_mu_lambda[1]
        return tuple_x_mu_lambda[0], enoclick, eclick
    
    return xmulambdas.map(_computePDisplays)

In [None]:
x_enoclick_eclick = compute_pdisplays(self, x_mu_lambdas)

In [None]:
def getPredictionsVectorRdd(self , x_enoclick_eclick):  
    
    def computePredictions(tuple_x_enoclick_eclick):
        p = self.parameters * 0
        t_x = tuple_x_enoclick_eclick[0].transpose()
        enoclick = tuple_x_enoclick_eclick[1]
        eclick = tuple_x_enoclick_eclick[2]
        for w in self.displayWeights.values():
            p[w.indices] = w.feature.Project_(t_x, enoclick+eclick ) # Correct for grads
        for w in self.clickWeights.values():
            p[w.indices] = w.feature.Project_(t_x, eclick )        
        return p

    return x_enoclick_eclick.map(computePredictions).reduce(add)

In [None]:
prediction = getPredictionsVector(self, x_enoclick_eclick)

In [None]:
prediction

In [None]:
for a in range(200):
    rdd = RunRddGibbsSampler(self, rdd, nbsteps=1)
    x_mu_lambdas = RunRddDotProduct(self,  rdd, self.displayWeights, self.muIntercept, self.clickWeights, self.lambdaIntercept)
    x_enoclick_eclick = computePDisplays(self, x_mu_lambdas)
    prediction = getPredictionsVector(self, x_enoclick_eclick)
    print(a)

In [None]:
len(self.samples.Eclick)

In [None]:
# List of features and crossfeatures for mu
self.displayWeights

In [None]:
# List of features and cfs for theta
self.clickWeights
# class WeightsSet : https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py#L8

## In parameter vector, indices from 3719 to 3729 are the parameters "theta" 
##   associated to values of the single feature "cat1"

In [None]:
# there are also two 'intercept' parameters:
self.muIntercept, self.lambdaIntercept
#  ...  thus P(Y = 1 |X =x) = sigmoid( K(x) . self.parameters[someOffset:] +  self.lambdaIntercept )

#  todo:  remane self.lambdaIntercept to self.thetaIntercept to get coherent notations

In [None]:
## samples of "X"

self.samples.data.shape

In [None]:
## Computing dotproducts between K(x) and mu or theta:

## https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/baseaggmodel.py#L62

mus    = self.dotproducts( self.displayWeights, self.samples.data ) + self.muIntercept
mus

## https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/agg_mrf_model.py#L145

## note: I just added some comments in the code, translating all the line numbers ...

In [None]:
d = self.Data
d
#  vector with  the counts of click or display  from aggregated data.
# Same indexing as self.parameters

In [None]:
p = self.getPredictionsVector( self.samples )
p
# expected counts according to the model, computed by MC on the samples 

In [None]:
## https://gitlab.criteois.com/a.gilotte/aggdata/-/blob/master/src/agg_mrf_model.py#L187
w = self.displayWeights["integer_feature_10"]
w.feature.Project_(  self.samples.data  , self.samples.pdisplays ) # Correct for grads

# a bit uselessly complicated :  self.samples.pdisplays  is constant
# This allows having samples with different 'weights', for example one sample for each possible modality of X

In [None]:
# After fiting the model,  "data" and "prediction" should be equal
plt.plot( d,p, "x" )


In [None]:
# ... up to the noise of the sampling / convergence of optimizer
plt.plot( np.log (1+d), np.log( 1+p), "x" )