In [None]:
pip install diffprivlib

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")
from myimports  import *

plt.style.use('ggplot')

In [None]:
from aggregated_models.aggdataset import AggDataset
from validation import MetricsComputer


In [None]:
from RandomProjectedPrivateLogistic import RPPrivateLogistic

In [None]:
from aggLogistic import AggLogistic
from agg_mrf_model import AggMRFModel, AggMRFModelParams

In [None]:
Validation = MetricsComputer("click")

## Download Data

In [None]:
import loaddata
train, valid, allvars = loaddata.run()
## features def: https://confluence.criteois.com/pages/viewpage.action?spaceKey=RSC&title=Dataset+06+%3A+Attribution+Modeling+for+Bidding


In [None]:
def addToResults( model, name , epsilon, regulL2 , otherparams = {}  ):
    dico = otherparams.copy()
    dico["model"] = name
    dico["valid"] = Validation.getLLH( model, valid )
    dico["train"] = Validation.getLLH( model, train )
    dico["epsilon"] = epsilon
    dico["regul"] = regulL2
    dico["features"] =  ",".join( model. features)
    res = pd.DataFrame( [dico] )
    print(res)
    return pd.concat( [ results , res ] )

In [None]:
results = pd.DataFrame()

# Data set with limited number of features

In [None]:
features = ['cat1', 'cat2', 'cat8', 'cat9' ]
aggdata_nonoise = AggDataset( features, "*&*", train )
epsilons = [ 0.1 ,1.0, 10.0 ]
aggdata_noised = {}
for epsilon0 in epsilons:
    aggdata_noised[epsilon0] = AggDataset( train , features, "*&*",  epsilon0=epsilon0)


In [None]:
logistic.features

##### logistic Regression
- Requires full logs, not privacy preserving.
- "Skyline"

In [None]:
regulL2s = [ 1.0,  4.0 , 16.0 , 64.0, 256 ]
for regulL2 in regulL2s:
    logistic = AggLogistic(  aggdata_nonoise , features, clicksCfs = "*&*" , regulL2=regulL2 )
    logistic.fit(train [features] , nbIter=500)
    results = addToResults( logistic, "logistic", 0, regulL2  )

In [None]:
results

##### Private logistic regression
- From ibm diffprivlib
- Adding some noise to the loss
- Does not scales well with input vector dimensions, therefore using random projections to limit dimensionality

In [None]:
%%time
for epsilon in epsilons:
    aggdata = aggdata_noised[epsilon]
    regulL2s = [ 1.0,  4.0 , 16.0 , 64.0, 256, 1024 , 1024*4 ]    
    for l2 in regulL2s:
        for p in [ 10,30,100,300, 1000 ]:
            logistic = RPPrivateLogistic("click" , features, "*&*",  train, lambdaL2=l2,epsilon= epsilon,
                                     projectionSize=p )
            logistic.fit(train)
            results = addToResults( logistic, "privatelogistic",  epsilon , l2, {"dimensions":p}  )

##### Models learned from agg data only

In [None]:
%%time
for epsilon in epsilons:
    aggdata = aggdata_noised[epsilon]
    for l2 in [  4.0 , 16.0 , 64.0, 256, 1024 , 1024*4 ]:
        params = AggMRFModelParams(
            exactComputation=False,
            clicksCfs = "*&*",
            displaysCfs="*&*",
            nbSamples = 50000,
            regulL2=1.0,
            regulL2Click = l2,
            laplaceEpsilon = aggdata.epsilon
        )
        self = AggMRFModel(aggdata, features, params)
        self.fit(200)
        results = addToResults( self, "aggmodel",  epsilon , l2  )

In [None]:
results[results.epsilon == 1.0].sort_values( "valid" )

In [None]:
a = results[(results.model == "aggmodel") & (results.epsilon == 0.1) &(results.train>0)]
a

In [None]:
a = results[results.model == "logistic"]
plt.plot( a.train.values , a.valid.values  , "x" )


a = results[(results.model == "aggmodel") & (results.epsilon == 0.1) &(results.train>0)]
plt.plot( a.train.values , a.valid.values  , "x" )

a = results[(results.model == "privatelogistic") & (results.epsilon == 0.1) &(results.train>0)]
plt.plot( a.train.values , a.valid.values  , "kx" )



## Big dataset (11 features)

In [None]:
features =['campaign', 'time_since_last_click', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
aggdata_nonoise = AggDataset( features, "*&*", train )
epsilons = [ 1.0, 10.0 ]

In [None]:
aggdata_noised = {}
for epsilon0 in epsilons:
    aggdata_noised[epsilon0] = AggDataset( train , features, "*&*",  epsilon0=epsilon0)

In [None]:
regulL2s = [  40.0 , 160.0 , 640.0, 2560 ]
for regulL2 in regulL2s:
    logistic = AggLogistic(  aggdata_nonoise , features, clicksCfs = "*&*" , regulL2=regulL2 )
    logistic.fit(train [features] , nbIter=50)
    results = addToResults( logistic, "logistic", 0, regulL2  )

In [None]:
%%time
for epsilon in epsilons:
    regulL2s = [ 160.0 , 640.0, 2560, 2560*4 , 2560*16 ]    
    for l2 in regulL2s:
        for p in [ 30,100,300, 1000, 3000 ]:
            print( epsilon, l2,p)
            logistic = RPPrivateLogistic("click" , features, "*&*",  train, lambdaL2=l2,epsilon= epsilon,
                                     projectionSize=p )
            logistic.fit(train)
            results = addToResults( logistic, "privatelogistic",  epsilon , l2, {"dimensions":p}  )

In [None]:
%%time
for epsilon in epsilons:
    regulL2s = [ 40.0, 10.0  ]    
    for l2 in regulL2s:
        for p in [ 3000,1000,300, 100, 30 ]:
            print( epsilon, l2,p)
            logistic = RPPrivateLogistic("click" , features, "*&*",  train, lambdaL2=l2,epsilon= epsilon,
                                     projectionSize=p )
            logistic.fit(train)
            results = addToResults( logistic, "privatelogistic",  epsilon , l2, {"dimensions":p}  )

In [None]:
results.query( 'epsilon==10.0' ) .sort_values('valid')

In [None]:
%%time
for epsilon in epsilons:
    aggdata = AggDataset( train , features, "*&*",  epsilon0=epsilon)
    for l2 in [ 160.0 , 640.0, 2560, 2560*4 , 2560*16 ]:
        params = AggMRFModelParams(
            exactComputation=False,
            clicksCfs = "*&*",
            displaysCfs="*&*",
            nbSamples = 50000,
            regulL2=1.0,
            regulL2Click = l2,
            noiseDistribution = aggdata.mechanism.getNoise()
        )
        self = AggMRFModel(aggdata, features, params)
        self.fit(200)
        results = addToResults( self, "aggmodel",  epsilon , l2  )

In [None]:
%%time
for epsilon in epsilons[1:]:
    aggdata = AggDataset( train , features, "*&*",  epsilon0=epsilon)
    for l2 in [ 160.0 , 640.0] :# , 2560, 2560*4 , 2560*16 ]:
        params = AggMRFModelParams(
            exactComputation=False,
            clicksCfs = "*&*",
            displaysCfs="*&*",
            nbSamples = 50000,
            regulL2=1.0,
            regulL2Click = l2,
            noiseDistribution = aggdata.mechanism.getNoise()
        )
        self = AggMRFModel(aggdata, features, params)
        self.fit(200)
        results = addToResults( self, "aggmodel",  epsilon , l2  )

In [None]:
%%time
## GAUSSIAN noise
for delta in [ 0.000001 , 0.001]:
    for epsilon in [10.0, 1.0]:
        for l2 in [ 160.0 , 640.0] :# , 2560, 2560*4 , 2560*16 ]:
            aggdata = AggDataset( train , features, "*&*",  epsilon0=epsilon ,delta = delta)
            params = AggMRFModelParams(
                exactComputation=False,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = 50000,
                regulL2=1.0,
                regulL2Click = l2,
                noiseDistribution = aggdata.mechanism.getNoise()
            )
            self = AggMRFModel(aggdata, features, params)
            self.fit(200)
            results = addToResults( self, "aggmodel",  epsilon , l2 ,  {"delta":delta}  )

In [None]:
results[ (results.model=="privatelogistic") &(results.epsilon==0.1) ].sort_values('valid')

In [None]:
results[results.epsilon==1.0].sort_values('valid')

In [None]:
results[results.epsilon==10.0].sort_values('valid')