In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import combinations 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score

import sys
from aggregated_models.myimports  import *
import aggregated_models.myJupyterUtils as myJupyterUtils ## Remove stacktraces on Keyboardinterupt
plt.style.use('classic')
from aggregated_models.agg_mrf_model import *
from aggregated_models.validation import * 
from aggregated_models.aggLogistic import AggLogistic

# loading public "criteo attribution dataset"
from aggregated_models.aggdataset import *
from aggregated_models.FeatureEncodings import *


  from cryptography.hazmat.backends import default_backend


In [2]:
from aggregated_models import loaddata

## Datasets
#### banking
more information about this data can be seen here: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

#### adult
more information about this data can be seen here: https://archive.ics.uci.edu/ml/datasets/Adult


In [5]:
#dftrain, dftest, features = loaddata.load_banking_dataset()
dftrain, dftest, features = loaddata.load_adult_dataset()


In [6]:
print("Positive ratio" ,   dftrain.label.sum() *1.0/len(dftrain) , dftest.label.sum() *1.0/len(dftest)  )
features

Positive ratio 0.2408095574460244 0.23622627602727106


['age',
 'workClass',
 'fnlwgt',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [7]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workClass       32561 non-null  int64
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  int64
 4   education-num   32561 non-null  int64
 5   marital-status  32561 non-null  int64
 6   occupation      32561 non-null  int64
 7   relationship    32561 non-null  int64
 8   race            32561 non-null  int64
 9   sex             32561 non-null  int64
 10  capital-gain    32561 non-null  int64
 11  capital-loss    32561 non-null  int64
 12  hours-per-week  32561 non-null  int64
 13  native-country  32561 non-null  int64
 14  label           32561 non-null  int64
dtypes: int64(15)
memory usage: 3.7 MB


In [8]:
aggdata = AggDataset.FromDF( dftrain , features, "*&*",  "label")
Validation = MetricsComputer("label")

In [9]:
## Logistic model, with cross features
regulL2s = [10,100,1000]
for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*&*" , rescaling=True, regulL2=regulL2 )
    logisticCfs.fit( dftrain[features] , nbIter = 200 )
    print( f"Logistic(*&*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,dftrain) , "valid" , Validation.run(logisticCfs,dftest)   )
    

Logistic(*&*), l2:10 train NLLH=0.4482, NMSE=0.4671   valid NLLH=0.4206, NMSE=0.4381  
Logistic(*&*), l2:100 train NLLH=0.4188, NMSE=0.4399   valid NLLH=0.4072, NMSE=0.4273  
Logistic(*&*), l2:1000 train NLLH=0.3592, NMSE=0.3857   valid NLLH=0.3565, NMSE=0.3827  


In [10]:
## Logistic model, no cross features
regulL2s = [1,10,100]
for regulL2 in regulL2s:
    logisticCfs = AggLogistic(  aggdata , features, clicksCfs = "*" , rescaling=True, regulL2=regulL2 )
    logisticCfs.fit( dftrain[features] , nbIter = 200 )
    print( f"Logistic(*), l2:{regulL2}" ,  "train",  Validation.run(logisticCfs,dftrain) , "valid" , Validation.run(logisticCfs,dftest)   )
    

Logistic(*), l2:1 train NLLH=0.4179, NMSE=0.4372   valid NLLH=0.4111, NMSE=0.4302  
Logistic(*), l2:10 train NLLH=0.4098, NMSE=0.4301   valid NLLH=0.4048, NMSE=0.4248  
Logistic(*), l2:100 train NLLH=0.3710, NMSE=0.3963   valid NLLH=0.3690, NMSE=0.3941  


In [11]:
## Random Markov Field
regulL2s = [ 100, 10, 1000 ]
for regulL2 in regulL2s:
    nbSamples = 10_000
    nbIter = 200
    params = AggMRFModelParams(
        features=features,
        exactComputation=False ,
        clicksCfs = "*&*",
        displaysCfs="*&*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2 )
    self = AggMRFModel(aggdata, params)
    self.fit(nbIter, alpha=0.01)
    print( f"MRF lambda= {regulL2}",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )

MRF lambda= 100 train NLLH=0.4270, NMSE=0.4477   valid NLLH=0.4125, NMSE=0.4319  
MRF lambda= 10 train NLLH=0.4540, NMSE=0.4717   valid NLLH=0.4187, NMSE=0.4355  
MRF lambda= 1000 train NLLH=0.3725, NMSE=0.3990   valid NLLH=0.3690, NMSE=0.3950  


In [13]:
## Random Markov Field, No cross features on label model
regulL2s = [ 1, 10, 100 ]
for regulL2 in regulL2s:
    nbSamples = 10_000
    nbIter = 200
    params = AggMRFModelParams(
        features=features,
        exactComputation=False ,
        clicksCfs = "*",
        displaysCfs="*&*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2 )
    self = AggMRFModel(aggdata, params)
    self.fit(nbIter)
    print( f"MRF, no cfs, lambda= {regulL2}",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )    

MRF, no cfs, lambda= 1 train NLLH=0.4173, NMSE=0.4369   valid NLLH=0.4105, NMSE=0.4298  
MRF, no cfs, lambda= 10 train NLLH=0.4100, NMSE=0.4303   valid NLLH=0.4051, NMSE=0.4249  
MRF, no cfs, lambda= 100 train NLLH=0.3721, NMSE=0.3971   valid NLLH=0.3701, NMSE=0.3950  


In [14]:
## Naive Bayes
regulL2s = [ 1, 10, 100 ]
for regulL2 in regulL2s:
    nbSamples = 10_000
    nbIter = 200
    params = AggMRFModelParams(
        features=features,
        exactComputation=False ,
        clicksCfs = "*",
        displaysCfs="*",
        nbSamples = nbSamples,
        regulL2=1.0,
        regulL2Click = regulL2 )
    self = AggMRFModel(aggdata, params)
    self.fit(nbIter)
    print( f"NB lambda= {regulL2}",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )    

NB lambda= 1 train NLLH=0.1787, NMSE=0.2784   valid NLLH=0.1767, NMSE=0.2805  
NB lambda= 10 train NLLH=0.1962, NMSE=0.2799   valid NLLH=0.1948, NMSE=0.2822  
NB lambda= 100 train NLLH=0.2455, NMSE=0.2774   valid NLLH=0.2435, NMSE=0.2787  


#### effect of regularizing mu and theta
(table 5)

In [15]:
regulL2s=[1,4,16,64,128]
regulL2Clicks=[1,4,16,64,128]
nbSamples=10_000
nbIter = 200
for regulL2 in regulL2s:
    for regulL2Click in regulL2Clicks:
            #try:
            params = AggMRFModelParams(
                features=features,
                exactComputation=False ,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = nbSamples,
                regulL2=1.0,
                regulL2Click = regulL2 )
            self = AggMRFModel(aggdata, params)
            self.fit(nbIter, alpha=0.01)
            print( f"MRF lambda_mu:{regulL2} l_theta:{regulL2Click}",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )
            #except: print(f"error while computing rmf with {regulL2} {regulL2Click}")

MRF lambda_mu:1 l_theta:1 train NLLH=0.4751, NMSE=0.4910   valid NLLH=0.4091, NMSE=0.4268  
MRF lambda_mu:1 l_theta:4 train NLLH=0.4742, NMSE=0.4900   valid NLLH=0.4086, NMSE=0.4261  
MRF lambda_mu:1 l_theta:16 train NLLH=0.4749, NMSE=0.4909   valid NLLH=0.4089, NMSE=0.4264  
MRF lambda_mu:1 l_theta:64 train NLLH=0.4745, NMSE=0.4903   valid NLLH=0.4089, NMSE=0.4262  
MRF lambda_mu:1 l_theta:128 train NLLH=0.4750, NMSE=0.4909   valid NLLH=0.4091, NMSE=0.4265  
MRF lambda_mu:4 l_theta:1 train NLLH=0.4629, NMSE=0.4798   valid NLLH=0.4160, NMSE=0.4328  
MRF lambda_mu:4 l_theta:4 train NLLH=0.4629, NMSE=0.4801   valid NLLH=0.4161, NMSE=0.4327  
MRF lambda_mu:4 l_theta:16 train NLLH=0.4628, NMSE=0.4799   valid NLLH=0.4156, NMSE=0.4320  
MRF lambda_mu:4 l_theta:64 train NLLH=0.4632, NMSE=0.4803   valid NLLH=0.4163, NMSE=0.4329  
MRF lambda_mu:4 l_theta:128 train NLLH=0.4628, NMSE=0.4798   valid NLLH=0.4158, NMSE=0.4322  
MRF lambda_mu:16 l_theta:1 train NLLH=0.4493, NMSE=0.4678   valid NLLH=0

In [None]:
#### increasing the number of Gibbs samples


In [17]:
nbIter = 200
regulL2 = 1
regulL2Clicks = 100
for nbSamples in [100, 200, 500, 1000,2000,5000,10_000, 20_000, 50_000,100_000]:
            params = AggMRFModelParams(
                features=features,
                exactComputation=False ,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = nbSamples,
                regulL2=regulL2,
                regulL2Click = regulL2Clicks )
            self = AggMRFModel(aggdata, params)
            self.fit(nbIter, alpha=0.01)
            print( f"MRF nbSamples = {nbSamples}",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )


MRF nbSamples = 100 train NLLH=0.3978, NMSE=0.4137   valid NLLH=0.3853, NMSE=0.4009  
MRF nbSamples = 200 train NLLH=0.4184, NMSE=0.4357   valid NLLH=0.4046, NMSE=0.4216  
MRF nbSamples = 500 train NLLH=0.4250, NMSE=0.4440   valid NLLH=0.4108, NMSE=0.4285  
MRF nbSamples = 1000 train NLLH=0.4273, NMSE=0.4466   valid NLLH=0.4129, NMSE=0.4311  
MRF nbSamples = 2000 train NLLH=0.4276, NMSE=0.4475   valid NLLH=0.4132, NMSE=0.4319  
MRF nbSamples = 5000 train NLLH=0.4271, NMSE=0.4478   valid NLLH=0.4127, NMSE=0.4321  
MRF nbSamples = 10000 train NLLH=0.4271, NMSE=0.4478   valid NLLH=0.4124, NMSE=0.4318  
MRF nbSamples = 50000 train NLLH=0.4273, NMSE=0.4478   valid NLLH=0.4125, NMSE=0.4318  
MRF nbSamples = 100000 train NLLH=0.4272, NMSE=0.4479   valid NLLH=0.4125, NMSE=0.4318  


#### Gradient rescaling
the model usually requires significantly less iterations to converge when using the "gradient rescaling".
(Note: on banking dataset, the effect is actually quite limited. It is more notieceable on the "adult" dataset, and on the Criteo AdKdd challenge)

In [18]:
for modifiedGradient in [True, False]:
    params = AggMRFModelParams(
                features=features,
                exactComputation=False ,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = nbSamples,
                regulL2=1.0,
                modifiedGradient =modifiedGradient,
                regulL2Click = 100 )
    self = AggMRFModel(aggdata, params)
        
    nbIterPerStep = 50
    for i in range(0,10):
        self.fit(nbIterPerStep, alpha=0.01)
        totalIters = (i+1) * nbIterPerStep
        print( f"MRF modifiedGradient: {modifiedGradient} iters:{totalIters} ",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )


MRF modifiedGradient: True iters:50  train NLLH=0.4001, NMSE=0.4155   valid NLLH=0.3919, NMSE=0.4081  
MRF modifiedGradient: True iters:100  train NLLH=0.4195, NMSE=0.4385   valid NLLH=0.4075, NMSE=0.4257  
MRF modifiedGradient: True iters:150  train NLLH=0.4248, NMSE=0.4451   valid NLLH=0.4110, NMSE=0.4302  
MRF modifiedGradient: True iters:200  train NLLH=0.4272, NMSE=0.4479   valid NLLH=0.4125, NMSE=0.4318  
MRF modifiedGradient: True iters:250  train NLLH=0.4287, NMSE=0.4496   valid NLLH=0.4133, NMSE=0.4326  
MRF modifiedGradient: True iters:300  train NLLH=0.4297, NMSE=0.4505   valid NLLH=0.4138, NMSE=0.4332  
MRF modifiedGradient: True iters:350  train NLLH=0.4304, NMSE=0.4513   valid NLLH=0.4143, NMSE=0.4336  
MRF modifiedGradient: True iters:400  train NLLH=0.4309, NMSE=0.4518   valid NLLH=0.4146, NMSE=0.4338  
MRF modifiedGradient: True iters:450  train NLLH=0.4313, NMSE=0.4522   valid NLLH=0.4148, NMSE=0.4341  
MRF modifiedGradient: True iters:500  train NLLH=0.4316, NMSE=0.4

#### increasing step size on mu produces a 'good' model faster

In [19]:
    params = AggMRFModelParams(
                features=features,
                exactComputation=False ,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = 10_000,
                regulL2=1.0,
                muStepSizeMultiplier = 5,
                modifiedGradient = True,
                regulL2Click = 100 )
    self = AggMRFModel(aggdata, params)
        
    nbIterPerStep = 50
    for i in range(0,10):
        self.fit(nbIterPerStep, alpha=0.01)
        totalIters = (i+1) * nbIterPerStep
        print( f"MRF with muStepSizeMultiplier. iters:{totalIters} ",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )


MRF with muStepSizeMultiplier. iters:50  train NLLH=0.4079, NMSE=0.4287   valid NLLH=0.3996, NMSE=0.4199  
MRF with muStepSizeMultiplier. iters:100  train NLLH=0.4198, NMSE=0.4409   valid NLLH=0.4078, NMSE=0.4278  
MRF with muStepSizeMultiplier. iters:150  train NLLH=0.4244, NMSE=0.4454   valid NLLH=0.4108, NMSE=0.4305  
MRF with muStepSizeMultiplier. iters:200  train NLLH=0.4270, NMSE=0.4480   valid NLLH=0.4124, NMSE=0.4319  
MRF with muStepSizeMultiplier. iters:250  train NLLH=0.4285, NMSE=0.4495   valid NLLH=0.4132, NMSE=0.4328  
MRF with muStepSizeMultiplier. iters:300  train NLLH=0.4295, NMSE=0.4504   valid NLLH=0.4138, NMSE=0.4332  
MRF with muStepSizeMultiplier. iters:350  train NLLH=0.4302, NMSE=0.4510   valid NLLH=0.4142, NMSE=0.4334  
MRF with muStepSizeMultiplier. iters:400  train NLLH=0.4306, NMSE=0.4514   valid NLLH=0.4146, NMSE=0.4338  
MRF with muStepSizeMultiplier. iters:450  train NLLH=0.4308, NMSE=0.4517   valid NLLH=0.4146, NMSE=0.4338  
MRF with muStepSizeMultiplier

#### but increasing the step size on both mu and theta makes the model diverge

In [20]:
    params = AggMRFModelParams(
                features=features,
                exactComputation=False ,
                clicksCfs = "*&*",
                displaysCfs="*&*",
                nbSamples = 10_000,
                regulL2=1.0,
                muStepSizeMultiplier = 1,
                modifiedGradient = True,
                regulL2Click = 100 )
    self = AggMRFModel(aggdata, params)
        
    nbIterPerStep = 50
    for i in range(0,10):
        self.fit(nbIterPerStep, alpha=0.01 * 5)
        totalIters = (i+1) * nbIterPerStep
        print( f"MRF with stepsize * 5. iters:{totalIters} ",  "train",   Validation.run(self,dftrain) , "valid" , Validation.run(self,dftest)   )


MRF with stepsize * 5. iters:50  train NLLH=0.3738, NMSE=0.3893   valid NLLH=0.3580, NMSE=0.3710  
MRF with stepsize * 5. iters:100  train NLLH=0.3726, NMSE=0.3863   valid NLLH=0.3560, NMSE=0.3673  
MRF with stepsize * 5. iters:150  train NLLH=0.3803, NMSE=0.3956   valid NLLH=0.3630, NMSE=0.3757  
MRF with stepsize * 5. iters:200  train NLLH=0.3738, NMSE=0.3873   valid NLLH=0.3564, NMSE=0.3673  
MRF with stepsize * 5. iters:250  train NLLH=0.3782, NMSE=0.3929   valid NLLH=0.3607, NMSE=0.3724  
MRF with stepsize * 5. iters:300  train NLLH=0.3857, NMSE=0.4014   valid NLLH=0.3680, NMSE=0.3809  
MRF with stepsize * 5. iters:350  train NLLH=0.3860, NMSE=0.4014   valid NLLH=0.3688, NMSE=0.3814  
MRF with stepsize * 5. iters:400  train NLLH=0.3854, NMSE=0.4004   valid NLLH=0.3683, NMSE=0.3808  
MRF with stepsize * 5. iters:450  train NLLH=0.3886, NMSE=0.4046   valid NLLH=0.3715, NMSE=0.3846  
MRF with stepsize * 5. iters:500  train NLLH=0.3733, NMSE=0.3875   valid NLLH=0.3558, NMSE=0.3670  
