In [1]:
from sklearn.datasets import fetch_openml
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [2]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [3]:
# dataset = fetch_openml(name="California-Housing-Classification", parser="auto")
# low_class_name = 'False'
# high_class_name = 'True'

dataset = fetch_openml(name="spambase", parser="auto")
low_class_name = '0'
high_class_name = '1'

# Really, we should shuffle the data to be on the safe side
X = dataset.data.values.astype(float)
Y = dataset.target.to_numpy()
Y = (Y == high_class_name).astype(int) - (Y == low_class_name).astype(int)

X, Y = unison_shuffled_copies(X, Y)

N = 1000
X = X[:N]
Y = Y[:N]

# Conformal test

In [4]:
from classifiers import ConformalPassiveAggressive, ConformalPassiveAggressive_I, ConformalPassiveAggressive_II, ConformalOneNearestNeighbours
from martingale import PluginMartingale

cp = ConformalPassiveAggressive(X.shape[1])
# cp = ConformalOneNearestNeighbours()
martingale = PluginMartingale()

p_vals = []

sizes = []

epsilon = 0.1

for i, (x, y) in tqdm(enumerate(zip(X, Y)), total=X.shape[0]):
    Gamma, p_values = cp.predict_set(x, epsilon=epsilon, return_p_values=True)

    cp.learn_one(x, y)
    cp.err(Gamma, y)
    cp.oe(Gamma, y)
    cp.of(p_values, y)

    sizes.append(Gamma.shape[0])

    if y == 1:
        p_vals.append(p_values[1])
        # martingale.update_log_martingale(p_values[1])
    else:
        p_vals.append(p_values[-1])
        # martingale.update_log_martingale(p_values[-1])

print(f'epsilon: {epsilon}')
print(f'Err: {cp.Err / cp.y.shape[0]}')
print(f'OE: {cp.OE / cp.y.shape[0]}')
print(f'OF: {cp.OF / cp.y.shape[0]}')
print(f'Avg size: {np.mean(sizes)}')

  0%|          | 0/1000 [00:00<?, ?it/s]

epsilon: 0.1
Err: 0.098
OE: 0.899
OF: 0.4972895245176404
Avg size: 1.801
