In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
import numpy as np
from online_cp import ConformalRidgeRegressor
from online_cp.CPS import RidgePredictionMachine
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from online_cp.evaluation import Evaluation, Err, OF, OE, WinklerScore, Width, CRPS

# Classification

In [None]:
N = 250

X, Y = make_classification(n_samples=N, n_features=2, n_informative=2, n_redundant=0, 
                           n_classes=3, n_clusters_per_class=1, random_state=2024)


# Create a scatter plot with different symbols for each class
plt.figure(figsize=(8, 6))

# Define different markers for each class
markers = ['o', 's', 'D']
colors = ['red', 'blue', 'green']

# Plot each class with a different marker
for label, marker, color in zip(np.unique(Y), markers, colors):
    plt.scatter(X[Y == label, 0], X[Y == label, 1], label=f'Class {label}', marker=marker, color=color, edgecolor='k')

# Add labels and a legend
plt.title('Synthetic Classification Dataset with 3 Classes')
plt.xlabel(r'$x_1$')
plt.ylabel(r'$x_2$')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from online_cp import ConformalNearestNeighboursClassifier

epsilon = 0.1

init_train = int(np.ceil(1/epsilon))
X_train = X[:init_train]
y_train = Y[:init_train]
X_run = X[init_train:]
y_run = Y[init_train:]

cp = ConformalNearestNeighboursClassifier(k=1, label_space=np.unique(Y))

efficiency = Evaluation(err=Err, oe=OE, of=OF)

cp.learn_initial_training_set(X=X_train, y=y_train)

for x, y in zip(X_run, y_run):
    
    # Make prediction
    Gamma, p_values, D = cp.predict(x, epsilon=0.1, return_p_values=True, return_update=True) 

    # Learn the label
    cp.learn_one(x, y, D)

    # Update efficiency criteria
    efficiency.update(y=y, Gamma=Gamma, p_values=p_values)


In [None]:
efficiency.plot_cumulative_results()

In [None]:
efficiency.summarize()

# Regression

In [None]:
housing = fetch_california_housing()

N = 300
data = housing.data
target = housing.target

X, X_other, Y, Y_other = train_test_split(data, target, train_size=N, random_state=42)

In [None]:
cp = ConformalRidgeRegressor(studentised=True)

epsilon = 0.1

# Ensure that we can get informative prediction sets
X_init_train = X[:int(np.ceil(2/epsilon))]
y_init_train = Y[:int(np.ceil(2/epsilon))]

X_process = X[int(np.ceil(2/epsilon)):]
y_process = Y[int(np.ceil(2/epsilon)):]

cp.learn_initial_training_set(X_init_train, y_init_train)

efficiency = Evaluation(err=Err, winkler=WinklerScore, width=Width)

for x, y in zip(X_process, y_process):
    
    # Make prediction
    Gamma, precomputed = cp.predict(x, epsilon=0.1, return_update=True) 
    # To avoid repeating computations, we return some precomputed arrays if return_update=True

    # Learn the label
    cp.learn_one(x, y, precomputed)
    # We do not have to invert a matrix at each step n. The hat matrix can be efficiently updaten online using the Sherman-Morrison formula

    # Update efficiency criteria
    efficiency.update(y=y, Gamma=Gamma, epsilon=epsilon)

precomputed_cp_for_later = precomputed

In [None]:
efficiency.plot_cumulative_results()

In [None]:
efficiency.summarize()

# CPS

In [None]:
cps = RidgePredictionMachine()

cps.learn_initial_training_set(X_init_train, y_init_train)

efficiency = Evaluation(err=Err, winkler=WinklerScore, width=Width, crps=CRPS)

for x, y in zip(X_process, y_process):
    tau = np.random.uniform(0, 1)
    
    # Compute CPD
    cpd, precomputed = cps.predict_cpd(x, return_update=True) 
    # To avoid repeating computations, we return some precomputed arrays if return_update=True

    Gamma = cpd.predict_set(tau=tau, epsilon=epsilon, minimise_width=True)

    # Learn the label
    cps.learn_one(x, y, precomputed)
    # We do not have to invert a matrix at each step n. The hat matrix can be efficiently updaten online using the Sherman-Morrison formula

    # Update efficiency
    efficiency.update(y=y, Gamma=Gamma, epsilon=epsilon, cpd=cpd)

    # Compute p-value
    p = cpd(y, tau)

In [None]:
efficiency.plot_cumulative_results()

In [None]:
efficiency.summarize()

# Are there analogous criteria in regression?

* The OE criterion is the size of the set of p-values larger than $\varepsilon$. If we have a cpd, that would be $|\{p(y): p(y)\neq p(y_i), p(y) > \varepsilon\}| = |\{p(y):p(y)>\varepsilon\}|$. This could be computed as the integral of $\Pi$ from the $\varepsilon$-quantile to $\sup\{y:\Pi(y,\tau)\leq 1\}$, but unfortunately, that does not converge.
* The E criterion in regression is just the M criterion unless $\Gamma=\emptyset$.
* We don't really compute p-values in regression, but we could, for an interval compute the p-values for the lower and upper bounds of the prediction interval. All $y\in\mathbb{R}\backslash\Gamma$ have smaller p-values.

# FILL IN LATER, BUT DO IT!

# There are some interesting integrals to consider in regression

Below is a plot of the Studentized CLS p-value for varying $y$-values (for $x_n$)

In [None]:
func = lambda y: cp.compute_p_value(x, y, precomputed=precomputed_cp_for_later, smoothed=False)

yrange = np.linspace(-4, 8, 100)
plt.plot(yrange, [func(y) for y in yrange])
plt.axvline(func(y), color='red', linestyle='--', label=r'$p(y_n)$')
plt.xlabel(r'$y$')
plt.ylabel(r'$p(y)$')
plt.legend()

We know (almost surely), that under natural assumptions, the prediction interval is finite for $\varepsilon \geq \frac{2}{n}$, and infinite otherwise. Thus, the integral from the corresponding lower to the corresponding upper value, may be seen as some kind of $\varepsilon$-free width criterion. It is a little tricky to compute, but in principle it should be possible.

In [None]:
interval = cp.predict(x, epsilon=2/cp.X.shape[0])
finite_yrange = np.linspace(interval.lower, interval.upper, endpoint=True, num=1000)
plt.plot(finite_yrange, [func(y) for y in finite_yrange])
plt.fill_between(finite_yrange, 0, [func(y) for y in finite_yrange], color='green', alpha=0.5, label=r'$\int p(y)dy$')
plt.xlabel(r'$y$')
plt.ylabel(r'$p(y)$')
plt.legend()

## The width criterion for CPD is $\varepsilon$-dependent
But the dependence is empirically small. However, if we want to get rid of it, we could integrate the width over $\varepsilon$. Again there may be issues, since there will be values of $\varepsilon$ that give infinite intervals. At least in principle, we could consider

$\int_{\varepsilon^*}^1\text{W}(\Gamma^{\varepsilon})d\varepsilon$,
where $\varepsilon^* = \inf\{\varepsilon : |\Gamma^{\varepsilon}|<\infty\}$ to be an efficiency criterion. Perhaps we might consider the number $(1-\varepsilon^*)\int_{\varepsilon^*}^1\text{W}(\Gamma^{\varepsilon})d\varepsilon$. It will typically be computationally nasty to compute.

A compromise would be to average the with over several $\varepsilon$.

In [None]:
tau = np.random.uniform(0, 1)
func = lambda epsilon: cpd.predict_set(tau=tau, epsilon=epsilon).width()

In [None]:
plt.plot(np.linspace(0, 1, 1000, endpoint=True), [func(epsilon) for epsilon in np.linspace(0, 1, 1000, endpoint=True)])
plt.fill_between(np.linspace(0, 1, 1000, endpoint=True), [func(epsilon) for epsilon in np.linspace(0, 1, 1000, endpoint=True)], alpha=0.5, label=r'$\int W(\varepsilon)d\varepsilon$')
plt.xlabel(r'$\epsilon$')
plt.ylabel(r'Width')
plt.legend()