In [1]:
import os,time
os.environ["OPENBLAS_NUM_THREADS"] = "32"
os.environ["OMP_NUM_THREADS"] = "4"

import pandas as pd
import numpy as np
import random
# We use a custom hot encoder for performances reasons.
from codpy.data_processing import hot_encoder
# Standard codpy kernel class.
from codpy.kernel import Kernel
# A multi scale kernel method.
from codpy.multiscale_kernel import *
from sklearn.metrics import confusion_matrix

In [2]:
def get_MNIST_data(N=-1):
    import tensorflow as tf

    (x, fx), (z, fz) = tf.keras.datasets.mnist.load_data()
    x, z = x / 255.0, z / 255.0
    x, z, fx, fz = (
        x.reshape(len(x), -1),
        z.reshape(len(z), -1),
        fx.reshape(len(fx), -1),
        fz.reshape(len(fz), -1),
    )
    fx, fz = (
        hot_encoder(pd.DataFrame(data=fx), cat_cols_include=[0], sort_columns=True),
        hot_encoder(pd.DataFrame(data=fz), cat_cols_include=[0], sort_columns=True),
    )
    x, fx, z, fz = (x, fx.values, z, fz.values)
    if N != -1:
        indices = random.sample(range(x.shape[0]), N)
        x, fx = x[indices], fx[indices]

    return x, fx, z, fz




def one_experiment(N_partition,get_predictor,**kwargs):
    def get_score(predictor):
        f_z = predictor(z).argmax(axis=-1)
        ground_truth = fz.argmax(axis=-1)
        out = confusion_matrix(ground_truth, f_z)
        return np.trace(out) / np.sum(out)

    elapsed_time = time.time()
    predictor = get_predictor(N_partition,**kwargs)
    score = get_score(predictor)
    elapsed_time = time.time()-elapsed_time
    print("N_partitions:",N_partitions," time:",elapsed_time)
    return score, elapsed_time

def run_experiment(N_partitions,get_predictors,labels):

    results=[]
    for N_partition in N_partitions:
        for get_predictor,label in zip(get_predictors,labels):
            score, elapsed_time = one_experiment(N_partition,get_predictor,all=True)
            results.append(
                {
                    "Ny": N_partition,
                    "Method": label,
                    "Execution Time (s)": elapsed_time,
                    "score": score
                }
    )
    out =   pd.DataFrame(results)
    print(out)
    out.to_csv("results_MNISTMultiscale.csv")
    return out


class Random_clusters:
    def __init__(self,x, N,**kwargs):
        self.x = x
        self.indices = random.sample(range(self.x.shape[0]), N)
        self.cluster_centers_ = self.x[self.indices]
    def __call__(self,z, **kwargs):
        return self.distance(z,self.cluster_centers_).argmin(axis=1)
    def distance(self,x,y):
        return core.op.Dnm(x, y, distance="norm22")

In [3]:
N_partitions=[5,10,20,40,80]
N_MNIST_pics=40000
x, fx, z, fz = get_MNIST_data(N_MNIST_pics)
core.kernel_interface.set_verbose(False)
labels = ["random","Sharp Disc.","Greedy","K-Means"]
get_predictors = [
    lambda N_partition,**kwargs: MultiScaleKernelClassifier(x=x,fx=fx,N=N_partition,method=Random_clusters,**kwargs),
    lambda N_partition,**kwargs: MultiScaleKernelClassifier(x=x,fx=fx,N=N_partition,method=SharpDiscrepancy,**kwargs),
    lambda N_partition,**kwargs: MultiScaleKernelClassifier(x=x,fx=fx,N=N_partition,method=GreedySearch,**kwargs),
    lambda N_partition,**kwargs: MultiScaleKernelClassifier(x=x,fx=fx,N=N_partition,method=MiniBatchkmeans,**kwargs)
]

# %% [markdown]
# Select a multi scale kernel method where the centers are given by a k-mean algorithm.
core.kernel_interface.set_verbose()
run_experiment(N_partitions=N_partitions,get_predictors=get_predictors,labels=labels)

N_partitions: [5, 10, 20, 40, 80]  time: 280.7652635574341
N_partitions: [5, 10, 20, 40, 80]  time: 474.53085374832153
N_partitions: [5, 10, 20, 40, 80]  time: 304.68677043914795
N_partitions: [5, 10, 20, 40, 80]  time: 274.20725440979004
N_partitions: [5, 10, 20, 40, 80]  time: 177.13222789764404
N_partitions: [5, 10, 20, 40, 80]  time: 377.9698736667633
N_partitions: [5, 10, 20, 40, 80]  time: 183.89475536346436
N_partitions: [5, 10, 20, 40, 80]  time: 167.36298441886902
N_partitions: [5, 10, 20, 40, 80]  time: 126.7184362411499
N_partitions: [5, 10, 20, 40, 80]  time: 360.46877455711365
N_partitions: [5, 10, 20, 40, 80]  time: 217.24539518356323
N_partitions: [5, 10, 20, 40, 80]  time: 205.60803699493408
N_partitions: [5, 10, 20, 40, 80]  time: 199.363915681839
N_partitions: [5, 10, 20, 40, 80]  time: 495.6520960330963
N_partitions: [5, 10, 20, 40, 80]  time: 186.55568647384644
N_partitions: [5, 10, 20, 40, 80]  time: 172.03288674354553
N_partitions: [5, 10, 20, 40, 80]  time: 179.8

Unnamed: 0,Ny,Method,Execution Time (s),score
0,5,random,280.765264,0.9751
1,5,Sharp Disc.,474.530854,0.976
2,5,Greedy,304.68677,0.9764
3,5,K-Means,274.207254,0.9763
4,10,random,177.132228,0.9734
5,10,Sharp Disc.,377.969874,0.9723
6,10,Greedy,183.894755,0.9734
7,10,K-Means,167.362984,0.9733
8,20,random,126.718436,0.9698
9,20,Sharp Disc.,360.468775,0.9705
