<a href="https://colab.research.google.com/github/fagonzalezo/sklearn-kdcrf/blob/master/examples/approximation_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup code

In [4]:
# Install kdrcf if running in Google Colab

try:
  import google.colab
  IN_COLAB = True
  
except:
  IN_COLAB = False

if IN_COLAB:
    
    !git clone https://github.com/fagonzalezo/sklearn-kdcrf.git
    !mv sklearn-kdcrf/kdcrf .
    

In [5]:
%matplotlib inline
import gzip

import pandas as pd
import pylab as pl


In [None]:
##exact kernel

In [6]:
##approximate kernel RFF

from sklearn.model_selection import train_test_split

import numpy as np
from kdcrf import KDClassifierRF
from kdcrf import RBFSamplerORF
from sklearn import datasets, svm
# Import datasets, classifiers and performance metrics
!pip install wget
import wget
from sklearn.preprocessing import MinMaxScaler

import h5py



In [27]:
def classify(data_train, targets_train, data_test, targets_test, gammas):
    
    scores = {}
    
    classifiers = {#'svm':('gamma', svm.SVC()), 
                   'kdc exact':('gamma', KDClassifierRF(approx='exact')),
                   'lrff+ 2000':('gamma', KDClassifierRF(approx='lrff+', 
                                                        n_components=data_train.shape[1]*4,
                                                        random_state=1)),            
                   'dmrff 2000':('gamma', KDClassifierRF(approx='dmrff', 
                                                        n_components=data_train.shape[1]*4,
                                                        random_state=1)),            
                   'dmorf 2000':('gamma', KDClassifierRF(approx='dmrff', 
                                                        n_components=data_train.shape[1]*4,
                                                        random_state=1, 
                                                        sampler=RBFSamplerORF(n_components=data_train.shape[1]*4, random_state=1))),            
                   'lrff+ orf 2000':('gamma', KDClassifierRF(approx='lrff+', 
                                                        n_components=data_train.shape[1]*4,
                                                        random_state=1, 
                                                        sampler=RBFSamplerORF(n_components=data_train.shape[1]*4, random_state=1))),
    
    }
    
    for clfn in classifiers.keys():
        scores[clfn] = []
    
    for gamma in gammas:
        print('gamma:', gamma,' ',end='')
        for clfn, (gname, clf) in classifiers.items():
            print('clfn:', clfn)
            clf.set_params(**{gname:gamma})
            clf.fit(data_train, targets_train)
            scores[clfn].append(clf.score(data_test, targets_test))
            
    return classifiers, scores

## Kernel Density Classification  for letters

In [None]:
## https://archive.ics.uci.edu/ml/datasets/Letter+Recognition
letter = wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data")

In [None]:

letters = pd.read_csv("letter-recognition.data", header=None)
print(letters.head())
print(letters.describe())

vector = letters.values[:,1:]
labels = letters.values[:,0]

X_train, X_test, y_train, y_test = train_test_split(vector, labels, test_size=0.3, random_state=42)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
gammas = [2**i for i in range(-7,9)]
classifiers, scores = classify(X_train, y_train, X_test, y_test, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()



In [None]:
with h5py.File("usps.h5", 'r') as hf:
        train = hf.get('train')
        X_tr = train.get('data')[:]
        y_tr = train.get('target')[:]
        test = hf.get('test')
        X_te = test.get('data')[:]
        y_te = test.get('target')[:]

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_tr)
X_tr = scaler.transform(X_tr)
X_te = scaler.transform(X_te)


In [None]:
gammas = [2**i for i in range(-7,4)]
classifiers, scores = classify(X_tr, y_tr, X_te, y_te, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()



In [None]:
# Import datasets, classifiers and performance metrics


# The digits dataset
digits = datasets.load_digits(n_class=9)

n_samples = len(digits.data)
data = digits.data / 16.
data -= data.mean(axis=0)

# We learn the digits on the first half of the digits
data_train, targets_train = (data[:n_samples // 2],
                             digits.target[:n_samples // 2])


# Now predict the value of the digit on the second half:
data_test, targets_test = (data[n_samples // 2:],
                           digits.target[n_samples // 2:])

In [None]:
gammas = [2**i for i in range(-7,4)]
classifiers, scores = classify(data_train, targets_train, data_test, targets_test, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()


## Moon Database

In [None]:
from sklearn.datasets import  make_moons
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=1000, noise=0.2, random_state=0)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
#y = y[:, np.newaxis]
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

In [None]:

gammas = [2**i for i in range(-7,10)]
classifiers, scores = classify(X_train, y_train, X_test, y_test, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()


## Forest database

In [None]:
forest = wget.download("http://archive.ics.uci.edu/ml//machine-learning-databases/covtype/covtype.data.gz")

In [None]:
dataset = pd.read_csv('covtype.data.gz', nrows=100, compression='gzip',
                   error_bad_lines=False)

dataset = dataset.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
        dataset[:,:-1], dataset[:, -1], test_size=0.33, random_state=42)


In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [None]:

gammas = [2**i for i in range(-9,4)]
classifiers, scores = classify(X_train, y_train, X_test, y_test, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()



## Cifar database


In [None]:
from examples.load_cifar_10 import cifar10

train_images, train_labels, test_images, test_labels = cifar10(is_one_hot=False)

In [None]:
train_images.shape

In [None]:
random_train = np.random.choice(range(train_images.shape[0]), 10000, replace=False)
random_test = np.random.choice(range(test_images.shape[0]), 5000, replace=False)


train_images = train_images[random_train,:]
train_labels = train_labels[random_train]
test_images = test_images[random_test,:]
test_labels = test_labels[random_test]


In [None]:
scaler = MinMaxScaler()
scaler.fit(train_images)
train_images = scaler.transform(train_images)
test_images = scaler.transform(test_images)


In [None]:
gammas = [2**i for i in range(-8,8)]
classifiers, scores = classify(train_images, train_labels, test_images, test_labels, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()



## MNIST

In [None]:
from requests import get

def download_file(url, file_name):
    with open(file_name, "wb") as file:
        response = get(url)
        file.write(response.content)
        

In [None]:
download_file('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'train-images-idx3-ubyte.gz')
download_file('http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 'train-labels-idx1-ubyte.gz')
download_file('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 't10k-images-idx3-ubyte.gz')
download_file('http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 't10k-labels-idx1-ubyte.gz')

In [None]:
def read_mnist(images_path: str, labels_path: str):
    with gzip.open(labels_path, 'rb') as labelsFile:
        labels = np.frombuffer(labelsFile.read(), dtype=np.uint8, offset=8)

    with gzip.open(images_path,'rb') as imagesFile:
        length = len(labels)
        # Load flat 28x28 px images (784 px), and convert them to 28x28 px
        features = np.frombuffer(imagesFile.read(), dtype=np.uint8, offset=16) \
                        .reshape(length, 784) 
        
    return features, labels

train = {}
test = {}

train['features'], train['labels'] = read_mnist('train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz')
test['features'], test['labels'] = read_mnist('t10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz')

In [None]:
random_train = np.random.choice(range(train['features'].shape[0]), 10000, replace=False)
random_test = np.random.choice(range(test['features'].shape[0]), 10000, replace=False)

In [None]:
train_images = train['features'][random_train,:]
train_labels = train['labels'][random_train]
test_images = test['features'][random_test,:]
test_labels = test['labels'][random_test]


In [None]:
scaler = MinMaxScaler()
scaler.fit(train_images)
train_images = scaler.transform(train_images)
test_images = scaler.transform(test_images)

In [None]:
train_labels

In [None]:
gammas = [2**i for i in range(-8,8)]
classifiers, scores = classify(train_images, train_labels, test_images, test_labels, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()



##Gisette

In [None]:

wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_train.data")
wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_train.labels")
wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/GISETTE/gisette_valid.data")
wget.download("https://archive.ics.uci.edu/ml/machine-learning-databases/gisette/gisette_valid.labels")

In [None]:

train_data = pd.read_csv("gisette_train.data", header=None, sep=" ")
train_labels = pd.read_csv("gisette_train.labels", header=None, sep=" ")
test_data = pd.read_csv("gisette_valid.data", header=None, sep=" ")
test_labels = pd.read_csv("gisette_valid.labels", header=None, sep=" ")
print(train_data.head())
print(train_data.describe())

In [None]:
scaler = MinMaxScaler()
scaler.fit(train_data)
train_data = scaler.transform(train_data)
test_data = scaler.transform(test_data)

In [None]:
gammas = [2**i for i in range(-3,16)]
classifiers, scores = classify(train_data, train_labels, test_data, test_labels, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()



# Extract Features from cifar with BIT - GOOGLE

In [10]:
#@title Imports
import tensorflow as tf
import tensorflow_hub as hub

import tensorflow_datasets as tfds

import time

from PIL import Image
import requests
from io import BytesIO

import matplotlib.pyplot as plt
import numpy as np

import os




In [11]:
# Load model into KerasLayer
model_url = "https://tfhub.dev/google/bit/m-r101x3/1"
module = hub.KerasLayer(model_url)


In [12]:
def preprocess_image(image):
  image = np.array(image)
  # reshape into shape [batch_size, height, width, num_channels]
  img_reshaped = tf.reshape(image, [1, image.shape[0], image.shape[1], image.shape[2]])
  # Use `convert_image_dtype` to convert to floats in the [0,1] range.
  image = tf.image.convert_image_dtype(img_reshaped, tf.float32)  
  return image

def preprocess_batch_images(image):
  image = np.array(image)
  # reshape into shape [batch_size, height, width, num_channels]
  img_reshaped = tf.reshape(image, [image.shape[0], image.shape[1], image.shape[2], image.shape[3]])
  # Use `convert_image_dtype` to convert to floats in the [0,1] range.
  image = tf.image.convert_image_dtype(img_reshaped, tf.float32)  
  return image

In [13]:
module.build([None, 32, 32, 3])

In [14]:
module.get_weights()

[array([[[[ 8.43182113e-03, -2.28414889e-02,  2.59402464e-03, ...,
            1.16767373e-03,  3.36902356e-03, -1.58912167e-02],
          [-1.19150477e-03,  7.35813240e-03,  5.53361373e-03, ...,
            9.48418688e-04, -7.30680209e-03, -6.31890818e-03],
          [ 3.68011091e-03,  1.48907648e-02, -2.69974303e-03, ...,
           -3.20098433e-03,  3.88700305e-03,  2.11484674e-02]],
 
         [[-4.79558017e-03,  4.49497961e-02, -4.81343921e-03, ...,
           -1.50131842e-03, -1.14646647e-03, -2.35074176e-03],
          [ 1.78922049e-03, -2.94188466e-02, -4.62803012e-03, ...,
           -1.35786051e-03,  2.92567816e-03, -9.30050295e-03],
          [ 1.94709946e-03, -1.44465277e-02, -6.83916360e-03, ...,
           -1.70795573e-03, -1.33966441e-05,  1.20829809e-02]],
 
         [[-1.00813070e-02, -5.41460216e-02, -2.16419878e-03, ...,
           -7.75655406e-03, -5.83050167e-03,  9.17073861e-02],
          [ 6.90415560e-04,  3.12203094e-02, -5.29428991e-03, ...,
           -7.381

In [15]:

reshape_images_train = preprocess_batch_images(np.reshape(train_images, (train_images.shape[0], 32,32,3)))

features_images_train = module(reshape_images_train)

In [16]:
features_images_train.shape


TensorShape([50000, 6144])

In [17]:

reshape_images_test = preprocess_batch_images(np.reshape(test_images, (test_images.shape[0], 32,32,3)))

features_images_test = module(reshape_images_test)

In [18]:
features_images_test.shape

TensorShape([10000, 6144])

In [26]:
from sklearn.linear_model import RidgeClassifier
ridge_classifier = RidgeClassifier()
ridge_classifier.fit(features_images_train, train_labels)
print(ridge_classifier.score(features_images_test, test_labels))


  overwrite_a=True).T


0.4209


In [None]:
gammas = [2**i for i in range(-15,15)]
classifiers, scores = classify(features_images_train, train_labels, features_images_test, test_labels, gammas)
pl.rcParams["figure.figsize"] = (15,8)

for clfn in classifiers.keys(): 
    pl.plot(np.arange(len(gammas)), scores[clfn], label=clfn)
pl.axes().set_xticks(np.arange(len(gammas)))
pl.axes().set_xticklabels(gammas)
pl.setp(pl.axes().get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
pl.legend()




gamma: 3.0517578125e-05  clfn: kdc exact
clfn: lrff+ 2000
clfn: dmrff 2000




In [26]:

module = hub.KerasLayer("https://tfhub.dev/google/imagenet/resnet_v2_152/feature_vector/4",
                   trainable=False)
features = module(reshape_images_train)   # A batch with shape [batch_size, num_features].
