# Using SVM + Histogram to classify the map

[Nathanaël Perraudin](http://perraudin.info), [Michaël Defferrard](http://deff.ch), Tomasz Kacprzak

In this notebook, we use the full augmented dataset to fit a linear SVM to the histogram of the maps to build a classifier.

A few remarks:
* Kernelized SVM is worse than linear SVM
* This script might require a lot of ram (32Gigs)

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import shutil
os.environ["CUDA_VISIBLE_DEVICES"]=""

import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import healpy as hp
from tqdm import tqdm_notebook

from scnn import models
from scnn import utils
from scnn.data import LabeledDatasetWithNoise

In [None]:
plt.rcParams['figure.figsize'] = (17, 5)

## 2 Data preparation

In [None]:
Nside = 1024
order = 2  # 1,2,4,8 correspond to 12,48,192,768 parts of the sphere.
sigma_noise = 5
EXP_NAME = '40sim_{}sides_1arcmin_{}noise_{}order'.format(Nside, sigma_noise, order)
data_path = 'data/same_psd/'

### 2.1 Load data

In [None]:
ds1 = np.load(data_path+'smoothed_class1.npz')['arr_0']
ds2 = np.load(data_path+'smoothed_class2.npz')['arr_0']

### 2.2 Samples creation

We here create samples by dividing the two complete spheres in patches (based on healpix sampling).

In [None]:
datasample = dict()
datasample['class1'] = np.vstack([utils.hp_split(el, order=order) for el in ds1])
datasample['class2'] = np.vstack([utils.hp_split(el, order=order) for el in ds2])
del ds1
del ds2

print('The data is of shape {}'.format(datasample['class1'].shape))

### 2.3 Normalization and train / test split 

Let us split the data into training and testing sets. The raw data is stored into `x_raw` and the histograms into `x_trans`. As a transformation, we cannot use the power spectrum density. Hence we do an histogram of the data.

In [None]:
# Normalize and transform the data, i.e. extract features.
x_raw = np.vstack((datasample['class1'], datasample['class2']))
x_raw_std = np.std(x_raw)
x_raw = x_raw / x_raw_std # Apply some normalization
rs = np.random.RandomState(0)
x_noise = x_raw + sigma_noise*rs.randn(*x_raw.shape)
cmin = np.min(x_raw)
cmax = np.max(x_raw)

# Create the label vector.
labels = np.zeros([x_raw.shape[0]], dtype=int)
labels[len(datasample['class1']):] = 1


ret = train_test_split(x_raw, x_noise, labels, train_size=0.8, shuffle=True, random_state=0)
x_raw_train, x_raw_validation, x_noise_train, x_noise_validation, labels_train, labels_validation = ret

print('Class 1 VS class 2')
print('  Training set: {} / {}'.format(np.sum(labels_train==0), np.sum(labels_train==1)))
print('  Validation set: {} / {}'.format(np.sum(labels_validation==0), np.sum(labels_validation==1)))

## Build an augmented dataset

In [None]:
training = LabeledDatasetWithNoise(x_raw_train, 
                                   labels_train, 
                                   start_level=sigma_noise, 
                                   end_level=sigma_noise)

In [None]:
nloop = 10
ntrain = len(x_raw_train)
N = ntrain * nloop
nbatch = ntrain // 4
it = training.iter(nbatch)

x_trans_train = []
labels_train = []
for i in range(nloop*4):
    x,l = next(it)
    x_trans_train.append(utils.histogram(x, cmin, cmax))
    labels_train.append(l)
x_trans_train = np.concatenate(x_trans_train, axis=0)
labels_train = np.concatenate(labels_train, axis=0)
# Scale the data
x_trans_train_mean = np.mean(x_trans_train)
x_trans_train = x_trans_train - x_trans_train_mean
x_trans_train_std = np.std(x_trans_train)
x_trans_train = x_trans_train / x_trans_train_std

x_trans_validation = (utils.histogram(x_noise_validation, cmin, cmax) - x_trans_train_mean) / x_trans_train_std

In [None]:
def err_svc_rbf(x_train, label_train, x_test, label_test):
    clf = SVC(kernel='rbf')
    clf.fit(x_train, label_train)
    pred = clf.predict(x_train)
    error_train = sum(np.abs(pred - label_train)) / len(label_train)
    pred = clf.predict(x_test)
    error_test = sum(np.abs(pred - label_test)) / len(label_test)
    return error_train, error_test
def err_svc_linear(x_train, label_train, x_test, label_test):
    clf = SVC(kernel='linear')
    clf.fit(x_train, label_train)
    pred = clf.predict(x_train)
    error_train = sum(np.abs(pred - label_train)) / len(label_train)
    pred = clf.predict(x_test)
    error_test = sum(np.abs(pred - label_test)) / len(label_test)
    return error_train, error_test

In [None]:
nsamples = list(ntrain//12 * np.linspace(1,6,num=6).astype(np.int))
nsamples +=list(ntrain//2 * np.linspace(1,12,num=12).astype(np.int))
err_train = np.zeros(shape=[len(nsamples)])
err_validation = np.zeros(shape=[len(nsamples)])
err_train[:] = np.nan
err_validation[:] = np.nan

for i, n in enumerate(nsamples):
    print('{} Solve it for {} samples'.format(i,n))
    %time err_train[i], err_validation[i] = err_svc_linear(x_trans_train[:n], labels_train[:n] ,x_trans_validation, labels_validation)
    

In [None]:
plt.plot(nsamples,err_train)

In [None]:
plt.plot(nsamples,err_validation*100)

The variations we observe are due to slight overfitting of the validataion set. Clearly we have enough samples to reach the saturation of our classifier. Hence, for the testing set, we simply use all the data. 

Computatition might take some time.

In [None]:
def get_testing_dataset(order, sigma_noise, x_raw_std):
    ds1 = np.load('data/same_psd_testing/smoothed_class1.npz')['arr_0']
    ds2 = np.load('data/same_psd_testing/smoothed_class2.npz')['arr_0']
    
    datasample = dict()
    datasample['class1'] = np.vstack([utils.hp_split(el, order=order) for el in ds1])
    datasample['class2'] = np.vstack([utils.hp_split(el, order=order) for el in ds2])
    
    x_raw = np.vstack((datasample['class1'], datasample['class2']))
    x_raw = x_raw / x_raw_std # Apply some normalization
    
    rs = np.random.RandomState(1)
    x_noise = x_raw + sigma_noise*rs.randn(*x_raw.shape)
    
    # Create the label vector.
    labels = np.zeros([x_raw.shape[0]], dtype=int)
    labels[len(datasample['class1']):] = 1
    
    return x_noise, labels

In [None]:
x_noise_test, labels_test = get_testing_dataset(order, sigma_noise, x_raw_std)
x_trans_test = (utils.histogram(x_noise_test, cmin, cmax) - x_trans_train_mean) / x_trans_train_std

In [None]:
e_train, e_validation = err_svc_linear(x_trans_train, labels_train ,x_trans_validation, labels_validation)
print('The validation error is {}%'.format(e_validation*100))

In [None]:
e_train, e_test = err_svc_linear(x_trans_train, labels_train ,x_trans_test, labels_test)
print('The test error is {}%'.format(e_test*100))


In [None]:
np.savez(EXP_NAME,[nsamples, err_train, err_validation, e_test])