# SVM
Supervised learning method

### Classification code

In [298]:
import h5py
import toyplot
import numpy as np
import pandas as pd

### Load in databases from a simulation run

In [299]:
# load in the counts data
with h5py.File("../databases/tr5-t5-r2-s1000.counts.h5") as io5:
    counts = io5["counts"][:]
    
# make a flattened array for models that are non-spatial    
fcounts = np.array([i.flatten() for i in counts[:]])

# print data shape
print(counts.shape)
print(fcounts.shape)

(320, 5, 16, 16)
(320, 1280)


In [300]:
# load in the features database
with h5py.File("../databases/tr5-t5-r2-s1000.labels.h5") as io5:
    df = pd.DataFrame({
        "theta": io5["thetas"][:],
        "asource": io5["admix_sources"][:].flatten(),
        "atarget": io5["admix_targets"][:].flatten(),
        "atimes": io5["admix_times"][:].flatten(),
        "aprops": io5["admix_props"][:].flatten(),
    })
    
# head of dataframe
df.sample(10)

Unnamed: 0,theta,asource,atarget,atimes,aprops
91,0.183463,3,6,1.875,0.355433
79,0.581656,7,4,2.625,0.115908
32,0.746617,4,2,0.75,0.141993
101,0.809721,3,2,0.75,0.060375
249,0.876769,1,4,0.375,0.352441
234,0.174951,5,2,1.125,0.221935
54,0.103757,4,1,0.375,0.013617
229,0.694997,5,3,1.125,0.390672
46,0.299389,4,5,1.125,0.096493
140,0.261816,6,4,1.875,0.478644


## Unsupervised learning to get features

NMF

In [301]:
from sklearn import svm
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = svm.SVC(gamma='scale')
clf.fit(X, y)  



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [302]:
clf.predict([[2., 2.]])

array([1])

In [327]:
X = fcounts
y = pd.Categorical([(i, j) for (i, j) in zip(df.asource, df.atarget)]).codes

In [328]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [336]:
clf = svm.SVC(gamma="auto").fit(X_train, y_train)
clf.predict(X_test)

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], dtype=int8)

In [337]:
y_test

array([11, 26, 24, 19, 25, 17, 12, 30,  7, 27, 18, 31, 16,  7, 10, 17,  8,
       31, 20, 15,  8, 28,  9, 13, 23, 28,  2, 21, 31, 12, 31, 25, 11, 29,
        1, 31, 10,  3, 27, 16, 25,  9,  3,  3, 23, 25,  5,  9, 27, 10, 20,
       18,  3, 23, 22, 12,  5, 19, 24, 17, 28, 30,  1,  6, 13,  0, 26,  0,
       26, 14, 15, 15,  6, 29,  3, 21,  5,  9, 30, 28], dtype=int8)

In [227]:
df.loc[(df.asource==4) & (df.atarget==7), "color"] = 3


In [297]:
c, a, m = toyplot.scatterplot(
    tcounts[0], 
    tcounts[1],
    width=400, 
    height=350,
    opacity=0.5,
    size=8,
    #color=[toyplot.color.Palette()[i] for i in df["color"]],
);

### PCA
"relative pixel position is ignored by this model"

In [232]:
from __future__ import print_function

from time import time
import logging
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.svm import SVC


print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')


# #############################################################################
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)

Downloading LFW metadata: https://ndownloader.figshare.com/files/5976012
2019-03-17 02:25:10,437 Downloading LFW metadata: https://ndownloader.figshare.com/files/5976012


Automatically created module for IPython interactive environment


Downloading LFW metadata: https://ndownloader.figshare.com/files/5976009
2019-03-17 02:25:11,202 Downloading LFW metadata: https://ndownloader.figshare.com/files/5976009
Downloading LFW metadata: https://ndownloader.figshare.com/files/5976006
2019-03-17 02:25:11,885 Downloading LFW metadata: https://ndownloader.figshare.com/files/5976006
Downloading LFW data (~200MB): https://ndownloader.figshare.com/files/5976015
2019-03-17 02:25:12,742 Downloading LFW data (~200MB): https://ndownloader.figshare.com/files/5976015


Total dataset size:
n_samples: 1288
n_features: 1850
n_classes: 7


In [250]:
# split into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)


In [251]:
lfw_people.keys()

dict_keys(['data', 'images', 'target', 'target_names', 'DESCR'])

In [252]:
lfw_people['images'].shape

(1288, 50, 37)

In [257]:
pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)

eigenfaces = pca.components_.reshape((n_components, h, w))

print("Projecting the input data on the eigenfaces orthonormal basis")
t0 = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("done in %0.3fs" % (time() - t0))


Projecting the input data on the eigenfaces orthonormal basis
done in 0.007s


In [258]:

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'),
                   param_grid, cv=5)
clf = clf.fit(X_train_pca, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)


Fitting the classifier to the training set
done in 95.072s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)




In [259]:
clf

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [260]:

print("Predicting people's names on the test set")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))

print(classification_report(y_test, y_pred, target_names=target_names))
print(confusion_matrix(y_test, y_pred, labels=range(n_classes)))



Predicting people's names on the test set
done in 0.025s
                   precision    recall  f1-score   support

     Ariel Sharon       0.10      0.15      0.12        13
     Colin Powell       0.43      0.53      0.48        60
  Donald Rumsfeld       0.26      0.33      0.30        27
    George W Bush       0.66      0.58      0.62       146
Gerhard Schroeder       0.17      0.20      0.18        25
      Hugo Chavez       0.25      0.13      0.17        15
       Tony Blair       0.50      0.39      0.44        36

        micro avg       0.46      0.46      0.46       322
        macro avg       0.34      0.33      0.33       322
     weighted avg       0.49      0.46      0.47       322

[[ 2  6  1  3  1  0  0]
 [ 8 32  2 10  4  1  3]
 [ 2  9  9  5  1  0  1]
 [ 7 21 10 84 14  3  7]
 [ 0  1  4 10  5  2  3]
 [ 1  2  2  8  0  2  0]
 [ 1  3  6  7  5  0 14]]
