In [1]:
# import required modules
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras import backend as K
from keras.layers import Dense, Dropout, Activation

from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn import datasets

from scipy import stats

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
iris = datasets.load_iris()

In [3]:
# get Iris data from UCI or from scikit-learn datasets
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
dataframe = pd.read_csv(url,header=None)
dataset = dataframe.values
X = dataset[:,0:4].astype(float)
Y = dataset[:,4]

from sklearn.datasets import load_iris
iris  = load_iris()
df = pd.DataFrame(iris.data)
target = pd.DataFrame(iris.target)

In [4]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [5]:
# run standard SVM classifiers available in scikit-learn
kfold_b1 = KFold(n_splits=10, shuffle=True, random_state=seed)

C = 1.0  # SVM regularization parameter
from sklearn import svm
# SVC with linear kernel
svc = svm.SVC(kernel='linear', C=C)
results_a1 = cross_val_score(svc, X, Y, cv=kfold_b1)
print("Baseline_svm: %.2f%% (%.2f%%)" % (results_a1.mean()*100, results_a1.std()*100))
lin_svc = svm.LinearSVC(C=C)
results_a1 = cross_val_score(lin_svc, X, Y, cv=kfold_b1)
print("Baseline_Lin_svm: %.2f%% (%.2f%%)" % (results_a1.mean()*100, results_a1.std()*100))
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C)
results_a1 = cross_val_score(rbf_svc, X, Y, cv=kfold_b1)
print("Baseline_rbf_svm: %.2f%% (%.2f%%)" % (results_a1.mean()*100, results_a1.std()*100))
poly_svc = svm.SVC(kernel='poly', degree=3, C=C)
results_a1 = cross_val_score(poly_svc, X, Y, cv=kfold_b1)
print("Baseline_poly_svm: %.2f%% (%.2f%%)" % (results_a1.mean()*100, results_a1.std()*100))

Baseline_svm: 97.33% (3.27%)
Baseline_Lin_svm: 93.33% (8.94%)
Baseline_rbf_svm: 97.33% (4.42%)
Baseline_poly_svm: 94.00% (6.96%)


In [6]:
# define baseline model (non-linear logistic regression with DNNs)
def baseline_model_logistic():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=4, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(16, input_dim=8, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
# map data by non-linear transforms
# non-linear maps approximate the kernels
d = 150
X.shape
from sklearn.kernel_approximation import (RBFSampler,Nystroem)
feature_map_fourier = RBFSampler(gamma=.2, random_state=1,n_components=d)
feature_map_nystroem = Nystroem(gamma=.2, random_state=1,n_components=d)

X_rbf = feature_map_fourier.fit_transform(X)
X_nys = feature_map_nystroem.fit_transform(X)

# LinearSVC (linear kernel)
lin_svc_rbf = svm.LinearSVC(C=C).fit(X_rbf, Y)
print("rbf map SVC: " + str(lin_svc_rbf.score(X_rbf, Y)))
# LinearSVC (linear kernel)
lin_svc_nys = svm.LinearSVC(C=C).fit(X_nys, Y)
print("nys map SVC: " + str(lin_svc_nys.score(X_nys, Y)))

rbf map SVC: 0.986666666667
nys map SVC: 0.986666666667


In [8]:
# define baseline model
# this is a single layer feedforward network (SLFN)
def baseline_model_RKS():
    # create model
    model = Sequential()
    model.add(Dense(3, input_dim=150, activation='softmax'))
    #model.add(Dropout(0.2))
    # Compile model
    # model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [9]:
# fit SLFNs with non-linear feature maps
estimator_b1 = KerasClassifier(build_fn=baseline_model_logistic, epochs=200, batch_size=5, verbose=0)
estimator_b2 = KerasClassifier(build_fn=baseline_model_RKS, epochs=200, batch_size=50, verbose=0)
estimator_b3 = KerasClassifier(build_fn=baseline_model_RKS, epochs=200, batch_size=50, verbose=0)

kfold_b1 = KFold(n_splits=10, shuffle=True, random_state=seed)
results_b1 = cross_val_score(estimator_b1, X, dummy_y, cv=kfold_b1)
print("Baseline_logistic: %.2f%% (%.2f%%)" % (results_b1.mean()*100, results_b1.std()*100))

kfold_b2 = KFold(n_splits=10, shuffle=True, random_state=seed)
results_b2 = cross_val_score(estimator_b2, X_rbf, dummy_y, cv=kfold_b2)
print("rks_rbf: %.2f%% (%.2f%%)" % (results_b2.mean()*100, results_b2.std()*100))

kfold_b3 = KFold(n_splits=10, shuffle=True, random_state=seed)
results_b3 = cross_val_score(estimator_b3, X_nys, dummy_y, cv=kfold_b3)
print("rks_nys: %.2f%% (%.2f%%)" % (results_b3.mean()*100, results_b3.std()*100))

Baseline_logistic: 97.33% (3.27%)
rks_rbf: 94.67% (7.77%)
rks_nys: 95.33% (6.70%)


In [10]:
# vanilla SVMs with errors measured on entire dataset
C = 1.0  # SVM regularization parameter
from sklearn import svm
# SVC with linear kernel
svc = svm.SVC(kernel='linear', C=C).fit(X, Y)
# LinearSVC (linear kernel)
lin_svc = svm.LinearSVC(C=C).fit(X, Y)
# SVC with RBF kernel
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, Y)
# SVC with polynomial (degree 3) kernel
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, Y)
print("linear SVC: " + str(svc.score(X, Y)))
print("rbf SVC: " + str(rbf_svc.score(X, Y)))
print("poly SVC: " + str(poly_svc.score(X, Y)))

linear SVC: 0.993333333333
rbf SVC: 0.98
poly SVC: 0.98


Setting up DNNs with "cosine" activation functions to mimic Kernel Approximations

In [11]:
# just to scale feature to [-1,1] for improved stability
from sklearn.datasets import load_iris
iris  = load_iris()
df = pd.DataFrame(iris.data)
target = pd.DataFrame(iris.target)

# standardize features (rescale to -1 to 1)
df=(df-df.min())/(df.max()-df.min())
df = (2*df-1)
print(df.describe())

                0           1           2           3
count  150.000000  150.000000  150.000000  150.000000
mean    -0.142593   -0.121667   -0.064859   -0.084444
std      0.460037    0.361329    0.598109    0.635967
min     -1.000000   -1.000000   -1.000000   -1.000000
25%     -0.555556   -0.333333   -0.796610   -0.833333
50%     -0.166667   -0.166667    0.135593    0.000000
75%      0.166667    0.083333    0.389831    0.416667
max      1.000000    1.000000    1.000000    1.000000


In [12]:
# see how to access weights and biases of a layer in keras
# test the weight sizes of a known model
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu',name="h1"))
model.add(Dense(16, input_dim=8, activation='relu',name="h2"))
model.add(Dense(3, activation='softmax',name="out"))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X, dummy_y, epochs=150, batch_size=10,verbose=0)
w1 = model.layers[0].get_weights()[0]
b1 = model.layers[0].get_weights()[1]
w2 = model.layers[1].get_weights()[0]
b2  = model.layers[1].get_weights()[1]
print(w1.shape)
print(b1.shape)
print(w2.shape)
print(b2.shape)

(4, 8)
(8,)
(8, 16)
(16,)


In [13]:
# test random kitchen sink idea
# sample bias(b) uniformly on (0,2pi)and weights(w) on normal
# feature map = cos(wx+b)
from math import pi
b1 = np.random.uniform(low=0.0, high=pi, size=[150,])
w1 = np.random.normal(loc=0.0, scale=1.0,size=[4,150])
weights = [w1,b1]
X_rks = np.cos(np.matmul(X,w1)+b1)
print(weights[0].shape)
print(weights[1].shape)
print(X_rks.shape)

lin_svc = svm.LinearSVC(C=C).fit(X_rks, Y)
print("linear SVC with RKS: " + str(lin_svc.score(X_rks, Y)))
Yhat = lin_svc.predict(X_rks)
encoded_Yhat = encoder.fit_transform(Yhat)
# make sure that weights given and accessed are same (almost)
print(stats.describe(encoded_Yhat-encoded_Y))

(4, 150)
(150,)
(150, 150)
linear SVC with RKS: 0.993333333333
DescribeResult(nobs=150, minmax=(0, 1), mean=0.0066666666666666671, variance=0.0066666666666666619, skewness=12.124632423681811, kurtosis=145.00671140939625)


set DNN where first layer is frozen and initialized with random_kitchen_sink weights and activation is cosine

In [14]:
# define a custom activaiton function
def cos_rks(x):
    return K.cos(x)
# define a custom DNN where first layer gives non-linear maps
# second layer simply weights them linearly
def rks_logistic():
    model = Sequential()
    model.add(Dense(150, input_dim=4, activation=cos_rks,name="rks",trainable=False))
    model.add(Dense(3, input_dim=150, activation='softmax',name="out"))
    model.layers[0].set_weights(weights)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
# Fit the model
# model.fit(X, dummy_y, epochs=10, batch_size=10,verbose=0)

estimator_rks = KerasClassifier(build_fn=baseline_model_logistic, epochs=50, batch_size=20, verbose=0)
kfold_b1 = KFold(n_splits=10, shuffle=True, random_state=seed)
results_b1 = cross_val_score(estimator_rks, X, dummy_y, cv=kfold_b1)
print("rks_with_logistic: %.2f%% (%.2f%%)" % (results_b1.mean()*100, results_b1.std()*100))

#w1r = model.layers[0].get_weights()[0]
#b1r = model.layers[0].get_weights()[1]
#w2r = model.layers[1].get_weights()[0]
#b2r  = model.layers[1].get_weights()[1]
#print(w1r.shape)
#print(b1r.shape)
#print(w2r.shape)
#print(b2r.shape)

rks_with_logistic: 90.00% (7.45%)
