In [1]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import *
from tqdm import tqdm_notebook as tqdm

In [2]:
X = np.load('features.npy')
y = np.load('targets.npy')
print('X shape:', X.shape)
print('y shape:', y.shape)

def l2_normalized(X):
    norms = np.linalg.norm(X, axis=1).reshape(-1,1)
    return X/norms

X shape: (326, 4556)
y shape: (326,)


In [16]:
def performance(X, y):
    X = l2_normalized(X)
    model = SVC(C=5, kernel='rbf', gamma='scale')
    model.fit(X, y)
    return model.score(X, y)

def cvPerf(X,y):
    X = l2_normalized(X)
    model = SVC(C=5, kernel='rbf', gamma='scale')
    N_SPLITS = 2
    shuffler = StratifiedShuffleSplit(test_size=0.4, n_splits=N_SPLITS)
    acc = np.empty(N_SPLITS)
    for i, (train_index, test_index) in enumerate(shuffler.split(X, y)):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        model.fit(X_train, y_train)
        acc[i] = model.score(X_test, y_test)
    return acc.max()

In [12]:
# perform a greedy search on features
finalized_features = []
n = X.shape[0]
benchmark = 0
while True:
    # iterate over features that are not finalized
    acc = []
    for index in tqdm(range(n)):
        if index not in finalized_features:
            # get the accuracy of adding index as a feature
            features = finalized_features + [index]
            acc.append( (index, performance(X[:, features], y)) )
    # we have index vs acc, for every index not in finalized features
    optimal_index, optimal_acc = max(acc, key=lambda x: x[1])
    if optimal_acc > benchmark:
        benchmark = optimal_acc
        finalized_features.append(optimal_index)
        print(f"Accuracy: {finalized_features} to {optimal_acc}")
    else:
        print("Finished with benchmark of", benchmark, f"optim accuracy: {optimal_acc}")
        break        

HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47] to 0.29141104294478526


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225] to 0.3282208588957055


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272] to 0.38650306748466257


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197] to 0.4447852760736196


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302] to 0.5306748466257669


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138] to 0.588957055214724


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29] to 0.6441717791411042


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264] to 0.6901840490797546


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56] to 0.7269938650306749


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56, 260] to 0.7760736196319018


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42] to 0.8067484662576687


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42, 36] to 0.8312883435582822


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42, 36, 77] to 0.8496932515337423


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42, 36, 77, 32] to 0.8742331288343558


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Accuracy: [47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42, 36, 77, 32, 103] to 0.8834355828220859


HBox(children=(IntProgress(value=0, max=326), HTML(value='')))


Finished with benchmark of 0.8834355828220859 optim accuracy: 0.8834355828220859


In [17]:
print(finalized_features, cvPerf(X[:,finalized_features], y))

[47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42, 36, 77, 32, 103] 0.21374045801526717


In [6]:
# [47, 225, 272, 197, 302, 138, 29, 264, 56, 260, 42, 36, 77, 32, 103] 0.8834355828220859

0.8834355828220859
