In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import FeatureAgglomeration

In [3]:
from sklearn.svm import SVC

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [5]:
s_names = [
    'standard',
    'min max',
    'max abs',
    'none'
]

In [6]:
scalers = [
    StandardScaler(),
    MinMaxScaler(),
    MaxAbsScaler(),
    None
]

In [7]:
d_names = [
    'pca',
    'rand proj gaussian',
    'rand proj sparse',
    'feature agglomeration',
    'none'
]

In [8]:
data = pd.read_csv('../data/keypoints.csv',header=None,names=['label']+list(np.arange(42)))

In [9]:
# extract features, labels
X = data[list(np.arange(2,42))].to_numpy()
y = data['label'].to_numpy()

In [10]:
# split train test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [11]:
model = SVC(C=1, gamma=32)

In [12]:
preds = []
scores = []

for s in scalers:
    if s is not None:
        X_train_ = s.fit_transform(X_train)
        X_test_ = s.transform(X_test)
    else: # compare to no scaling
        X_train_ = X_train
        X_test_ = X_test
    
    for n_comp in range(2,6):
        dr = [
            PCA(n_components=n_comp),
            GaussianRandomProjection(n_components=n_comp),
            SparseRandomProjection(n_components=n_comp),
            FeatureAgglomeration(n_clusters=n_comp),
            None
            ]
        
        for d in dr:
            if d is not None:
                X_train_1 = d.fit_transform(X_train_)
                X_test_1 = d.transform(X_test_)
            else:
                X_train_1 = X_train_
                X_test_1 = X_test_

            model.fit(X_train_1, y_train)
            pred = model.predict(X_test_1)
            preds.append(pred)

            score = model.score(X_test_1, y_test)
            scores.append(score)

In [13]:
scores = np.array(scores).reshape(4,4,5) # scaler, n_components, dimension reducer

In [14]:
f,axs=plt.subplots(2,2,sharex=True,sharey=True,figsize=(14,8))
pd.DataFrame(scores[:,0,:],index=s_names,columns=d_names).plot.bar(ax=axs[0,0],title='ndim=2',legend=False)
pd.DataFrame(scores[:,1,:],index=s_names,columns=d_names).plot.bar(ax=axs[0,1],title='ndim=3',legend=False)
pd.DataFrame(scores[:,2,:],index=s_names,columns=d_names).plot.bar(ax=axs[1,0],title='ndim=4',legend=False)
pd.DataFrame(scores[:,3,:],index=s_names,columns=d_names).plot.bar(ax=axs[1,1],title='ndim=5',legend=False)
plt.legend(bbox_to_anchor=(1.05, 1.3))
axs[0,0].set_ylabel('accuracy')
axs[1,0].set_ylabel('accuracy')

Text(0, 0.5, 'accuracy')

In [15]:
ax=pd.DataFrame(scores.max(axis=1),index=s_names,columns=d_names).plot.bar(title='Max Over Reduced Dim',legend=False,figsize=(14,5))
plt.legend(bbox_to_anchor=(1.02,1))
plt.ylabel('accuracy')
for p,i in zip(ax.patches,(scores.argmax(axis=1)+2).T.flatten()):
    ax.annotate(str(i), (p.get_x() * 1.015, p.get_height() * 1.01))

In [16]:
scores[1][3][2] # min-max, 5 components, randproj sparse

0.9423013517969008