In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA
from sklearn import decomposition
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import svm

# Load data - 150 observations, 4 features, 3 classes, 
df = pd.read_csv("wine_dataset5.csv", header=None)
print(df.describe())
data = df.values


                0           1           2           3           4
count  178.000000  178.000000  178.000000  178.000000  178.000000
mean    13.000618   19.494944    2.029270    5.058090    1.938202
std      0.811827    3.339564    0.998859    2.318286    0.775035
min     11.030000   10.600000    0.340000    1.280000    1.000000
25%     12.362500   17.200000    1.205000    3.220000    1.000000
50%     13.050000   19.500000    2.135000    4.690000    2.000000
75%     13.677500   21.500000    2.875000    6.200000    3.000000
max     14.830000   30.000000    5.080000   13.000000    3.000000


In [0]:
## Setup

# Shuffle data randomly
shuffled_data = data;
np.random.shuffle(shuffled_data)
X_df = shuffled_data[:,0:4]
y = shuffled_data[:,4]   # labels

# Zero out mean of data
M = np.mean(X_df, axis=0)  # column mean
X = X_df - M

In [0]:
# Perform PCA
pca = decomposition.PCA(n_components=4).fit(X)
X_dim = pca.transform(X)   # principal components
p_axis = pca.components_   # principal axis
p_axisT = np.transpose(p_axis)
#print(pca.singular_values_)

In [0]:
# Evaluate reconstruction error
Num_PC = [4, 3, 2, 1]  # number of PC's to used: 1, 2, 3, 4 use 1,2,3,4 features

for k in Num_PC:
  Xhat = np.matmul(X_dim[:,0:k], p_axisT[0:k,:])
  print('Reconstruction error = {0:0.4f} with {1:1d} PCs'.format((np.linalg.norm(X) - np.linalg.norm(Xhat)), k))

Reconstruction error = -0.0000 with 4 PCs
Reconstruction error = 0.4985 with 3 PCs
Reconstruction error = 1.9231 with 2 PCs
Reconstruction error = 11.8974 with 1 PCs


In [0]:
'''
Determine classification accuracy using SVM classifier with 10-fold cross validation
''' 

Num_PC_arr = [4, 1]
model = svm.SVC(kernel='linear', C=1).fit(Xhat, y)
print('Accuracy comparison using principal components vs original data:')

for k in Num_PC_arr:
  scores1 = cross_val_score(model, X_dim[:,0:k], y, cv=10)
  scores2 = cross_val_score(model, X[:,0:k], y, cv=10)
  print('{0:0.4f} with {1:1d} PCs vs {2:0.4f} with {3:1d} features'.format(scores1.mean(), k, scores2.mean(), k))

Accuracy comparison using principal components vs original data:
0.9368 with 4 PCs vs 0.9368 with 4 features
0.5097 with 1 PCs vs 0.6704 with 1 features
