Principle Component Analysis on the Wine Dataset using sklearn

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [2]:
# Set the seed for reproducibility
np.random.seed(1000)

# Load the data
df = pd.read_csv('wine_dataset.csv')

# Drop the 'style' column
df.drop('style', axis=1, inplace=True)


# Standardize the data
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)


# Convert the standardized data back to a DataFrame
df_standardized = pd.DataFrame(df_standardized, columns=df.columns)


# get the covariance matrix
covariance_matrix = np.asarray(df_standardized.cov())
print("covariance matrix: \n", covariance_matrix)


# get the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print("eigenvalues: \n", eigenvalues)
print("eigenvectors: \n", eigenvectors)

covariance matrix: 
 [[ 1.00015394  0.21904197  0.32448567 -0.11199852  0.29824068 -0.28277895
  -0.32910456  0.45898063 -0.25273937  0.29961386 -0.09546622 -0.07675502]
 [ 0.21904197  1.00015394 -0.3780395  -0.19604135  0.37718233 -0.35261158
  -0.41454     0.27133741  0.26149465  0.22601847 -0.03764618 -0.26574038]
 [ 0.32448567 -0.3780395   1.00015394  0.14247316  0.03900402  0.1331463
   0.19527203  0.09616873 -0.32985896  0.05620595 -0.01049511  0.08554488]
 [-0.11199852 -0.19604135  0.14247316  1.00015394 -0.12896035  0.40293266
   0.49555786  0.55260201 -0.26736099 -0.18595603 -0.3594701  -0.03698618]
 [ 0.29824068  0.37718233  0.03900402 -0.12896035  1.00015394 -0.19507481
  -0.27967349  0.36267048  0.04471486  0.3956542  -0.25695513 -0.20069639]
 [-0.28277895 -0.35261158  0.1331463   0.40293266 -0.19507481  1.00015394
   0.72104506  0.0257208  -0.14587635 -0.18848626 -0.17986612  0.0554716 ]
 [-0.32910456 -0.41454     0.19527203  0.49555786 -0.27967349  0.72104506
   1.0001539

In [3]:
# sort the eigenvalues and select k = d/2 largest eigenvalues
# report the largest eigenvalues and their corresponding eigenvectors
k = (len(eigenvalues) // 2)

sort_indices = np.argsort(eigenvalues)[::-1][:k]

eigenvalues_k = eigenvalues[sort_indices]
eigenvectors_k = eigenvectors[:, sort_indices]

print("eigenvalues_k: \n", eigenvalues_k)
print("eigenvectors_k: \n", eigenvectors_k)

# construct the projection matrix using d * k which should be 12 * 6 
projection_matrix = eigenvectors_k
print("projection_matrix: \n", projection_matrix)


eigenvalues_k: 
 [3.04201535 2.65026192 1.64175951 1.06878976 0.84062985 0.66050084]
eigenvectors_k: 
 [[ 0.25692873  0.26184306 -0.46748619 -0.14396377  0.16536261 -0.03003708]
 [ 0.39493118  0.10519825  0.27968932 -0.08005785  0.14777408  0.38266373]
 [-0.14646061  0.14409348 -0.58807557  0.05551036 -0.23462139 -0.36224839]
 [-0.31890519  0.34258497  0.0755017   0.11245623  0.50792118  0.06331719]
 [ 0.31344994  0.2697701  -0.04676921  0.16529004 -0.3938966   0.42544212]
 [-0.42269137  0.11117878  0.09899801  0.30330631 -0.24845196  0.28318017]
 [-0.47441968  0.14394753  0.10128143  0.13223199 -0.22396681  0.10676882]
 [ 0.09243753  0.55492047  0.05156338  0.15057853  0.3303573  -0.15455292]
 [ 0.20806957 -0.15292185  0.40678741  0.47147768 -0.0014575  -0.56089714]
 [ 0.29985192  0.1196342  -0.16869128  0.58801992 -0.19324555  0.02014082]
 [ 0.05892408 -0.49272747 -0.21293142  0.08003179  0.11602319  0.16947538]
 [-0.08747571 -0.29660091 -0.29583773  0.47243936  0.45912914  0.2778883

In [4]:
# Perform PCA
pca = PCA(n_components=k)
X_pca = pca.fit_transform(df_standardized)

# Construct the projection matrix using sklearn
projection_matrix_sklearn = pca.components_.T

print("projection_matrix_sklearn: \n", projection_matrix_sklearn)

projection_matrix_sklearn: 
 [[-0.25692873  0.26184306  0.46748619 -0.14396377  0.16536261 -0.03003708]
 [-0.39493118  0.10519825 -0.27968932 -0.08005785  0.14777408  0.38266373]
 [ 0.14646061  0.14409348  0.58807557  0.05551036 -0.23462139 -0.36224839]
 [ 0.31890519  0.34258497 -0.0755017   0.11245623  0.50792118  0.06331719]
 [-0.31344994  0.2697701   0.04676921  0.16529004 -0.3938966   0.42544212]
 [ 0.42269137  0.11117878 -0.09899801  0.30330631 -0.24845196  0.28318017]
 [ 0.47441968  0.14394753 -0.10128143  0.13223199 -0.22396681  0.10676882]
 [-0.09243753  0.55492047 -0.05156338  0.15057853  0.3303573  -0.15455292]
 [-0.20806957 -0.15292185 -0.40678741  0.47147768 -0.0014575  -0.56089714]
 [-0.29985192  0.1196342   0.16869128  0.58801992 -0.19324555  0.02014082]
 [-0.05892408 -0.49272747  0.21293142  0.08003179  0.11602319  0.16947538]
 [ 0.08747571 -0.29660091  0.29583773  0.47243936  0.45912914  0.27788835]]
