Principle Component Analysis on the Wine Dataset using sklearn

In [21]:
import os 
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [28]:
# Set the seed for reproducibility
np.random.seed(1000)

# Load the data
df = pd.read_csv('C:\\Users\\esteb\\PythonProjects\\projects\\Machine-Learning\\Training&DimensionalityReduction\\wine_dataset.csv')

# Drop the 'style' column
df.drop('style', axis=1, inplace=True)

# Standardize the data
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)

# Convert the standardized data back to a DataFrame
df_standardized = pd.DataFrame(df_standardized, columns=df.columns)

# Display the first few rows of the standardized data
print(df_standardized.head())

# get the covariance matrix
covariance_matrix = np.cov(df.T)
print("covariance matrix: \n", covariance_matrix)

# get the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print("eigenvalues: \n", eigenvalues)
print("eigenvectors: \n", eigenvectors)


   fixed_acidity  volatile_acidity  citric_acid  residual_sugar  chlorides  \
0       0.142473          2.188833    -2.192833       -0.744778   0.569958   
1       0.451036          3.282235    -2.192833       -0.597640   1.197975   
2       0.451036          2.553300    -1.917553       -0.660699   1.026697   
3       3.073817         -0.362438     1.661085       -0.744778   0.541412   
4       0.142473          2.188833    -2.192833       -0.744778   0.569958   

   free_sulfur_dioxide  total_sulfur_dioxide   density        pH  sulphates  \
0            -1.100140             -1.446359  1.034993  1.813090   0.193097   
1            -0.311320             -0.862469  0.701486 -0.115073   0.999579   
2            -0.874763             -1.092486  0.768188  0.258120   0.797958   
3            -0.762074             -0.986324  1.101694 -0.363868   0.327510   
4            -1.100140             -1.446359  1.034993  1.813090   0.193097   

    alcohol   quality  
0 -0.915464 -0.937230  
1 -0.580

In [29]:
# sort the eigenvalues and select k = d/2 largest eigenvalues
# report the largest eigenvalues and their corresponding eigenvectors
k = (len(eigenvalues) // 2)

sort_indices = np.argsort(eigenvalues)[::-1][:k]
eigenvalues_k = eigenvalues[sort_indices]
eigenvectors_k = eigenvectors[:, sort_indices]

print("eigenvalues_k: \n", eigenvalues_k)
print("eigenvectors_k: \n", eigenvectors_k)

# construct the projection matrix
projection_matrix = eigenvectors_k
print("projection_matrix: \n", projection_matrix)

eigenvalues_k: 
 [3.37210738e+03 1.43655406e+02 1.70649469e+01 1.74849230e+00
 1.22601776e+00 4.82398206e-01]
eigenvectors_k: 
 [[-7.40794747e-03  5.37151136e-03 -2.38512534e-02 -7.13391954e-01
   6.93896945e-01 -5.68311269e-02]
 [-1.18432245e-03  7.86983127e-04 -9.04663220e-04 -2.40023102e-02
  -1.79724761e-02 -4.20670532e-02]
 [ 4.86867386e-04  2.47169465e-04 -1.92231722e-03 -2.40290581e-02
   5.09750479e-02  3.09345706e-03]
 [ 4.10197243e-02 -1.86280097e-02 -9.95215659e-01  7.05007767e-02
   3.55509694e-02 -3.49629770e-02]
 [-1.68197676e-04 -6.68445065e-05 -1.76598728e-04 -9.90529594e-03
  -3.42814494e-03  2.40484439e-03]
 [ 2.30481538e-01 -9.72618836e-01  2.71252260e-02 -1.08118777e-02
  -6.79198388e-04 -6.09737975e-03]
 [ 9.72166737e-01  2.31394621e-01  3.58535508e-02 -2.26122774e-03
   7.39713625e-03 -3.89552188e-05]
 [ 1.77246490e-06 -1.27799964e-06 -4.60800041e-04 -1.43929562e-03
  -1.70548725e-04  4.73059696e-04]
 [-6.55520709e-04 -6.48013240e-04  6.91136883e-03  2.76138928e-0

In [34]:
# Perform PCA
pca = PCA(n_components=(len(eigenvalues)//2))
X_pca = pca.fit_transform(df)

# Construct the projection matrix using sklearn
projection_matrix_sklearn = pca.components_

print("projection_matrix_sklearn: \n", projection_matrix_sklearn)


projection_matrix_sklearn: 
 [[-7.40794747e-03 -1.18432245e-03  4.86867386e-04  4.10197243e-02
  -1.68197676e-04  2.30481538e-01  9.72166737e-01  1.77246490e-06
  -6.55520709e-04 -7.04339125e-04 -5.45180769e-03 -5.32679832e-04]
 [-5.37151136e-03 -7.86983127e-04 -2.47169465e-04  1.86280097e-02
   6.68445065e-05  9.72618836e-01 -2.31394621e-01  1.27799964e-06
   6.48013240e-04  3.46546062e-04  2.87899807e-03  9.15205691e-03]
 [ 2.38512534e-02  9.04663220e-04  1.92231722e-03  9.95215659e-01
   1.76598728e-04 -2.71252260e-02 -3.58535508e-02  4.60800041e-04
  -6.91136883e-03 -1.93636054e-03 -8.25958966e-02 -8.79199781e-03]
 [ 7.13391954e-01  2.40023102e-02  2.40290581e-02 -7.05007767e-02
   9.90529594e-03  1.08118777e-02  2.26122774e-03  1.43929562e-03
  -2.76138928e-02  2.23568727e-02 -6.09807112e-01 -3.34064046e-01]
 [ 6.93896945e-01 -1.79724761e-02  5.09750479e-02  3.55509694e-02
  -3.42814494e-03 -6.79198388e-04  7.39713625e-03 -1.70548725e-04
  -3.77127643e-02  1.67223779e-02  5.856645