Principle Component Analysis on the Wine Dataset using sklearn

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [8]:
# Set the seed for reproducibility
np.random.seed(1000)

# Load the data
df = pd.read_csv('wine_dataset.csv')

# Drop the 'style' column
df.drop('style', axis=1, inplace=True)


# Standardize the data
scaler = StandardScaler()
df_standardized = scaler.fit_transform(df)


# Convert the standardized data back to a DataFrame
df_standardized = pd.DataFrame(df_standardized, columns=df.columns)


# get the covariance matrix
covariance_matrix = np.asarray(df_standardized.cov())
print("covariance matrix: \n", covariance_matrix)


# get the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print("eigenvalues: \n", eigenvalues)
print("eigenvectors: \n", eigenvectors)

covariance matrix: 
 [[ 1.00015394  0.21904197  0.32448567 -0.11199852  0.29824068 -0.28277895
  -0.32910456  0.45898063 -0.25273937  0.29961386 -0.09546622 -0.07675502]
 [ 0.21904197  1.00015394 -0.3780395  -0.19604135  0.37718233 -0.35261158
  -0.41454     0.27133741  0.26149465  0.22601847 -0.03764618 -0.26574038]
 [ 0.32448567 -0.3780395   1.00015394  0.14247316  0.03900402  0.1331463
   0.19527203  0.09616873 -0.32985896  0.05620595 -0.01049511  0.08554488]
 [-0.11199852 -0.19604135  0.14247316  1.00015394 -0.12896035  0.40293266
   0.49555786  0.55260201 -0.26736099 -0.18595603 -0.3594701  -0.03698618]
 [ 0.29824068  0.37718233  0.03900402 -0.12896035  1.00015394 -0.19507481
  -0.27967349  0.36267048  0.04471486  0.3956542  -0.25695513 -0.20069639]
 [-0.28277895 -0.35261158  0.1331463   0.40293266 -0.19507481  1.00015394
   0.72104506  0.0257208  -0.14587635 -0.18848626 -0.17986612  0.0554716 ]
 [-0.32910456 -0.41454     0.19527203  0.49555786 -0.27967349  0.72104506
   1.0001539

In [5]:
# sort the eigenvalues and select k = d/2 largest eigenvalues
# report the largest eigenvalues and their corresponding eigenvectors
k = (len(eigenvalues) // 2)

sort_indices = np.argsort(eigenvalues)[::-1][:k]

eigenvalues_k = eigenvalues[sort_indices]
eigenvectors_k = eigenvectors[:, sort_indices]

print("eigenvalues_k: \n", eigenvalues_k)
print("eigenvectors_k: \n", eigenvectors_k)

# construct the projection matrix using d * k which should be 12 * 6 
projection_matrix = eigenvectors_k
print("projection_matrix: \n", projection_matrix)


eigenvalues_k: 
 [3.04201535 2.65026192 1.64175951 1.06878976 0.84062985 0.66050084]
eigenvectors_k: 
 [[ 0.25692873  0.26184306 -0.46748619 -0.14396377  0.16536261 -0.03003708]
 [ 0.39493118  0.10519825  0.27968932 -0.08005785  0.14777408  0.38266373]
 [-0.14646061  0.14409348 -0.58807557  0.05551036 -0.23462139 -0.36224839]
 [-0.31890519  0.34258497  0.0755017   0.11245623  0.50792118  0.06331719]
 [ 0.31344994  0.2697701  -0.04676921  0.16529004 -0.3938966   0.42544212]
 [-0.42269137  0.11117878  0.09899801  0.30330631 -0.24845196  0.28318017]
 [-0.47441968  0.14394753  0.10128143  0.13223199 -0.22396681  0.10676882]
 [ 0.09243753  0.55492047  0.05156338  0.15057853  0.3303573  -0.15455292]
 [ 0.20806957 -0.15292185  0.40678741  0.47147768 -0.0014575  -0.56089714]
 [ 0.29985192  0.1196342  -0.16869128  0.58801992 -0.19324555  0.02014082]
 [ 0.05892408 -0.49272747 -0.21293142  0.08003179  0.11602319  0.16947538]
 [-0.08747571 -0.29660091 -0.29583773  0.47243936  0.45912914  0.2778883

In [6]:
# Perform PCA
pca = PCA(n_components=k)
X_pca = pca.fit_transform(df_standardized)

# Construct the projection matrix using sklearn
projection_matrix_sklearn = pca.components_.T

print("projection_matrix_sklearn: \n", projection_matrix_sklearn)

projection_matrix_sklearn: 
 [[-0.25692873  0.26184306  0.46748619 -0.14396377  0.16536261 -0.03003708]
 [-0.39493118  0.10519825 -0.27968932 -0.08005785  0.14777408  0.38266373]
 [ 0.14646061  0.14409348  0.58807557  0.05551036 -0.23462139 -0.36224839]
 [ 0.31890519  0.34258497 -0.0755017   0.11245623  0.50792118  0.06331719]
 [-0.31344994  0.2697701   0.04676921  0.16529004 -0.3938966   0.42544212]
 [ 0.42269137  0.11117878 -0.09899801  0.30330631 -0.24845196  0.28318017]
 [ 0.47441968  0.14394753 -0.10128143  0.13223199 -0.22396681  0.10676882]
 [-0.09243753  0.55492047 -0.05156338  0.15057853  0.3303573  -0.15455292]
 [-0.20806957 -0.15292185 -0.40678741  0.47147768 -0.0014575  -0.56089714]
 [-0.29985192  0.1196342   0.16869128  0.58801992 -0.19324555  0.02014082]
 [-0.05892408 -0.49272747  0.21293142  0.08003179  0.11602319  0.16947538]
 [ 0.08747571 -0.29660091  0.29583773  0.47243936  0.45912914  0.27788835]]


Question 4. Linear Discriminant Analysis 

Following steps of this model.

In [5]:
from sklearn.preprocessing import LabelEncoder  

In [9]:
# make a new dataframe and standardize data 
lda_df = pd.read_csv('C:\\Users\\esteb\\PythonProjects\\projects\\Machine-Learning\\Training&DimensionalityReduction\\wine_dataset.csv')

# Drop the 'style' column for the X data
X = lda_df.iloc[:, :-1].values
y = lda_df.iloc[:, -1].values

# encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# (Step 1) Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

unique_labels = np.unique(y)
mean_vectors = []

# (Step 2) for each class, compute the d-dimensional mean vector
for label in unique_labels:
    mean_vectors.append(np.mean(X[y == label], axis=0))

# reported vectors 
print("mean_vectors: \n", mean_vectors)

# (Step 3) construct the between-class scatter matrix and the within-class scatter matrix and report them

general_mean = np.mean(X, axis=0)
S_b = np.zeros((X.shape[1], X.shape[1]))
S_w = np.zeros((X.shape[1], X.shape[1]))

# for each class i compute the within class scatter matrix use the formula sum of (x - mean_vector_i) * (x - mean_vector_i).T
for cls, mean_vectors in zip(unique_labels, mean_vectors):
    S_i = np.zeros((X.shape[1], X.shape[1]))
    for row in X[y == cls]:
        row = row.reshape(-1, 1)
        S_i += (row - mean_vectors).dot((row - mean_vectors).T)
        
    S_w += S_i

print("S_w: \n", S_w)

# for each class i compute the between class scatter matrix use the formula sum of n_i * (mean_vector_i - mean_vector) * (mean_vector_i - mean_vector).T
for cls, mean_vectors in zip(unique_labels, mean_vectors):
    n_i = X[y == cls].shape[0]
    mean_vector_i = mean_vectors.reshape(-1, 1)
    general_mean = general_mean.reshape(-1, 1)
    S_b += n_i * (mean_vector_i - general_mean).dot((mean_vector_i - general_mean).T)

print("S_b: \n", S_b)

mean_vectors: 
 [array([ 0.85188711,  1.14293625, -0.32797945, -0.61050298,  0.89728426,
       -0.82546595, -1.22575798,  0.68370347,  0.57603762,  0.85272394,
       -0.05770297, -0.2088384 ]), array([-0.27810688, -0.37312272,  0.1070721 ,  0.19930467, -0.29292722,
        0.26948143,  0.40016068, -0.22320168, -0.18805311, -0.27838007,
        0.0188377 ,  0.06817734])]
S_w: 
 [[ 86651.43119776  24682.91861377  38356.9348606    5379.77685766
   31767.54606218  -7135.77096977  -9262.57621049  45089.54985747
   -9991.19939809  32039.82766839   4618.60264716   6637.62008053]
 [ 24682.91861377  84492.88663349 -17485.5797232   -2250.80202333
   36841.93368691 -13658.63615171 -17001.71217179  29383.1136059
   29015.09868699  25223.64737055   8046.51782894  -9173.43881264]
 [ 38356.9348606  -17485.5797232   95401.82472589  29591.54960255
   15934.72681572  29661.63134498  35988.82432958  21182.83368859
  -11627.6291356   17440.89111714  15617.46729256  23664.41910457]
 [  5379.77685766  -22

In [14]:
# Step 4 compute the eigenvalues and eigenvectors of S_w^-1 * S_b

eigenvalues, eigenvectors = np.linalg.eig(np.linalg.inv(S_w).dot(S_b))

# reported eigenvalues and eigenvectors
print("eigenvalues: \n", eigenvalues)
print("eigenvectors: \n", eigenvectors)



eigenvectors shape:  (12, 12)
eigenvalues: 
 [ 4.22135349e-02+0.00000000e+00j  4.05328121e-17+0.00000000e+00j
 -1.31636557e-17+1.49094221e-17j -1.31636557e-17-1.49094221e-17j
 -1.70638958e-17+0.00000000e+00j -1.46326127e-18+8.28603892e-18j
 -1.46326127e-18-8.28603892e-18j -2.69365543e-18+3.46978628e-18j
 -2.69365543e-18-3.46978628e-18j  2.55603894e-18+0.00000000e+00j
  1.93032456e-18+0.00000000e+00j  1.51219031e-19+0.00000000e+00j]
eigenvectors: 
 [[ 0.39182636+0.j         -0.58942112+0.j          0.01085165-0.01839055j
   0.01085165+0.01839055j -0.2097326 +0.j          0.01478323+0.09876707j
   0.01478323-0.09876707j -0.06089186+0.14494832j -0.06089186-0.14494832j
  -0.21500466+0.j         -0.2284147 +0.j         -0.00326998+0.j        ]
 [ 0.37385948+0.j          0.04683815+0.j         -0.56000814+0.j
  -0.56000814-0.j         -0.1365262 +0.j          0.46486966-0.30468041j
   0.46486966+0.30468041j  0.5704538 +0.j          0.5704538 -0.j
  -0.50223489+0.j         -0.28543731+0.j    

In [15]:
# Step 5 sort the eigenvalues and select k = d/2 largest eigenvalues
# report the largest eigenvalues and their corresponding eigenvectors

k = (len(eigenvalues)) // 2

sort_indices = np.argsort(eigenvalues)[::-1][:k]
eigenvalues_k = eigenvalues[sort_indices]
eigenvectors_k = eigenvectors[:, sort_indices]



print("eigenvalues_k: \n", eigenvalues_k)
print("eigenvectors_k: \n", eigenvectors_k)

eigenvalues_k: 
 [ 4.22135349e-02+0.00000000e+00j  4.05328121e-17+0.00000000e+00j
  2.55603894e-18+0.00000000e+00j  1.93032456e-18+0.00000000e+00j
  1.51219031e-19+0.00000000e+00j -1.46326127e-18+8.28603892e-18j]
eigenvectors_k: 
 [[ 0.39182636+0.j         -0.58942112+0.j         -0.21500466+0.j
  -0.2284147 +0.j         -0.00326998+0.j          0.01478323+0.09876707j]
 [ 0.37385948+0.j          0.04683815+0.j         -0.50223489+0.j
  -0.28543731+0.j         -0.02980218+0.j          0.46486966-0.30468041j]
 [ 0.13176493+0.j          0.11004922+0.j         -0.35925643+0.j
  -0.42714159+0.j         -0.67093758+0.j          0.10300738-0.17291467j]
 [ 0.2478929 +0.j         -0.4218124 +0.j          0.44968431+0.j
   0.42568229+0.j         -0.01088997+0.j         -0.58274273+0.j        ]
 [ 0.29423976+0.j          0.11937805+0.j          0.44708902+0.j
   0.41931389+0.j          0.73349833+0.j          0.04281078-0.00281586j]
 [ 0.11515205+0.j         -0.12276908+0.j          0.24382344+0.

In [12]:
# Step 6 construct the projection matrix


projection_matrix: 
 [[ 0.39182636+0.j         -0.58942112+0.j         -0.21500466+0.j
  -0.2284147 +0.j         -0.00326998+0.j          0.01478323+0.09876707j]
 [ 0.37385948+0.j          0.04683815+0.j         -0.50223489+0.j
  -0.28543731+0.j         -0.02980218+0.j          0.46486966-0.30468041j]
 [ 0.13176493+0.j          0.11004922+0.j         -0.35925643+0.j
  -0.42714159+0.j         -0.67093758+0.j          0.10300738-0.17291467j]
 [ 0.2478929 +0.j         -0.4218124 +0.j          0.44968431+0.j
   0.42568229+0.j         -0.01088997+0.j         -0.58274273+0.j        ]
 [ 0.29423976+0.j          0.11937805+0.j          0.44708902+0.j
   0.41931389+0.j          0.73349833+0.j          0.04281078-0.00281586j]
 [ 0.11515205+0.j         -0.12276908+0.j          0.24382344+0.j
   0.08981015+0.j         -0.01141248+0.j         -0.02314619-0.03664367j]
 [ 0.37430172+0.j          0.36834036+0.j          0.08325444+0.j
   0.08678764+0.j         -0.01122921+0.j         -0.15811878+0.062