Question 4. Linear Discriminant Analysis 

Following steps of this model.

In [2]:
# imports 
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder 

Preprocess by standardizing data and computing and gathering mean vectors 

In [5]:
# Set the seed for reproducibility
np.random.seed(1000)

# Load the data
df = pd.read_csv('wine_dataset.csv')

# Drop the 'style' column for the X data
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# (Step 1) Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

unique_labels = np.unique(y)
mean_vectors = []


# (Step 2) for each class, compute the d-dimensional mean vector
for label in unique_labels:
    mean_vectors.append(np.mean(X[y == label], axis=0))

# reported vectors 
print("mean_vectors: \n", mean_vectors)

mean_vectors: 
 [array([ 0.85188711,  1.14293625, -0.32797945, -0.61050298,  0.89728426,
       -0.82546595, -1.22575798,  0.68370347,  0.57603762,  0.85272394,
       -0.05770297, -0.2088384 ]), array([-0.27810688, -0.37312272,  0.1070721 ,  0.19930467, -0.29292722,
        0.26948143,  0.40016068, -0.22320168, -0.18805311, -0.27838007,
        0.0188377 ,  0.06817734])]


Construct the in-between matrix and within-class scatter matrix 

In [6]:
# (Step 3) construct the between-class scatter matrix and the within-class scatter matrix and report them
import numpy as np

# Assuming X is your data matrix and y is the class labels
unique_labels = np.unique(y)
general_mean = np.mean(X, axis=0)
S_b = np.zeros((X.shape[1], X.shape[1]))
S_w = np.zeros((X.shape[1], X.shape[1]))

# Compute the within-class scatter matrix
for cls in unique_labels:
    X_cls = X[y == cls]
    mean_vector_cls = np.mean(X_cls, axis=0).reshape(-1, 1)
    for row in X_cls:
        row = row.reshape(-1, 1)
        S_w += (row - mean_vector_cls).dot((row - mean_vector_cls).T)

# Compute the between-class scatter matrix
for cls in unique_labels:
    X_cls = X[y == cls]
    n_cls = X_cls.shape[0]
    mean_vector_cls = np.mean(X_cls, axis=0).reshape(-1, 1)
    general_mean = general_mean.reshape(-1, 1)
    S_b += n_cls * (mean_vector_cls - general_mean).dot((mean_vector_cls - general_mean).T)

print("S_w: \n", S_w)
print("S_b: \n", S_b)


S_w: 
 [[ 4.95775893e+03 -6.42229598e+02  2.70047199e+03  3.75551274e+02
   3.16104047e+02 -3.45430353e+02  7.69103351e+01  1.74618144e+03
  -2.68261450e+03  4.05538526e+02 -5.15887327e+02 -1.21258789e+02]
 [-6.42229598e+02  3.72631874e+03 -1.66066395e+03  2.06483483e+02
   2.74999396e+02 -2.89488179e+02  2.78604048e+02  1.05188718e+02
   3.02251358e+02 -5.98938903e+02 -1.04667293e+02 -1.21998799e+03]
 [ 2.70047199e+03 -1.66066395e+03  6.26884194e+03  5.00810783e+02
   8.77563588e+02  2.90685117e+02  4.15791730e+02  1.10032864e+03
  -1.74204473e+03  9.58309078e+02 -1.08317143e+02  4.10421670e+02]
 [ 3.75551274e+02  2.06483483e+02  5.00810783e+02  5.70647041e+03
   3.24151203e+02  1.54856919e+03  1.63193139e+03  4.47501824e+03
  -9.90875981e+02 -1.03793096e+02 -2.40983633e+03 -5.10683386e+02]
 [ 3.16104047e+02  2.74999396e+02  8.77563588e+02  3.24151203e+02
   4.78933511e+03  3.03778036e+02  5.16040024e+02  1.05471843e+03
  -8.05817195e+02  9.47309713e+02 -1.55936322e+03 -9.06273339e+02

Compute the eigenvalues and eignevectors 

In [5]:
# Step 4 compute the eigenvalues and eigenvectors of S_w^-1 * S_b

eigenvalues, eigenvectors = np.linalg.eig(np.matmul(np.linalg.inv(S_w), S_b))

# we should have a max of c-1 eigenvalues be non-zero

#reported eigenvalues and eigenvectors
print("eigenvalues: \n", eigenvalues)
print("eigenvectors: \n", eigenvectors)


eigenvalues: 
 [ 2.41977984e-02+0.00000000e+00j -1.16875452e-17+0.00000000e+00j
 -2.84673453e-17+0.00000000e+00j  4.93018449e-18+0.00000000e+00j
  1.95465950e-18+3.90700292e-18j  1.95465950e-18-3.90700292e-18j
 -1.85608094e-18+1.73749669e-18j -1.85608094e-18-1.73749669e-18j
  2.97338632e-18+0.00000000e+00j  3.21615036e-19+7.56901394e-19j
  3.21615036e-19-7.56901394e-19j  1.46808261e-18+0.00000000e+00j]
eigenvectors: 
 [[ 0.12653425+0.00000000e+00j  0.4159696 +0.00000000e+00j
  -0.41201565+0.00000000e+00j  0.19212247+0.00000000e+00j
  -0.09752898-1.33134724e-01j -0.09752898+1.33134724e-01j
   0.01394392-2.89994763e-01j  0.01394392+2.89994763e-01j
  -0.02010796+0.00000000e+00j -0.08139666+4.44986761e-02j
  -0.08139666-4.44986761e-02j -0.21007425+0.00000000e+00j]
 [-0.13671375+0.00000000e+00j -0.27774313+0.00000000e+00j
  -0.14794455+0.00000000e+00j -0.10092829+0.00000000e+00j
   0.0320059 +5.83254243e-03j  0.0320059 -5.83254243e-03j
  -0.08458614+7.85067371e-02j -0.08458614-7.85067371e-0

Sort the largest eigenvalues and corresponding eigenvectors.

In [6]:
# Step 5 sort the eigenvalues and select k = d/2 largest eigenvalues
# report the largest eigenvalues and their corresponding eigenvectors
k = (len(eigenvalues)) // 2


# get the nonzero eigenvalues should be 1
non_zero_eigenvals = []

# get the real numbers of the eigenvectors and eigenvalues 
eigenvectors = np.abs(eigenvectors)
eigenvalues = np.abs(eigenvalues)

# iterate and find largest
for val in eigenvalues:
    if val > 1e-5:
        non_zero_eigenvals.append(val)
        

non_zero_eigenvals = np.array(non_zero_eigenvals)

k_largest_eigenvalues = non_zero_eigenvals[np.argsort(-non_zero_eigenvals)][:k]
k_largest_eigenvectors = eigenvectors[:, np.argsort(-non_zero_eigenvals)][:k]

print("k_largest_eigenvalues: \n", k_largest_eigenvalues)
print("k_largest_eigenvectors: \n", k_largest_eigenvectors)


k_largest_eigenvalues: 
 [0.0241978]
k_largest_eigenvectors: 
 [[0.12653425]
 [0.13671375]
 [0.03732081]
 [0.47035028]
 [0.04282246]
 [0.08758026]]


Construct the Projection Matrix 

In [7]:
# construct the projection matrix
projection_matrix = k_largest_eigenvectors


print(projection_matrix)


[[0.12653425]
 [0.13671375]
 [0.03732081]
 [0.47035028]
 [0.04282246]
 [0.08758026]]
