Question 4. Linear Discriminant Analysis 

Following steps of this model.

In [2]:
# imports 
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder 

Preprocess by standardizing data and computing and gathering mean vectors 

In [3]:
# Set the seed for reproducibility
np.random.seed(1000)

# Load the data
df = pd.read_csv('wine_dataset.csv')

# Drop the 'style' column for the X data
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# (Step 1) Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

unique_labels = np.unique(y)
mean_vectors = []

# (Step 2) for each class, compute the d-dimensional mean vector
for label in unique_labels:
    mean_vectors.append(np.mean(X[y == label], axis=0))

# reported vectors 
print("mean_vectors: \n", mean_vectors)

mean_vectors: 
 [array([ 0.85188711,  1.14293625, -0.32797945, -0.61050298,  0.89728426,
       -0.82546595, -1.22575798,  0.68370347,  0.57603762,  0.85272394,
       -0.05770297, -0.2088384 ]), array([-0.27810688, -0.37312272,  0.1070721 ,  0.19930467, -0.29292722,
        0.26948143,  0.40016068, -0.22320168, -0.18805311, -0.27838007,
        0.0188377 ,  0.06817734])]


Construct the in-between matrix and within-class scatter matrix 

In [4]:
# (Step 3) construct the between-class scatter matrix and the within-class scatter matrix and report them

general_mean = np.mean(X, axis=0)
S_b = np.zeros((X.shape[1], X.shape[1]))
S_w = np.zeros((X.shape[1], X.shape[1]))

# for each class i compute the within class scatter matrix use the formula sum of (x - mean_vector_i) * (x - mean_vector_i).T
for cls, mean_vectors in zip(unique_labels, mean_vectors):
    S_i = np.zeros((X.shape[1], X.shape[1]))
    for row in X[y == cls]:
        row = row.reshape(-1, 1)
        S_i += (row - mean_vectors).dot((row - mean_vectors).T)
        
    S_w += S_i

print("S_w: \n", S_w)

mean_vector_i = mean_vectors.reshape(-1, 1)
general_mean = general_mean.reshape(-1, 1)

# for each class i compute the between class scatter matrix use the formula sum of n_i * (mean_vector_i - mean_vector) * (mean_vector_i - mean_vector).T
for cls, mean_vectors in zip(unique_labels, mean_vectors):
    n_i = X[y == cls].shape[0]
    S_b += n_i * (mean_vector_i - general_mean).dot((mean_vector_i - general_mean).T)

print("S_b: \n", S_b)

S_w: 
 [[ 86651.43119776  24682.91861377  38356.9348606    5379.77685766
   31767.54606218  -7135.77096977  -9262.57621049  45089.54985747
   -9991.19939809  32039.82766839   4618.60264716   6637.62008053]
 [ 24682.91861377  84492.88663349 -17485.5797232   -2250.80202333
   36841.93368691 -13658.63615171 -17001.71217179  29383.1136059
   29015.09868699  25223.64737055   8046.51782894  -9173.43881264]
 [ 38356.9348606  -17485.5797232   95401.82472589  29591.54960255
   15934.72681572  29661.63134498  35988.82432958  21182.83368859
  -11627.6291356   17440.89111714  15617.46729256  23664.41910457]
 [  5379.77685766  -2250.80202333  29591.54960255  97497.13971354
    3889.22600458  51739.6748069   60444.36285737  57810.37776953
   -5708.12959065   -388.4619228  -10538.17378172  15160.53529964]
 [ 31767.54606218  36841.93368691  15934.72681572   3889.22600458
   86314.74656455   -467.3999829   -5577.66837098  37413.63876623
   13027.61051479  39358.0222804   -8138.12344132  -3192.19985053]

Compute the eigenvalues and eignevectors 

In [5]:
# Step 4 compute the eigenvalues and eigenvectors of S_w^-1 * S_b

eigenvalues, eigenvectors = np.linalg.eig(np.matmul(np.linalg.inv(S_w), S_b))

# we should have a max of c-1 eigenvalues be non-zero

#reported eigenvalues and eigenvectors
print("eigenvalues: \n", eigenvalues)
print("eigenvectors: \n", eigenvectors)


eigenvalues: 
 [ 2.41977984e-02+0.00000000e+00j -1.16875452e-17+0.00000000e+00j
 -2.84673453e-17+0.00000000e+00j  4.93018449e-18+0.00000000e+00j
  1.95465950e-18+3.90700292e-18j  1.95465950e-18-3.90700292e-18j
 -1.85608094e-18+1.73749669e-18j -1.85608094e-18-1.73749669e-18j
  2.97338632e-18+0.00000000e+00j  3.21615036e-19+7.56901394e-19j
  3.21615036e-19-7.56901394e-19j  1.46808261e-18+0.00000000e+00j]
eigenvectors: 
 [[ 0.12653425+0.00000000e+00j  0.4159696 +0.00000000e+00j
  -0.41201565+0.00000000e+00j  0.19212247+0.00000000e+00j
  -0.09752898-1.33134724e-01j -0.09752898+1.33134724e-01j
   0.01394392-2.89994763e-01j  0.01394392+2.89994763e-01j
  -0.02010796+0.00000000e+00j -0.08139666+4.44986761e-02j
  -0.08139666-4.44986761e-02j -0.21007425+0.00000000e+00j]
 [-0.13671375+0.00000000e+00j -0.27774313+0.00000000e+00j
  -0.14794455+0.00000000e+00j -0.10092829+0.00000000e+00j
   0.0320059 +5.83254243e-03j  0.0320059 -5.83254243e-03j
  -0.08458614+7.85067371e-02j -0.08458614-7.85067371e-0

Sort the largest eigenvalues and corresponding eigenvectors.

In [6]:
# Step 5 sort the eigenvalues and select k = d/2 largest eigenvalues
# report the largest eigenvalues and their corresponding eigenvectors
k = (len(eigenvalues)) // 2


# get the nonzero eigenvalues should be 1
non_zero_eigenvals = []

# get the real numbers of the eigenvectors and eigenvalues 
eigenvectors = np.abs(eigenvectors)
eigenvalues = np.abs(eigenvalues)

# iterate and find largest
for val in eigenvalues:
    if val > 1e-5:
        non_zero_eigenvals.append(val)
        

non_zero_eigenvals = np.array(non_zero_eigenvals)

k_largest_eigenvalues = non_zero_eigenvals[np.argsort(-non_zero_eigenvals)][:k]
k_largest_eigenvectors = eigenvectors[:, np.argsort(-non_zero_eigenvals)][:k]

print("k_largest_eigenvalues: \n", k_largest_eigenvalues)
print("k_largest_eigenvectors: \n", k_largest_eigenvectors)


k_largest_eigenvalues: 
 [0.0241978]
k_largest_eigenvectors: 
 [[0.12653425]
 [0.13671375]
 [0.03732081]
 [0.47035028]
 [0.04282246]
 [0.08758026]]


Construct the Projection Matrix 

In [7]:
# construct the projection matrix
projection_matrix = k_largest_eigenvectors


print(projection_matrix)


[[0.12653425]
 [0.13671375]
 [0.03732081]
 [0.47035028]
 [0.04282246]
 [0.08758026]]
