In [3]:
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Create Linear Discriminant Analysis From Scratch

Use sklearn Wine dateset to run an LDA algo against

In [2]:
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Categorical.from_codes(wine.target, wine.target_names)

In [None]:
X.shape

In [None]:
# create a dataframe with both features and classes

df = X.join(pd.Series(y, name='wine'))
df.head()

# Linear Discriminant Analysis Steps

https://towardsdatascience.com/linear-discriminant-analysis-in-python-76b8b17817c2

Linear Discriminant Analysis can be broken up into the following steps:
- Compute the within class and between class scatter matrices
- Compute the eigenvectors and corresponding eigenvalues for the scatter matrices
- Sort the eigenvalues and select the top k
- Create a new matrix containing eigenvectors that map to the k eigenvalues
- Obtain the new features (i.e. LDA components) by taking the dot product of the data and the matrix from step 4

Linear discriminant analysis of the form discussed above has its roots in an approach developed by the famous statistician R.A. Fisher, who arrived at linear discriminants from a different perspective. He was interested in finding a linear projection for data that maximizes the variance between classes relative to the variance for data from the same class. This approach is known as Fisherâ€™s linear discriminant analysis, and can be formulate for two classes or multiple classes.

# Within Class Scatter Matrix
We calculate the within class scatter matrix using the following formula.

A scatter matrix is a statistic that is used to make estimates of the covariance matrix, for instance of the multivariate normal distribution.

Lets define the scatter matrix (within) $S_W$ as

$$S_W = \sum_{i=1}^{c}{S_i}$$

where c is the total number of distinct classes and

$$S_i = \sum_{x\in D_i}^{c}{(x-m_i)(x-m_i)^T}$$

$$m_i = \frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k}$$

where x is a sample (i.e. row) and n is the total number of samples with a given class.

For every class, we create a vector with the means of each feature.

The full equation can be written as

$$S_W = \sum_{i=1}^{c}{\sum_{x\in D_i}^{c}{(x-m_i)(x-m_i)^T}}$$

$$S_W = \sum_{i=1}^{c}{\sum_{x\in D_i}^{c}{(x-\frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k})(x-\frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k})^T}}$$


In [None]:
class_feature_means = pd.DataFrame(columns=wine.target_names)
class_feature_means

# STEP 1 - Create vector of feature means

In [None]:
for c, rows in df.groupby('wine'):
    class_feature_means[c] = (rows.mean())
class_feature_means['class_0'][0]

# STEP 1 - Within class scatter matrix

In [None]:
within_class_scatter_matrix = np.zeros((13,13))

for c, rows in df.groupby('wine'):
    s = np.zeros((13,13))
    rows = rows.drop(columns=['wine'])
    for i, x in rows.iterrows():
        mc = class_feature_means[c]
        r = ((x - mc).values.reshape(13,1))
        res = r.dot(r.T)
        s+=res
    within_class_scatter_matrix+=s

# STEP 1 - Between class scatter matrix

We calculate the between class scatter matrix using the following formula:

$$S_b = \sum_{i=1}^{c}N_i{(m_i-m)(m_i-m)^T}$$

where

$$m_i = \frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k}$$

and

$$m = \frac{1}{n} \sum_{i}^{n}{x_k}$$

In [None]:
# calculate the feature means
feature_means= df.mean()
between_sclass_catter_matrix = np.zeros((13,13))

for c in class_feature_means:
    n = df[df['wine']==c].shape[0]
    
    mc  = class_feature_means[c].values.reshape(13,1)
    m = feature_means.values.reshape(13,1)
    
    between_sclass_catter_matrix += n * (mc-m).dot((mc-m).T)

# Step 2

Compute the eigenvectors and corresponding eigenvalues for the scatter matrices
We solve the generalised eigenvalue problem for

$$S_W^{-1}S_B$$

to obtain the linear discriminants.

In [None]:
eig_val, eig_vec = np.linalg.eig(np.linalg.inv(within_class_scatter_matrix).dot(between_sclass_catter_matrix))

The eigenvectors with the highest eigenvalues carry the most information about the distribution of the data. Thus, we sort the eigenvalues from highest to lowest and select the first $k$ eigenvectors. In order to ensure that the eigenvalue maps to the same eigenvector after sorting, we place them in a temporary array.

In [None]:
pairs = [(np.abs(eig_val[i]), eig_vec[i]) for i in range(len(eig_val))]

In [None]:
sorted(eig_val)

In [None]:
eig_vec

In [None]:
%matplotlib inline