In [1]:
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
np.set_printoptions(precision=4)
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Create Linear Discriminant Analysis From Scratch

Use sklearn Wine dateset to run an LDA algo against

In [2]:
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Categorical.from_codes(wine.target, wine.target_names)

In [3]:
X.shape

(178, 13)

In [10]:
# create a dataframe with both features and classes

df = X.join(pd.Series(y, name='class'))
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,class_0


# Linear Discriminant Analysis Steps

Linear Discriminant Analysis can be broken up into the following steps:
- Compute the within class and between class scatter matrices
- Compute the eigenvectors and corresponding eigenvalues for the scatter matrices
- Sort the eigenvalues and select the top k
- Create a new matrix containing eigenvectors that map to the k eigenvalues
- Obtain the new features (i.e. LDA components) by taking the dot product of the data and the matrix from step 4

Linear discriminant analysis of the form discussed above has its roots in an approach developed by the famous statistician R.A. Fisher, who arrived at linear discriminants from a different perspective. He was interested in finding a linear projection for data that maximizes the variance between classes relative to the variance for data from the same class. This approach is known as Fisher’s linear discriminant analysis, and can be formulate for two classes or multiple classes.

# Within Class Scatter Matrix
We calculate the within class scatter matrix using the following formula.

A scatter matrix is a statistic that is used to make estimates of the covariance matrix, for instance of the multivariate normal distribution.

Lets define the scatter matrix (within) $S_W$ as

$$S_W = \sum_{i=1}^{c}{S_i}$$

where c is the total number of distinct classes and

$$S_i = \sum_{x\in D_i}^{c}{(x-m_i)(x-m_i)^T}$$

$$m_i = \frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k}$$

where x is a sample (i.e. row) and n is the total number of samples with a given class.

For every class, we create a vector with the means of each feature.

The full equation can be written as

$$S_W = \sum_{i=1}^{c}{\sum_{x\in D_i}^{c}{(x-m_i)(x-m_i)^T}}$$

$$S_W = \sum_{i=1}^{c}{\sum_{x\in D_i}^{c}{(x-\frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k})(x-\frac{1}{n_i} \sum_{x\in D_i}^{n}{x_k})^T}}$$


In [25]:
class_feature_means = pd.DataFrame(columns=wine.target_names)
class_feature_means

c = 'class_0'
df=df[df['class']==c]
df.shape

n=59
feastures=14-1
df


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,class_0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,class_0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,class_0
5,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0,class_0
6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0,class_0
7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0,class_0
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0,class_0
9,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0,class_0


In [20]:
for c, rows in df.groupby('class'):
    class_feature_means[c] = rows.mean()
    #print(rows)
    break
df.mean()

alcohol                          13.000618
malic_acid                        2.336348
ash                               2.366517
alcalinity_of_ash                19.494944
magnesium                        99.741573
total_phenols                     2.295112
flavanoids                        2.029270
nonflavanoid_phenols              0.361854
proanthocyanins                   1.590899
color_intensity                   5.058090
hue                               0.957449
od280/od315_of_diluted_wines      2.611685
proline                         746.893258
dtype: float64

Then, we plug the mean vectors ($m_i$) into the equation from before in order to obtain the within class scatter matrix.

In [35]:
within_scatter_matrix = np.zeros((13,13))

In [34]:
within_scatter_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [5]:
within_class_scatter_matrix = np.zeros((13,13))

for c, rows in df.groupby('class'):
    
    rows = rows.drop(['class'], axis=1)
    
    s = np.zeros((13,13))
    
    for index, row in rows.iterrows():
        
        #print('------>', row)
        
        x, mc = row.values.reshape(13,1), class_feature_means[c].values.reshape(13,1)
        
        #print(mc)
        
        s+= (x - mc).dot((x - mc).T)
    
    within_class_scatter_matrix += s

NameError: name 'class_feature_means' is not defined

In [86]:
data = [
    [1,8,1,5, 'class_0'],
    [3,4,2,4, 'class_0'],
    [2,2,3,6, 'class_0'],
    [10,20,30,40, 'class_1'],
    [10,20,30,40, 'class_1'],
    [10,20,30,40, 'class_1'],
    [100,200,300,400, 'class_2'],
    [100,200,300,400, 'class_2'],
    [100,200,300,400, 'class_2']
]

df_try = pd.DataFrame(data, columns=['a', 'b', 'c', 'd', 'class'])

try_class_feature_means = pd.DataFrame(columns=[df_try['class'].unique()])
for c, row in df_try.groupby('class'):
    try_class_feature_means[c] = row.mean()
    
try_scatter_matrix = np.zeros((4,4))

for c, rows in df_try.groupby('class'):
    
    rows = rows.drop(['class'], axis=1)
    
    s = np.zeros((4,4))
    
    for index, row in rows.iterrows():
        
        x, mc = row.values.reshape(4,1), try_class_feature_means[c].values.reshape(4,1)
        
        s+= (x - mc).dot((x - mc).T)
    
    try_scatter_matrix += s

try_scatter_matrix

array([[ 2.    , -4.    ,  1.    , -1.    ],
       [-4.    , 18.6667, -6.    , -2.    ],
       [ 1.    , -6.    ,  2.    ,  1.    ],
       [-1.    , -2.    ,  1.    ,  2.    ]])