In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from math import pi
from math import exp

In [3]:
SEED = 13
np.random.seed(SEED)
np.set_printoptions(suppress=True)

In [4]:
df = pd.read_csv('iris.data',sep=',',header=None, names=['sepal length','sepal width','petal length',
                                                         'petal width','class'])

FileNotFoundError: [Errno 2] File b'iris.data' does not exist: b'iris.data'

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df['class'] = LabelEncoder().fit_transform(df['class'])

In [None]:
df.sample(5,random_state=SEED)

In [None]:
df['class'].value_counts()

In [None]:
sns.countplot(df['class'])

In [None]:
f,ax = plt.subplots(2,2,figsize=(10,7))
ax = ax.flatten()
for i in range(4):
    sns.distplot(df.iloc[:,i],ax=ax[i])

Data is all cleaned and with no NULL values. We won't be perfoming the EDA or preprocessing for this data as we are not building a model just applying the LDA

In [None]:
class LDA():
    
    def __init__(self,df,class_name='class'):
        self.df = df
        self.class_name = class_name
        self.n = df.shape[1]-1
    
        
    def find_class_vise_mean(self):
        class_vise_mean = self.df.groupby(self.class_name).mean().T
        return class_vise_mean
    
    
    def find_within_class_scatter(self):
        class_vise_mean = self.find_class_vise_mean()
        within_class_scatter_matrix = np.zeros((self.n,self.n))
        for class_, rows in self.df.groupby('class'):
            rows = rows.drop([self.class_name], axis=1)
            dot_product = np.zeros((self.n,self.n))
            class_mean = class_vise_mean[class_].values.reshape(self.n,1)

            for index, row in rows.iterrows():
                n_th_row = row.values.reshape(self.n,1) 
                # get all the elements in the columns row-vise that belong to current class in a form of 2-D array
                dot_product += (n_th_row - class_mean).dot((n_th_row - class_mean).T)

                # for each column element 'X', subtract it's 'own' class mean for all the x_i
                # i.e for a row element say 'flower' at index 13, it has 4 ATTRIBUTES corresponding to sepal and 
                # petal's width and heights and suppose it belongs to class 2. for each ATTRIBUTE in this flower,
                # subtract the attribute from the class' mean it belong to i.e class-2 mean because each has
                # individual mean for each ATTRIBUTE. Get a Transpose to get a DOT*

            within_class_scatter_matrix += dot_product
        return within_class_scatter_matrix
    
    
    def find_bw_class_scatter(self):
        class_vise_mean = self.find_class_vise_mean()
        feature_means = self.df.drop(self.class_name,axis=1).mean() # means of individual features/columns
        between_class_scatter_matrix = np.zeros((self.n,self.n))

        for class_ in class_vise_mean:    
            total_elements_in_class = len(self.df.loc[self.df[self.class_name] == class_].index)

            class_m = class_vise_mean[class_].values.reshape(self.n,1)
            feat_m = feature_means.values.reshape(self.n,1)
            # mean that belongs to current class(0,1,2), mean of individual features(sep_h,pet_l,sep_l....)

            between_class_scatter_matrix += total_elements_in_class * (class_m - feat_m).dot((class_m - feat_m).T)
        return between_class_scatter_matrix
    
    
    def get_eign_value_vector(self):
        within_class_scatter_matrix = self.find_within_class_scatter()
        between_class_scatter_matrix = self.find_bw_class_scatter()
        eign_values, eign_vectors = np.linalg.eig(np.linalg.inv(within_class_scatter_matrix).\
                                                            dot(between_class_scatter_matrix))
        return eign_values,eign_vectors
    
    
    def explained_var(self,display_only=False):
        
        eign_values,eign_vectors = self.get_eign_value_vector()
        eign_value_vector_pair = [(np.abs(eign_values[i]), 
                               eign_vectors[:,i]) for i in range(len(eign_values))]
        eign_value_vector_pair = sorted(eign_value_vector_pair, key=lambda x: x[0], reverse=True)
        eign_value_sums = sum(eign_values)
        
        if display_only:
            print('Explained Variance by each EignVector in terms of total info')
            for i, pair in enumerate(eign_value_vector_pair):
                print('{}: {:.2f}%'.format(i+1, (pair[0]/eign_value_sums).real*100))
            return None
        else:
            return eign_value_vector_pair
      
        
    def get_lda(self):
        X = self.df.drop(self.class_name,axis=1).values
        eign_value_vector_pair = self.explained_var()
        W_matrix = np.hstack((eign_value_vector_pair[0][1].reshape(self.n,1), 
                      eign_value_vector_pair[1][1].reshape(self.n,1))).real
        lda = np.array(X.dot(W_matrix))
        return lda
    
    
    def plot(self):
        lda = self.get_lda()
        plt.xlabel('LD1')
        plt.ylabel('LD2')
        plt.scatter(lda[:,0],lda[:,1],c=self.df[self.class_name],
                    cmap='rainbow',alpha=0.7,edgecolors='b')

In [None]:
lda = LDA(df)
lda.plot()

In [None]:
df_lda = pd.DataFrame(LDA(df).get_lda(),columns=['X','Y'])
df_lda['class'] = df['class']

Calculate `base rate` which is the probability of data to the class they belong to. So we have to seperate the data by the class

In [19]:
dic = {'a':[1,2,3,4,5,6,7,8,9,0],
       'b':[2,3,4,5,6,7,8,9,0,1],
       'c':[4,5,6,6,7,5,4,3,2,1],
       'class':[4,0,0,1,4,2,1,0,1,2]}
df = pd.DataFrame(dic)

In [20]:
df.head()

Unnamed: 0,a,b,c,class
0,1,2,4,4
1,2,3,5,0
2,3,4,6,0
3,4,5,6,1
4,5,6,7,4


In [21]:
def stats_by_class(df,col):
        '''
        get statistics by each class present in the dataset
        '''
        
        result_df = df.groupby(col).agg(['mean','std','count'])
        class_stats = {}
        for i in range(df[col].nunique()):
            x = result_df.loc[i].values.tolist()
            if i not in class_stats:
                class_stats[i] = []
            class_stats[i].append(x[:3])
            class_stats[i].append(x[3:])
        return class_stats

In [18]:
result_df = df.groupby('class').agg(['mean','std','count'])
class_stats = {}
for i in range(df['class'].nunique()):
    x = result_df.loc[i].values.tolist()
    print(i)
    print(x[:3])
    print(x[3:])

0
[2.6, 2.701851217221259, 5.0]
[3.6, 2.701851217221259, 5.0, 4.0, 1.8708286933869707, 5.0]
1
[6.0, 2.0, 3.0]
[7.0, 2.0, 3.0, 4.666666666666667, 1.5275252316519468, 3.0]
2
[7.0, 2.8284271247461903, 2.0]
[3.0, 4.242640687119285, 2.0, 4.5, 3.5355339059327378, 2.0]


In [15]:
result_df

Unnamed: 0_level_0,a,a,a,b,b,b,c,c,c
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,2.6,2.701851,5,3.6,2.701851,5,4.0,1.870829,5
1,6.0,2.0,3,7.0,2.0,3,4.666667,1.527525,3
2,7.0,2.828427,2,3.0,4.242641,2,4.5,3.535534,2


In [22]:
stats_by_class(df,'class')

KeyError: 3

In [23]:
class BAYES():
    '''
    Implement Bayesian Classification 
    '''
    def __init__(df,col='class'):
        '''
        args:
            df: dataframe with target label included
            col:  column name of the target
        '''
        self.df = df
        self.col = col

    def segregate_data(self):
        '''
        create a dictionary where each key is the class value and list of all the records which belong to the
        class as the value in the dictionary.
        '''
        segregated = {}
        for class_,row in self.df.groupby('class'):
            if class_ not in segregated:
                segregated[class_] = []
            segregated[class_] = row.values.tolist()
        return segregated
    
    
    def col_stats(self):
        '''
        get the statistics of each of the columns
        '''
        return self.df.describe().drop(self.col,axis=1).T[['mean','std','count']].values.tolist()
    
    
    def stats_by_class(self):
        '''
        get statistics by each class present in the dataset
        '''
        
        result_df = self.df.groupby('class').agg(['mean','std','count'])
        class_stats = {}
        for i in range(self.df['class'].nunique()):
            x = result_df.loc[i].values.tolist()
            if i not in class_stats:
                class_stats[i] = []
            class_stats[i].append(x[:3])
            class_stats[i].append(x[3:])
        return class_stats
    
    
    def gauss_prob(self,x,mean,std):
        e = exp(-((x-mean)**2 / (2 * std**2 )))
        return (1 / (sqrt(2 * pi) * std)) * e
    
    
    def class_prob(self,per_class_stat, element):
        total_rows = sum([per_class_stat[label][0][2] for label in per_class_stat])
        probabilities = {}
        for class_value, class_summaries in per_class_stat.items():
            probabilities[class_value] = per_class_stat[class_value][0][2]/float(total_rows)
            for i in range(len(class_summaries)):
                mean, std, count = class_summaries[i]
                probabilities[class_value] *= gauss_prob(element[i], mean, std)
        return probabilities

In [28]:
class_stats = stats_by_class(df_lda)
result = pd.DataFrame(class_prob(class_stats,df_lda.iloc[0,:2]),index=range(1))
for i in range(1,df_lda.shape[0]):
    ser = pd.Series(class_prob(class_stats,df_lda.iloc[i,:2]),name=str(i))
    result = result.append(ser)

In [34]:
result.sample(9)

Unnamed: 0,0,1,2
48,0.605094,3.4971640000000004e-22,1.3355609999999999e-36
46,0.6872538,2.3413779999999998e-21,1.803559e-35
91,6.368394e-31,0.704228,0.002976482
30,0.363913,9.910352e-16,2.047128e-29
13,0.5635474,1.0898380000000001e-18,3.3336130000000005e-33
104,3.17876e-65,1.350193e-06,0.361201
90,2.757687e-32,0.4520624,0.00105509
12,0.3820665,8.524231e-18,3.717175e-32
61,2.524427e-28,0.4208713,0.001030749


In [31]:
log_2_result = np.log2(result)

In [32]:
log_2_result

Unnamed: 0,0,1,2
0,-0.249208,-67.042992,-114.940093
1,-1.016389,-54.131809,-100.844752
2,-0.193216,-58.818569,-105.969426
3,-1.239334,-50.479407,-96.011135
4,-0.402727,-68.482649,-116.126469
...,...,...,...
145,-181.993063,-15.579540,-1.446725
146,-167.822917,-7.965296,-1.346717
147,-162.258741,-9.148570,-1.179383
148,-190.165926,-20.211450,-2.469821
