#Gaussian bayes classifier

In this assignment we will use a Gaussian bayes classfier to classify our data points.

# Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from sklearn.metrics import classification_report
from matplotlib import cm

# Load training data

Our data has 2D feature $x1, x2$. Data from the two classes is are in $\texttt{class1_train}$ and $\texttt{class2_train}$ respectively. Each file has two columns corresponding to the 2D feature.

In [None]:
class1_train = pd.read_csv('https://raw.githubusercontent.com/shala2020/shala2020.github.io/master/Lecture_Materials/Assignments/MachineLearning/L3/class1_train').to_numpy()
class2_train = pd.read_csv('https://raw.githubusercontent.com/shala2020/shala2020.github.io/master/Lecture_Materials/Assignments/MachineLearning/L3/class2_train').to_numpy()

In [None]:
class1_train[:10]

In [None]:
class1_train.shape

In [None]:
class2_train.shape

# Visualize training data
Generate 2D scatter plot of the training data. Plot the points from class 1 in red and the points from class 2 in blue.

In [None]:
import seaborn as sns
classes = ['class-1','class-2']

for i in range(class1_train.shape[0]):
    
    plt.scatter(class1_train[i][0],class1_train[i][1] ,c="red",alpha=0.6, edgecolors='none')

    # plt.legend(loc='best', fontsize=16)
    plt.xlabel('Growth %')
    plt.ylabel('Population')

for j in range(class2_train.shape[0]):
    plt.scatter(class1_train[j][0],class1_train[j][1] ,c="blue")


# Maximum likelihood estimate of parameters

We will model the likelihood, $P(\mathbf{x}|C_1)$ and $P(\mathbf{x}|C_2)$ as $\mathcal{N}(\mathbf{\mu_1},\Sigma_1)$ and $\mathcal{N}(\mathbf{\mu_2},\Sigma_2)$ respectively. The prior probability of the classes are called, $P(C_1)=\pi_1$ and $P(C_2)=\pi_2$.

The maximum likelihood estimate of the parameters as follows:
\begin{align*}
\pi_k &= \frac{\sum_{i=1}^N \mathbb{1}(t^i=k)}{N}\\
\mathbf{\mu_k} &= \frac{\sum_{i=1}^N \mathbb{1}(t^i=k)\mathbf{x}^i}{\sum_{i=1}^N \mathbb{1}(t^i=k)}\\
\Sigma_k &= \frac{\sum_{i=1}^N \mathbb{1}(t^i=k)(\mathbf{x}^i-\mathbf{\mu_k})(\mathbf{x}^i-\mathbf{\mu_k})^T}{\sum_{i=1}^N \mathbb{1}(t^i=k)}\\
\end{align*}

Here, $t^i$ is the target or class of $i^{th}$ sample. $\mathbb{1}(t^i=k)$ is 1 if $t^i=k$ and 0 otherwise.

Compute maximum likelihood values estimates of $\pi_1$, $\mu_1$, $\Sigma_1$ and $\pi_2$, $\mu_2$, $\Sigma_2$ 

Also print these values


$pi$ = `Prior` <br/>
$mu$ and $sigma$ = `Likelihood` 


In [None]:
def calculate_pi_1():
  num = class1_train.shape[0]
  deno = class1_train.shape[0] + class2_train.shape[0]
  return num/deno

def calculate_pi_2():
  num = class2_train.shape[0]
  deno = class1_train.shape[0] + class2_train.shape[0]
  return num/deno

def calculate_mu_1():
  return class1_train.mean(axis=0)

def calculate_mu_2():
  return class2_train.mean(axis=0)

def calculate_cov_1():
  x = class1_train
  print(x.shape)
  mu = x.mean(axis=0) 
  x_norm = x-mu
  x_transpose = x_norm.transpose()
  return np.cov(x_transpose)

def calculate_cov_2():
  x = class2_train
  print(x.shape)
  mu = x.mean(axis=0)
  x_norm = x-mu
  x_transpose = x_norm.transpose()
  return np.cov(x_transpose)


print( 'pi_1 : {} and pi_2 : {}'.format(calculate_pi_1(),calculate_pi_2()))
print( 'mu_1 : {} and mu_2 : {}'.format(calculate_mu_1(),calculate_mu_2()))
print( 'sigma_1 : \n{} \n sigma_2 : \n{}'.format(calculate_cov_1(),calculate_cov_2()))

In [None]:
## Another way to get Pi , mu and sigma

pi1 = len(class1_train)/(len(class1_train)+len(class2_train))
pi2 = len(class2_train)/(len(class1_train)+len(class2_train))
mu1 = class1_train.mean(axis=0)
mu2 = class2_train.mean(axis=0)
sig1 = np.cov(class1_train,rowvar=False)
sig2 = np.cov(class2_train,rowvar=False)

print("Pi-1 {} and Pi-2 {}".format(pi1,pi2))
print("mu-1 {} and mu-2 {}".format(mu1,mu2))
print("sig-1 {} and sig-2 {}".format(sig1,sig2))


# Visualize the likelihood
Now that you have the parameters, let us visualize how the likelihood looks like.

1. Use $\texttt{np.mgrid}$ to generate points uniformly spaced in -5 to 5 along 2 axes
1. Use $\texttt{multivariate_normal.pdf}$ to get compute the Gaussian likelihood for each class  
1. Use $\texttt{plot_surface}$ to plot the likelihood of each class.
1. Use $\texttt{contourf}$ to plot the likelihood of each class. 

You may find the code in the lecture notebook helpful.
 
For the plots, use $\texttt{cmap=cm.Reds}$ for class 1 and $\texttt{cmap=cm.Blues}$ for class 2. Use $\texttt{alpha=0.5}$ to overlay both plots together.

In [None]:
from matplotlib import cm

x,y = np.mgrid[-5:5:.01, -5:5:.01]
pos = np.empty(x.shape + (2,))
pos[:, :, 0] = x; pos[:, :, 1] = y

mu1 = calculate_mu_1()
mu2 = calculate_mu_2()
cov1 = calculate_cov_1()
cov2 = calculate_cov_2()
rv1 = multivariate_normal(mean = mu1, cov = cov1)
rv2 = multivariate_normal(mean = mu2, cov = cov2)

fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(121, projection='3d')
plt.xlabel('x')
plt.ylabel('y')
ax.plot_surface(x,y,rv1.pdf(pos), cmap=cm.Reds,alpha=0.5)
ax.plot_surface(x,y,rv2.pdf(pos), cmap=cm.Blues,alpha=0.5)

plt.subplot(122)
plt.contourf(x, y, rv1.pdf(pos), cmap=cm.Reds,alpha=0.5)
plt.contourf(x, y, rv2.pdf(pos), cmap=cm.Blues,alpha=0.5)

plt.colorbar()
plt.xlabel('x')
plt.ylabel('y')

#Visualize the posterior
Use the prior and the likelihood you've computed to obtain the posterior distribution for each class.

Like in the case of the likelihood above, make same similar surface and contour plots for the posterior.

In [None]:
likelihood1 = rv1.pdf(pos)
likelihood2 = rv2.pdf(pos)

p1 = (likelihood1 * pi1)/(likelihood1*pi1+likelihood2*pi2)
p2 = (likelihood2 * pi2)/(likelihood1*pi1+likelihood2*pi2)

x, y = np.mgrid[-5:5:.01, -5:5:.01]
pos = np.empty(x.shape + (2,))
pos[:, :, 0] = x; pos[:, :, 1] = y
fig = plt.figure(figsize=(20,10))
ax = fig.add_subplot(131, projection='3d')
plt.xlabel('x')
plt.ylabel('y')
ax.plot_surface(x,y,p1, cmap=cm.Reds,alpha=0.5)
ax.plot_surface(x,y,p2, cmap=cm.Blues,alpha=0.5)
plt.subplot(132)
plt.contourf(x,y,p1,cmap=cm.Reds,alpha=0.5)
plt.contourf(x,y,p2,cmap=cm.Blues,alpha=0.5)
plt.xlabel('x')
plt.ylabel('y')

# Decision boundary
1. Decision boundary can be obtained by $P(C_2|x)>P(C_1|x)$ in python. Use $\texttt{contourf}$ to plot the decision boundary. Use $\texttt{cmap=cm.Blues}$ and $\texttt{alpha=0.5}$
1. Also overlay the scatter plot of train data points from the 2 classes on the same plot. Use red color for class 1 and blue color for class 2 

In [None]:
des = p2>p1
plt.contourf(x,y,p1,cmap=cm.Reds,alpha=0.5)
plt.contourf(x,y,p2,cmap=cm.Blues,alpha=0.5)
plt.contourf(x,y,des,cmap=cm.Greens,alpha=0.3)
plt.xlabel('x')
plt.ylabel('y')
plt.scatter(class1_train[:,0],class1_train[:,1],marker='*',color='red')
plt.scatter(class2_train[:,0],class2_train[:,1],marker='+',color='blue')

# Test Data
Now let's use our trained model to classify test data points

1. $\texttt{test_data}$ contains the $x1,x2$ features of different data points
1. $\texttt{test_label}$ contains the true class of the data points. 0 means class 1. 1 means class 2.  
1. Classify the test points based on whichever class has higher posterior probability for each data point
1. Use $\texttt{classification_report}$ to test the classification performance

In [None]:
test = pd.read_csv('https://raw.githubusercontent.com/shala2020/shala2020.github.io/master/Lecture_Materials/Assignments/MachineLearning/L3/test').to_numpy()
test_data, test_label = test[:,:2], test[:,2]

test_data


In [None]:
## likelihood 
l1 = rv1.pdf(test_data)
l2 = rv2.pdf(test_data)

In [None]:
##Posterior 
p1_test= (l1*pi1)/(l1*pi1+l2*pi2)
p2_test= (l2*pi2)/(l1*pi1+l2*pi2)

In [None]:
## Descision bundory 
test_data_predict=p2_test>p1_test
test_data_predict

In [None]:
test_data_predict = np.where(test_data_predict==True,1,0)
test_data_predict

In [None]:
from sklearn.metrics import classification_report,accuracy_score

In [None]:
print(accuracy_score(test_label,test_data_predict))

In [None]:
print(classification_report(test_label,test_data_predict))