In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff  # This library helps us to plot the distribution function of data
from scipy.stats import multivariate_normal as mvn  # Evaluates normal distributions

# **Data**
We´ll generate random points (like the last time) for our dataset, but they will be labeled into a class from the start.

In [None]:
# Dataset
D = 2 #Number of dimensions
K = 3 #Number of classes
N = int(K * 1e3)  #Number of observations

#Create random observations for every class
x0 = np.random.randn((N//K),D) + np.array([2,2])
x1 = np.random.randn((N//K),D) + np.array([0,-2])
x2 = np.random.randn((N//K),D) + np.array([-2,2])

#Stack everything together vertically as a single dataset
x = np.vstack((x0,x1,x2))
y = np.array([0]*(N//K) + [1]*(N//K) + [2]*(N//K))

print(x.shape,y.shape)
x, y

(3000, 2) (3000,)


(array([[ 1.78862276,  2.33369042],
        [ 1.52030085,  2.39366114],
        [ 0.7737928 ,  2.01164325],
        ...,
        [-1.02162132,  2.68837381],
        [-0.89070391,  2.20481085],
        [-1.83808973,  4.29664252]]), array([0, 0, 0, ..., 2, 2, 2]))

In [None]:
fig = px.scatter(x = x[:,0],
               y = x[:,1],
               color = y,
               title = 'Random dataset (dataset 1)')
fig.update_coloraxes(showscale=False)
fig.show()

# **KNN**
K Nearest Neighbors is a supervised learning algorithm that evaluates one data point with its $k$ nearest neighbors to assign it a class.

The distance will be based on the squared distance between two points (you can use different distance functions, though).

## Step 1: Select one datapoint and get the distance with the rest of points

In [None]:
x1 = x[1027]  #Random point for the example
x1

array([-2.02778702, -0.98827651])

In [None]:
dist_sqr = np.sum((x - x1)**2, axis=1) #Get the squared distance of each point
dist_sqr

array([25.60044789, 24.02642981, 16.84836802, ..., 14.53012697,
       11.48876486, 27.96635415])

## Step 2: Select the $k$ closest neighbours of your point

The ```np.argsort``` function returns the list of indices from smaller to higher.


In [None]:
#Sort the distances from closest to furthest
idx = np.argsort(dist_sqr)
idx

array([1027, 1657, 2257, ...,  106,  238,  406])

We ignore the first position because it will be the exact same point that we´re evaluating

In [None]:
#Keep only the k closest
k = 10  #We´ll work with the 10 closest neighbors for this example

idx = idx[1:]
idx

array([1657, 2257, 1843, 1791, 1679, 2819, 1876, 1715, 1487, 1778])

In [None]:
neighbors = x[idx]
neigh_y = y[idx]
neighbors, neigh_y

(array([[-1.91873756, -0.88409188],
        [-1.85920974, -0.88049224],
        [-1.82667138, -0.82407671],
        [-1.75462257, -0.95499807],
        [-1.66534663, -0.90174283],
        [-1.95915252, -1.36731075],
        [-1.66757923, -1.17421459],
        [-1.61436101, -1.05096383],
        [-2.23312416, -1.35453491],
        [-1.68307461, -1.24573199]]), array([1, 2, 1, 1, 1, 2, 1, 1, 1, 1]))

In [None]:
fig = go.Figure()

# Print all the datapoints
fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color ='grey', opacity=0.2),
                         showlegend = False))

# Print the neighbors
fig.add_trace(go.Scatter(x = neighbors[:,0],
                         y = neighbors[:,1],
                         mode = 'markers',
                         marker = dict(color =neigh_y),
                         showlegend = False))

# Show the first point
fig.add_trace(go.Scatter(x = [x1[0]],
                         y = [x1[1]],
                         mode = 'markers',
                         marker = dict(color = 'black'),
                         showlegend = False))

fig.show()

## Step 3: Assign to the class that occurred the most from the neighbors
The original algorithm just counts the number of occurrances to assign the class. However, we´ll add a weight to give more importance to the closest neighbors.

In [None]:
# Get the weights of each neighbor
epsilon = 1e-3  #This value will only prevent to have a value of 0
gamma_k = 1 / (np.sqrt(dist_sqr[idx]+epsilon))
gamma_k

array([6.48937317, 4.93649642, 3.823342  , 3.61017254, 2.67403469,
       2.58736195, 2.45942654, 2.38466768, 2.37484435, 2.31800929])

In [None]:
# Regular count to assign
np.bincount(y[idx])

array([0, 8, 2])

In [None]:
# Each count will add the weight instead of 1 in case the number of occurrances is the same
np.bincount(y[idx], weights=gamma_k)

array([ 0.        , 26.13387025,  7.52385837])

In [None]:
# Get the class of the point
y_hat = np.bincount(y[idx], weights=gamma_k).argmax()
y_hat

1

In [None]:
neighbors = np.concatenate((neighbors, np.array([x1])))
neigh_y = np.concatenate((neigh_y, np.array([y_hat])))
neighbors, neigh_y

(array([[-1.91873756, -0.88409188],
        [-1.85920974, -0.88049224],
        [-1.82667138, -0.82407671],
        [-1.75462257, -0.95499807],
        [-1.66534663, -0.90174283],
        [-1.95915252, -1.36731075],
        [-1.66757923, -1.17421459],
        [-1.61436101, -1.05096383],
        [-2.23312416, -1.35453491],
        [-1.68307461, -1.24573199],
        [-2.02778702, -0.98827651]]), array([1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1]))

In [None]:
fig = go.Figure()

# Print all the datapoints
fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color ='grey', opacity=0.2),
                         showlegend = False))

# Print the neighbors
fig.add_trace(go.Scatter(x = neighbors[:,0],
                         y = neighbors[:,1],
                         mode = 'markers',
                         marker = dict(color =neigh_y),
                         showlegend = False))

fig.show()

## Step 4: Repeat for every single datapoint

In [None]:
class KNNClassifier():
  def fit(self, x, y):
    self.x = x
    self.y = y

  def predict(self, x, k, epsilon=1e-3):
    N = len(x)  #Number of rows
    y_hat = np.zeros(N)

    for i in range(N):
      dist_sqr = np.sum((self.x - x[i])**2, axis=1) #Get the squared distance of each point
      idxt = np.argsort(dist_sqr)[1:k+1] #Get the indexes of the K nearest neighbors
      gamma_k = 1 / (np.sqrt(dist_sqr[idxt]+epsilon)) #Get the weights

      y_hat[i] = np.bincount(self.y[idxt], weights=gamma_k).argmax()

    return y_hat

In [None]:
knn = KNNClassifier()
knn.fit(x,y)

y_hat = knn.predict(x, 10)

In [None]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Original", "KNN"))

fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color = y),
                         showlegend = False),
              row = 1,
              col = 1)

fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color = y_hat),
                         showlegend = False),
              row = 2,
              col = 1)

fig.update_layout(height=500, width=800)
fig.show()

## Evaluate the metrics

In [None]:
def metrics(y_hat, y):
  total = len(y)
  tn = 0 # True negative (Original and predicted = 0)
  tp = 0 # True positive (Original and predicted = 1)
  fn = 0 # False negative (Original = 1 but predicted = 0)
  fp = 0 # False positive (Original = 0 but predicted = 1)

  # Count the correct and incorrect classifications
  for i in range(len(y)):
    if y[i] == 0:
      if y_hat[i] == 0:
        tn += 1
      else:
        fp += 1
    else:
      if y_hat[i] == 0:
        fn += 1
      else:
        tp += 1

  # Get the metrics
  acc = (tp + tn) / total # Accuracy
  prec = tp / (tp + fp) # Precision
  rec = tp / (tp + fn)  # Recall
  f = 2 * ((prec * rec) / (prec + rec)) # F1-score
  print(f"Accuracy: {acc}")
  print(f"Precision: {prec}")
  print(f"Recall: {rec}")
  print(f"F1-score: {f}")
  
  # Create the confusion matrix
  fig = go.Figure()

  fig.add_trace(go.Heatmap(z = [[fp, tp], [tn, fn]],
                           x = ['False', 'True'],
                           y = ['True', 'False'],
                           text = [[f'FP:{fp}', f'TP:{tp}'], [f'TN:{tn}', f'FN:{fn}']],
                           texttemplate = '%{text}',
                           colorscale = 'blues',
                           showscale = False))
  
  fig.update_layout(title = 'Confusion Matrix',
                    yaxis_title = 'True Label',
                    xaxis_title = 'Predicted Label')
  fig.show()

In [None]:
metrics(y_hat, y)

Accuracy: 0.977
Precision: 0.9805873568939771
Recall: 0.985
F1-score: 0.9827887253679222


## How many classes?
Just like with K-Means, there is no easy way to determine the optimal number of neighbors. In this particular case, you just need to vary the number of neighbors and see which one is the best fit.

In [None]:
y_hats = []

for i in range(1,31):
  print(f"Checking {i} neighbors...")
  y_hats.append(knn.predict(x, i))

Checking 1 neighbors...
Checking 2 neighbors...
Checking 3 neighbors...
Checking 4 neighbors...
Checking 5 neighbors...
Checking 6 neighbors...
Checking 7 neighbors...
Checking 8 neighbors...
Checking 9 neighbors...
Checking 10 neighbors...
Checking 11 neighbors...
Checking 12 neighbors...
Checking 13 neighbors...
Checking 14 neighbors...
Checking 15 neighbors...
Checking 16 neighbors...
Checking 17 neighbors...
Checking 18 neighbors...
Checking 19 neighbors...
Checking 20 neighbors...
Checking 21 neighbors...
Checking 22 neighbors...
Checking 23 neighbors...
Checking 24 neighbors...
Checking 25 neighbors...
Checking 26 neighbors...
Checking 27 neighbors...
Checking 28 neighbors...
Checking 29 neighbors...
Checking 30 neighbors...


In [None]:
def accuracy(y,y_hat):
  return np.mean(y==y_hat)

In [None]:
accuracies = []

for y_hat in y_hats:
  accuracies.append(accuracy(y, y_hat))

fig = px.line(x = range(1,31),
              y = accuracies,
              markers = True,
              title = 'KNN Accuracy')

fig.update_layout(xaxis_title = "K Neighbors",
                  yaxis_title = "Accuracy")
fig.show()

# **Naive Bayes Classification**
Another supervised approach is based on the Bayes Theorem, which determines the probability of an event happening, based on particular situations. The overall equation is described as:

$P(A|B)=\frac{P(B|A)P(A)}{P(B)}$

This equation is considered naive because it gives the same weight or importance to every single element in the dataset. We´ll see it with an example.

If you want to understand more how the equation is created, you can check this [video](https://www.youtube.com/watch?v=9wCnvr7Xw4E).

## Naive Classification Example
This example comes from this [video](https://www.youtube.com/watch?v=O2L2Uv9pdDA).

Suppose you have a total of 12 letters, from which 8 are letters from people you know, and the other 4 are spam.

Our objective will be to classify a letter based on its message. So first, we´ll need the letters.

In [None]:
normal = np.array(['Dear', 'Friend', 'Lunch',
                  'Friend', 'Lunch', 'Dear',
                  'Dear',
                  'Friend',
                  'Dear', 'Lunch',
                  'Dear',
                  'Dear', 'Friend', 'Money',
                  'Dear', 'Dear', 'Friend', 'Lunch'])

spam =  np.array(['Friend', 'Money', 'Friend','Money',
                  'Money', 'Friend',
                  'Money', 'Dear', 'Money', 'Friend',
                  'Money'])

# 0 = Normal, and 1 = Spam
labels = np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1])

### Step 1
We need to count the occurrences of every single word from all the letters, and see how much they´re used in the two situations (normal or spam)

In [None]:
normal_counts = np.unique(normal, return_counts=True)
spam_counts = np.unique(spam, return_counts=True)

normal_counts, spam_counts

((array(['Dear', 'Friend', 'Lunch', 'Money'], dtype='<U6'),
  array([8, 5, 4, 1])),
 (array(['Dear', 'Friend', 'Money'], dtype='<U6'), array([1, 4, 6])))

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Original", "Span"))

fig.add_trace(go.Histogram(x = normal,
                           showlegend = False),
              row = 1,
              col = 1)

fig.add_trace(go.Histogram(x = spam,
                           showlegend = False),
              row = 1,
              col = 2)

fig.update_layout(height=500, width=800)
fig.show()

### Step 2
Get the probabilities of every word, in every type of letter. Get the general probabilities of getting a normal and spam letter individually as well.

In [None]:
# Letter probabilities (Prior probability)
p_original = len(labels[labels==0]) / len(labels)
p_spam = len(labels[labels==1]) / len(labels)

# Word prob. in original
p_dear_orig = normal_counts[1][0] / sum(normal_counts[1])
p_friend_orig = normal_counts[1][1] / sum(normal_counts[1])
p_lunch_orig = normal_counts[1][2] / sum(normal_counts[1])
p_money_orig = normal_counts[1][3] / sum(normal_counts[1])

# Word prob. in spam
p_dear_spam = spam_counts[1][0] / sum(spam_counts[1])
p_friend_spam = spam_counts[1][1] / sum(spam_counts[1])
p_lunch_spam = 0
p_money_spam = spam_counts[1][2] / sum(spam_counts[1])

### Step 3
Given a new message: *Dear Friend*, we will use the probabilities to determine if its a normal message or spam. We just need to obtain two probabilities:


*   $P(Original|Dear Friend)=P(Original)\times P(Dear|Original)\times P(Friend|Original)$
*   $P(Spam|Dear Friend)=P(Spam)\times P(Dear|Spam)\times P(Friend|Spam)$

After this, we just assign the class with the highest probability.



In [None]:
p_orig_newMsg = p_original * p_dear_orig * p_friend_orig
p_spam_newMsg = p_spam * p_dear_spam * p_friend_spam

print(f"Prob. of being original = {p_orig_newMsg:0.2f}")
print(f"Prob. of being spam = {p_spam_newMsg:0.2f}")

Prob. of being original = 0.08
Prob. of being spam = 0.01


In this case, we can see that *Dear Friend* is classified as a normal message, which seems to make sense.

However, what will happen if the message is *Lunch Money Money Money*?



In [None]:
p_orig_newMsg = p_original * p_lunch_orig * (p_money_orig**3)
p_spam_newMsg = p_spam * p_lunch_spam * (p_money_spam**3)

print(f"Prob. of being original = {p_orig_newMsg}")
print(f"Prob. of being spam = {p_spam_newMsg}")

Prob. of being original = 2.540263171264542e-05
Prob. of being spam = 0.0


Since we are expecting that the probability for spam will be 0 since there´s no probability for "lunch", a common implementation is to add an ```epsilon``` value which will help us to prevent that.

In this case, that epsilon will be equal to 1, but in reality is usually a much smaller value.

In [None]:
# Letter probabilities (Prior probability)
p_original = len(labels[labels==0]) + 1 / len(labels)
p_spam = len(labels[labels==1]) + 1 / len(labels)

# Word prob. in original
p_dear_orig = (normal_counts[1][0] + 1) / sum(normal_counts[1])
p_friend_orig = (normal_counts[1][1] + 1) / sum(normal_counts[1])
p_lunch_orig = (normal_counts[1][2] + 1) / sum(normal_counts[1])
p_money_orig = (normal_counts[1][3] + 1) / sum(normal_counts[1])

# Word prob. in spam
p_dear_spam = (spam_counts[1][0] + 1) / sum(spam_counts[1])
p_friend_spam = (spam_counts[1][1] + 1) / sum(spam_counts[1])
p_lunch_spam = 1 / sum(spam_counts[1])
p_money_spam = (spam_counts[1][2] + 1) / sum(spam_counts[1])

In [None]:
p_orig_newMsg = p_original * p_lunch_orig * (p_money_orig**3)
p_spam_newMsg = p_spam * p_lunch_spam * (p_money_spam**3)

print(f"Prob. of being original = {p_orig_newMsg:0.3f}")
print(f"Prob. of being spam = {p_spam_newMsg:0.3f}")

Prob. of being original = 0.003
Prob. of being spam = 0.096


## Continuous Data Implementation
The previous example used discrete data (punctual observations) for the classification.

When dealing with continuous data, it is possible to express the probability of an occurrence based on the Normal Probabilistic Density Function (PDF) described as

$f(X_{i}=x|Y=y)=\frac{1}{\sqrt(2\pi \sigma^{2})}e^{\frac{(x-\mu)^{2}}{2 \sigma^{2}}}$

Where $\mu$ represents the mean and $\sigma^{2}$ the variance of the data distribution.

The main advantage of this distribution is that there are already functions capable of determining such probabilities by knowing just these two values. At the same time, Naive Bayes assumes independence of the features, which means the covariance matrices are diagonal matrices, which helps us to create an equation system through matrices to get every single probability.

Finally, since multiplying is computationally expensive, we can apply a logarithm to every single probability. By doing this, we just need to add each probability, making the measurement more efficient.

In [None]:
class NaiveBayes():
  # Obtain all your probabilities
  def fit(self, x, y, epsilon=1e-3):
    self.likelihoods = dict() # For each element belonging to every class
    self.priors = dict()  # For every individual class probability

    #Determine your classes
    self.K = set(y.astype(int))

    #Assign the x values to every given class
    for k in self.K:
      x_k = x[y==k,:]

      #Populate likelihoods
      self.likelihoods[k] = {"Mean" : x_k.mean(axis=0),
                             "Covariance" : x_k.var(axis=0) + epsilon}  # We just get the covariance of independent functions
      #Populate priors (probability of x given y)
      self.priors[k] = len(x_k) / len(x)
    
  def predict(self, x):
    #Get number and dimension of observations
    N, D = x.shape

    #Get the predicted probability for every observation
    p_hat = np.zeros((N,len(self.K)))

    for k,l in self.likelihoods.items():
      # Apply the PDF of a logarithmic PDF (to add instead of multiply) to get the probabilities of belonging to each class
      p_hat[:,k] = mvn.logpdf(x, l['Mean'], l['Covariance']) + np.log(self.priors[k])

    return p_hat.argmax(axis=1) # Return only the class with the highest probability

Going back to our first dataset, we can then "train" a Naive Bayes classifier to label each datapoint.

In [None]:
nb = NaiveBayes()
nb.fit(x,y)
y_hat = nb.predict(x)

In [None]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Original", "KNN"))

fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color = y),
                         showlegend = False),
              row = 1,
              col = 1)

fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color = y_hat),
                         showlegend = False),
              row = 2,
              col = 1)

fig.update_layout(height=500, width=800)
fig.show()

In [None]:
metrics(y_hat, y)

Accuracy: 0.9793333333333333
Precision: 0.9835329341317365
Recall: 0.9855
F1-score: 0.9845154845154845


## Conclusion
It seems that the Naive approach was capable of getting a slightly better classification as KNN since it begins to evaluate based on probabilistic inferences rather than just the distance between points.

This method is called the **naive** version of the Bayes Theorem because it assumes that the presence of one particular feature does not affect the other, meaning every observation is independent from each other.

You can get more information about the Naive classification [here](https://towardsdatascience.com/naive-bayes-explained-9d2b96f4a9c0).

# **Gaussian Bayes Classification**

## Gaussian Classification Example
In order to evaluate the context and importance of every single probability, the best approach is to evaluate every single observation inside a Gaussian distribution. This prevents the assumption of independence between variables.

We will follow now the example from this [video](https://www.youtube.com/watch?v=H3EjCKtlVog). In this case, we will assume a Gaussian distribution for the data.

Based on the products consumed by people inside a movie theater, we´ll determine if they love Star Wars (0) or not (1).

In [None]:
# Import the dataset
movie_df = pd.read_csv('data/gaussian_movies.csv')
movie_df

Unnamed: 0,Popcorn (g),Soda (ml),Candy (g),Loves SW
0,25.727634,250.157250,14.432435,0
1,21.557809,250.065078,15.909503,0
2,27.400900,250.141092,15.065682,0
3,27.424493,249.810856,14.425514,0
4,21.870992,249.709464,14.068983,0
...,...,...,...,...
94,20.500352,244.995105,9.409072,1
95,19.859327,245.562050,12.090903,1
96,20.281539,244.445921,14.675605,1
97,19.901450,245.694587,8.674708,1


### Step 1
Get and compare the Gaussian distribution of each feature

In [None]:
pop_love = movie_df['Popcorn (g)'][movie_df['Loves SW'] == 0].to_numpy()
pop_no = movie_df['Popcorn (g)'][movie_df['Loves SW'] == 1].to_numpy()
soda_love = movie_df['Soda (ml)'][movie_df['Loves SW'] == 0].to_numpy()
soda_no = movie_df['Soda (ml)'][movie_df['Loves SW'] == 1].to_numpy()
candy_love = movie_df['Candy (g)'][movie_df['Loves SW'] == 0].to_numpy()
candy_no = movie_df['Candy (g)'][movie_df['Loves SW'] == 1].to_numpy()

In [None]:
# Create the Gaussian plots
fig = ff.create_distplot([pop_love, pop_no],
                         group_labels = ['Loves SW', 'Doesn´t love SW'],
                         show_hist = True,
                         show_rug = False)
fig.update_layout(title = 'Popcorn')
fig.show()

In [None]:
fig = ff.create_distplot([soda_love, soda_no],
                         group_labels = ['Loves SW', 'Doesn´t love SW'],
                         show_hist = True,
                         show_rug = False)
fig.update_layout(title = 'Soda')
fig.show()

In [None]:
fig = ff.create_distplot([candy_love, candy_no],
                         group_labels = ['Loves SW', 'Doesn´t love SW'],
                         show_hist = True,
                         show_rug = False)
fig.update_layout(title = 'Candy')
fig.show()

## Step 2
Given a new observation, we need to find its location on the Gaussian plots.

Assume in this case that the new person consumed the following:


*   20g of popcorn
*   248ml of soda
*   13g of candy



In [None]:
# Create the Gaussian plots
fig = ff.create_distplot([pop_love, pop_no],
                         group_labels = ['Loves SW', 'Doesn´t love SW'],
                         show_hist = True,
                         show_rug = False)
fig.add_vline(x=20)
fig.update_layout(title = 'Popcorn')
fig.show()

In [None]:
fig = ff.create_distplot([soda_love, soda_no],
                         group_labels = ['Loves SW', 'Doesn´t love SW'],
                         show_hist = True,
                         show_rug = False)
fig.add_vline(x=248)
fig.update_layout(title = 'Soda')
fig.show()

In [None]:
fig = ff.create_distplot([candy_love, candy_no],
                         group_labels = ['Loves SW', 'Doesn´t love SW'],
                         show_hist = True,
                         show_rug = False)
fig.add_vline(x=13)
fig.update_layout(title = 'Candy')
fig.show()

## Step 3
Get the corresponding probabilities.

Since we´re still using a normal distribution of data, the logic behind getting the probabilistic function of our data remains the same as the Naive Classifier. The only difference now is that our covariance needs to follow the behavior of the Gaussian distribution rather than just the generic variance.

You can get more insight about the process [here](https://towardsdatascience.com/gaussian-naive-bayes-4d2895d139a). In the meantime, is enough to know that the covariance in a Gaussian distribution is described as

$variance=\frac{1}{N-1}\left(x-\mu \right)^{T}\left(x-\mu \right)$

In [None]:
# Prior probabilities
p_likes = len(movie_df['Loves SW'][movie_df['Loves SW'] == 0]) / len(movie_df)
p_no = len(movie_df['Loves SW'][movie_df['Loves SW'] == 1]) / len(movie_df)

print(f"Probability they like Star Wars: {p_likes:0.2f}")
print(f"Probability they don´t like Star Wars: {p_no:0.2f}")

Probability they like Star Wars: 0.70
Probability they don´t like Star Wars: 0.30


In [None]:
# Probabilities of consuming liking Star Wars
like_sw = len(movie_df['Loves SW'][movie_df['Loves SW'] == 0]) #How many like Star Wars

mu_pop_like = movie_df['Popcorn (g)'][movie_df['Loves SW'] == 0].mean(axis=0) #Mean of popcorn ate
pop_like_x_mu = movie_df['Popcorn (g)'][movie_df['Loves SW'] == 0] - mu_pop_like #Popcorn ate minus the mean
var_pop_like = (1 / (like_sw - 1)) * np.matmul(pop_like_x_mu.T, pop_like_x_mu)  #Variance for popcorn ate
p_new_pop_like = mvn.pdf(20, mu_pop_like, var_pop_like) #Prob. of eating 20g of popcorn given the person likes Star Wars

mu_soda_like = movie_df['Soda (ml)'][movie_df['Loves SW'] == 0].mean(axis=0) #Mean of soda drank
soda_like_x_mu = movie_df['Soda (ml)'][movie_df['Loves SW'] == 0] - mu_soda_like #soda drank minus the mean
var_soda_like = (1 / (like_sw - 1)) * np.matmul(soda_like_x_mu.T, soda_like_x_mu)  #Variance for soda drank
p_new_soda_like = mvn.pdf(248, mu_soda_like, var_soda_like) #Prob. of drinking 148ml of soda given the person likes Star Wars

mu_candy_like = movie_df['Candy (g)'][movie_df['Loves SW'] == 0].mean(axis=0) #Mean of candy ate
candy_like_x_mu = movie_df['Candy (g)'][movie_df['Loves SW'] == 0] - mu_candy_like #candy ate minus the mean
var_candy_like = (1 / (like_sw - 1)) * np.matmul(candy_like_x_mu.T, candy_like_x_mu)  #Variance for candy ate
p_new_candy_like = mvn.pdf(13, mu_candy_like, var_candy_like) #Prob. of eating 13g of candy given the person likes Star Wars

In [None]:
# Probabilities of consuming not liking Star Wars
nolike_sw = len(movie_df['Loves SW'][movie_df['Loves SW'] == 1]) #How many don't like Star Wars

mu_pop_nolike = movie_df['Popcorn (g)'][movie_df['Loves SW'] == 0].mean(axis=0) #Mean of popcorn ate
pop_nolike_x_mu = movie_df['Popcorn (g)'][movie_df['Loves SW'] == 0] - mu_pop_nolike #Popcorn ate minus the mean
var_pop_nolike = (1 / (nolike_sw - 1)) * np.matmul(pop_nolike_x_mu.T, pop_nolike_x_mu)  #Variance for popcorn ate
p_new_pop_nolike = mvn.pdf(20, mu_pop_nolike, var_pop_nolike) #Prob. of eating 20g of popcorn given the person doesn't like Star Wars

mu_soda_nolike = movie_df['Soda (ml)'][movie_df['Loves SW'] == 0].mean(axis=0) #Mean of soda drank
soda_nolike_x_mu = movie_df['Soda (ml)'][movie_df['Loves SW'] == 0] - mu_soda_nolike #soda drank minus the mean
var_soda_nolike = (1 / (nolike_sw - 1)) * np.matmul(soda_nolike_x_mu.T, soda_nolike_x_mu)  #Variance for soda drank
p_new_soda_nolike = mvn.pdf(248, mu_soda_nolike, var_soda_nolike) #Prob. of drinking 148ml of soda given the person doesn't like Star Wars

mu_candy_nolike = movie_df['Candy (g)'][movie_df['Loves SW'] == 0].mean(axis=0) #Mean of candy ate
candy_nolike_x_mu = movie_df['Candy (g)'][movie_df['Loves SW'] == 0] - mu_candy_nolike #candy ate minus the mean
var_candy_nolike = (1 / (nolike_sw - 1)) * np.matmul(candy_nolike_x_mu.T, candy_nolike_x_mu)  #Variance for candy ate
p_new_candy_nolike = mvn.pdf(13, mu_candy_nolike, var_candy_nolike) #Prob. of eating 13g of candy given the person doesn't like Star Wars

## Step 4
Once you have the probabilities, you just need to get the total probability for each class, and assign the one with the biggest likelihood.

In [None]:
prob_new_like = p_likes * p_new_pop_like * p_new_soda_like * p_new_candy_like
prob_new_nolike = p_no * p_new_pop_nolike * p_new_soda_nolike * p_new_candy_nolike

print(f"Probability new customer likes Star Wars: {prob_new_like}")
print(f"Probability new customer doesn´t like Star Wars: {prob_new_nolike}")

Probability new customer likes Star Wars: 2.069931892514577e-17
Probability new customer doesn´t like Star Wars: 2.16913830111256e-09


## Continuous Data Implementation
From the previous example, we can obsrve that most of the times the probabilities are very small (mostly zero). To prevent this, and have a better representation, is better to use the logarithmic PDF of the distribution. Either way, both implementations should prove that in this case, the new customer is most likely to don't like Star Wars ☹.

Anyways, once we understood how the method works, we just need to create a function that generalizes everything. Basically, it´ll be the same procedure of the Naive classifier, but changing the variance for the Gaussian representation.

In [None]:
class GaussBayes():
  def fit(self, x, y, epsilon=1e-3):
    self.likelihoods = dict()
    self.priors = dict()
    self.K = set(y.astype(int))

    #Set the covariance matrix
    for k in self.K:
      x_k = x[y==k,:]
      N_k, D = x_k.shape
      mu_k = x_k.mean(axis=0)
      
      self.likelihoods[k] = {'Mean' : mu_k,
                             'Covariance' : (1/(N_k-1)) * np.matmul((x_k - mu_k).T, x_k-mu_k) + epsilon*np.identity(D)}
      self.priors[k] = len(x_k) / len(x)

  def predict(self, x):
    #Get number and dimension of observations
    N, D = x.shape

    #Get the predicted probability for every observation
    p_hat = np.zeros((N,len(self.K)))

    for k,l in self.likelihoods.items():
      p_hat[:,k] = mvn.logpdf(x, l['Mean'], l['Covariance']) + np.log(self.priors[k])

    return p_hat.argmax(axis=1)

Now we can compare this Gaussian classifier with our first dataset.

In [None]:
gbayes = GaussBayes()
gbayes.fit(x,y)

y_hat = gbayes.predict(x)

In [None]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("Original", "KNN"))

fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color = y),
                         showlegend = False),
              row = 1,
              col = 1)

fig.add_trace(go.Scatter(x = x[:,0],
                         y = x[:,1],
                         mode = 'markers',
                         marker = dict(color = y_hat),
                         showlegend = False),
              row = 2,
              col = 1)

fig.update_layout(height=500, width=800)
fig.show()

In [None]:
metrics(y_hat, y)

Accuracy: 0.979
Precision: 0.9825610363726955
Recall: 0.986
F1-score: 0.9842775143498876


## Conclusion
It might seem that the accuracy in this case was the same as with the Naive approach. However, it is also important to consider the distribution and behavior of data since it might not always follow a normal distribution.

# **What's next?**
Now that we know three different supervised methods, we can begin to compare how it behaves on more real data rather than with small, almost random datapoints.

The next step will be then, to specify the rules for training and testing any model.

# **Assignment**
Run the Classification of the Donut and XOR datasets with the three algorithms (KNN, Naive Bayes, and Gaussian Bayes)