# Algoritmo Naive-Bayes

In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

#data = pd.read_csv('data/vertebralcolumn-3C.csv')
#data = pd.read_csv('data/BreastCancer.csv')
data = pd.read_csv('data/Iris.csv')
#data = pd.read_csv('data/Vehicle.csv')

data = data.dropna(axis = 'rows') 
classes = np.array(pd.unique(data[data.columns[-1]]), dtype = str)  
nrow, ncol = data.shape
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
data = data.to_numpy()
y = data[:, -1]
X = data[:, 0:ncol-1]

scaler = MinMaxScaler().fit(X)
X = scaler.transform(X)

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, random_state = 5)

In [4]:
# Função para o cálculo da verossimilhança. 

def verossimilhanca(y, Z):
    def gaussiana(x, mu, sig):
        return np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
    prob = 1
    for j in np.arange(0, Z.shape[1]):
        m = np.mean(Z[:,j])
        s = np.std(Z[:,j])      
        prob = prob*gaussiana(y[j], m, s)
    return prob

Inicialmente, definimos uma função para calcular a densidade de probabilidade conjunta: $$p(\vec{x}|C_i) = \prod_{j=1}^d p(x_j|C_i), \quad i=1,\ldots, k$$ 
onde $C_i$ são as classes. 

Se a distribuição for normal, temos que cada atributo $X_j$ tem a seguinte função densidade de probabilidade associada, para cada classe:
$$
p(x_j|C_i) = \frac{1}{\sqrt{2\pi\sigma_{C_i}}}\exp \left[ -\frac{1}{2}\left( \frac{x_j-\mu_{C_i}}{\sigma_{C_i}}\right)^2 \right], \quad i=1,2,\ldots, k.
$$

In [13]:
def NB_gaussiano(x_train, x_test, y_train, y_test):
    P = pd.DataFrame(data=np.zeros((x_test.shape[0], len(classes))), columns = classes) 
    for i in np.arange(0, len(classes)):
        elements = np.where(y_train == classes[i])
        Z = x_train[elements,:][0]
        for j in np.arange(0, x_test.shape[0]):
            x = x_test[j,:]
            pj = verossimilhanca(x,Z)
            P[classes[i]][j] = pj*len(elements)/x_train.shape[0]
            
    y_pred = []
    for i in np.arange(0, P.shape[0]):
        c = np.argmax(np.array(P.iloc[[i]]))
        y_pred.append(P.columns[c])
    y_pred = np.array(y_pred, dtype = str)
    return y_pred 

#### Experimental 

In [15]:
y_test_pred = NB_gaussiano(x_train, x_test, y_train, y_test)
y_test_prediction = np.asarray(y_test_pred)
y_test_prediction

array(['versicolor', 'virginica', 'virginica', 'setosa', 'virginica',
       'versicolor', 'setosa', 'virginica', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'setosa',
       'setosa', 'virginica', 'virginica', 'setosa', 'setosa',
       'versicolor', 'virginica', 'setosa', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'versicolor', 'virginica'],
      dtype='<U10')

#### Framework 

In [12]:
model = GaussianNB()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
y_pred

array(['versicolor', 'versicolor', 'virginica', 'setosa', 'virginica',
       'versicolor', 'setosa', 'virginica', 'setosa', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'virginica', 'setosa',
       'setosa', 'virginica', 'virginica', 'setosa', 'setosa',
       'versicolor', 'virginica', 'setosa', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'versicolor', 'virginica'],
      dtype='<U10')

In [16]:
# Comparando os resultados do modelo contruído vs o modelo do sklearn. 
y_test_prediction == y_pred

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

**Autor**: Alan Gomes 

**E-mail**: gomes-alan@hotmail.com 

#### Referências 

[1] Curso de ciência de dados do professor Dr. Francisco Rodrigues - USP
- https://www.youtube.com/watch?v=lm2IagDGDAU&list=PLSc7xcwCGNh1PJrPfLaH4MMjfDl48tmGM

[2] Curso Machine Learning - Data Science Academy 