#### Importamos librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import *
from sklearn.metrics import pairwise_distances_argmin

#### Leemos datos para entrenar el modelo

In [2]:
iris_data = pd.read_csv('Iris_data.txt', sep=',', header=None)
iris_data

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
95,5.7,3.0,4.2,1.2,Iris-versicolor
96,5.7,2.9,4.2,1.3,Iris-versicolor
97,6.2,2.9,4.3,1.3,Iris-versicolor
98,5.1,2.5,3.0,1.1,Iris-versicolor


#### Discretizamos la clase, si es Iris Setosa la etiqueta sera 1, si es Versicolor 0

In [3]:
iris_data[4] = iris_data[4] == 'Iris-setosa'
iris_data[4] = iris_data[4].astype(int)
iris_data = iris_data.to_numpy() 
iris_data[:10]

array([[5.1, 3.5, 1.4, 0.2, 1. ],
       [4.9, 3. , 1.4, 0.2, 1. ],
       [4.7, 3.2, 1.3, 0.2, 1. ],
       [4.6, 3.1, 1.5, 0.2, 1. ],
       [5. , 3.6, 1.4, 0.2, 1. ],
       [5.4, 3.9, 1.7, 0.4, 1. ],
       [4.6, 3.4, 1.4, 0.3, 1. ],
       [5. , 3.4, 1.5, 0.2, 1. ],
       [4.4, 2.9, 1.4, 0.2, 1. ],
       [4.9, 3.1, 1.5, 0.1, 1. ]])

#### Separamos los datos por clase, clase 1(Iris setosa) y Clase 0(Iris Versicolor)

In [4]:
def separate_by_class(data): 
    dict = {} 
    for i in range(len(data)): 
        if (data[i][-1] not in dict): 
            dict[data[i][-1]] = [] 
        dict[data[i][-1]].append(data[i]) 
    return dict

#### Funciones Probabilisticas

In [5]:
def mean(numbers):
    return np.mean(numbers)

In [6]:
def stdev(numbers):
    return np.std(numbers)

In [7]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [8]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

In [9]:
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

In [10]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

#### Funcion que predice a que clase pertenece una muestra

In [11]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

#### Funcion que entrena el algoritmo y devuelve las prediciones de los datos de la variable test

In [12]:
def naive_bayes(train, test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize, row)
        predictions.append(output)
    return(predictions)

#### Mostramos los datos de test

In [13]:
df = pd.read_csv('Iris_test.txt', sep=',', header=None)
iris_test = df.iloc[:,:-1].to_numpy() 
df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,6.9,3.1,4.9,1.5,Iris-versicolor
2,5.0,3.4,1.5,0.2,Iris-setosa


#### Aplicamos el algoritmo de Bayes

In [14]:
naive_bayes(iris_data,iris_test)

[1.0, 0.0, 1.0]

#### Si es Iris Setosa la etiqueta sera 1, si es Versicolor 0. Las predicciones son validas.