# Ejercicio 1

## Leer datos

In [69]:
import pandas as pd

training_set = pd.read_excel('datasets/PreferenciasBritanicos.xlsx')
training_set

Unnamed: 0,scones,cerveza,wiskey,avena,futbol,Nacionalidad
0,0,0,1,1,1,I
1,1,0,1,1,0,I
2,1,1,0,0,1,I
3,1,1,0,0,0,I
4,0,1,0,0,1,I
5,0,0,0,1,0,I
6,1,0,0,1,1,E
7,1,1,0,0,1,E
8,1,1,1,1,0,E
9,1,1,0,1,0,E


## Implementacion de Naive Bayes

Se instancia la clase con un set de entrenamiento y el nombre de la columna donde están las clases. Opcionalmente se puede deshabilitar la corrección de Laplace.

Luego se utiliza el método `classify` que recibe un diccionario con la hipótesis que se quiere clasificar. Este retorna un diccionario con la probabilidad condicional de cada clase.

In [70]:
class NaiveBayes:

    def __init__(self, training_set, class_key, laplace = True):
        self.ts = training_set
        self.class_key = class_key
        self.classes = training_set[class_key].unique()
        self.laplace = laplace
        self.class_probabilities = self.__class_probabilities()

    def __class_probabilities(self):
        to_return = {}
        for value in self.classes:
            k = self.ts[self.ts[self.class_key] == value ]
            if self.laplace:
                to_return[value] = (len(k)+1) / (len(self.ts)+len(self.classes))
            else:
                to_return[value] = len(k) / len(self.ts)
        return to_return

    def p_attr_given_class(self, attr, value, class_name):
        attr_and_class = self.ts[(self.ts[attr] == value) & (self.ts[self.class_key] == class_name)]
        klass = self.ts[self.ts[self.class_key] == class_name]

        if self.laplace:
            return (len(attr_and_class)+1)/(len(klass)+len(self.classes))
        else:
            return len(attr_and_class)/len(klass)

    def classify(self, subject, decimals=None):
        probabilities = {}
        total_probability = 0
        for klass in self.classes:
            p = 1
            for key, val in subject.items():
                p *= self.p_attr_given_class(key, val, klass)
            p *= self.class_probabilities[klass]
            total_probability += p
            probabilities[klass] = p
        for k,v in probabilities.items():
            if not decimals:
                probabilities[k] = v/total_probability
            else:
                probabilities[k] = round(v/total_probability, decimals)
        return probabilities


## Set de prueba

In [71]:
testing_set = pd.DataFrame(data=[[1,0,1,1,0], [0,1,1,0,1]],
                columns=["scones", "cerveza", "wiskey","avena","futbol"])

naive_bayes = NaiveBayes(training_set, "Nacionalidad")

for k, subject in enumerate(testing_set.to_dict(orient='records')):
    print(f'subject {k}: can be classified as {naive_bayes.classify(subject, 2)}')

subject 0: can be classified as {'I': 0.24, 'E': 0.76}
subject 1: can be classified as {'I': 0.83, 'E': 0.17}
