# Ejercicio 1

## Leer datos

In [2]:
import pandas as pd

training_set = pd.read_excel('datasets/PreferenciasBritanicos.xlsx')
training_set

Unnamed: 0,scones,cerveza,wiskey,avena,futbol,Nacionalidad
0,0,0,1,1,1,I
1,1,0,1,1,0,I
2,1,1,0,0,1,I
3,1,1,0,0,0,I
4,0,1,0,0,1,I
5,0,0,0,1,0,I
6,1,0,0,1,1,E
7,1,1,0,0,1,E
8,1,1,1,1,0,E
9,1,1,0,1,0,E


## Implementacion de Naive Bayes

Se instancia la clase con un set de entrenamiento y el nombre de la columna donde están las clases. Opcionalmente se puede deshabilitar la corrección de Laplace.

Luego se utiliza el método `classify` que recibe un diccionario con la hipótesis que se quiere clasificar. Este retorna un diccionario con la probabilidad condicional de cada clase.

Clasificacion:
$$
V_{NB} = arg\max_{v_j \in \{I, E\}}\prod_{i=1}^{5} P(a_i| v_j)P(v_j)
\\
a_i \in \{scones, cerveza, wiskey, avena, futbol\}
$$
Probabilidad Total:

$$
P(a_1, a_2, ..., a_5) = \prod_{i=1}^{5} P(a_i| I)P(I) + \prod_{i=1}^{5} P(a_i| E)P(E)
$$

In [3]:
class NaiveBayes:

    def __init__(self, training_set, class_key, laplace = True):
        self.ts = training_set
        self.class_key = class_key
        self.classes = training_set[class_key].unique()
        self.laplace = laplace
        self.class_probabilities = self.__class_probabilities()

    def __class_probabilities(self):
        to_return = {}
        for value in self.classes:
            k = self.ts[self.ts[self.class_key] == value ]
            if self.laplace:
                to_return[value] = (len(k)+1) / (len(self.ts)+len(self.classes))
            else:
                to_return[value] = len(k) / len(self.ts)
        return to_return

    def p_attr_given_class(self, attr, value, class_name):
        attr_and_class = self.ts[(self.ts[attr] == value) & (self.ts[self.class_key] == class_name)]
        klass = self.ts[self.ts[self.class_key] == class_name]

        if self.laplace:
            return (len(attr_and_class)+1)/(len(klass)+len(self.classes))
        else:
            return len(attr_and_class)/len(klass)

    def classify(self, subject, decimals=None):
        probabilities = {}
        total_probability = 0
        for klass in self.classes:
            p = 1
            for key, val in subject.items():
                p *= self.p_attr_given_class(key, val, klass)
            p *= self.class_probabilities[klass]
            total_probability += p
            probabilities[klass] = p
        for k,v in probabilities.items():
            if not decimals:
                probabilities[k] = v/total_probability
            else:
                probabilities[k] = round(v/total_probability, decimals)
        return probabilities


## Set de prueba

In [10]:
testing_set = pd.DataFrame(data=[[1,0,1,1,0], [0,1,1,0,1]],
                columns=["scones", "cerveza", "wiskey","avena","futbol"])

naive_bayes = NaiveBayes(training_set, "Nacionalidad")

for k, subject in enumerate(testing_set.to_dict(orient='records')):
    print(f'{tuple([v for _,v in subject.items()])}: Tiene las siguientes probabilidades {naive_bayes.classify(subject, 2)}')
result_set = []
for k, subject in enumerate(testing_set.to_dict(orient='records')):
    result_set.append([v for _,v in subject.items()]+[v for _,v in naive_bayes.classify(subject, 3).items()])

pd.DataFrame(data=result_set,
                columns=["scones", "cerveza", "wiskey","avena","futbol", "P(I)", "P(E)"])


(1, 0, 1, 1, 0): Tiene las siguientes probabilidades {'I': 0.24, 'E': 0.76}
(0, 1, 1, 0, 1): Tiene las siguientes probabilidades {'I': 0.83, 'E': 0.17}


Unnamed: 0,scones,cerveza,wiskey,avena,futbol,P(I),P(E)
0,1,0,1,1,0,0.24,0.76
1,0,1,1,0,1,0.835,0.165
