In [110]:
import pandas as pd

class Classifier():
    data = None
    class_attr = None
    prior = {}
    cp = {}
    hypothesis = None
    sizes = {}
    k = 0

    def __init__(self,filename = None, class_attr = None, k = 1.0):
        self.data = pd.read_csv(filename, sep=',')
        self.data = self.data.drop(columns=['animal_name'])
        self.class_attr = class_attr
        self.k = float(k)


    def calculate_prior(self):          # calculate probabilities for the class_attribute i.e. 'Play'
        class_values = list(set(self.data[self.class_attr]))
        class_data =  list(self.data[self.class_attr])
        for i in class_values:
            self.prior[i]  = class_data.count(i)/float(len(class_data))
        print (self.prior)

    def set_sizes(self):
        names = list(self.data)
        self.sizes = {}
        for i in names:
            self.sizes.update({i : len(set(self.data[i]))})
        print(self.sizes)

    def get_cp(self, attr, attr_type, class_value):
        data_attr = list(self.data[attr])
        class_data = list(self.data[self.class_attr])
        total = int(self.k)
        for i in range(0, len(data_attr)):
            if (class_data[i] == class_value and data_attr[i] == attr_type):
                total+=1
        calculation = total/float(class_data.count(class_value) + self.k*self.sizes[attr])

        print("P (", attr, "=", attr_type, "| Play = ", class_value, ") = (", str(total), "+", self.k, ") / (" , self.sizes[attr], "+",
            str(self.k), "*", str(float(class_data.count(class_value))),") = ",(calculation))
        return calculation

    '''
        Here we calculate Likelihood of Evidence and multiple all individual probabilities with prior
        (Outcome|Multiple Evidence) = P(Evidence1|Outcome) x P(Evidence2|outcome) x ... x P(EvidenceN|outcome) x P(Outcome)
        scaled by P(Multiple Evidence)
    '''
    def calculate_conditional_probabilities(self, hypothesis):
        for i in self.prior:
            self.cp[i] = {}
            for j in hypothesis:
                self.cp[i].update({ hypothesis[j]: self.get_cp(j, hypothesis[j], i)})
 

    def classify(self):
        print ("\nResult with Laplacian smoothing:")
        h = []
        for i in self.cp:                   
            agg = 1
            f = self.cp[i].values()        # can also be reduced by a simple lambda reduce(lambda x, y: x*y, self.cp[i].values())*self.prior[i]
            for j in f:
                agg = agg*j                     
            g = self.prior[i]
            h.append(g*agg)
            print (i, " --> ", g*agg)
            

# if __name__ == "__main__":
#     print("\n\tWelcome to the naive bayes classifying program for tennis.csv\n")
#     k_lap = input("Please enter the value of 'k' for Laplacian smoothing: ")
#     c = Classifier(filename = "tennis.csv", class_attr = "Play", k = k_lap)
#     print("The prior probabilities for 'yes' and 'no' in the 'Play' column are:")
#     c.calculate_prior()
#     print("\nThe sizes of the rest of the attribute columns are: ")
#     c.set_sizes()

#     outlook = input("\nProvide conditions for Outlook (sunny/overcast/rainy): ")
#     temp = input("Provide conditions for temperature (hot/mild/cool): ")
#     humidity = input("Provide conditions for humidity (high/normal): ")
#     windy = input("Is it windy? (true/false): ")
#     c.hypothesis = {"Outlook":outlook, "Temp.":temp, "Humidity":humidity , "Windy":windy}

#     print("\nThe conditional probabilities are as follows:")
#     c.calculate_conditional_probabilities(c.hypothesis)
#     c.classify()

In [131]:
c = Classifier(filename = "train.data", class_attr = "classtype", k = 1)
c.calculate_prior()
c.set_sizes()

{1: 0.3, 2: 0.1, 3: 0.1, 4: 0.1, 5: 0.039603960396039604, 6: 0.2, 7: 0.2}
{'hair': 2, 'feathers': 2, 'eggs': 2, 'milk': 2, 'airborne': 2, 'aquatic': 2, 'predator': 2, 'toothed': 2, 'backbone': 2, 'breathes': 2, 'venomous': 1, 'fins': 2, 'legs': 4, 'tail': 2, 'domestic': 1, 'catsize': 2, 'classtype': 6}


In [132]:
test = pd.read_csv('test.data')

In [133]:
test = test.drop(columns=['animal_name', 'classtype'])

In [134]:
test = list(test.T.to_dict().values())[0]

In [135]:
c.calculate_conditional_probabilities(test)

P ( hair = 0 | Play =  1 ) = ( 1 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.2
P ( feathers = 0 | Play =  1 ) = ( 4 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.8
P ( eggs = 1 | Play =  1 ) = ( 1 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.2
P ( milk = 0 | Play =  1 ) = ( 1 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.2
P ( airborne = 0 | Play =  1 ) = ( 3 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.6
P ( aquatic = 1 | Play =  1 ) = ( 1 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.2
P ( predator = 1 | Play =  1 ) = ( 2 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.4
P ( toothed = 1 | Play =  1 ) = ( 4 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.8
P ( backbone = 1 | Play =  1 ) = ( 4 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.8
P ( breathes = 0 | Play =  1 ) = ( 1 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.2
P ( venomous = 0 | Play =  1 ) = ( 4 + 1.0 ) / ( 1 + 1.0 * 3.0 ) =  1.0
P ( fins = 1 | Play =  1 ) = ( 1 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.2
P ( legs = 0 | Play =  1 ) = ( 1 + 1.0 ) / ( 4 + 1.0 * 3.0 ) =  0.14285714285714285
P ( tail = 1 | Play =  1 ) = ( 3 + 1.0 ) / ( 2 + 1.0 * 3.0 ) =  0.6
P 

In [136]:
c.classify()


Result with Laplacian smoothing:
1  -->  0.072
2  -->  0.044444444444444446
3  -->  0.044444444444444446
4  -->  0.044444444444444446
5  -->  0.009900990099009901
6  -->  0.037500000000000006
7  -->  0.037500000000000006
