### Objective
* Recap of Naive Bayes algorithm
* Implement Naive Bayes using Python/NumPy

<hr>

### Recap of Naive Bayes Algorithm
* Assumption data is categorical in nature
* For neumerical data, we have to do binning
* If data is categorical, we have to calculate frequncy table as well as likelihood table
* For test data, we can calculate probabilty using likelihood table

### Loading Tennis Data

In [1]:
import pandas as pd

In [2]:
tennis_data = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/tennis.csv.txt')

In [3]:
tennis_data

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


### Calculate Frequency Table

In [5]:
outlook_frequency_tab = pd.crosstab(tennis_data.outlook, tennis_data.play)

In [6]:
temp_frequency_tab = pd.crosstab(tennis_data.temp, tennis_data.play)

In [7]:
humidity_frequency_tab = pd.crosstab(tennis_data.humidity, tennis_data.play)

In [8]:
windy_frequency_tab = pd.crosstab(tennis_data.windy, tennis_data.play)

In [88]:
windy_frequency_tab

play,no,yes
windy,Unnamed: 1_level_1,Unnamed: 2_level_1
False,2,6
True,3,3


### Generate Likelihood Table

In [11]:
outlook_frequency_tab.no.sum()

5

In [12]:
outlook_frequency_tab.yes.sum()

9

In [17]:
outlook_likelihood_tab = outlook_frequency_tab
outlook_likelihood_tab['no'] = outlook_frequency_tab.no/outlook_frequency_tab.no.sum()

In [18]:
outlook_likelihood_tab['yes'] = outlook_frequency_tab.yes/outlook_frequency_tab.yes.sum()

In [19]:
outlook_likelihood_tab

play,no,yes
outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
overcast,0.0,0.444444
rainy,0.4,0.333333
sunny,0.6,0.222222


In [133]:
class MyNaiveBayes:
    def __init__(self):
        self.likelihood_tables = {}
        self.class_prior_probability = None
    
    #Generate Frequency Tables 
    def get_frequency_tables(self, feature_data, target_data):
        
        freq_dict = {}
        
        for col in feature_data.columns:
            freq_tab = pd.crosstab(feature_data[col],target_data)
            freq_dict[col] = freq_tab
        
        return freq_dict 
    
    #Generate Likelihood Tables
    def get_likelihood_tables(self, frequency_tables):
        likelihood_dict = {}
        
        for col, freq_table in frequency_tables.items():
            likelihood_tab = freq_table
            
            for tgt_name in freq_table.columns:
                total_count = freq_table[tgt_name].sum()
                likelihood_tab[tgt_name] = freq_table[tgt_name]/total_count
            
            likelihood_dict[col] = likelihood_tab    
            
        return likelihood_dict
    
    
    def myfit(self,feature_data, target_data):
        frequency_tables = self.get_frequency_tables(feature_data,target_data)
        likelihood_tables = self.get_likelihood_tables(frequency_tables)
        self.likelihood_tables = likelihood_tables
        
        target_freq = target_data.value_counts()
        target_events = target_data.value_counts().sum()
        self.class_prior_probability = target_freq/target_events
    
    def mypredict(self,feature_data):
        tests = feature_data.to_dict(orient='records')
        for test in tests:
            p_yes = 1
            for col,val in test.items():
                p_yes *= self.likelihood_tables[col].loc[val]['yes']
            p_yes = p_yes*self.class_prior_probability['yes']
                
            p_no = 1
            for col,val in test.items():
                p_no *= self.likelihood_tables[col].loc[val]['no']
            p_no = p_no*self.class_prior_probability['no']    
            
            #print(p_yes, p_no)
            yes = p_yes/(p_yes+p_no)
            no = p_no/(p_yes+p_no)
            if yes > no:
                print ('Yes')
            else:
                print ('No')
    

In [134]:
feature_data = tennis_data.drop(columns='play')

In [135]:
target_data = tennis_data.play

In [136]:
mynb = MyNaiveBayes()
mynb.myfit(feature_data, target_data)

In [137]:
mynb.likelihood_tables['outlook'].loc['sunny']['yes']

0.2222222222222222

In [138]:
mynb.likelihood_tables['temp']

play,no,yes
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
cool,0.2,0.333333
hot,0.4,0.222222
mild,0.4,0.444444


In [139]:
test = feature_data[:5]

In [140]:
mynb.mypredict(test)

No
No
Yes
Yes
Yes


In [93]:
s = target_data.value_counts()

In [95]:
s/14

yes    0.642857
no     0.357143
Name: play, dtype: float64

In [121]:
mynb.class_prior_probability

yes    0.642857
no     0.357143
Name: play, dtype: float64

In [132]:
target_data[:5]

0     no
1     no
2    yes
3    yes
4    yes
Name: play, dtype: object

In [141]:
mynb.mypredict(feature_data)

No
No
Yes
Yes
Yes
Yes
Yes
No
Yes
Yes
Yes
Yes
Yes
No
