# Lab 09 (Module 10)
Hao-Tien Kuo
## Naïve Bayes with Categorical Data: Manual Calculation

In [6]:
# Suppose tomorrow will be mild, rainy, and windy, with high humidity. Should we play golf tomorrow? 

# P(golf | mild, rainy, windy, high) ∝ P(mild | golf) * P(rainy | golf) * P(windy | golf) * P(high | golf)

p_golf = 4/9 * 3/9 * 3/9 * 3/9
p_not_golf = 2/5 * 2/5 * 3/5 * 4/5

p_golf = p_golf / (p_golf + p_not_golf)
p_not_golf = 1 - p_golf

print(p_golf, p_not_golf)

# The probability of golf given tomorrow is less than the probability of not golf given tomorrow, so we shouldn't play golf tomorrow

0.1765038124823496 0.8234961875176504


In [8]:
# Is the assumption that outlook and humidity are independent a good assumption?

# Outlook and humidity could have some corelation, so the assumption might not hold
# E.g., Rainy days generally have higher humidity

## Naïve Bayes with Categorical Data: Sklearn

In [12]:
import numpy as np
import pandas as pd
import statistics
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('PlayGolf.csv')

In [13]:
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,PlayGolf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


In [14]:
# Temperature and Outlook are ordinal variables. 
# Instead of converting them to dummy variables, they need to be recoded as ordinal variables. 

scale_temperature = {'Cool':1, 'Mild':2, 'Hot':3}
scale_outlook = {'Rainy':1, 'Overcast':2, 'Sunny':3}
df['Temperature_scale'] = df['Temperature'].replace(scale_temperature)
df['Outlook_scale'] = df['Outlook'].replace(scale_outlook)

df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,PlayGolf,Temperature_scale,Outlook_scale
0,Rainy,Hot,High,False,No,3,1
1,Rainy,Hot,High,True,No,3,1
2,Overcast,Hot,High,False,Yes,3,2
3,Sunny,Mild,High,False,Yes,2,3
4,Sunny,Cool,Normal,False,Yes,1,3
5,Sunny,Cool,Normal,True,No,1,3
6,Overcast,Cool,Normal,True,Yes,1,2
7,Rainy,Mild,High,False,No,2,1
8,Rainy,Cool,Normal,False,Yes,1,1
9,Sunny,Mild,Normal,False,Yes,2,3


In [32]:
# The other variables need to be recoded to binary variables. 
# Because Windy is Boolean, it does not need to be recoded.

dummies = pd.get_dummies(df[['Humidity', 'PlayGolf']])
df = pd.concat([df, dummies], axis=1)

df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,PlayGolf,Temperature_scale,Outlook_scale,Humidity_High,Humidity_Normal,PlayGolf_No,PlayGolf_Yes
0,Rainy,Hot,High,False,No,3,1,1,0,1,0
1,Rainy,Hot,High,True,No,3,1,1,0,1,0
2,Overcast,Hot,High,False,Yes,3,2,1,0,0,1
3,Sunny,Mild,High,False,Yes,2,3,1,0,0,1
4,Sunny,Cool,Normal,False,Yes,1,3,0,1,0,1
5,Sunny,Cool,Normal,True,No,1,3,0,1,1,0
6,Overcast,Cool,Normal,True,Yes,1,2,0,1,0,1
7,Rainy,Mild,High,False,No,2,1,1,0,1,0
8,Rainy,Cool,Normal,False,Yes,1,1,0,1,0,1
9,Sunny,Mild,Normal,False,Yes,2,3,0,1,0,1


In [38]:
# Fit the data using CategoricalNB.

from sklearn.naive_bayes import CategoricalNB

X = df[['Windy', 'Temperature_scale', 'Outlook_scale', 'Humidity_High']]
y = df['PlayGolf_Yes']

clf = CategoricalNB()
clf.fit(X, y)
clf.score(X, y)

0.9285714285714286

In [44]:
# Using the data set, PlayGolfNext.csv, use your Naïve Bayes model to predict the next few days. 

df_next = pd.read_csv('PlayGolfNext.csv')

df_next

Unnamed: 0,Day,Outlook,Temperature,Humidity,Windy
0,Day After Tomorrow,Overcast,Cool,High,False
1,Tomorrow,Rainy,Mild,High,True
2,Today,Sunny,Hot,Normal,False


In [45]:
df_next['Temperature_scale'] = df_next['Temperature'].replace(scale_temperature)
df_next['Outlook_scale'] = df_next['Outlook'].replace(scale_outlook)
dummies = pd.get_dummies(df_next[['Humidity']])
df_next = pd.concat([df_next, dummies], axis=1)

df_next

Unnamed: 0,Day,Outlook,Temperature,Humidity,Windy,Temperature_scale,Outlook_scale,Humidity_High,Humidity_Normal
0,Day After Tomorrow,Overcast,Cool,High,False,1,2,1,0
1,Tomorrow,Rainy,Mild,High,True,2,1,1,0
2,Today,Sunny,Hot,Normal,False,3,3,0,1


In [46]:
X_next = df_next[['Windy', 'Temperature_scale', 'Outlook_scale', 'Humidity_High']]
df_next['PlayGolf_Yes'] = clf.predict(X_next)

df_next

# Which days should you play golf?
# Today and the day after tomorrow

Unnamed: 0,Day,Outlook,Temperature,Humidity,Windy,Temperature_scale,Outlook_scale,Humidity_High,Humidity_Normal,PlayGolf_Yes
0,Day After Tomorrow,Overcast,Cool,High,False,1,2,1,0,1
1,Tomorrow,Rainy,Mild,High,True,2,1,1,0,0
2,Today,Sunny,Hot,Normal,False,3,3,0,1,1


In [None]:
# Does the recommendation (Yes or No to play golf) for today and tomorrow match the class example and your manual prediction above?
# Yes!