In [None]:
## import Library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

## Load Dataset

In [None]:
data = pd.read_csv("tennis.csv")
data.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


##Description

In [None]:
data.isnull().sum()

outlook     0
temp        0
humidity    0
windy       0
play        0
dtype: int64

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   outlook   14 non-null     object
 1   temp      14 non-null     object
 2   humidity  14 non-null     object
 3   windy     14 non-null     bool  
 4   play      14 non-null     object
dtypes: bool(1), object(4)
memory usage: 590.0+ bytes


In [None]:
data.columns

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')

In [None]:
data['humidity'].value_counts()

normal    7
high      7
Name: humidity, dtype: int64

In [None]:
data['outlook'].value_counts()

rainy       5
sunny       5
overcast    4
Name: outlook, dtype: int64

In [None]:
data['temp'].value_counts()

mild    6
hot     4
cool    4
Name: temp, dtype: int64

In [None]:
data['windy'].value_counts()

False    8
True     6
Name: windy, dtype: int64

##Convert Dummies variable

In [None]:
data_copy = data.copy()
data_with_dummies = pd.get_dummies(data=data_copy,columns=['outlook', 'temp', 'humidity', 'windy','play'],prefix=['outlook', 'temp', 'humidity', 'windy','play'])
data_with_dummies.head()

Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny,temp_cool,temp_hot,temp_mild,humidity_high,humidity_normal,windy_False,windy_True,play_no,play_yes
0,0,0,1,0,1,0,1,0,1,0,1,0
1,0,0,1,0,1,0,1,0,0,1,1,0
2,1,0,0,0,1,0,1,0,1,0,0,1
3,0,1,0,0,0,1,1,0,1,0,0,1
4,0,1,0,1,0,0,0,1,1,0,0,1


In [None]:
data_with_dummies = data_with_dummies.drop('play_no',1)

In [None]:
data_with_dummies.head()

Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny,temp_cool,temp_hot,temp_mild,humidity_high,humidity_normal,windy_False,windy_True,play_yes
0,0,0,1,0,1,0,1,0,1,0,0
1,0,0,1,0,1,0,1,0,0,1,0
2,1,0,0,0,1,0,1,0,1,0,1
3,0,1,0,0,0,1,1,0,1,0,1
4,0,1,0,1,0,0,0,1,1,0,1


In [None]:
data_with_dummies_copy = data_with_dummies
x = data_with_dummies_copy.drop('play_yes',1)
y = data_with_dummies_copy.play_yes

In [None]:
x

Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny,temp_cool,temp_hot,temp_mild,humidity_high,humidity_normal,windy_False,windy_True
0,0,0,1,0,1,0,1,0,1,0
1,0,0,1,0,1,0,1,0,0,1
2,1,0,0,0,1,0,1,0,1,0
3,0,1,0,0,0,1,1,0,1,0
4,0,1,0,1,0,0,0,1,1,0
5,0,1,0,1,0,0,0,1,0,1
6,1,0,0,1,0,0,0,1,0,1
7,0,0,1,0,0,1,1,0,1,0
8,0,0,1,1,0,0,0,1,1,0
9,0,1,0,0,0,1,0,1,1,0


In [None]:
y

0     0
1     0
2     1
3     1
4     1
5     0
6     1
7     0
8     1
9     1
10    1
11    1
12    1
13    0
Name: play_yes, dtype: uint8

###Train test split

In [None]:
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size =0.2,random_state = 0)

In [None]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)

(11, 10)
(11,)
(3, 10)
(3,)


##Naive Bayes Classifier

#####Naive Bayes is a classification algorithm for binary (two-class) and multi-class classification problems. The technique is easiest to understand when described using binary or categorical input values.

It is called naive Bayes or idiot Bayes because the calculation of the probabilities for each hypothesis are simplified to make their calculation tractable. Rather than attempting to calculate the values of each attribute value P(d1, d2, d3|h), they are assumed to be conditionally independent given the target value and calculated as P(d1|h) * P(d2|H) and so on.

This is a very strong assumption that is most unlikely in real data, i.e. that the attributes do not interact. Nevertheless, the approach performs surprisingly well on data where this assumption does not hold.

In [None]:
gb = GaussianNB()
gb_model = gb.fit(train_x,train_y)


gb_train = gb.score(train_x,train_y)
print("Training Score:",gb_train)


Training Score: 0.9090909090909091


In [None]:
gb_model.predict([[0,0,1,0,1,0,1,0,1,0]])

array([0], dtype=uint8)