In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tabulate import tabulate

import warnings
def warn(*args, **kwargs):
    pass
warnings.warn = warn

In [2]:
df = pd.read_csv(r"../golf-dataset.csv")
df.head()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Outlook    14 non-null     object
 1   Temp       14 non-null     object
 2   Humidity   14 non-null     object
 3   Windy      14 non-null     bool  
 4   Play Golf  14 non-null     object
dtypes: bool(1), object(4)
memory usage: 594.0+ bytes


In [4]:
df.describe()

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
count,14,14,14,14,14
unique,3,3,2,2,2
top,Rainy,Mild,High,False,Yes
freq,5,6,7,8,9


In [5]:
df.isnull().any()

Outlook      False
Temp         False
Humidity     False
Windy        False
Play Golf    False
dtype: bool

In [6]:
df.isnull().sum().sum()

0

In [7]:
encoder = LabelEncoder()
encoded_df = []
for x in df.columns.tolist():
    encoded_df.append(encoder.fit_transform(df[x]))
encoded_df = pd.DataFrame(encoded_df)
encoded_df = encoded_df.T
cols = df.columns.tolist()
i = 0
for col in range(len(cols)):
    encoded_df.rename(columns={i:cols[col]}, inplace=True)
    i += 1
encoded_df

Unnamed: 0,Outlook,Temp,Humidity,Windy,Play Golf
0,1,1,0,0,0
1,1,1,0,1,0
2,0,1,0,0,1
3,2,2,0,0,1
4,2,0,1,0,1
5,2,0,1,1,0
6,0,0,1,1,1
7,1,2,0,0,0
8,1,0,1,0,1
9,2,2,1,0,1


In [8]:
X = encoded_df.drop('Play Golf', axis=1)
y = encoded_df['Play Golf']

In [9]:
X

Unnamed: 0,Outlook,Temp,Humidity,Windy
0,1,1,0,0
1,1,1,0,1
2,0,1,0,0
3,2,2,0,0
4,2,0,1,0
5,2,0,1,1
6,0,0,1,1
7,1,2,0,0
8,1,0,1,0
9,2,2,1,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=72018)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9, 4)
(5, 4)
(9,)
(5,)


In [11]:
splits = [i/10 for i in range(1,10)]
result = []
gnb = GaussianNB()

i = 0

for split in splits:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=72018)
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    i += 1
    result.append([i, str(f"{(1-split)*100}%"),str(f"{split*100}%"),cr, cm, score])

print('Results Table\n')
print(tabulate(result, headers=['SNo','Train %','Test %','Report','Matrix','Accuracy'], tablefmt='grid'))

Results Table

+-------+---------------------+----------+-------------------------------------------------------+----------+------------+
|   SNo | Train %             | Test %   | Report                                                | Matrix   |   Accuracy |
|     1 | 90.0%               | 10.0%    | precision    recall  f1-score   support               | [[0 1]   |   0.5      |
|       |                     |          |                                                       |  [0 1]]  |            |
|       |                     |          |            0       0.00      0.00      0.00         1 |          |            |
|       |                     |          |            1       0.50      1.00      0.67         1 |          |            |
|       |                     |          |                                                       |          |            |
|       |                     |          |     accuracy                           0.50         2 |          |            |
|

  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


In [12]:
cr_df = classification_report(y_test, y_pred, output_dict=True)
cr_df = pd.DataFrame(cr_df)
cr_df

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.0,0.615385,0.615385,0.307692,0.378698
recall,0.0,1.0,0.615385,0.5,0.615385
f1-score,0.0,0.761905,0.615385,0.380952,0.468864
support,5.0,8.0,0.615385,13.0,13.0


In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.62      1.00      0.76         8

    accuracy                           0.62        13
   macro avg       0.31      0.50      0.38        13
weighted avg       0.38      0.62      0.47        13

