## (1)	Build the naïve bayes classifier

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cardata.csv')
df = df.dropna()
df.head(10)

Unnamed: 0,Income,Has Child (under 10),Age,Buy Car
0,Low,No,Young,No
1,Mid-Range,Yes,Old,No
2,High,Yes,Mid-aged,Yes
3,High,No,Young,Yes
4,Mid-Range,Yes,Mid-aged,Yes
5,High,No,Old,No
6,Low,Yes,Young,Yes
7,Mid-Range,Yes,Mid-aged,No
8,High,Yes,Young,Yes
9,Low,No,Old,Yes


### Rename attribute

In [3]:
df = df.rename(columns={"Income": "income", "Has Child (under 10)": "child", "Age": "age", "Buy Car": "buy"})

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   income  10 non-null     object
 1   child   10 non-null     object
 2   age     10 non-null     object
 3   buy     10 non-null     object
dtypes: object(4)
memory usage: 400.0+ bytes


## Encoding child, buy

In [5]:
# child
df['child'] = np.where(df.child == 'No', 0, 1)
# buy
df['buy'] = np.where(df.buy == 'No', 0, 1)

## Categorical Feature Encoding

In [6]:
# columns income, age
df = pd.get_dummies(df,columns=['income','age'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   child             10 non-null     int32
 1   buy               10 non-null     int32
 2   income_High       10 non-null     uint8
 3   income_Low        10 non-null     uint8
 4   income_Mid-Range  10 non-null     uint8
 5   age_Mid-aged      10 non-null     uint8
 6   age_Old           10 non-null     uint8
 7   age_Young         10 non-null     uint8
dtypes: int32(2), uint8(6)
memory usage: 220.0 bytes


In [8]:
df.head(10)

Unnamed: 0,child,buy,income_High,income_Low,income_Mid-Range,age_Mid-aged,age_Old,age_Young
0,0,0,0,1,0,0,0,1
1,1,0,0,0,1,0,1,0
2,1,1,1,0,0,1,0,0
3,0,1,1,0,0,0,0,1
4,1,1,0,0,1,1,0,0
5,0,0,1,0,0,0,1,0
6,1,1,0,1,0,0,0,1
7,1,0,0,0,1,1,0,0
8,1,1,1,0,0,0,0,1
9,0,1,0,1,0,0,1,0


In [9]:
df.shape

(10, 8)

### X is dataset to train the algorithm
### y is actual value of buying a car or not

In [10]:
X = df[["child", "income_High", "income_Low", "income_Mid-Range", "age_Young", "age_Old", "age_Mid-aged"]].values
y = df["buy"].values

In [11]:
X

array([[0, 0, 1, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 1, 0],
       [1, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 1],
       [1, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 1, 0]], dtype=int32)

In [12]:
y

array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1])

### import sklearn library

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

### Gaussian Naive Bayes

In [14]:
gnb = GaussianNB()

In [15]:
gnb.fit(X,y)

GaussianNB()

### Prediction with whole dataset X

In [16]:
naive = gnb.predict(X)

## list the result of the prediction

In [17]:
print (naive)

[1 0 1 1 0 0 1 0 1 0]


## (2) Predict “Income = High, Has Child = No, Age = Mid-aged” 

#### "child", "income_High", "income_Low", "income_Mid-Range", "age_Young", "age_Old", "age_Mid-aged"

In [18]:
TEST = [[0, 1, 0, 0, 0, 0, 1]]

### prediction result: 1 - buy; 0 - do not buy

In [19]:
naiveTest = gnb.predict(TEST)
print (naiveTest)

[1]


### Evaluate Naive Bayes Classifier

### Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y, naive)

array([[3, 1],
       [2, 4]], dtype=int64)

### Accuracy

In [21]:
from sklearn.metrics import accuracy_score
print("Naive Bayes Classifier:\t",accuracy_score(y, naive))

Logistic Regression:	 0.7


### Precision

In [30]:
from sklearn.metrics import precision_score
print("Naive Bayes Classifier:\t",precision_score(y, naive))

Naive Bayes Classifier:	 0.8


### Recall (Sensitivity)

In [31]:
from sklearn.metrics import recall_score
print("Naive Bayes Classifier:\t",recall_score(y, naive))

Naive Bayes Classifier:	 0.6666666666666666


### F1 Score

In [32]:
from sklearn.metrics import f1_score
print("Naive Bayes Classifier:\t",f1_score(y, naive))


Naive Bayes Classifier:	 0.7272727272727272


In [25]:
from sklearn.metrics import classification_report

### Classification Report - desired probabilities

In [26]:
print(classification_report(y, naive))

              precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.80      0.67      0.73         6

    accuracy                           0.70        10
   macro avg       0.70      0.71      0.70        10
weighted avg       0.72      0.70      0.70        10



### Improved Performance with Bagging Classifier

In [27]:

from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier()
bc.fit(X,y)
bc_predictions = bc.predict(X)
print(bc_predictions)

[0 0 1 1 1 0 1 1 1 1]


In [28]:
print(classification_report(y, bc_predictions))

              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.86      1.00      0.92         6

    accuracy                           0.90        10
   macro avg       0.93      0.88      0.89        10
weighted avg       0.91      0.90      0.90        10



### The second time predicting
### with a more precise algorithm as comparison
### prediction result: 1 - buy; 0 - do not buy

In [29]:
bcTest = bc.predict(TEST)
print (bcTest)

[1]
