# Lab2 - Naive Bayes classifier

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes  import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

## Introduction

**Goal** : Predict whether a player's career length is greater than or equal to 5

**Assumption of distribution** : Gaussian distribution (for numeric features)

### Processing flow
1. Read raw data
2. Data preprocess
3. Build model

### Dataset - [NBA Prediction for Naive Bayes](https://www.kaggle.com/wtamubuff/nba-prediction-for-naive-bayes)
#### Files ####
1. nba_longevity.csv

<img src="Data_Dictionary_nba_longevity.png" style="width: 400px;"/>

## Read raw data
1. read data
1. basic observation

In [None]:
# read dataset
df_raw_data = pd.read_csv("nba_longevity.csv")

# view data
df_raw_data.head() 

In [None]:
# view column names
df_raw_data.keys()

In [None]:
# show data type of a certain column
df_raw_data['PTS'].dtype

## Data preprocess
### Selecting features that is used on prediction and prediction ground truth.
   - **GP** : games played
   - **MIN** : minutes played
   - **PTS** : points per game
   - **TARGET_5Yrs** : outcome:1 if career length >= 5 yrs, 0 if < 5...

In [None]:
# select using columns
df_selected_data = df_raw_data[['GP', 'MIN', 'PTS', 'TARGET_5Yrs']]
print(f'There are {df_selected_data.shape[0]} pieces of data.\n')

df_selected_data.head()

### process missing data
1. check nan value
2. fill a cell manually
2. fillna by column
3. dropna

In [None]:
# check missing data
df_selected_data.isnull().sum()

### Other preprocess
No thing to do

In [None]:
preprocessed_data = df_selected_data.to_numpy()

## Build model
1. Split dataset to training set and testing set.
2. Training model
3. Evaluation

In [None]:
X = preprocessed_data[:, 0:-1]
Y = preprocessed_data[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Build model

In [None]:
gaussian_nb_model = GaussianNB()
gaussian_nb_model.fit(X_train, Y_train)

### Evaluation
1. calculating $\text{accuracy} = \frac{TP + TN}{Number of samples}$

2. calculating $\text{precision} = \frac{TP}{TP + FP}$

2. calculating $\text{recall} = \frac{TP}{TP + FN}$

2. calculating $\text{F1 score} = \frac{2 \times \text{precision} \times \text{recall}}{\text{precision} + \text{recall}}$

1. [Other metrics](https://scikit-learn.org/stable/modules/model_evaluation.html)

3. drawing confusion matrix

In [None]:
# make prediction
Y_pred = gaussian_nb_model.predict(X_test)

# accuracy
accuracy = accuracy_score(Y_test, Y_pred)
# precision
precision = precision_score(Y_test, Y_pred)
# recall
recall = recall_score(Y_test, Y_pred)
# F1 score
f1_score = f1_score(Y_test, Y_pred)

# classification_report
target_names = ['< 5 yrs', '>= 5 yrs']
print(classification_report(Y_test, Y_pred, target_names=target_names))

In [None]:
# confusion matrix
cm = confusion_matrix(Y_test, Y_pred)
disp = ConfusionMatrixDisplay.from_predictions(Y_test, Y_pred)
plt.show()

## Practice - adding new feature into naive Bayes classifier

**Hint :** 
1. make sure that the naive assumption is make sense
1. Gaussian distribution is suitable for modeling the distribution of added feature.