In [2]:
import pickle
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix


In [3]:
# df = pd.read_csv('/content/breast-cancer-wisconsin-data1.csv')

# Alternative read-in (for those who dont have data downloaded locally)

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
columns = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape',
           'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei',
           'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']

df = pd.read_csv(url, names=columns)

In [4]:
df['bare_nuclei'] = pd.to_numeric(df['bare_nuclei'], errors='coerce')

In [5]:
X = df[['clump_thickness', 'uniform_cell_size', 'uniform_cell_shape', 'bland_chromatin']] # Highest correlated variables to class
y = df['class']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=31)

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)

print('Naive Bayes')
print(f'Accuracy: {accuracy_score(y_test, nb_y_pred)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_test, nb_y_pred)}')

Naive Bayes
Accuracy: 0.9371428571428572
Confusion Matrix:
 [[103  10]
 [  1  61]]


In [None]:
with open('nb_model.pkl', 'wb') as file:
    pickle.dump(nb_model, file)