Loading the Dataset and functions

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
breast_cancer = load_breast_cancer()

In [None]:
print(breast_cancer.DESCR)

Checking the Sample and Target Sizes

In [None]:
breast_cancer.target[::100]

In [None]:
X = breast_cancer.data
y = breast_cancer.target

X.shape, y.shape

Splitting the Data for Training and Testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11)

Training and Testing Set Sizes

In [None]:
X_train.shape, X_test.shape

Creating the Model (GaussianNB)

In [None]:
nb = GaussianNB()

Training the Model

In [None]:
nb.fit(X=X_train, y=y_train)

Predicting Breast_Cancer Classes

In [None]:
predicted = nb.predict(X=X_test)

expected = y_test

In [None]:
predicted[:20]

In [None]:
expected[:20]

In [None]:
wrong = [(p,e) for (p,e) in zip(predicted, expected) if p!=e]

wrong

Estimator Method score

In [None]:
print(f'{nb.score(X_test, y_test):.2%}')

Confusion Matrix

In [None]:
confusion = confusion_matrix(y_true=expected, y_pred=predicted)

confusion

Classification Report

In [None]:
names = [str(breast_cancer) for breast_cancer in breast_cancer.target_names]

print(classification_report(expected, predicted, target_names=names))

Visualizing the Confusion Matrix

In [None]:
import pandas as pd

In [None]:
confusion_df = pd.DataFrame(confusion, index=range(2), columns=range(2))

import seaborn as sns

axes = sns.heatmap(confusion_df, annot=True, cmap='nipy_spectral_r')

K-Fold Cross-Validation

In [None]:
kfold = KFold(n_splits=10, random_state=11, shuffle=True)

Using the KFold Object with Function cross_val_score

In [None]:
scores = cross_val_score(estimator=nb, X=breast_cancer.data, y=breast_cancer.target, cv=kfold)

scores

In [None]:
print(f'Mean accuracy: {scores.mean():.2%}')

In [None]:
print(f'Accuracy standard deviation: {scores.std():.2%}')

Running Multiple Models to Find the Best One

In [None]:
estimators = {'GaussianNB': nb,
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=10000),
    'SVC': SVC(gamma='scale')}

In [None]:
for estimator_name, estimator_object in estimators.items():
    kfold = KFold(n_splits=10, random_state=11, shuffle=True)
    scores = cross_val_score(estimator=estimator_object, X=breast_cancer.data, y=breast_cancer.target, cv=kfold)
    print(f'{estimator_name:>20}: ' + f'mean_accuracy={scores.mean():.2%}: ' + f'standard_deviation={scores.std():.2%}')

### References:

Huan, X. (2024). *ex15_15.ipynb*.

Huan, X. (2024). *KNN_Iris.ipynb*.

Deitel, P., & Deitel, H. (2020). Intro to Python for Computer Science and Data Science: Learning to Program with AI, Big Data and the Cloud. Pearson.