# Breast Cancer Detection

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier as kNN
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
np.set_printoptions(suppress=True)
plt.style.use("bmh")
config = {'figure.figsize': (16, 3),
          'axes.titlesize': 18,
          'axes.labelsize': 10,
          'lines.linewidth': 2,
          'lines.markersize': 10,
          'xtick.labelsize': 10,
          'ytick.labelsize': 10,
          'axes.prop_cycle': plt.cycler(color=["darkmagenta", "saddlebrown", "darkcyan", "olivedrab", "darkseagreen", "darkkhaki", "darkgoldenrod", "deepskyblue", "firebrick", "palevioletred"]),}
plt.rcParams.update(config)

<img src="https://www.techexplorist.com/wp-content/uploads/2020/02/cancer-cells-moving-1024x682.jpg" style="width: 50%;">

# Data set

The data set contains information about approximately 700 cells, which includes factors which are properties that would be valuable for a pathologist:

* Clump thickness
* Uniform cell size
* Uniform cell shape
* Marginal adhesion
* Single epithelial_size
* Bare nuclei
* Bland chromatin
* Normal nucleoli
* Mitoses

The target variable states whether the cell is malignant ($y=4$) or benign ($y=2$).

In [3]:
# Load Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ['id', 'clump_thickness', 'uniform_cell_size', 'uniform_cell_shape', 'marginal_adhesion', 'single_epithelial_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv(url, names=names)

In [4]:
df.head()

Unnamed: 0,id,clump_thickness,uniform_cell_size,uniform_cell_shape,marginal_adhesion,single_epithelial_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
df.replace('?',-99999, inplace=True)
df.drop(['id'], axis=1, inplace=True)

In [6]:
# Create X and Y datasets for training
X = np.array(df.drop(['class'], axis=1))
y = np.array(df['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
model = kNN(n_neighbors=5)

In [15]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=30, random_state=1)
cv_results = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print(f'kNN CV mean accuracy: {cv_results.mean():.2f}')

kNN CV mean accuracy: 0.97


In [16]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [18]:
print(f'kNN test accuracy: {accuracy_score(y_test, predictions):.2f}')
print(classification_report(y_test, predictions))

kNN test accuracy: 0.98
              precision    recall  f1-score   support

           2       0.99      0.98      0.98        93
           4       0.96      0.98      0.97        47

    accuracy                           0.98       140
   macro avg       0.97      0.98      0.98       140
weighted avg       0.98      0.98      0.98       140

