## Training a KNN model to diagnostic breast cancer

In [None]:
#Library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Setting style
sns.set_palette("Dark2_r")
sns.set_style("darkgrid")

#### Reading data

In [None]:
df = pd.read_csv('../data/breast cancer data.csv')
df.head()

In [None]:
df.info()

#### As we can see above, there is no missing values. Only a mysterious column with no name and no data

In [None]:
df.describe()

#### The dataset has 32 columns, with 1 column referring to the patient ID and a last column with only null values

In [None]:
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)

## Exploratory analysis

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(data = df, x='diagnosis', y='radius_mean')
ax.set_title('Radius Mean vs Diagnosis', fontsize=20)
ax.set_xlabel('Diagnostic', fontsize=16)
ax.set_ylabel('Radius Mean', fontsize=16)
plt.show()

plt.figure(figsize=(10,10))
ax = sns.boxplot(data = df, x='diagnosis', y='perimeter_mean')
ax.set_title('Perimeter Mean versus Diagnosis', fontsize=20)
ax.set_xlabel('Diagnosis', fontsize=16)
ax.set_ylabel('Perimeter Mean', fontsize=16)
plt.show()

plt.figure(figsize=(10,10))
ax = sns.boxplot(data = df, x='diagnosis', y='concave points_mean')
ax.set_title('Concave Points Mean versus Diagnosis', fontsize=20)
ax.set_xlabel('Diagnosis', fontsize=16)
ax.set_ylabel('Concave Points Mean', fontsize=16)
plt.show()


#### We can see here that information such as the average perimeter and the average radius of a tumor can be quite discriminatory during diagnosis

#### Checking how balanced the target are

In [None]:
ax = sns.catplot(x = 'diagnosis', data=df, kind='count')
ax;

In [None]:
plt.figure(figsize=(20,20))
ax = sns.heatmap(df.corr(), annot=True, linewidths=.5)
ax.set_title('Correlation map between variables', fontsize = 20)
ax;

### Variance

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
 
Var = VarianceThreshold()
Var.fit(df.drop(columns=['diagnosis']))
 
pd.DataFrame(data = Var.variances_.round(5), index=df.drop(columns=['diagnosis']).columns, columns=['Variance'])

## Training

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

X = df.drop(columns=['diagnosis'])
y = LabelEncoder().fit_transform(df['diagnosis'])

#Split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

#Applying a scaler
scaler = StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Creating the model
model = KNeighborsClassifier()
#Training the model
model.fit(X_scaled, y_train)

## Model Evaluating

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#Training set
pred = model.predict(X_scaled)
print(classification_report(y_train, pred))
sns.heatmap(confusion_matrix(y_train, pred), annot=True, fmt='d')
plt.title('Confusion Matrix - Training set')
plt.show()

In [None]:
#Test set
pred = model.predict(X_test_scaled)
print(classification_report(y_test, pred))
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='d')
plt.title('Confusion Matrix - Test set')
plt.show()