# BREAST CANCER DATASET

A first look at the breast cancer dataset, a classic (binary) classification dataset via kNN method.

scikit learn includes a data set for breast cancer (diagnostic) in Wisconsin, 

In [37]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

#LOADING DATA
cancer = load_breast_cancer()

#print(cancer.DESCR) # data set description

In [38]:
print('Keywords used in the data set:')
print(cancer.keys())

Keywords used in the data set:
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [39]:
# CONVERTING IN DATAFRAME
# "target" contains 1 if benign, or 0 if malignant
data = np.c_[cancer.data, cancer.target]
columns = np.append(cancer.feature_names, ["target"])
cancer_df = pd.DataFrame(data, columns=columns)

print('Number of features: ', len(cancer['feature_names']))

Number of features:  30


In [40]:
# split the data in malignant or benign and each case:

counts = cancer_df.target.value_counts(ascending=True) #organise by target, 
# ascending=True sets target=0 (malignant) first

counts.index = "malignant benign".split()  #splitting data for counting, first malignant, and then benign

counts = [counts.malignant, counts.benign]; #counts

print("Dataset contains [malignant, benign] cases: ", counts)

Dataset contains [malignant, benign] cases:  [212, 357]


## Starting treatment for applying ML techniques

*We need to rerun cancer_df because it is ordered by target, splitting data for train/test will be problematic*

In [41]:
cancer_df = pd.DataFrame(data, columns=columns)

In [42]:
# separate variables

#X = pd shape, matrix
X = cancer_df[cancer_df.columns[:-1]] #ignores target

# y = pd series, vector
y = cancer_df.target # target
    

### Importing kNN model 

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Auto amount of data for train/test (75/25)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

model = KNeighborsClassifier(n_neighbors=1)

model.fit(X_train, y_train) # fitting model

# now, model is fitted and working, should predict correctly
print("Accuracy: ", model.score(X_test, y_test))


Accuracy:  0.916083916083916


### Accuracy in function of k

In [64]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k = [1,3,6,11]
for index in [1,3,6,11, 15, 20, 30]:
    model = KNeighborsClassifier(n_neighbors = index)

    model.fit(X_train, y_train) # fitting model
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    # now, model is fitted and working, should predict correctly
    print('For K= ', index)
    print("Accuracy train: ", train_score)
    print("Accuracy test: ", test_score)
    print('\n')

For K=  1
Accuracy train:  1.0
Accuracy test:  0.916083916083916


For K=  3
Accuracy train:  0.9577464788732394
Accuracy test:  0.9230769230769231


For K=  6
Accuracy train:  0.9413145539906104
Accuracy test:  0.9230769230769231


For K=  11
Accuracy train:  0.9366197183098591
Accuracy test:  0.958041958041958


For K=  15
Accuracy train:  0.9366197183098591
Accuracy test:  0.958041958041958


For K=  20
Accuracy train:  0.9272300469483568
Accuracy test:  0.951048951048951


For K=  30
Accuracy train:  0.9131455399061033
Accuracy test:  0.9440559440559441




### Prediction for mean values

*i.e. if we take mean values for each feature, will the cancer be malignant(0), or benign (1)?*

In [10]:
cancer_df = pd.DataFrame(data, columns=columns)

means = cancer_df.mean()[:-1].values.reshape(1, -1) # reshape into a row with no target column

model.predict(means)

if model.predict(means) == 1.:
    kind = "benign"
else:
    kind= "malignant"

print("Prediction for mean values: ", model.predict(means), ", cancer is ", kind + ".")


Prediction for mean values:  [1.] , cancer is  benign.


### knn regression 

In [28]:
from sklearn.neighbors import KNeighborsRegressor
import plotly
import plotly.graph_objs as go
#from plotly.plotly import iplot, plot  # versions being updated, need load various modules
from chart_studio.plotly import plot, iplot
from sklearn.datasets import make_regression


#Generating data:
#X, y = pd shape, matrix
X_knnreg, y_knnreg = make_regression(n_samples = 100, n_features=1,
                            n_informative=1, bias = 150.0,
                            noise = 30, random_state=0)

# Visualize data: need to reshape data
trace = go.Scatter(x = X_knnreg.reshape(-1),
                   y = y_knnreg.reshape(-1),
                   mode = 'markers'
                   )

fig = go.Figure(data = trace)
#fig.show()  # uncomment this line for checking

# train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# model, knn regresion
knnreg = KNeighborsRegressor(n_neighbors = 5).fit(X_train, y_train)

print('R² for test: {:.3f}'
     .format(knnreg.score(X_test, y_test)))

# *testing means every n values*

cancer_df = pd.DataFrame(data, columns=columns)

n = 5
n = int(n)

n_mean = cancer_df.groupby(np.arange(len(cancer_df))//n).mean()[:-1].values.reshape(1, -1)

model.predict(n_mean)