In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
% matplotlib inline

## Wisconsin Breast Cancer Database

In [2]:
# import the data
bcw_data = pd.read_csv(os.path.join("data", "uci", "breast-cancer-wisconsin.data"), na_values="?", header=None)

# set the column names
bcw_data.columns = ["ID","THICKNESS","CELL_SIZE_UNIFORMITY","CELL_SHAPE_UNIFORMITY","MARGINAL_ADHESION","EPI_CELL_SIZE","BARE_NUCLEI","BLAND_CHROMATIN","NORMAL_NUCLEOLI","MITOSES","CLASS"]

# remove NAs
bcw_data = bcw_data.dropna(axis=0, how="any")
bcw_data = bcw_data.drop("ID", axis=1)

y1 = bcw_data.pop("CLASS")
X1 = bcw_data

# split the data
X_tr, X_te, y_tr, y_te = train_test_split(X1, y1, test_size=0.2, random_state=0)
print("X_tr:", X_tr.shape)
print("X_te:", X_te.shape)
print("y_tr:", y_tr.shape)
print("y_te:", y_te.shape)

X_tr: (546, 9)
X_te: (137, 9)
y_tr: (546,)
y_te: (137,)


In [23]:
pipe1 = Pipeline([
    ('knn', KNeighborsClassifier())
])

grid1 = {
    'knn__n_neighbors': [1, 3, 5, 10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv1 = GridSearchCV(pipe1, grid1, cv=5)

grid_cv1.fit(X_tr, y_tr)

test_score = grid_cv1.score(X_te, y_te)

# Collect results and sort them
df1 = pd.DataFrame.from_items([
    ('neighbors', grid_cv1.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv1.cv_results_['param_knn__p']),
    ('mean_te', grid_cv1.cv_results_['mean_test_score'])
])

df1.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
2,3,1,0.938462
6,10,1,0.938462
4,5,1,0.936264
3,3,2,0.931868
7,10,2,0.931868
8,15,1,0.92967
0,1,1,0.927473
9,15,2,0.923077
12,25,1,0.923077
1,1,2,0.920879


In [24]:
print("Test accuracy:", test_score)

Test accuracy: 0.9473684210526315


## Wisconsin Diagnostic Breast Cancer (WDBC)

In [25]:
# import the data
wdbc_data = pd.read_csv(os.path.join("data", "uci", "wdbc.data"), na_values="?", header=None)

# set the column names
wdbc_data=wdbc_data.rename(columns = {0:'ID', 1:"CLASS", 2: "MEAN_RADIUS", 3: "MEAN_TEXTURE", 4: "MEAN_PERIMETER", 5: "MEAN_AREA", 6: "MEAN_SMOOTHNESS", 7: "MEAN_COMPACTNESS", 8: "MEAN_CONCAVITY", 9:"MEAN_CONCAVE_POINTS", 10: "MEAN_SYMMETRY", 11: "MEAN_FRACTAL_DIMENSIONS", 12: "RADIUS_SE", 13: "TEXTURE_SE", 14: "PERIMETER_SE", 15: "AREA_SE", 16: "SMOOTHNESS_SE", 17: "COMPACTNESS_SE", 18: "CONCAVITY_SE", 19: "CONCAVE_POINTS_SE", 20: "SYMMETRY_SE",21: "FRACTAL_DIMENSIONS_SE", 22: "WORST_RADIUS", 23: "WORST_TEXTURE", 24: "WORST_PERIMETER", 25: "WORST_AREA", 26: "WORST_SMOOTHNESS", 27: "WORST_COMPACTNESS", 28: "WORST_CONCAVITY", 29: "WORST_CONCAVE_POINTS", 30: "WORST_SYMMETRY", 31: "WORST_FRACTAL_DIMENSIONS"})
wdbc_data = wdbc_data.dropna(axis=0, how="any")

y2 = wdbc_data.pop("CLASS")
X2 = wdbc_data.drop(["ID"], axis=1)

# split the data
X_tr, X_te, y_tr, y_te = train_test_split(X2, y2, test_size=0.2, random_state=0)
print("X_tr:", X_tr.shape)
print("X_te:", X_te.shape)
print("y_tr:", y_tr.shape)
print("y_te:", y_te.shape)

X_tr: (455, 30)
X_te: (114, 30)
y_tr: (455,)
y_te: (114,)


In [26]:
pipe2 = Pipeline([
    ('knn', KNeighborsClassifier())
])

grid2 = {
    'knn__n_neighbors': [3, 5, 10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv2 = GridSearchCV(pipe2, grid2, cv=5)

grid_cv2.fit(X_tr, y_tr)

test_score = grid_cv2.score(X_te, y_te)

# Collect results and sort them
df2 = pd.DataFrame.from_items([
    ('neighbors', grid_cv2.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv2.cv_results_['param_knn__p']),
    ('mean_te', grid_cv2.cv_results_['mean_test_score'])
])

df2.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
0,3,1,0.938462
4,10,1,0.938462
2,5,1,0.936264
1,3,2,0.931868
5,10,2,0.931868
6,15,1,0.92967
7,15,2,0.923077
10,25,1,0.923077
3,5,2,0.920879
8,20,1,0.918681


In [27]:
print("Test accuracy:", test_score)

Test accuracy: 0.9473684210526315


## Wisconsin Prognostic Breast Cancer (WPBC)

In [28]:
# import the data
wpbc_data = pd.read_csv(os.path.join("data", "uci", "wpbc.data"), na_values="?", header=None)

# set the column names
wpbc_data=wpbc_data.rename(columns = {0:'ID', 1:"CLASS", 2: "TIME", 3: "MEAN_RADIUS", 4: "MEAN_TEXTURE", 5: "MEAN_PERIMITER", 6: "MEAN_AREA", 7: "MEAN_SMOOTHNESS", 8: "MEAN_COMPACTNESS", 9: "MEAN_CONCAVITY", 10:"MEAN_CONCAVE_POINTS", 11: "MEAN_SYMMETRY", 12: "MEAN_FRACTAL_DIMENSIONS", 13: "RADIUS_SE", 14: "TEXTURE_SE", 15: "PERIMETER_SE", 16: "AREA_SE", 17: "SMOOTHNESS_SE", 18: "COMPACTNESS_SE", 19: "CONCAVITY_SE", 20: "CONCAVE_POINTS_SE", 21: "SYMMETRY_SE",22: "FRACTAL_DIMENSIONS_SE", 23: "WORST_RADIUS", 24: "WORST_TEXTURE", 25: "WORST_PERIMETER", 26: "WORST_AREA", 27: "WORST_SMOOTHNESS", 28: "WORST_COMPACTNESS", 29: "WORST_CONCAVITY", 30: "WORST_CONCAVE_POINTS", 31: "WORST_SYMMETRY", 32: "WORST_FRACTAL_DIMENSIONS", 33: "TUMOR_SIZE", 34: "LYMPH_STATUS"})

wpbc_data = wpbc_data.dropna(axis=0, how="any")

wpbc_data['OUTCOME'] = 0
wpbc_data['OUTCOME'][(wpbc_data.CLASS == "R") & (wpbc_data.TIME <= 24)] = 1

y_class = wpbc_data.pop("CLASS")
y3 = wpbc_data.pop("OUTCOME")
X3 = wpbc_data.drop(["ID","TIME"], axis=1)

# split the data
X_tr, X_te, y_tr, y_te, y_class_tr, y_class_te = train_test_split(X3, y3, y_class, test_size=0.2, random_state=0)
print("X_tr:", X_tr.shape)
print("X_te:", X_te.shape)
print("y_tr:", y_tr.shape)
print("y_te:", y_te.shape)
print("y_class_tr:", y_class_tr.shape)
print("y_class_te:", y_class_te.shape)

X_tr: (155, 32)
X_te: (39, 32)
y_tr: (155,)
y_te: (39,)
y_class_tr: (155,)
y_class_te: (39,)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [30]:
pipe3 = Pipeline([
    ('knn', KNeighborsClassifier())
])

grid3 = {
    'knn__n_neighbors': [3, 5, 10, 15, 20, 25, 30],
    'knn__p': [1, 2],
}

grid_cv3 = GridSearchCV(pipe3, grid3, cv=5)

grid_cv3.fit(X_tr, y_tr)

test_score = grid_cv3.score(X_te, y_te)

# Collect results and sort them
df3 = pd.DataFrame.from_items([
    ('neighbors', grid_cv3.cv_results_['param_knn__n_neighbors']),
    ('distance_metric', grid_cv3.cv_results_['param_knn__p']),
    ('mean_te', grid_cv3.cv_results_['mean_test_score'])
])

df3.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,neighbors,distance_metric,mean_te
3,5,2,0.83871
5,10,2,0.83871
6,15,1,0.83871
7,15,2,0.83871
8,20,1,0.83871
9,20,2,0.83871
10,25,1,0.83871
11,25,2,0.83871
12,30,1,0.83871
13,30,2,0.83871


In [31]:
print("Test accuracy:", test_score)

Test accuracy: 0.9230769230769231
