In [526]:
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale
import pandas as pd

In [527]:
bc = load_breast_cancer()
X = bc.data
y = bc.target

In [528]:
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [529]:
X[:5]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e

In [530]:
y[:5]

array([0, 0, 0, 0, 0])

In [531]:
# scale features
X = scale(bc.data)
X[:5]

array([[ 1.09706398e+00, -2.07333501e+00,  1.26993369e+00,
         9.84374905e-01,  1.56846633e+00,  3.28351467e+00,
         2.65287398e+00,  2.53247522e+00,  2.21751501e+00,
         2.25574689e+00,  2.48973393e+00, -5.65265059e-01,
         2.83303087e+00,  2.48757756e+00, -2.14001647e-01,
         1.31686157e+00,  7.24026158e-01,  6.60819941e-01,
         1.14875667e+00,  9.07083081e-01,  1.88668963e+00,
        -1.35929347e+00,  2.30360062e+00,  2.00123749e+00,
         1.30768627e+00,  2.61666502e+00,  2.10952635e+00,
         2.29607613e+00,  2.75062224e+00,  1.93701461e+00],
       [ 1.82982061e+00, -3.53632408e-01,  1.68595471e+00,
         1.90870825e+00, -8.26962447e-01, -4.87071673e-01,
        -2.38458552e-02,  5.48144156e-01,  1.39236330e-03,
        -8.68652457e-01,  4.99254601e-01, -8.76243603e-01,
         2.63326966e-01,  7.42401948e-01, -6.05350847e-01,
        -6.92926270e-01, -4.40780058e-01,  2.60162067e-01,
        -8.05450380e-01, -9.94437403e-02,  1.80592744e+

In [532]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [533]:
# create and train mode
model = KMeans(n_clusters=2)
model.fit(X_train, y_train)



In [534]:
predictions = model.predict(X_test)

In [535]:
labels = model.labels_

In [536]:
print('labels: ', labels)
print('predictions: ', predictions)
print('actuals: ', y_test)

labels:  [0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 0 0
 0 0 0 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0
 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1
 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 0 1
 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1
 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1
 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0
 1 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 1 1
 1 0 0 1 0 0 1 0 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 1 1 0 1 1 1
 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 0 1
 0 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0
 1 1 0 0 0 1 0 0 0 1 1]
predictions:  [1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 

In [537]:
# note k-means can assign labels 0 or 1 to clusters. So if it labels a cluster 0 as 1, then we might get a seemingly very low accuracy but it's actually high.
print('accuracy: ', round(accuracy_score(y_test, predictions),2))

accuracy:  0.91


In [538]:
print(pd.crosstab(y_train, labels))

col_0    0    1
row_0          
0      139   30
1       11  275
