# K-Means Clustering Example

Let's make some fake data that includes people clustered by income and age, randomly:

In [64]:
from numpy import random, array

#Create fake income/age clusters for N people in k clusters
def createClusteredData(N, k):
    random.seed(10)
    pointsPerCluster = float(N)/k
    X = []
    for i in range (k):
        incomeCentroid = random.uniform(20000.0, 200000.0)
        ageCentroid = random.uniform(20.0, 70.0) 
        print(incomeCentroid,ageCentroid)
        for j in range(int(pointsPerCluster)):
            X.append([random.normal(incomeCentroid, 10000.0), random.normal(ageCentroid, 2.0)])
    X = array(X)
    return X

We'll use k-means to rediscover these clusters in unsupervised learning:

In [66]:
%matplotlib inline

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from numpy import random, float

data = createClusteredData(1000, 7)

model = KMeans(n_clusters=7)

# Note I'm scaling the data to normalize it! Important for good results.
model = model.fit(scale(data))

# We can look at the clusters each data point was assigned to
print(model.labels_)
print(model.centroid_)

# And we'll visualize it:
plt.figure(figsize=(8, 6))
plt.scatter(data[:,0], data[:,1], c=model.labels_.astype(float))
plt.show()

158837.7157880143 21.037597467970073
113022.07000767955 51.30718345632117
179356.56878498103 32.652636739771246
189155.6034622904 50.670189924001924
172481.43619514283 41.90256562996446
116005.50769408407 68.35072696104791
86666.39395085273 54.63804434389909
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 5 5 5 3 5 3 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5
 5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 3 5 5 5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4

AttributeError: 'KMeans' object has no attribute 'centroid_'

## Activity

Things to play with: what happens if you don't scale the data? What happens if you choose different values of K? In the real world, you won't know the "right" value of K to start with - you'll need to converge on it yourself.

In [61]:
data

array([], dtype=float64)

In [62]:
data[:,0]

IndexError: too many indices for array

In [40]:
data[:,1]

array([22.46815542, 21.02082977, 19.59742635, 21.25469452, 20.68839705,
       23.44367222, 23.09414562, 21.92787269, 21.30787122, 18.8779877 ,
       17.55085288, 25.80753213, 24.38284189, 23.83359022, 22.26400584,
       19.93897944, 20.08531344, 21.42762403, 20.36233279, 19.57365846,
       20.33585369, 20.05892303, 20.61220219, 21.66193734, 20.74275695,
       21.61578588, 22.45391751, 21.44475906, 22.87251534, 20.31323657,
       20.03413967, 19.64197741, 19.97900531, 18.20048541, 20.79378609,
       21.95940327, 23.01574238, 25.97289958, 22.27879879, 19.44157983,
       24.52722576, 20.59205007, 16.77417325, 21.8242799 , 17.04880993,
       21.52668542, 19.52981155, 22.87413577, 21.21677269, 17.12857323,
       17.22268369, 21.97710034, 20.23732076, 22.73401469, 19.46305962,
       20.09598296, 19.55888399, 20.33983363, 21.3198058 , 17.80045597,
       18.39670238, 25.96824763, 21.7300637 , 21.371218  , 22.37337669,
       18.77849508, 21.66525851, 20.5940112 , 21.09621689, 23.93

In [41]:
len(data[:,1])

994

In [42]:
incomeCentroid = random.uniform(20000.0, 200000.0)
ageCentroid = random.uniform(20.0, 70.0)
incomeCentroid


174041.47169033138

In [43]:
ageCentroid

32.28351675000191