In [1]:
import numpy as np 

In [75]:
def loadDataSet(fileName):
    dataMat = []
    with open(fileName) as fr:
        for line in fr.readlines():
            curLine = line.strip().split('\t')
            fltLine = list(map(float, curLine))
            dataMat.append(fltLine)
    return dataMat

def distEclud(vecA, vecB):
    """计算两个向量的欧氏距离"""
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))

def randCent(dataSet, k):
    n = dataSet.shape[1]
    centroids = np.mat(np.zeros((k, n)))
    for j in range(n):
        minJ = dataSet[:,j].min()
        rangeJ = float(dataSet[:,j].max() - minJ)
        centroids[:,j] = minJ + rangeJ * np.random.rand(k,1)
    return centroids 

In [76]:
datMat = np.mat(loadDataSet('testSet.txt'))

In [77]:
print(datMat[:,0].min(), datMat[:,0].max())
print(datMat[:,1].min(), datMat[:,1].max())

-5.379713 4.838138
-4.232586 5.1904


In [78]:
randCent(datMat, 2)

matrix([[ 2.76180247,  1.68363201],
        [ 0.3497634 , -1.43300819]])

In [79]:
distEclud(datMat[0], datMat[1])

5.184632816681332

In [80]:
# k均值聚类算法

def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
    m = dataSet.shape[0]
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroids = createCent(dataSet, k)
    clusterChanged = True 
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = np.inf; minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j,:], dataSet[i,:])
                if distJI < minDist:            # 寻找最近的质心
                    minDist = distJI; minIndex = j 
            if clusterAssment[i,0] != minIndex:
                clusterChanged = True 
            clusterAssment[i,:] = minIndex, minDist ** 2
        # print(centroids)
        for cent in range(k):
            pstInClust = dataSet[np.nonzero(clusterAssment[:,0].A == cent)[0]]  
            centroids[cent, :] = np.mean(pstInClust, axis=0)       # 更新质心的质量
    return centroids, clusterAssment

In [81]:
datMat = np.mat(loadDataSet('testSet.txt'))
_,_ = kMeans(datMat, 4)

In [140]:
# 二分K-均值聚类算法

def biKmeans(dataSet, k, distMeas=distEclud):
    m = dataSet.shape[0]
    clusterAssment = np.mat(np.zeros((m, 2)))
    centroid0 = np.mean(dataSet, axis=0).tolist()[0]
    centList = []
    centList.append(centroid0)
    for j in range(m):
        clusterAssment[j,1] = distMeas(np.mat(centroid0), dataSet[j,:]) ** 2
    while (len(centList) < k):
        lowestSSE = np.inf 
        for i in range(len(centList)):
            pstInCurrClutser = \
                dataSet[np.nonzero(clusterAssment[:,0].A==i)[0], :]
            centroidMat, splitClustAss = \
                kMeans(pstInCurrClutser, 2, distMeas)
            sseSplit = np.sum(splitClustAss[:,1])
            sseNotSplit = \
                np.sum(dataSet[np.nonzero(clusterAssment[:,0].A!=i)[0], 1])
            print("sseSplit, and not Split: ", sseSplit, sseNotSplit) 
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i 
                bestNewCents = centroidMat 
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit 
        bestClustAss[np.nonzero(bestClustAss[:,0].A == 1)[0], 0] = len(centList)
        bestClustAss[np.nonzero(bestClustAss[:,0].A == 0)[0], 0] = bestCentToSplit 
        print("the bestCentToSplit is: ", bestCentToSplit)
        print("the len of bestClustAss is: ", len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
        centList.append(bestNewCents[1,:].tolist()[0])
        clusterAssment[np.nonzero(clusterAssment[:,0].A == \
                            bestCentToSplit)[0],:] = bestClustAss 
    return np.mat(centList), clusterAssment

In [141]:
datMat3 = np.mat(loadDataSet('testSet2.txt'))

In [142]:
centList, myNewAssements = biKmeans(datMat3, 3)

sseSplit, and not Split:  541.2976292649145 0.0
the bestCentToSplit is:  0
the len of bestClustAss is:  60
sseSplit, and not Split:  67.2202000797829 62.556557
sseSplit, and not Split:  25.194262086233078 10.96325
the bestCentToSplit is:  1
the len of bestClustAss is:  20


In [143]:
centList

matrix([[-1.70351595,  0.27408125],
        [ 1.788374  ,  2.990118  ],
        [ 3.55066577,  3.20197931]])

In [144]:
import urllib
import json

In [150]:
def geoGrab(stAddress, city):
    apiStem = 'http://where.yahooapis.com/geocode?'  #create a dict and constants for the goecoder
    params = {}
    params['flags'] = 'J'#JSON return type
    params['appid'] = 'aaa0VN6k'
    params['location'] = '%s %s' % (stAddress, city)
    url_params = urllib.parse.urlencode(params)
    yahooApi = apiStem + url_params      #print url_params
    print(yahooApi)
    c=urllib.request.urlopen(yahooApi)
    return json.loads(c.read())