Permalink
Browse files

k-means

  • Loading branch information...
goog committed Apr 4, 2013
1 parent c4be58f commit 17d3c605bdfd8ce928b0340728301f333a00017b
Showing with 253 additions and 0 deletions.
  1. +34 −0 cluster/kmeans++.py
  2. +51 −0 cluster/kmeans.py
  3. +38 −0 pageRANK/PageRank.py
  4. +54 −0 pageRANK/pagerank.py
  5. +76 −0 pageRANK/pagerank_graph.py
View
@@ -0,0 +1,34 @@
+import random
+'''
+The exact algorithm is as follows:
+
+ 1,Choose one center uniformly at random from among the data points.
+ 2,For each data point x, compute D(x), the distance between x and the nearest center that has already been chosen.
+ 3,Choose one new data point at random as a new center, using a weighted probability distribution
+ where a point x is chosen with probability proportional to D(x)**2.
+ 4,Repeat Steps 2 and 3 until k centers have been chosen.
+ 5,Now that the initial centers have been chosen, proceed using standard k-means clustering.
+
+'''
+
+def initialize(X, K):
+ C = [X[0]]
+ for k in range(1, K):
+ D2 = scipy.array([min([scipy.inner(c-x,c-x) for c in C]) for x in X])
+ probs = D2/D2.sum() ####
+ cumprobs = probs.cumsum()
+ r = scipy.rand()
+ for j,p in enumerate(cumprobs):
+ if r < p:
+ i = j
+ break
+ C.append(X[i])
+ return C
+
+ #...
+
+##http://stackoverflow.com/questions/5466323/how-exactly-does-k-means-work
+
+k =2
+data = [(1,1),(1,2),(1,3),(4,2),(4,3),(4,4)]
+kmeans(k,data)
View
@@ -0,0 +1,51 @@
+import random
+
+def kmeans(k,data):
+ ### the data :::list
+ centroidLIST= []
+ initID = random.sample(range(len(data)),k)
+ cluster = []
+ for i in range(k):
+ cluster.append(set())
+ centroidLIST.append(data[initID[i]]) ### initial cluster center
+
+
+ while(True):
+
+ #### reassign the points
+ for i in data:
+ min = float('inf'); index = 0
+ for j,l in enumerate(centroidLIST):
+ distance = (i[0]-l[0]) **2 + (i[1]-l[1]) ** 2
+ if distance < min:
+ min = distance
+ index = j
+ cluster[index].add(i)
+
+ oldCENTROID = centroidLIST
+ ##### compute the centroid
+ centroidLIST = []
+ for i in range(k):
+ ### cluster[i]
+ x = 0;y = 0
+ n = len(cluster[i])
+ for j in cluster[i]:
+ x+=j[0]
+ y+=j[1]
+ x/=float(n)
+ y/=float(n)
+ centroidLIST.append((x,y))
+ if oldCENTROID == centroidLIST:
+ for i in range(k):
+ print "the %s cluster is :" % (i+1)
+ print cluster[i]
+ break
+
+ ## clear the cluster data
+ for i in range(k):
+ cluster.append(set())
+
+
+k =2
+data = [(1,1),(1,2),(1,3),(4,2),(4,3),(4,4)]
+kmeans(k,data)
View
@@ -0,0 +1,38 @@
+from numpy import *
+import sys
+import ast
+# Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j' sum(i, M_i,j) = 1
+# Parameter d damping factor
+# Parameter v_quadratic_error quadratic error for v
+# Return v, a vector of ranks such that v_i is the i-th rank from [0, 1]
+
+def pagerank(d,v_quadratic_error):
+ eg= '''[[0,0,0,0,1],
+ [0.5,0,0,0,0],
+ [0.5,0,0,0,0],
+ [0,1,0.5,0,0],
+ [0,0,0.5,1,0]]'''
+ print "input a matrix like:\n",eg
+ M = raw_input('input matrix:\n')
+ while(not M):
+ M = raw_input('no input! please input OR press exit to quit \n')
+ if M=='exit':
+ sys.exit()
+ M = array(ast.literal_eval(M))
+ N=M.shape[1]
+ v=random.rand(N,1)
+ # norm:sqrt(dot(v,v)) to regularization
+ v=v/linalg.norm(v)
+ a=array([[inf]]);last_v = dot(ones((N,1)),a)
+ # here d is smooth parameter
+ M_hat=(d * M)+(((1-d)/N)*ones((N,N)))
+ i=0
+ while(linalg.norm(v - last_v) > v_quadratic_error):
+ i+=1
+ last_v = v
+ v = dot(M_hat,v)
+ v = v/linalg.norm(v)
+ print "after %s iters,it converges" %i
+ print "The PageRank is:\n",v
+
+pagerank(0.85,0.001)
View
@@ -0,0 +1,54 @@
+import numpy as np
+from scipy.sparse import csc_matrix
+
+def pageRank(G, s = .85, maxerr = .001):
+ '''
+ Computes the pagerank for each of the n states.
+
+ Used in webpage ranking and text summarization using unweighted
+ or weighted transitions respectively.'''
+
+ n = G.shape[0]
+
+ # transform G into markov matrix M
+ M = csc_matrix(G,dtype=np.float)
+ # sum(1):sum of each row
+ rsums = np.array(M.sum(1))[:,0]
+ # rsums : [ 1. 2. 3. 2. 1. 2. 3.]
+ ri, ci = M.nonzero() #Return the indices of non-zero
+ M.data /= rsums[ri]
+ # bool array of sink states, rsums==0
+ sink = rsums==0
+
+
+ # Compute pagerank r until we converge
+ ro, r = np.zeros(n), np.ones(n)
+ while np.sum(np.abs(r-ro)) > maxerr:
+ ro = r.copy()
+ # calculate each pagerank at a time
+ for i in xrange(0,n):
+ # inlinks of state i
+ Ii = np.array(M[:,i].todense())[:,0] # select i column
+ print Ii
+ # account for sink states
+ Si = sink / float(n)
+ # account for teleportation to state i
+ Ti = np.ones(n) / float(n)
+ r[i] = ro.dot( Ii*s + Si*s + Ti*(1-s) )
+
+ # return normalized pagerank
+ return r/sum(r)
+
+
+
+
+if __name__=='__main__':
+ G = np.array([[0,0,1,0,0,0,0],
+ [0,1,1,0,0,0,0],
+ [1,0,1,1,0,0,0],
+ [0,0,0,1,1,0,0],
+ [0,0,0,0,0,0,1],
+ [0,0,0,0,0,1,1],
+ [0,0,0,1,1,0,1]])
+
+ print pageRank(G,s=.86)
View
@@ -0,0 +1,76 @@
+# Copyright (c) 2010 Pedro Matiello <pmatiello@gmail.com>
+# Juarez Bochi <jbochi@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+
+"""
+PageRank algoritm
+
+@sort: pagerank
+"""
+
+def pagerank(graph, damping_factor=0.85, max_iterations=100, min_delta=0.00001):
+ """
+ Compute and return the PageRank in an directed graph.
+
+ @type graph: digraph
+ @param graph: Digraph.
+
+ @type damping_factor: number
+ @param damping_factor: PageRank dumping factor.
+
+ @type max_iterations: number
+ @param max_iterations: Maximum number of iterations.
+
+ @type min_delta: number
+ @param min_delta: Smallest variation required to have a new iteration.
+
+ @rtype: Dict
+ @return: Dict containing all the nodes PageRank.
+ """
+
+ nodes = graph.nodes()
+ graph_size = len(nodes)
+ if graph_size == 0:
+ return {}
+ min_value = (1.0-damping_factor)/graph_size #value for nodes without inbound links
+
+ # itialize the page rank dict with 1/N for all nodes
+ pagerank = dict.fromkeys(nodes, 1.0/graph_size)
+
+ for i in range(max_iterations):
+ diff = 0 #total difference compared to last iteraction
+ # computes each node PageRank based on inbound links
+ for node in nodes:
+ rank = min_value
+ for referring_page in graph.incidents(node):
+ rank += damping_factor * pagerank[referring_page] / len(graph.neighbors(referring_page))
+
+ diff += abs(pagerank[node] - rank)
+ pagerank[node] = rank
+
+ #stop if PageRank has converged
+ if diff < min_delta:
+ break
+
+ return pagerank

1 comment on commit 17d3c60

Owner

goog commented on 17d3c60 Apr 4, 2013

Line44, need to add cluster=[ ]

Please sign in to comment.