# goog/Algorithms

k-means

goog committed Apr 4, 2013
1 parent c4be58f commit 17d3c605bdfd8ce928b0340728301f333a00017b
Showing with 253 additions and 0 deletions.
1. +34 −0 cluster/kmeans++.py
2. +51 −0 cluster/kmeans.py
3. +38 −0 pageRANK/PageRank.py
4. +54 −0 pageRANK/pagerank.py
5. +76 −0 pageRANK/pagerank_graph.py
 @@ -0,0 +1,34 @@ +import random +''' +The exact algorithm is as follows: + + 1,Choose one center uniformly at random from among the data points. + 2,For each data point x, compute D(x), the distance between x and the nearest center that has already been chosen. + 3,Choose one new data point at random as a new center, using a weighted probability distribution + where a point x is chosen with probability proportional to D(x)**2. + 4,Repeat Steps 2 and 3 until k centers have been chosen. + 5,Now that the initial centers have been chosen, proceed using standard k-means clustering. + +''' + +def initialize(X, K): + C = [X[0]] + for k in range(1, K): + D2 = scipy.array([min([scipy.inner(c-x,c-x) for c in C]) for x in X]) + probs = D2/D2.sum() #### + cumprobs = probs.cumsum() + r = scipy.rand() + for j,p in enumerate(cumprobs): + if r < p: + i = j + break + C.append(X[i]) + return C + + #... + +##http://stackoverflow.com/questions/5466323/how-exactly-does-k-means-work + +k =2 +data = [(1,1),(1,2),(1,3),(4,2),(4,3),(4,4)] +kmeans(k,data)
 @@ -0,0 +1,51 @@ +import random + +def kmeans(k,data): + ### the data :::list + centroidLIST= [] + initID = random.sample(range(len(data)),k) + cluster = [] + for i in range(k): + cluster.append(set()) + centroidLIST.append(data[initID[i]]) ### initial cluster center + + + while(True): + + #### reassign the points + for i in data: + min = float('inf'); index = 0 + for j,l in enumerate(centroidLIST): + distance = (i[0]-l[0]) **2 + (i[1]-l[1]) ** 2 + if distance < min: + min = distance + index = j + cluster[index].add(i) + + oldCENTROID = centroidLIST + ##### compute the centroid + centroidLIST = [] + for i in range(k): + ### cluster[i] + x = 0;y = 0 + n = len(cluster[i]) + for j in cluster[i]: + x+=j[0] + y+=j[1] + x/=float(n) + y/=float(n) + centroidLIST.append((x,y)) + if oldCENTROID == centroidLIST: + for i in range(k): + print "the %s cluster is :" % (i+1) + print cluster[i] + break + + ## clear the cluster data + for i in range(k): + cluster.append(set()) + + +k =2 +data = [(1,1),(1,2),(1,3),(4,2),(4,3),(4,4)] +kmeans(k,data)
 @@ -0,0 +1,38 @@ +from numpy import * +import sys +import ast +# Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j' sum(i, M_i,j) = 1 +# Parameter d damping factor +# Parameter v_quadratic_error quadratic error for v +# Return v, a vector of ranks such that v_i is the i-th rank from [0, 1] + +def pagerank(d,v_quadratic_error): + eg= '''[[0,0,0,0,1], + [0.5,0,0,0,0], + [0.5,0,0,0,0], + [0,1,0.5,0,0], + [0,0,0.5,1,0]]''' + print "input a matrix like:\n",eg + M = raw_input('input matrix:\n') + while(not M): + M = raw_input('no input! please input OR press exit to quit \n') + if M=='exit': + sys.exit() + M = array(ast.literal_eval(M)) + N=M.shape[1] + v=random.rand(N,1) + # norm:sqrt(dot(v,v)) to regularization + v=v/linalg.norm(v) + a=array([[inf]]);last_v = dot(ones((N,1)),a) + # here d is smooth parameter + M_hat=(d * M)+(((1-d)/N)*ones((N,N))) + i=0 + while(linalg.norm(v - last_v) > v_quadratic_error): + i+=1 + last_v = v + v = dot(M_hat,v) + v = v/linalg.norm(v) + print "after %s iters,it converges" %i + print "The PageRank is:\n",v + +pagerank(0.85,0.001)
 @@ -0,0 +1,54 @@ +import numpy as np +from scipy.sparse import csc_matrix + +def pageRank(G, s = .85, maxerr = .001): + ''' + Computes the pagerank for each of the n states. + + Used in webpage ranking and text summarization using unweighted + or weighted transitions respectively.''' + + n = G.shape[0] + + # transform G into markov matrix M + M = csc_matrix(G,dtype=np.float) + # sum(1):sum of each row + rsums = np.array(M.sum(1))[:,0] + # rsums : [ 1. 2. 3. 2. 1. 2. 3.] + ri, ci = M.nonzero() #Return the indices of non-zero + M.data /= rsums[ri] + # bool array of sink states, rsums==0 + sink = rsums==0 + + + # Compute pagerank r until we converge + ro, r = np.zeros(n), np.ones(n) + while np.sum(np.abs(r-ro)) > maxerr: + ro = r.copy() + # calculate each pagerank at a time + for i in xrange(0,n): + # inlinks of state i + Ii = np.array(M[:,i].todense())[:,0] # select i column + print Ii + # account for sink states + Si = sink / float(n) + # account for teleportation to state i + Ti = np.ones(n) / float(n) + r[i] = ro.dot( Ii*s + Si*s + Ti*(1-s) ) + + # return normalized pagerank + return r/sum(r) + + + + +if __name__=='__main__': + G = np.array([[0,0,1,0,0,0,0], + [0,1,1,0,0,0,0], + [1,0,1,1,0,0,0], + [0,0,0,1,1,0,0], + [0,0,0,0,0,0,1], + [0,0,0,0,0,1,1], + [0,0,0,1,1,0,1]]) + + print pageRank(G,s=.86)
 @@ -0,0 +1,76 @@ +# Copyright (c) 2010 Pedro Matiello +# Juarez Bochi +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + + +""" +PageRank algoritm + +@sort: pagerank +""" + +def pagerank(graph, damping_factor=0.85, max_iterations=100, min_delta=0.00001): + """ + Compute and return the PageRank in an directed graph. + + @type graph: digraph + @param graph: Digraph. + + @type damping_factor: number + @param damping_factor: PageRank dumping factor. + + @type max_iterations: number + @param max_iterations: Maximum number of iterations. + + @type min_delta: number + @param min_delta: Smallest variation required to have a new iteration. + + @rtype: Dict + @return: Dict containing all the nodes PageRank. + """ + + nodes = graph.nodes() + graph_size = len(nodes) + if graph_size == 0: + return {} + min_value = (1.0-damping_factor)/graph_size #value for nodes without inbound links + + # itialize the page rank dict with 1/N for all nodes + pagerank = dict.fromkeys(nodes, 1.0/graph_size) + + for i in range(max_iterations): + diff = 0 #total difference compared to last iteraction + # computes each node PageRank based on inbound links + for node in nodes: + rank = min_value + for referring_page in graph.incidents(node): + rank += damping_factor * pagerank[referring_page] / len(graph.neighbors(referring_page)) + + diff += abs(pagerank[node] - rank) + pagerank[node] = rank + + #stop if PageRank has converged + if diff < min_delta: + break + + return pagerank

Owner

### goog commented on `17d3c60`Apr 4, 2013

 Line44, need to add cluster=[ ]