Skip to content

Commit

Permalink
Merge branch 'release/1.3.0'
Browse files Browse the repository at this point in the history
Conflicts:
	MANIFEST.in
  • Loading branch information
exhuma committed Mar 26, 2015
2 parents e5fce3d + 837f29f commit bfcd9f9
Show file tree
Hide file tree
Showing 17 changed files with 498 additions and 339 deletions.
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
include README.rst LICENSE CHANGELOG
include *.py cluster.bmp
include cluster.bmp
4 changes: 4 additions & 0 deletions cluster/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#


from pkg_resources import resource_string

from .method.hierarchical import HierarchicalClustering
from .method.kmeans import KMeansClustering
from .util import ClusteringError

__version__ = resource_string('cluster', 'version.txt').decode('ascii').strip()
89 changes: 25 additions & 64 deletions cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

from __future__ import print_function

from .util import fullyflatten


class Cluster(object):
"""
Expand All @@ -27,7 +29,7 @@ class Cluster(object):
"""

def __repr__(self):
return "<Cluster@%s(%s)>" % (self.__level, self.__items)
return "<Cluster@%s(%s)>" % (self.level, self.items)

def __str__(self):
return self.__str__()
Expand All @@ -44,67 +46,26 @@ def __init__(self, level, *args):
will get added as item to the cluster. You could also pass a list as
second parameter to initialise the cluster with that list as content
"""
self.__level = level
if len(args) == 0:
self.__items = []
else:
self.__items = list(args)

def append(self, item):
"""
Appends a new item to the cluster
:param item: The item that is to be appended.
"""
self.__items.append(item)

def items(self, new_items=None):
"""
Sets or gets the items of the cluster
:param new_items: if set, the items of the cluster will be replaced with
that argument.
"""
if new_items is None:
return self.__items
else:
self.__items = new_items

def fullyflatten(self, *args):
"""
Completely flattens out this cluster and returns a one-dimensional
list containing the cluster's items. This is useful in cases where
some items of the cluster are clusters in their own right and you only
want the items.
:param *args: only used for recursion.
"""
flattened_items = []
self.level = level
if len(args) == 0:
collection = self.__items
self.items = []
else:
collection = args[0].items()
self.items = args

for item in collection:
def __iter__(self):
for item in self.items:
if isinstance(item, Cluster):
flattened_items = flattened_items + self.fullyflatten(item)
for recursed_item in item:
yield recursed_item
else:
flattened_items.append(item)

return flattened_items

def level(self):
"""
Returns the level associated with this cluster.
"""
return self.__level
yield item

def display(self, depth=0):
"""
Pretty-prints this cluster. Useful for debuging.
"""
print(depth * " " + "[level %s]" % self.__level)
for item in self.__items:
print(depth * " " + "[level %s]" % self.level)
for item in self.items:
if isinstance(item, Cluster):
item.display(depth + 1)
else:
Expand Down Expand Up @@ -132,8 +93,8 @@ def topology(self):
('.idlerc', '.pylint.d')))))))
"""

left = self.__items[0]
right = self.__items[1]
left = self.items[0]
right = self.items[1]

if isinstance(left, Cluster):
first = left.topology()
Expand Down Expand Up @@ -166,27 +127,27 @@ def getlevel(self, threshold):
useful approach.
"""

left = self.__items[0]
right = self.__items[1]
left = self.items[0]
right = self.items[1]

# if this object itself is below the threshold value we only need to
# return it's contents as a list
if self.level() <= threshold:
return [self.fullyflatten()]
if self.level <= threshold:
return [fullyflatten(self.items)]

# if this cluster's level is higher than the threshold we will
# investgate it's left and right part. Their level could be below the
# threshold
if isinstance(left, Cluster) and left.level() <= threshold:
if isinstance(left, Cluster) and left.level <= threshold:
if isinstance(right, Cluster):
return [left.fullyflatten()] + right.getlevel(threshold)
return [fullyflatten(left.items)] + right.getlevel(threshold)
else:
return [left.fullyflatten()] + [[right]]
elif isinstance(right, Cluster) and right.level() <= threshold:
return [fullyflatten(left.items)] + [[right]]
elif isinstance(right, Cluster) and right.level <= threshold:
if isinstance(left, Cluster):
return left.getlevel(threshold) + [right.fullyflatten()]
return left.getlevel(threshold) + [fullyflatten(right.items)]
else:
return [[left]] + [right.fullyflatten()]
return [[left]] + [fullyflatten(right.items)]

# Alright. We covered the cases where one of the clusters was below
# the threshold value. Now we'll deal with the clusters that are above
Expand Down
100 changes: 100 additions & 0 deletions cluster/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from __future__ import division
from functools import wraps


def cached(fun):
"""
memoizing decorator for linkage functions.
Parameters have been hardcoded (no ``*args``, ``**kwargs`` magic), because,
the way this is coded (interchangingly using sets and frozensets) is true
for this specific case. For other cases that is not necessarily guaranteed.
"""

_cache = {}

@wraps(fun)
def newfun(a, b, distance_function):
frozen_a = frozenset(a)
frozen_b = frozenset(b)
if (frozen_a, frozen_b) not in _cache:
result = fun(a, b, distance_function)
_cache[(frozen_a, frozen_b)] = result
return _cache[(frozen_a, frozen_b)]
return newfun


@cached
def single(a, b, distance_function):
"""
Given two collections ``a`` and ``b``, this will return the distance of the
points which are closest togetger. ``distance_function`` is used to
determine the distance between two elements.
Example::
>>> single([1, 2], [3, 4], lambda x, y: abs(x-y))
1 # (distance between 2 and 3)
"""
left_a, right_a = min(a), max(a)
left_b, right_b = min(b), max(b)
result = min(distance_function(left_a, right_b),
distance_function(left_b, right_a))
return result


@cached
def complete(a, b, distance_function):
"""
Given two collections ``a`` and ``b``, this will return the distance of the
points which are farthest apart. ``distance_function`` is used to determine
the distance between two elements.
Example::
>>> single([1, 2], [3, 4], lambda x, y: abs(x-y))
3 # (distance between 1 and 4)
"""
left_a, right_a = min(a), max(a)
left_b, right_b = min(b), max(b)
result = max(distance_function(left_a, right_b),
distance_function(left_b, right_a))
return result


@cached
def average(a, b, distance_function):
"""
Given two collections ``a`` and ``b``, this will return the mean of all
distances. ``distance_function`` is used to determine the distance between
two elements.
Example::
>>> single([1, 2], [3, 100], lambda x, y: abs(x-y))
26
"""
distances = [distance_function(x, y)
for x in a for y in b]
return sum(distances) / len(distances)


@cached
def uclus(a, b, distance_function):
"""
Given two collections ``a`` and ``b``, this will return the *median* of all
distances. ``distance_function`` is used to determine the distance between
two elements.
Example::
>>> single([1, 2], [3, 100], lambda x, y: abs(x-y))
2.5
"""
distances = sorted([distance_function(x, y)
for x in a for y in b])
midpoint, rest = len(distances) // 2, len(distances) % 2
if not rest:
return sum(distances[midpoint-1:midpoint+1]) / 2
else:
return distances[midpoint]
8 changes: 8 additions & 0 deletions cluster/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ def worker(self):
tasks_completed = 0
for task in iter(self.task_queue.get, 'STOP'):
col_index, item, item2 = task
if not hasattr(item, '__iter__'):
item = [item]
if not hasattr(item2, '__iter__'):
item2 = [item2]
result = (col_index, self.combinfunc(item, item2))
self.done_queue.put(result)
tasks_completed += 1
Expand Down Expand Up @@ -119,6 +123,10 @@ def genmatrix(self, num_processes=1):
num_tasks_completed += 1
else:
# Otherwise do it here, in line
if not hasattr(item, '__iter__'):
item = [item]
if not hasattr(item2, '__iter__'):
item2 = [item2]
row[col_index] = self.combinfunc(item, item2)

if self.symmetric:
Expand Down
3 changes: 2 additions & 1 deletion cluster/method/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ class BaseClusterMethod(object):
they are comparable, it's ok.
"""

def __init__(self, input, distance_function):
def __init__(self, input, distance_function, progress_callback=None):
self.distance = distance_function
self._input = input # the original input
self._data = input[:] # clone the input so we can work with it
# without distroying the original data.
self.progress_callback = progress_callback

def topo(self):
"""
Expand Down
Loading

0 comments on commit bfcd9f9

Please sign in to comment.