Merge branch 'release/1.3.0'

Conflicts: MANIFEST.in
exhuma · Mar 26, 2015 · bfcd9f9 · bfcd9f9
2 parents e5fce3d + 837f29f
commit bfcd9f9
Show file tree

Hide file tree

Showing 17 changed files with 498 additions and 339 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
 include README.rst LICENSE CHANGELOG
-include *.py cluster.bmp
+include cluster.bmp
diff --git a/cluster/__init__.py b/cluster/__init__.py
@@ -16,6 +16,10 @@
 #
 
 
+from pkg_resources import resource_string
+
 from .method.hierarchical import HierarchicalClustering
 from .method.kmeans import KMeansClustering
 from .util import ClusteringError
+
+__version__ = resource_string('cluster', 'version.txt').decode('ascii').strip()
diff --git a/cluster/cluster.py b/cluster/cluster.py
@@ -17,6 +17,8 @@
 
 from __future__ import print_function
 
+from .util import fullyflatten
+
 
 class Cluster(object):
     """
@@ -27,7 +29,7 @@ class Cluster(object):
     """
 
     def __repr__(self):
-        return "<Cluster@%s(%s)>" % (self.__level, self.__items)
+        return "<Cluster@%s(%s)>" % (self.level, self.items)
 
     def __str__(self):
         return self.__str__()
@@ -44,67 +46,26 @@ def __init__(self, level, *args):
             will get added as item to the cluster. You could also pass a list as
             second parameter to initialise the cluster with that list as content
         """
-        self.__level = level
-        if len(args) == 0:
-            self.__items = []
-        else:
-            self.__items = list(args)
-
-    def append(self, item):
-        """
-        Appends a new item to the cluster
-
-        :param item: The item that is to be appended.
-        """
-        self.__items.append(item)
-
-    def items(self, new_items=None):
-        """
-        Sets or gets the items of the cluster
-
-        :param new_items: if set, the items of the cluster will be replaced with
-            that argument.
-        """
-        if new_items is None:
-            return self.__items
-        else:
-            self.__items = new_items
-
-    def fullyflatten(self, *args):
-        """
-        Completely flattens out this cluster and returns a one-dimensional
-        list containing the cluster's items. This is useful in cases where
-        some items of the cluster are clusters in their own right and you only
-        want the items.
-
-        :param *args: only used for recursion.
-        """
-        flattened_items = []
+        self.level = level
         if len(args) == 0:
-            collection = self.__items
+            self.items = []
         else:
-            collection = args[0].items()
+            self.items = args
 
-        for item in collection:
+    def __iter__(self):
+        for item in self.items:
             if isinstance(item, Cluster):
-                flattened_items = flattened_items + self.fullyflatten(item)
+                for recursed_item in item:
+                    yield recursed_item
             else:
-                flattened_items.append(item)
-
-        return flattened_items
-
-    def level(self):
-        """
-        Returns the level associated with this cluster.
-        """
-        return self.__level
+                yield item
 
     def display(self, depth=0):
         """
         Pretty-prints this cluster. Useful for debuging.
         """
-        print(depth * "    " + "[level %s]" % self.__level)
-        for item in self.__items:
+        print(depth * "    " + "[level %s]" % self.level)
+        for item in self.items:
             if isinstance(item, Cluster):
                 item.display(depth + 1)
             else:
@@ -132,8 +93,8 @@ def topology(self):
                 ('.idlerc', '.pylint.d')))))))
         """
 
-        left = self.__items[0]
-        right = self.__items[1]
+        left = self.items[0]
+        right = self.items[1]
 
         if isinstance(left, Cluster):
             first = left.topology()
@@ -166,27 +127,27 @@ def getlevel(self, threshold):
             useful approach.
         """
 
-        left = self.__items[0]
-        right = self.__items[1]
+        left = self.items[0]
+        right = self.items[1]
 
         # if this object itself is below the threshold value we only need to
         # return it's contents as a list
-        if self.level() <= threshold:
-            return [self.fullyflatten()]
+        if self.level <= threshold:
+            return [fullyflatten(self.items)]
 
         # if this cluster's level is higher than the threshold we will
         # investgate it's left and right part. Their level could be below the
         # threshold
-        if isinstance(left, Cluster) and left.level() <= threshold:
+        if isinstance(left, Cluster) and left.level <= threshold:
             if isinstance(right, Cluster):
-                return [left.fullyflatten()] + right.getlevel(threshold)
+                return [fullyflatten(left.items)] + right.getlevel(threshold)
             else:
-                return [left.fullyflatten()] + [[right]]
-        elif isinstance(right, Cluster) and right.level() <= threshold:
+                return [fullyflatten(left.items)] + [[right]]
+        elif isinstance(right, Cluster) and right.level <= threshold:
             if isinstance(left, Cluster):
-                return left.getlevel(threshold) + [right.fullyflatten()]
+                return left.getlevel(threshold) + [fullyflatten(right.items)]
             else:
-                return [[left]] + [right.fullyflatten()]
+                return [[left]] + [fullyflatten(right.items)]
 
         # Alright. We covered the cases where one of the clusters was below
         # the threshold value. Now we'll deal with the clusters that are above

diff --git a/cluster/linkage.py b/cluster/linkage.py
@@ -0,0 +1,100 @@
+from __future__ import division
+from functools import wraps
+
+
+def cached(fun):
+    """
+    memoizing decorator for linkage functions.
+
+    Parameters have been hardcoded (no ``*args``, ``**kwargs`` magic), because,
+    the way this is coded (interchangingly using sets and frozensets) is true
+    for this specific case. For other cases that is not necessarily guaranteed.
+    """
+
+    _cache = {}
+
+    @wraps(fun)
+    def newfun(a, b, distance_function):
+        frozen_a = frozenset(a)
+        frozen_b = frozenset(b)
+        if (frozen_a, frozen_b) not in _cache:
+            result = fun(a, b, distance_function)
+            _cache[(frozen_a, frozen_b)] = result
+        return _cache[(frozen_a, frozen_b)]
+    return newfun
+
+
+@cached
+def single(a, b, distance_function):
+    """
+    Given two collections ``a`` and ``b``, this will return the distance of the
+    points which are closest togetger.  ``distance_function`` is used to
+    determine the distance between two elements.
+
+    Example::
+
+        >>> single([1, 2], [3, 4], lambda x, y: abs(x-y))
+        1  # (distance between 2 and 3)
+    """
+    left_a, right_a = min(a), max(a)
+    left_b, right_b = min(b), max(b)
+    result = min(distance_function(left_a, right_b),
+                 distance_function(left_b, right_a))
+    return result
+
+
+@cached
+def complete(a, b, distance_function):
+    """
+    Given two collections ``a`` and ``b``, this will return the distance of the
+    points which are farthest apart.  ``distance_function`` is used to determine
+    the distance between two elements.
+
+    Example::
+
+        >>> single([1, 2], [3, 4], lambda x, y: abs(x-y))
+        3  # (distance between 1 and 4)
+    """
+    left_a, right_a = min(a), max(a)
+    left_b, right_b = min(b), max(b)
+    result = max(distance_function(left_a, right_b),
+                 distance_function(left_b, right_a))
+    return result
+
+
+@cached
+def average(a, b, distance_function):
+    """
+    Given two collections ``a`` and ``b``, this will return the mean of all
+    distances. ``distance_function`` is used to determine the distance between
+    two elements.
+
+    Example::
+
+        >>> single([1, 2], [3, 100], lambda x, y: abs(x-y))
+        26
+    """
+    distances = [distance_function(x, y)
+                 for x in a for y in b]
+    return sum(distances) / len(distances)
+
+
+@cached
+def uclus(a, b, distance_function):
+    """
+    Given two collections ``a`` and ``b``, this will return the *median* of all
+    distances. ``distance_function`` is used to determine the distance between
+    two elements.
+
+    Example::
+
+        >>> single([1, 2], [3, 100], lambda x, y: abs(x-y))
+        2.5
+    """
+    distances = sorted([distance_function(x, y)
+                        for x in a for y in b])
+    midpoint, rest = len(distances) // 2, len(distances) % 2
+    if not rest:
+        return sum(distances[midpoint-1:midpoint+1]) / 2
+    else:
+        return distances[midpoint]
diff --git a/cluster/matrix.py b/cluster/matrix.py
@@ -57,6 +57,10 @@ def worker(self):
         tasks_completed = 0
         for task in iter(self.task_queue.get, 'STOP'):
             col_index, item, item2 = task
+            if not hasattr(item, '__iter__'):
+                item = [item]
+            if not hasattr(item2, '__iter__'):
+                item2 = [item2]
             result = (col_index, self.combinfunc(item, item2))
             self.done_queue.put(result)
             tasks_completed += 1
@@ -119,6 +123,10 @@ def genmatrix(self, num_processes=1):
                         num_tasks_completed += 1
                 else:
                     # Otherwise do it here, in line
+                    if not hasattr(item, '__iter__'):
+                        item = [item]
+                    if not hasattr(item2, '__iter__'):
+                        item2 = [item2]
                     row[col_index] = self.combinfunc(item, item2)
 
             if self.symmetric:

diff --git a/cluster/method/base.py b/cluster/method/base.py
@@ -39,11 +39,12 @@ class BaseClusterMethod(object):
         they are comparable, it's ok.
     """
 
-    def __init__(self, input, distance_function):
+    def __init__(self, input, distance_function, progress_callback=None):
         self.distance = distance_function
         self._input = input    # the original input
         self._data = input[:]  # clone the input so we can work with it
                                # without distroying the original data.
+        self.progress_callback = progress_callback
 
     def topo(self):
         """