From 943134c593f475a63c971a4aaf01c5eec81bce75 Mon Sep 17 00:00:00 2001 From: Tom De Smedt Date: Mon, 27 Oct 2014 16:35:10 +0100 Subject: [PATCH] Python 3 support: use next() instead of .next() + define xrange() --- pattern/graph/__init__.py | 2 +- pattern/metrics.py | 76 +++++++++++++++++++++++++++++--- pattern/text/search.py | 11 ++--- pattern/vector/__init__.py | 29 +++++++++--- pattern/vector/stemmer.py | 2 +- pattern/web/__init__.py | 2 +- pattern/web/json/ordered_dict.py | 4 +- 7 files changed, 104 insertions(+), 22 deletions(-) diff --git a/pattern/graph/__init__.py b/pattern/graph/__init__.py index 826b34c3..38f80729 100644 --- a/pattern/graph/__init__.py +++ b/pattern/graph/__init__.py @@ -1077,7 +1077,7 @@ def clique(graph, id): for n in graph.nodes: try: # Raises StopIteration if all nodes in the clique are connected to n: - (id for id in a if n.id==id or graph.edge(n.id, id) is None).next() + next(id for id in a if n.id==id or graph.edge(n.id, id) is None) except StopIteration: a.append(n.id) return a diff --git a/pattern/metrics.py b/pattern/metrics.py index 2c71d25a..8e8ce682 100644 --- a/pattern/metrics.py +++ b/pattern/metrics.py @@ -8,15 +8,18 @@ import sys from time import time -from math import sqrt, floor, modf, exp, pi, log +from math import sqrt, floor, ceil, modf, exp, pi, log from collections import defaultdict, deque from itertools import chain -from operator import itemgetter +from operator import itemgetter, lt, le from heapq import nlargest from bisect import bisect_right from random import gauss +if sys.version > "3": + xrange = range + #################################################################################################### # Simple implementation of Counter for Python 2.5 and 2.6. # See also: http://code.activestate.com/recipes/576611/ @@ -280,7 +283,7 @@ def levenshtein(string1, string2): if n > m: # Make sure n <= m to use O(min(n,m)) space. string1, string2, n, m = string2, string1, m, n - current = range(n+1) + current = list(xrange(n+1)) for i in xrange(1, m+1): previous, current = current, [i]+[0]*n for j in xrange(1, n+1): @@ -570,6 +573,65 @@ def cooccurrence(iterable, window=(-1,-1), term1=lambda x: True, term2=lambda x: # Adjectives preceding nouns: # {("cat", "NN"): {("black", "JJ"): 1}} +#### INTERPOLATION ################################################################################# + +def lerp(a, b, t): + """ Returns the linear interpolation between a and b at time t between 0.0-1.0. + For example: lerp(100, 200, 0.5) => 150. + """ + if t < 0.0: + return a + if t > 1.0: + return b + return a + (b - a) * t + +def smoothstep(a, b, x): + """ Returns the Hermite interpolation (cubic spline) for x between a and b. + The return value between 0.0-1.0 eases (slows down) as x nears a or b. + """ + if x < a: + return 0.0 + if x >= b: + return 1.0 + x = float(x - a) / (b - a) + return x * x * (3 - 2 * x) + +def smoothrange(a=None, b=None, n=10): + """ Returns an iterator of approximately n values v1, v2, ... vn, + so that v1 <= a, and vn >= b, and all values are multiples of 1, 2, 5 and 10. + For example: list(smoothrange(1, 123)) => [0, 20, 40, 60, 80, 100, 120, 140], + """ + def _multiple(v, round=False): + e = floor(log(v, 10)) # exponent + m = pow(10, e) # magnitude + f = v / m # fraction + if round is True: + op, x, y, z = lt, 1.5, 3.0, 7.0 + if round is False: + op, x, y, z = le, 1.0, 2.0, 5.0 + if op(f, x): + return m * 1 + if op(f, y): + return m * 2 + if op(f, z): + return m * 5 + else: + return m * 10 + if a is None and b is None: + a, b = 0, 1 + if a is None: + a, b = 0, b + if b is None: + a, b = 0, a + if a == b: + yield float(a); raise StopIteration + r = _multiple(b - a) + t = _multiple(r / (n - 1), round=True) + a = floor(a / t) * t + b = ceil(b / t) * t + for i in range(int((b - a) / t) + 1): + yield a + i * t + #### STATISTICS #################################################################################### #--- MEAN ------------------------------------------------------------------------------------------ @@ -688,7 +750,7 @@ def kurtosis(iterable, sample=False): #a = 1 #b = 1000 -#U = [float(i-a)/(b-a) for i in range(a,b)] # uniform distribution +#U = [float(i-a)/(b-a) for i in xrange(a,b)] # uniform distribution #print(abs(-1.2 - kurtosis(U)) < 0.0001) #--- QUANTILE -------------------------------------------------------------------------------------- @@ -715,7 +777,7 @@ def quantile(iterable, p=0.5, sort=True, a=1, b=-1, c=0, d=1): i = int(floor(i)) return s[i] + (s[i+1] - s[i]) * (c + d * f) -#print(quantile(range(10), p=0.5) == median(range(10))) +#print(quantile(xrange(10), p=0.5) == median(xrange(10))) def boxplot(iterable, **kwargs): """ Returns a tuple (min(list), Q1, Q2, Q3, max(list)) for the given list of values. @@ -918,7 +980,7 @@ def gammaln(x): y = x + 5.5 y = (x + 0.5) * log(y) - y n = 1.0 - for i in range(6): + for i in xrange(6): x += 1 n += ( 76.18009173, @@ -1048,6 +1110,6 @@ def kolmogorov(x): return 0.0 x = -2.0 * x * x k = 0 - for i in reversed(range(1, 27+1, 2)): # 27 25 23 ... 1 + for i in reversed(xrange(1, 27+1, 2)): # 27 25 23 ... 1 k = (1 - k) * exp(x * i) return 2.0 * k diff --git a/pattern/text/search.py b/pattern/text/search.py index dd1c937b..5410a863 100644 --- a/pattern/text/search.py +++ b/pattern/text/search.py @@ -174,7 +174,7 @@ def variations(iterable, optional=lambda x: False): for p in product([False, True], repeat=sum(o)): p = list(p) v = [b and (b and p.pop(0)) for b in o] - v = tuple(iterable[i] for i in xrange(len(v)) if not v[i]) + v = tuple(iterable[i] for i in range(len(v)) if not v[i]) a.add(v) # Longest-first. return sorted(a, cmp=lambda x, y: len(y) - len(x)) @@ -567,7 +567,7 @@ def match(self, word): - the word (or lemma) occurs in Constraint.taxa taxonomy tree, AND - the word and/or chunk tags match those defined in the constraint. Individual terms in Constraint.words or the taxonomy can contain wildcards (*). - Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB* + Some part-of-speech-tags can also contain wildcards: NN*, VB*, JJ*, RB*, PR*. If the given word contains spaces (e.g., proper noun), the entire chunk will also be compared. For example: Constraint(words=["Mac OS X*"]) @@ -609,8 +609,8 @@ def match(self, word): try: if " " in w and (s1 in w or s2 and s2 in w or "*" in w): s1 = word.chunk and word.chunk.string.lower() or s1 - s2 = word.chunk and " ".join([x or "" for x in word.chunk.lemmata]) or s2 - except: + s2 = word.chunk and " ".join(x or "" for x in word.chunk.lemmata) or s2 + except Exception as e: s1 = s1 s2 = None # Compare the word to the allowed words (which can contain wildcards). @@ -620,6 +620,7 @@ def match(self, word): # if "was" is not in the constraint, perhaps "be" is, which is a good match. if s2 and _match(s2, w): b=True; break + # If the constraint defines allowed taxonomy terms, # and the given word did not match an allowed word, traverse the taxonomy. # The search goes up from the given word to its parents in the taxonomy. @@ -804,7 +805,7 @@ def match(self, sentence, start=0, _v=None, _u=None): if sentence.__class__.__name__ == "Sentence": pass elif isinstance(sentence, list) or sentence.__class__.__name__ == "Text": - return find(lambda m,s: m is not None, ((self.match(s, start, _v), s) for s in sentence))[0] + return find(lambda m: m is not None, (self.match(s, start, _v) for s in sentence)) elif isinstance(sentence, basestring): sentence = Sentence(sentence) elif isinstance(sentence, Match) and len(sentence) > 0: diff --git a/pattern/vector/__init__.py b/pattern/vector/__init__.py index 35af9007..511236c8 100644 --- a/pattern/vector/__init__.py +++ b/pattern/vector/__init__.py @@ -45,6 +45,7 @@ if sys.version > "3": long = int + xrange = range try: MODULE = os.path.dirname(os.path.realpath(__file__)) @@ -1717,7 +1718,7 @@ def transform(self, document): _lsa_transform_cache = {} #def iter2array(iterator, typecode): -# a = numpy.array([iterator.next()], typecode) +# a = numpy.array([next(iterator)], typecode) # shape0 = a.shape[1:] # for (i, item) in enumerate(iterator): # a.resize((i+2,) + shape0) @@ -1961,7 +1962,7 @@ def hierarchical(vectors, k=1, iterations=1000, distance=COSINE, **kwargs): id = sequence() features = kwargs.get("features", _features(vectors)) clusters = Cluster((v for v in shuffled(vectors))) - centroids = [(id.next(), v) for v in clusters] + centroids = [(next(id), v) for v in clusters] map = {} for _ in range(iterations): if len(clusters) <= max(k, 1): @@ -1988,7 +1989,7 @@ def hierarchical(vectors, k=1, iterations=1000, distance=COSINE, **kwargs): v = centroid(merged.flatten(), features) centroids.pop(j) centroids.pop(i) - centroids.append((id.next(), v)) + centroids.append((next(id), v)) return clusters #from pattern.vector import Vector @@ -2081,6 +2082,16 @@ def baseline(self): return self._baseline return ([(0, None)] + sorted([(v, k) for k, v in self._classes.items()]))[-1][1] + @property + def weighted_random_baseline(self): + """ Yields the weighted random baseline: + accuracy with classes predicted randomly according to their distribution. + """ + n = float(sum(self.distribution.values())) or 1 + return sum(map(lambda x: (x / n) ** 2, self.distribution.values())) + + wrb = weighted_random_baseline + @property def skewness(self): """ Yields 0.0 if the trained classes are evenly distributed. @@ -2974,8 +2985,6 @@ def finalize(self): ANN = NN = NeuralNetwork = BPNN - - #nn = BPNN() #nn._weight_initialization(2, 1, hidden=2) #nn._train([ @@ -3070,6 +3079,16 @@ def __init__(self, *args, **kwargs): ( "shrinking", "h", True)): v = kwargs.get(k2, kwargs.get(k1, v)) setattr(self, "_"+k1, v) + # SVC/SVR/SVO alias. + if self._type == "svc": + self._type = SVC + if self._type == "svr": + self._type = SVR + if self._type == "svo": + self._type = SVO + # RBF alias. + if self._kernel == "rbf": + self._kernel = RBF Classifier.__init__(self, train=kwargs.get("train", []), baseline=MAJORITY) @property diff --git a/pattern/vector/stemmer.py b/pattern/vector/stemmer.py index 0c7f3c02..430a3e3a 100644 --- a/pattern/vector/stemmer.py +++ b/pattern/vector/stemmer.py @@ -286,7 +286,7 @@ def case_sensitive(stem, word): Ponies => Poni """ ch = [] - for i in xrange(len(stem)): + for i in range(len(stem)): if word[i] == word[i].upper(): ch.append(stem[i].upper()) else: diff --git a/pattern/web/__init__.py b/pattern/web/__init__.py index dca0082d..f15810ca 100644 --- a/pattern/web/__init__.py +++ b/pattern/web/__init__.py @@ -2357,7 +2357,7 @@ def articles(self, **kwargs): while True: batch, done = [], False try: - for i in range(10): batch.append(iterator.next()) + for i in range(10): batch.append(next(iterator)) except StopIteration: done = True # No more articles, finish batch and raise StopIteration. url = URL(self._url.replace("api.php", "wikia.php"), method=GET, query={ diff --git a/pattern/web/json/ordered_dict.py b/pattern/web/json/ordered_dict.py index 87ad8882..c1b5492e 100644 --- a/pattern/web/json/ordered_dict.py +++ b/pattern/web/json/ordered_dict.py @@ -66,9 +66,9 @@ def popitem(self, last=True): # Modified from original to support Python 2.4, see # http://code.google.com/p/simplejson/issues/detail?id=53 if last: - key = reversed(self).next() + key = next(reversed(self)) else: - key = iter(self).next() + key = next(iter(self)) value = self.pop(key) return key, value