In [1]:
import re
import pygtrie

from collections import Counter
from tqdm import tqdm

from litecoder.db import City, session

In [2]:
def keyify(text, lower=True):
    """Normalize text string -> index key.
    """
    text = text.strip()
    
    # Remove periods
    text = re.sub('\.', '', text)
    
    # Comma -> space
    text = re.sub(',', ' ', text)
    
    # 2+ whitespace -> space
    text = re.sub('\s{2,}', ' ', text)

    if lower:
        text = text.lower()

    return text

In [3]:
City.query.count()

344249

In [4]:
names = [keyify(r[0]) for r in session.query(City.name)]

In [5]:
name_counts = Counter(names)

In [6]:
name_counts[keyify('tuscaloosa')]

1

In [7]:
def keys_iter(row):
    
    states = (row.name_a1, row.us_state_abbr)
    
    for state in states:
        yield '%s %s' % (row.name, state)
    
    if row.population and row.population > 500000:
        yield row.name
        
    elif name_counts[keyify(row.name)] == 1:
        yield row.name

In [9]:
idx = pygtrie.StringTrie(separator=' ')

cities = City.query.filter(City.country_iso=='US')

for c in tqdm(cities):
    for key_raw in keys_iter(c):
        
        key = keyify(key_raw)
        
        if key in idx:
            idx[key] = (*idx[key], c.wof_id)

        else:
            idx[key] = (c.wof_id,)


0it [00:00, ?it/s][A
1it [00:01,  1.50s/it][A
1963it [00:01, 1230.51it/s][A
3902it [00:01, 2301.35it/s][A
5833it [00:02, 2836.69it/s][A
7690it [00:02, 3566.67it/s][A
9822it [00:02, 4353.50it/s][A
13760it [00:02, 5603.70it/s]Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/local/Cellar/python/3.6.5/Frameworks/Python.framework/Versions/3.6/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/Users/dclure/Projects/litecoder/env/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/local/bin/../Cellar/python/3.6.5/bin/../Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

54727it [00:04, 11443.07it/s]


In [70]:
%time idx.longest_prefix(keyify('mobile al'))

CPU times: user 26 µs, sys: 16 µs, total: 42 µs
Wall time: 44.1 µs


('mobile al', (85913749,))

In [51]:
list(idx.prefixes('lodi'))

[]

In [66]:
idx.prefixes?

[0;31mSignature:[0m [0midx[0m[0;34m.[0m[0mprefixes[0m[0;34m([0m[0mkey[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Walks towards the node specified by key and yields all found items.

Example:

    >>> import pygtrie
    >>> t = pygtrie.StringTrie()
    >>> t['foo'] = 'Foo'
    >>> t['foo/bar/baz'] = 'Baz'
    >>> list(t.prefixes('foo/bar/baz/qux'))
    [('foo', 'Foo'), ('foo/bar/baz', 'Baz')]
    >>> list(t.prefixes('does/not/exist'))
    []

Args:
    key: Key to look for.

Yields:
    ``(k, value)`` pairs denoting keys with associated values
    encountered on the way towards the specified key.
[0;31mFile:[0m      ~/Projects/litecoder/env/lib/python3.6/site-packages/pygtrie.py
[0;31mType:[0m      method


In [42]:
idx.has_node('red level') == idx.HAS_SUBTRIE

False

In [48]:
idx.longest_prefix('red')

(None, None)

In [60]:
idx.has_subtrie('level al')

False

In [61]:
idx.traverse?

[0;31mSignature:[0m [0midx[0m[0;34m.[0m[0mtraverse[0m[0;34m([0m[0mnode_factory[0m[0;34m,[0m [0mprefix[0m[0;34m=[0m[0;34m<[0m[0mobject[0m [0mobject[0m [0mat[0m [0;36m0x103eccbf0[0m[0;34m>[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Traverses the tree using node_factory object.

node_factory is a callable function which accepts (path_conv, path,
children, value=...) arguments, where path_conv is a lambda converting
path representation to key, path is the path to this node, children is
an iterable of children nodes constructed by node_factory, optional
value is the value associated with the path.

node_factory's children argument is a generator which has a few
consequences:

* To traverse into node's children, the generator must be iterated over.
  This can by accomplished by a simple "children = list(children)"
  statement.
* Ignoring the argument allows node_factory to stop the traversal from
  going into the children of the node.  In other words, w