In [98]:
import re

from tqdm import tqdm
from anytree import Node, RenderTree, search
from collections import defaultdict

from litecoder.db import City, session

In [3]:
City.query.count()

344249

In [4]:
def keyify(text, lower=True):
    """Normalize text string -> index key.
    """
    text = text.strip()
    
    # Remove periods
    text = re.sub('\.', '', text)
    
    # Comma -> space
    text = re.sub(',', ' ', text)
    
    # 2+ whitespace -> space
    text = re.sub('\s{2,}', ' ', text)

    if lower:
        text = text.lower()

    return text.split()

In [107]:
def keys_iter(row):
    
    states = (row.name_a1, row.us_state_abbr)
    
    for state in states:
        yield '%s %s' % (row.name, state)
    
    if row.population and row.population > 500000:
        yield row.name

In [134]:
idx = defaultdict(list)

cities = City.query.filter(City.country_iso=='US')

for c in tqdm(cities):
    for key_raw in keys_iter(c):
        
        tokens = keyify(key_raw)
        
        parent = Node(tokens[0])
        idx[tokens[0]].append(parent)
        
        for token in tokens[1:]:
            parent = Node(token, parent=parent)

54727it [00:08, 6667.31it/s]


In [135]:
def find_locs(text):
    
    active = []
    closed = []
    for token in keyify(text):

        new_active, new_closed = [], []
        for an in active:
            for match in search.findall(an, lambda n: n.name==token, maxlevel=an.depth+2):
                if match.is_leaf:
                    new_closed.append(match)
                else:
                    new_active.append(match)

        active = new_active

        if not active:
            closed += new_closed
            active = idx[token]
            
    return closed

In [141]:
%time find_locs('I was born in Atmore, Alabama in June, but went to high school in Mobile AL and in chicago test')

CPU times: user 1.06 ms, sys: 5 µs, total: 1.06 ms
Wall time: 1.07 ms


[Node('/atmore/alabama'), Node('/mobile/al')]

In [142]:
%time find_locs('I was born in Chicago')

CPU times: user 24 µs, sys: 1 µs, total: 25 µs
Wall time: 26.9 µs


[]