In [56]:
import re

from tqdm import tqdm
from anytree import Node, RenderTree, search
from collections import Counter, defaultdict
from wordfreq import top_n_list

from litecoder.db import City, session

In [57]:
FREQ_WORDS = set(top_n_list('en', 1000))

In [35]:
def normalize(text, lower=True):
    """Normalize text string -> index key.
    """
    text = text.strip()
    
    # Remove periods
    text = re.sub('\.', '', text)
    
    # Comma -> space
    text = re.sub(',', ' ', text)
    
    # 2+ whitespace -> space
    text = re.sub('\s{2,}', ' ', text)

    if lower:
        text = text.lower()

    return text

In [36]:
names = [normalize(r[0]) for r in session.query(City.name)]

In [37]:
name_counts = Counter(names)

In [58]:
def keys_iter(row):
    
    states = (row.name_a1, row.us_state_abbr)
    
    name_norm = normalize(row.name)
    
    for state in states:
        yield '%s %s' % (row.name, state)
    
    if row.population and row.population > 500000:
        yield row.name
        
    elif name_counts[name_norm] == 1 and name_norm not in FREQ_WORDS:
        yield row.name

In [59]:
idx = defaultdict(list)

cities = City.query.filter(City.country_iso=='US')

for c in tqdm(cities):
    for key_raw in keys_iter(c):
        
        tokens = normalize(key_raw).split()
        
        parent = Node(tokens[0])
        
        idx[tokens[0]].append(parent)
        
        for token in tokens[1:]:
            parent = Node(token, parent=parent)

54727it [00:06, 8933.85it/s]


In [60]:
def find_locs(text):
    
    active = []
    closed = []
    for token in normalize(text).split():
        
        extensions = []
        for an in active:
            if not an.is_leaf:
                extensions += search.findall(an, lambda n: n.name==token, maxlevel=an.depth+2)
                
        if not extensions:
            closed += [n for n in active if n.is_leaf]
            
        active = extensions
            
        if not active:
            active = idx[token]
            
    closed += [n for n in active if n.is_leaf]

    return closed

In [63]:
find_locs('I grew up in Tuscaloosa, but went to high school down in Mobile AL, and then was in New Haven CT for college')

[Node('/tuscaloosa'), Node('/mobile/al'), Node('/new/haven/ct')]