In [20]:
import re

from tqdm import tqdm
from anytree import Node, RenderTree, search
from collections import Counter, defaultdict
from wordfreq import top_n_list
from textblob import TextBlob

from litecoder.db import City, session

In [21]:
def tokenize(text):
    return [str(t) for t in TextBlob(text).tokens]

In [207]:
class ToponymToken(Node):
    
    def __init__(self, name, parent=None, ignore_case=True, allow_comma=False, scrub_re='\.'):
        
        super().__init__(name, parent)
        
        self.ignore_case = ignore_case
        self.allow_comma = allow_comma
        self.scrub_re = scrub_re
        
        self._cleaned = self._clean_token(name)
        
    def _clean_token(self, token):
        
        if self.ignore_case:
            token = token.lower()
            
        if self.scrub_re:
            token = re.sub(self.scrub_re, '', token)
            
        return token
    
    def match_token(self, token):
        
        if self._clean_token(token) == self._cleaned:
            return True
        
        if self.allow_comma and token == ',':
            return True

In [208]:
south = ToponymToken('South')
lake = ToponymToken('Lake', parent=south)
tahoe = ToponymToken('Tahoe', parent=lake, allow_comma=True)
ca = ToponymToken('CA', parent=tahoe)
california = ToponymToken('California', parent=tahoe)

In [209]:
print(RenderTree(south))

ToponymToken('/South', allow_comma=False, ignore_case=True, scrub_re='\\.')
└── ToponymToken('/South/Lake', allow_comma=False, ignore_case=True, scrub_re='\\.')
    └── ToponymToken('/South/Lake/Tahoe', allow_comma=True, ignore_case=True, scrub_re='\\.')
        ├── ToponymToken('/South/Lake/Tahoe/CA', allow_comma=False, ignore_case=True, scrub_re='\\.')
        └── ToponymToken('/South/Lake/Tahoe/California', allow_comma=False, ignore_case=True, scrub_re='\\.')


In [210]:
idx = defaultdict(list)
idx['south'].append(south)

In [219]:
def find_locs(text):
    
    active = []
    closed = []
    for token in tokenize(text):
        
        extensions = []
        for an in active:
            if not an.is_leaf:
                
                if an.match_token(token):
                    extensions.append(an)
                    
                else:
                    extensions += [c for c in an.children if c.match_token(token)]
                
        if not extensions:
            closed += [n for n in active if n.is_leaf]

        active = extensions
            
        if not active:
            active = idx.get(token.lower(), [])
            
    closed += [n for n in active if n.is_leaf]

    return closed

In [220]:
find_locs('I grew up in South Lake, Tahoe C.A. in the 90s.')

[ToponymToken('/South/Lake/Tahoe/CA', allow_comma=False, ignore_case=True, scrub_re='\\.')]