In [6]:
import re
import attr
import us

from itertools import groupby
from boltons.iterutils import windowed

In [14]:
def isplit(it, keyfunc):
    return [list(g) for k, g in groupby(it, keyfunc) if not k]

In [15]:
class StateIndex(dict):

    def __init__(self):
        """Index name -> state.
        """
        for state in us.STATES:
            self[state.abbr.lower()] = state
            self[state.name.lower()] = state

In [16]:
state_index = StateIndex()

In [17]:
@attr.s(repr=False)
class Token:
    
    token = attr.ib()
    index = attr.ib()
    
    def __repr__(self):
        return f'{self.token}_{self.index}'

In [18]:
class TokenList:
    
    @classmethod
    def from_text(cls, text):
        
        tokens = []
        text = text.replace('.', '')
        
        i = 0
        for token in re.findall('[a-z-]+|,', text, re.I):
            tokens.append(Token(token, i))
            if token != ',': i += 1
        
        return cls(tokens)
    
    def __init__(self, tokens):
        self.tokens = tokens
    
    def __repr__(self):
        return f'{self.__class__.__name__}({self.tokens})'
    
    def key(self):
        token_strs = [t.token for t in self.tokens]
        return ' '.join(token_strs).lower()

In [19]:
class LocationField(TokenList):
    
    def candidate_toponyms(self, maxn=4):
        
        # Split on commas.
        for comma_part in isplit(self.tokens, lambda t: t.token == ','):
        
            # Slide window across tokens.
            for n in range(1, maxn+1):
                for w in windowed(comma_part, n):
                    yield TokenList(w)

In [20]:
lf = LocationField.from_text('South Lake Tahoe, CA')

In [21]:
for ct in lf.candidate_toponyms():
    print(ct.key())

south
lake
tahoe
south lake
lake tahoe
south lake tahoe
ca


In [22]:
def query(text):
    
    lf = LocationField.from_text(text)
    
    states = []
    
    for ct in lf.candidate_toponyms():
        
        state = state_index.get(ct.key())
        
        if state:
            states.append(state)
            
    print(states)