In [1]:
import re
import attr
import us

from itertools import groupby
from boltons.iterutils import windowed

In [2]:
class StateIndex(dict):

    def __init__(self):
        """Index name -> state.
        """
        for state in us.STATES:
            self[state.abbr.lower()] = state
            self[state.name.lower()] = state

In [5]:
state_index = StateIndex()

In [119]:
@attr.s(repr=False)
class Token:
    
    token = attr.ib()
    index = attr.ib()
    
    def __repr__(self):
        return f'{self.token}_{self.index}'

In [171]:
class TokenList:
    
    def __init__(self, tokens):
        self.tokens = tokens
    
    def __repr__(self):
        return f'{self.__class__.__name__}({self.tokens})'

In [173]:
class CandidateToponym(TokenList):
    
    def key(self):
        token_strs = [t.token for t in self.tokens]
        return ' '.join(token_strs).lower()

In [174]:
class LocationField(TokenList):
    
    @classmethod
    def from_text(cls, text):
        
        tokens = []
        text = text.replace('.', '')
        
        i = 0
        for token in re.findall('[a-z-]+|,', text, re.I):
            tokens.append(Token(token, i))
            if token != ',': i += 1
        
        return cls(tokens)
    
    def candidate_toponyms(self, maxn=4):
        
        # Split on commas.
        for comma_part in isplit(self.tokens, lambda t: t.token == ','):
        
            # Slide window across tokens.
            for n in range(1, maxn+1):
                for w in windowed(comma_part, n):
                    yield CandidateToponym(w)

In [180]:
lf = LocationField.from_text('South Lake Tahoe, CA')

In [181]:
for ct in lf.candidate_toponyms():
    print(ct.key())

south
lake
tahoe
south lake
lake tahoe
south lake tahoe
ca
