In [1]:
import re
import attr
import us

from itertools import groupby
from boltons.iterutils import windowed

In [2]:
class StateIndex(dict):

    def __init__(self):
        """Index name -> state.
        """
        for state in us.STATES:
            self[state.abbr.lower()] = state
            self[state.name.lower()] = state

In [3]:
state_index = StateIndex()

In [5]:
def isplit(it, keyfunc):
    return [list(g) for k, g in groupby(it, keyfunc) if not k]

In [6]:
@attr.s
class Token:
    token = attr.ib()
    p = attr.ib()

In [7]:
class LocationFieldTokens(list):
    
    @classmethod
    def from_text(cls, text):
        tokens = cls()
        
        text = text.replace('.', '')
        
        i = 0
        for token in re.findall('[a-z-]+|,', text, re.I):
            tokens.append(Token(token, i))
            if token != ',': i += 1

        return tokens
    
    def comma_phrases(self):
        parts = isplit(self, lambda t: t.token == ',')
        return list(map(CommaPhraseTokens, parts))
    
    def candidate_toponyms(self, *args, **kwargs):
        for cp in self.comma_phrases():
            yield from cp.candidate_toponyms(*args, **kwargs)

In [8]:
class CommaPhraseTokens(list):
    
    def candidate_toponyms(self, maxn=4):
        for n in range(1, maxn+1):
            for w in windowed(self, n):
                yield CandidateToponymTokens(w)

In [9]:
class CandidateToponymTokens(list):
    
    def key(self):
        return ' '.join([t.token for t in self]).lower()

In [10]:
q = LocationFieldTokens.from_text('Birmingham AL')

In [11]:
for c in q.candidate_toponyms():
    print(c.key())

birmingham
al
birmingham al


In [12]:
cts = list(q.candidate_toponyms())

In [13]:
cts

[[Token(token='Birmingham', order=0)],
 [Token(token='AL', order=1)],
 [Token(token='Birmingham', order=0), Token(token='AL', order=1)]]