In [1]:
import re

from tqdm import tqdm
from collections import defaultdict, Counter, UserDict
from itertools import product
from cached_property import cached_property

from litecoder.db import session, City, CityAltName

In [53]:
def keyify(text):
    
    text = text.lower()
    text = text.strip()

    text = text.replace('.', '')
    text = re.sub('[,-]', ' ', text)
    
    # 2+ whitespace -> 1 space
    text = re.sub('\s{2,}', ' ', text)
    
    return text

In [54]:
keyify('la-la land')

'la la land'

In [55]:
keyify('Tuscaloosa, AL')

'tuscaloosa al'

In [56]:
USA_NAMES = (
    'USA',
    'United States',
    'United States of America',
    'US',
    'America',
)

class USCityKeyGen:
    
    # TODO: Separate class?
    @cached_property
    def int_name_counts(self):
        names = [keyify(r[0]) for r in session.query(City.name)]
        return Counter(names)
    
    def raw_keys_iter(self, city):
        """Enumerate index keys for a city.
        
        Args:
            city (db.City)
            
        Yields: str
        """
        # TODO: Alt names
        names = (city.name, *city.alt_name_strings)
        
        # TODO: Parametrize
        is_big = city.population and city.population > 500000
        
        name_is_unique = self.int_name_counts[keyify(city.name)] == 1
        
        # If unique or high population.
        if is_big or name_is_unique:

            # Bare name
            for name in names:
                yield name

            # Bare name + USA
            for name, usa in product(names, USA_NAMES):
                yield ' '.join((name, usa))
            
        states = (city.name_a1, city.us_state_abbr)
        
        # Name + state
        for name, state in product(names, states):
            yield ' '.join((name, state))

        # Name + state + USA
        for name, state, usa in product(names, states, USA_NAMES):
            yield ' '.join((name, state, usa))
            
    def __call__(self, city):
        for text in self.raw_keys_iter(city):
            yield keyify(text)

In [6]:
city_key_iter = USCityKeyGen()

In [7]:
la = City.query.filter(City.country_iso=='US').filter(City.name=='Los Angeles').first()
tuscaloosa = City.query.filter(City.country_iso=='US').filter(City.name=='Tuscaloosa').first()

In [8]:
list(city_key_iter(la))

['los angeles',
 'the plains of id',
 'tehrangeles',
 'shangri la',
 'lotus land',
 'la',
 'lotus',
 'la la land',
 'lax',
 'southland',
 'el lay',
 'southern california',
 'smell a',
 'city of angels',
 'tinseltown',
 'hell a',
 'lost angels',
 'the big orange',
 'sc',
 'la',
 'lost angeles',
 'los angeles usa',
 'los angeles united states',
 'los angeles united states of america',
 'los angeles us',
 'los angeles america',
 'the plains of id usa',
 'the plains of id united states',
 'the plains of id united states of america',
 'the plains of id us',
 'the plains of id america',
 'tehrangeles usa',
 'tehrangeles united states',
 'tehrangeles united states of america',
 'tehrangeles us',
 'tehrangeles america',
 'shangri la usa',
 'shangri la united states',
 'shangri la united states of america',
 'shangri la us',
 'shangri la america',
 'lotus land usa',
 'lotus land united states',
 'lotus land united states of america',
 'lotus land us',
 'lotus land america',
 'la usa',
 'la unit

In [9]:
list(city_key_iter(tuscaloosa))

['tuscaloosa',
 'tcl',
 'tus',
 'tuscaloosa usa',
 'tuscaloosa united states',
 'tuscaloosa united states of america',
 'tuscaloosa us',
 'tuscaloosa america',
 'tcl usa',
 'tcl united states',
 'tcl united states of america',
 'tcl us',
 'tcl america',
 'tus usa',
 'tus united states',
 'tus united states of america',
 'tus us',
 'tus america',
 'tuscaloosa alabama',
 'tuscaloosa al',
 'tcl alabama',
 'tcl al',
 'tus alabama',
 'tus al',
 'tuscaloosa alabama usa',
 'tuscaloosa alabama united states',
 'tuscaloosa alabama united states of america',
 'tuscaloosa alabama us',
 'tuscaloosa alabama america',
 'tuscaloosa al usa',
 'tuscaloosa al united states',
 'tuscaloosa al united states of america',
 'tuscaloosa al us',
 'tuscaloosa al america',
 'tcl alabama usa',
 'tcl alabama united states',
 'tcl alabama united states of america',
 'tcl alabama us',
 'tcl alabama america',
 'tcl al usa',
 'tcl al united states',
 'tcl al united states of america',
 'tcl al us',
 'tcl al america',
 

In [65]:
class CityIndex(KeyIndex):
    
    def __init__(self):
        self._idx = defaultdict(set)
        
    def __getitem__(self, text):
        return self._idx[keyify(text)]
    
    def build(self):
        """Index all US cities.
        """
        iter_keys = USCityKeyGen()
        
        cities = City.query.filter(City.country_iso=='US')
        
        for city in tqdm(cities):
            
            try:
                
                # Generate keys, ensure no errors.
                keys = list(iter_keys(city))
                
                # Index complete key set.
                for key in iter_keys(city):
                    self[key].add(city.wof_id)

            except Exception as e:
                pass

In [66]:
ci = CityIndex()
ci.build()

54727it [01:59, 457.72it/s]


In [67]:
len(ci._idx)

965166

In [69]:
ci['nyc']

{85977539}