In [1]:
import re

from tqdm import tqdm
from collections import defaultdict, Counter, UserDict
from itertools import product
from cached_property import cached_property

from litecoder.db import session, City, CityAltName

In [2]:
def keyify(text):
    
    text = text.lower()
    text = text.strip()

    text = text.replace('.', '')
    text = re.sub('[,-]', ' ', text)
    
    # 2+ whitespace -> 1 space
    text = re.sub('\s{2,}', ' ', text)
    
    return text

In [3]:
keyify('la-la land')

'la la land'

In [44]:
keyify('Tuscaloosa, AL')

'tuscaloosa al'

In [55]:
USA_NAMES = (
    'USA',
    'United States',
    'United States of America',
    'US',
    'America',
)

class USCityKeyGen:
    
    # TODO: Separate class?
    @cached_property
    def int_name_counts(self):
        names = [keyify(r[0]) for r in session.query(City.name)]
        return Counter(names)
    
    def raw_keys_iter(self, city):
        """Enumerate index keys for a city.
        
        Args:
            city (db.City)
            
        Yields: str
        """
        names = (city.name,)
        
        # TODO: Parametrize
        is_big = city.population and city.population > 500000
        
        name_is_unique = self.int_name_counts[keyify(city.name)] == 1
        
        # If unique or high population.
        if is_big or name_is_unique:

            # Bare name
            for name in names:
                yield name

            # Bare name + USA
            for name, usa in product(names, USA_NAMES):
                yield ' '.join((name, usa))
            
        states = (city.name_a1, city.us_state_abbr)
        
        # Name + state
        for name, state in product(names, states):
            yield ' '.join((name, state))

        # Name + state + USA
        for name, state, usa in product(names, states, USA_NAMES):
            yield ' '.join((name, state, usa))
            
    def __call__(self, city):
        for text in self.raw_keys_iter(city):
            yield keyify(text)

In [56]:
city_key_iter = USCityKeyGen()

In [57]:
la = City.query.filter(City.country_iso=='US').filter(City.name=='Los Angeles').first()
tuscaloosa = City.query.filter(City.country_iso=='US').filter(City.name=='Tuscaloosa').first()

In [58]:
list(city_key_iter(la))

['los angeles',
 'los angeles usa',
 'los angeles united states',
 'los angeles united states of america',
 'los angeles us',
 'los angeles america',
 'los angeles california',
 'los angeles ca',
 'los angeles california usa',
 'los angeles california united states',
 'los angeles california united states of america',
 'los angeles california us',
 'los angeles california america',
 'los angeles ca usa',
 'los angeles ca united states',
 'los angeles ca united states of america',
 'los angeles ca us',
 'los angeles ca america']

In [59]:
list(city_key_iter(tuscaloosa))

['tuscaloosa',
 'tuscaloosa usa',
 'tuscaloosa united states',
 'tuscaloosa united states of america',
 'tuscaloosa us',
 'tuscaloosa america',
 'tuscaloosa alabama',
 'tuscaloosa al',
 'tuscaloosa alabama usa',
 'tuscaloosa alabama united states',
 'tuscaloosa alabama united states of america',
 'tuscaloosa alabama us',
 'tuscaloosa alabama america',
 'tuscaloosa al usa',
 'tuscaloosa al united states',
 'tuscaloosa al united states of america',
 'tuscaloosa al us',
 'tuscaloosa al america']

In [60]:
class CityIndex:
    
    def __init__(self):
        self._idx = defaultdict(set)
        
    def __getitem__(self, text):
        return self._idx[keyify(text)]
    
    def query(self, text):
        return [City.query.get(cid) for cid in self[text]]
    
    def build(self):
        """Index all US cities.
        """
        iter_keys = USCityKeyGen()
        
        cities = City.query.filter(City.country_iso=='US')
        
        for city in tqdm(cities):
            
            try:
                
                # Generate keys, ensure no errors.
                keys = list(iter_keys(city))
                
                # Index complete key set.
                for key in iter_keys(city):
                    self[key].add(city.wof_id)

            except Exception as e:
                pass

In [61]:
ci = CityIndex()
ci.build()

54727it [00:16, 3325.72it/s]


In [62]:
len(ci._idx)

787008

In [68]:
ci.query('new york city')

[]