In [67]:
import re

from tqdm import tqdm
from collections import defaultdict, Counter, UserDict
from itertools import product
from cached_property import cached_property

from litecoder.db import session, City

In [21]:
def keyify(text):
    text = text.lower()
    text = re.sub('[^a-z0-9 ]', '', text)
    return text

In [85]:
USA_NAMES = (
    'USA',
    'United States',
    'United States of America',
    'US',
    'America',
)

class USCityKeyGen:
    
    @cached_property
    def int_name_counts(self):
        names = [keyify(r[0]) for r in session.query(City.name)]
        return Counter(names)
    
    def raw_keys_iter(self, city):
        
        # TODO: Alt names
        names = (city.name,)
        
        # TODO: Parametrize
        is_big = city.population and city.population > 500000
        
        name_is_unique = self.int_name_counts[keyify(city.name)] == 1
        
        # Bare name, if unique or high population.
        if is_big or name_is_unique:
            for name in names:
                yield name
            
        states = (city.name_a1, city.us_state_abbr)
        
        for name, state in product(names, states):
            yield ' '.join((name, state))
            
        for name, state, usa in product(names, states, USA_NAMES):
            yield ' '.join((name, state, usa))
            
    def __call__(self, city):
        for text in self.raw_keys_iter(city):
            yield keyify(text)

In [86]:
city_key_iter = USCityKeyGen()

In [87]:
la = City.query.filter(City.country_iso=='US').filter(City.name=='Los Angeles').first()
tuscaloosa = City.query.filter(City.country_iso=='US').filter(City.name=='Tuscaloosa').first()

In [88]:
list(city_key_iter(tuscaloosa))

['tuscaloosa',
 'tuscaloosa alabama',
 'tuscaloosa al',
 'tuscaloosa alabama usa',
 'tuscaloosa alabama united states',
 'tuscaloosa alabama united states of america',
 'tuscaloosa alabama us',
 'tuscaloosa alabama america',
 'tuscaloosa al usa',
 'tuscaloosa al united states',
 'tuscaloosa al united states of america',
 'tuscaloosa al us',
 'tuscaloosa al america']

In [91]:
class CityIndex:
    
    def __init__(self):
        self._idx = defaultdict(list)
        
    def __getitem__(self, text):
        return self._idx[keyify(text)]
    
    def build(self):
        iter_keys = USCityKeyGen()
        for city in tqdm(City.query.filter(City.country_iso=='US')):
            
            try:
                keys = list(iter_keys(city))
                for key in iter_keys(city):
                    self[key].append(city.wof_id)

            except Exception as e:
                pass

In [92]:
ci = CityIndex()
ci.build()

54727it [00:09, 5897.33it/s]


In [72]:
len(ci._idx)

635781

In [94]:
ci['Tuscaloosa']

[85914453]