In [11]:
import re

from tqdm import tqdm
from collections import defaultdict, Counter, UserDict
from itertools import product
from cached_property import cached_property

from litecoder.models import session, City
from litecoder import logger

In [2]:
def keyify(text):
    
    text = text.lower()
    text = text.strip()

    text = text.replace('.', '')
    text = re.sub('[,-]', ' ', text)
    
    # 2+ whitespace -> 1 space
    text = re.sub('\s{2,}', ' ', text)
    
    return text

In [3]:
keyify('la-la land')

'la la land'

In [4]:
keyify('Tuscaloosa, AL')

'tuscaloosa al'

In [5]:
keyify('Washington,DC')

'washington dc'

In [12]:
class NameCounts(Counter):
    
    def __init__(self):
        logger.info('Indexing name -> counts.')
        names = [keyify(r[0]) for r in session.query(City.name)]
        super().__init__(names)
    
    def __getitem__(self, text):
        return super().__getitem__(keyify(text))

In [18]:
class NamePopulations(defaultdict):
    
    def __init__(self):
        """Index name -> [pops].
        """
        super().__init__(list)
        
        logger.info('Indexing name -> populations.')
        
        for city in tqdm(City.query):
            for name in city.names:
                self[keyify(name)].append(city.population or 0)
                
    def __getitem__(self, text):
        return super().__getitem__(keyify(text))
    
    def p2_ratio(self, name, pop):
        """Ratio between given pop and second-highest pop for the name.
        """
        pops = sorted(self[name], reverse=True)
        print(pops, pop)
        
        if len(pops) > 2:
            return pop / pops[1]

In [19]:
class AllowBareName:
    
    def __init__(self):
        self.name_counts = NameCounts()
        self.name_pops = NamePopulations()
    
    def __call__(self, city, name):
        
        if self.name_counts[name] < 2:
            return True
        
        pop = city.population or 0
        
        all_pops = sorted(self.name_pops[name], reverse=True)
        
        if len(all_pops) > 2 and pop / all_pops[1] > 10:
            return True
        
        return False

In [20]:
allow_bare = AllowBareName()

2018-07-19 11:29:07,298 | INFO : Indexing name -> counts.
2018-07-19 11:29:08,968 | INFO : Indexing name -> populations.


344249it [00:11, 30364.90it/s]


In [25]:
allow_bare(City.query.filter(City.name=='New York').first(), 'NEW YORK')

[8175133] 8175133


False

In [31]:
allow_bare.name_pops['new york']

[8175133]

In [32]:
allow_bare.name_counts['new york']

2

In [7]:
USA_NAMES = (
    'USA',
    'United States',
    'United States of America',
    'US',
    'America',
)

class USCityKeyGen:
    
    # TODO: Separate class?
    @cached_property
    def int_name_counts(self):
        names = [keyify(r[0]) for r in session.query(City.name)]
        return Counter(names)
    
    def raw_keys_iter(self, city):
        """Enumerate index keys for a city.
        
        Args:
            city (db.City)
            
        Yields: str
        """
        names = (city.name, *city.alt_names)
        
        # TODO: Parametrize
        is_big = city.population and city.population > 500000
        
        name_is_unique = self.int_name_counts[keyify(city.name)] == 1
        
        # If unique or high population.
        if is_big or name_is_unique:

            # Bare name
            for name in names:
                yield name

            # Bare name + USA
            for name, usa in product(names, USA_NAMES):
                yield ' '.join((name, usa))
            
        states = (city.name_a1, city.us_state_abbr)
        
        # Name + state
        for name, state in product(names, states):
            yield ' '.join((name, state))

        # Name + state + USA
        for name, state, usa in product(names, states, USA_NAMES):
            yield ' '.join((name, state, usa))
            
    def __call__(self, city):
        for text in self.raw_keys_iter(city):
            yield keyify(text)

In [8]:
city_key_iter = USCityKeyGen()

In [9]:
la = City.query.filter(City.country_iso=='US').filter(City.name=='Los Angeles').first()
tt = City.query.filter(City.country_iso=='US').filter(City.name=='Tuscaloosa').first()

In [10]:
list(city_key_iter(la))

['los angeles',
 'la',
 'la la land',
 'east los angeles',
 'south los angeles',
 'west los angeles',
 'south central los angeles',
 'south central',
 'los angeles usa',
 'los angeles united states',
 'los angeles united states of america',
 'los angeles us',
 'los angeles america',
 'la usa',
 'la united states',
 'la united states of america',
 'la us',
 'la america',
 'la la land usa',
 'la la land united states',
 'la la land united states of america',
 'la la land us',
 'la la land america',
 'east los angeles usa',
 'east los angeles united states',
 'east los angeles united states of america',
 'east los angeles us',
 'east los angeles america',
 'south los angeles usa',
 'south los angeles united states',
 'south los angeles united states of america',
 'south los angeles us',
 'south los angeles america',
 'west los angeles usa',
 'west los angeles united states',
 'west los angeles united states of america',
 'west los angeles us',
 'west los angeles america',
 'south central 

In [11]:
list(city_key_iter(tt))

['tuscaloosa',
 'tuscaloosa usa',
 'tuscaloosa united states',
 'tuscaloosa united states of america',
 'tuscaloosa us',
 'tuscaloosa america',
 'tuscaloosa alabama',
 'tuscaloosa al',
 'tuscaloosa alabama usa',
 'tuscaloosa alabama united states',
 'tuscaloosa alabama united states of america',
 'tuscaloosa alabama us',
 'tuscaloosa alabama america',
 'tuscaloosa al usa',
 'tuscaloosa al united states',
 'tuscaloosa al united states of america',
 'tuscaloosa al us',
 'tuscaloosa al america']

In [12]:
class CityIndex:
    
    def __init__(self):
        self._idx = defaultdict(set)
        
    def __getitem__(self, text):
        return self._idx[keyify(text)]
    
    def query(self, text):
        return [City.query.get(cid) for cid in self[text]]
    
    def build(self):
        """Index all US cities.
        """
        iter_keys = USCityKeyGen()
        
        cities = City.query.filter(City.country_iso=='US')
        
        for city in tqdm(cities):
            
            try:
                
                # Generate keys, ensure no errors.
                keys = list(iter_keys(city))
                
                # Index complete key set.
                for key in iter_keys(city):
                    self[key].add(city.wof_id)

            except Exception as e:
                pass

In [13]:
ci = CityIndex()
ci.build()

54727it [00:16, 3334.18it/s]


In [14]:
len(ci._idx)

787800

In [22]:
ci.query('east los angeles')

[City<East Los Angeles, California, United States>,
 City<Los Angeles, California, United States>]