In [85]:
import pandas as pd
from geopy import distance
from decimal import Decimal

In [86]:
geo_data = pd.read_csv('geo-by-area.csv', converters={'latitude': Decimal, 'longitude': Decimal})

In [87]:
geo_data['new_area'] = pd.Series([], dtype='object')

In [88]:
geo_data.loc[geo_data['town'] == 'Minsk', 'new_area'] = 'capital'

In [89]:
towns_over100 = [
    'Orsha', 'Borisov', 'Soligorsk', 'Navapolack',
    'Babruysk', 'Mazyr', 'Baranavičy', 'Pinsk', 'Lida'
]

In [90]:
geo_data.loc[geo_data['town'].isin(towns_over100), 'new_area'] = 'town_over100'

In [91]:
mask_below100 =  geo_data['area'].isin(['town', 'region-center']) & ~geo_data['town'].isin(towns_over100)

In [92]:
geo_data.loc[mask_below100, 'new_area'] = 'town_below100'

In [93]:
remaining_mask = geo_data['area'].isin(['city', 'embassy', 'village']) & geo_data['new_area'].isna()
geo_data.loc[remaining_mask, 'new_area'] = geo_data['area'][remaining_mask]

In [94]:
geo_data[geo_data['new_area'].isna()]

Unnamed: 0,id,town,area,latitude,longitude,new_area


In [95]:
geo_data['area'] = geo_data['new_area']

In [96]:
geo_data = geo_data[['id', 'town', 'area', 'latitude', 'longitude']]

In [97]:
BREST_CENTER = (52.100681, 23.703214)
VICIEBSK_CENTER = (55.183909, 30.197171)
HOMEL_CENTER = (52.431225, 30.992659)
HRODNA_CENTER = (53.672348, 23.826575)
MAHILIOU_CENTER = (53.893391, 30.329823)
MINSK_CENTER = (53.905320, 27.553474)

CITY_CENTERS = (
    BREST_CENTER,
    VICIEBSK_CENTER,
    HOMEL_CENTER,
    HRODNA_CENTER,
    MAHILIOU_CENTER,
)

CITY_RADIUS = 10
MINSK_RADIUS = 20
EXTENDED_MINSK_RADIUS = 40

In [98]:
def is_city_suburb(point):
    for city in CITY_CENTERS:
        if distance.distance(point, city).km < CITY_RADIUS:
            return True            

In [100]:
def _get_area(point):
    if is_city_suburb(point):
        return 'city'       
    
    distance_to_minsk = distance.distance(point, MINSK_CENTER).km
    if distance_to_minsk <= MINSK_RADIUS:
        return 'minsk_suburb'
    
for row in geo_data[~geo_data['area'].isin(['capital', 'city'])].iloc:
    point = (row['latitude'], row['longitude'])
    new_area = _get_area(point)
    
    if new_area is not None:
        geo_data.loc[geo_data['id'] == row['id'], 'area'] = new_area
    
    

In [101]:
geo_data['region'] = [int(x.split('-')[0]) for x in geo_data['id']]
geo_data.loc[(geo_data['region'] == 7) & (geo_data['area'] == 'embassy'), 'region'] = 8
geo_data.loc[geo_data['id'] == '06-135-0025', 'area'] = 'town_below100' # Jalizava
geo_data.loc[geo_data['id'] == '02-077-0019', 'area'] = 'town_below100' # Varapajeva
geo_data.loc[geo_data['id'] == '04-112-0065', 'area'] = 'town_below100' # Šajbaki
geo_data.loc[geo_data['id'] == '06-146-0024', 'area'] = 'city' # Palykavičy
geo_data.loc[geo_data['id'].isin(['05-141-0025']), 'area'] = 'minsk_suburb' # Kopišča
# geo_data.loc[(geo_data['region'] == 5) & (geo_data['area'] == 'village'), 'area'] = 'minsk_village'

In [102]:
geo_data.to_csv('geo-categorized.csv', index=False)