In [1]:
import pandas as pd
import numpy as np
import json

## Восстановление названий городов через координаты

## Вычисление расстояний до центров городов / до центров столиц регионов

In [2]:
with open('../data/russian-cities.json', 'r') as json_file:
    cities_data = json.load(json_file)

cities_data = [
    [
        city_dict['name'], 
        city_dict['subject'],
        city_dict['district'],
        city_dict['population'],
        city_dict['coords']['lat'],
        city_dict['coords']['lon']
    ] for city_dict in cities_data
]

cities_data = pd.DataFrame(
    cities_data, 
    columns=[
        'city_name', 'rf_subject', 'rf_district', 
        'city_population', 
        'city_lat', 'city_lon'
    ]
)

cities_data.loc[cities_data['city_name'] == 'Орёл', 'city_name'] = 'Орел'
cities_data.loc[cities_data['city_name'] == 'Великий Новгород', 'city_name'] = 'Новгород'

mapping_idx_to_city_name = cities_data['city_name'].to_dict()

In [3]:
with open('../data/rf_subjects.txt', 'r') as txt_file:
    rf_subjects = txt_file.readlines()
    
rf_subjects = [
    row.split('\t') for row in rf_subjects[1:-3]
]
rf_subjects = [
    row[2:] for row in rf_subjects
]
rf_subjects = pd.DataFrame(rf_subjects, columns=['subject', 'capital'])
rf_subjects['capital'] = rf_subjects['capital'].apply(lambda x: x.split('г. ')[-1][:-1])

rf_subjects = pd.merge(
    rf_subjects,
    cities_data[['city_name', 'city_lat', 'city_lon']],
    left_on=['capital'],
    right_on=['city_name'],
    how='left'
).drop(columns=['city_name'])

mapping_idx_to_capital_name = rf_subjects['capital'].to_dict()

In [4]:
from sklearn.neighbors import KDTree

kdtree_city = KDTree(
    cities_data[['city_lat', 'city_lon']].values, 
    metric='euclidean'
)

kdtree_capital = KDTree(
    rf_subjects[['city_lat', 'city_lon']].values, 
    metric='euclidean'
)

In [5]:
train = pd.read_csv('../data/train.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
query = train[['lat', 'lng']].values

city_distances, city_indicies = kdtree_city.query(query)
capital_distances, capital_indicies = kdtree_capital.query(query)

In [7]:
train['rebuilt_city_name'] = [mapping_idx_to_city_name[idx[0]] for idx in city_indicies]
train['distance_to_city_center'] = city_distances
train['rebuilt_capital_name'] = [mapping_idx_to_capital_name[idx[0]] for idx in capital_indicies]
train['distance_to_capital_center'] = capital_distances

In [17]:
train = pd.merge(
    train,
    cities_data[['city_name', 'rf_subject', 'rf_district', 'city_population']],
    left_on=['rebuilt_city_name'],
    right_on=['city_name'],
    how='left'
).drop(columns=['city_name'])

In [19]:
test = pd.read_csv('../data/test.csv')

In [20]:
query = test[['lat', 'lng']].values

city_distances, city_indicies = kdtree_city.query(query)
capital_distances, capital_indicies = kdtree_capital.query(query)

In [21]:
test['rebuilt_city_name'] = [mapping_idx_to_city_name[idx[0]] for idx in city_indicies]
test['distance_to_city_center'] = city_distances
test['rebuilt_capital_name'] = [mapping_idx_to_capital_name[idx[0]] for idx in capital_indicies]
test['distance_to_capital_center'] = capital_distances

In [22]:
test = pd.merge(
    test,
    cities_data[['city_name', 'rf_subject', 'rf_district', 'city_population']],
    left_on=['rebuilt_city_name'],
    right_on=['city_name'],
    how='left'
).drop(columns=['city_name'])