In [1]:
import datetime

import pandas as pd
import numpy as np
from functools import reduce

import preprocessing as prep

import geopy
from geopy.distance import geodesic as GD
import tqdm
from pathlib import Path

import re

In [None]:
df = pd.read_csv('output/parsed_data.csv')

In [3]:
pd.set_option('display.max_columns', None)

# 1. Latitude and Longitude

In [4]:
def street_preprocessing(street: str):
    if pd.isna(street):
        return street
    
    street = (
        street
        .replace(' Генерал-фельдмаршала', "")
        .replace('бульвар Ф. Лефорта', "бульвар Франца Лефорта")
        .replace('бульвар Рижский', 'Рижский бульвар')
        .replace('бульвар Борисовский', 'Борисовский бульвар')
        .replace('улица 9 Апреля', 'улица 9-го Апреля')
        .replace('улица Летний проезд', 'Летний проезд')
        .replace('Московское кв-л', 'Московский район, Московское')
        .replace('улица А. Толстого', 'улица Алексея Толстого')
        .replace("улица Воскресенская", "Воскресенская улица")
        .replace("улица Д. Давыдова", "улица Давыдова")
        .replace("улица Флагманская", "Флагманская улица")
    )
    
    for i in range(10):
        if (f"{i}-" in street) and (street.split()[0] in ["улица", "проезд", "переулок"]):
            street = " ".join(street.split()[1:] + street.split()[:1])
    street = street.replace("9-го Апреля улица", 'улица 9-го Апреля')
    return street

def house_number_preprocessing(row: pd.Series):
    num = row['house_number']
    if pd.isna(num):
        return pd.Series({'house_number': num})
    num_split = [j for i in num.rsplit('к', 1) for j in i.split('Дс')]
    if len(num_split) == 1:
        return pd.Series({'house_number': num})
    else:
        return pd.Series({'house_number': num_split[0], 'corpus': num_split[1]})
    
def settlement_preprocessing(setl: str):
    if pd.isna(setl):
        return setl
    
    setl = (
        setl
        .replace(' пос.', '')
        .replace('пос. ', '')
        .replace('Ново-Дорожный', 'Новодорожный')
    )
    
    setl = re.sub(r' \([^)]*\)', '', setl)
    
    return setl

In [5]:
df['settlement'] = df['settlement'].map(settlement_preprocessing)
df['microdistrict'] = df['microdistrict'].map(lambda x: x.replace(' мкр', '') if not pd.isna(x) else x)
df['district'] = df['district'].map(lambda x: x.replace('р-н ', '') if not pd.isna(x) else x)
df['street'] = df['street'].map(street_preprocessing)
df = df.drop('house_number', axis=1).join(df.apply(house_number_preprocessing, axis=1))

In [6]:
def make_parsable_address(row: pd.Series) -> str:
    address_list = ['Россия']
    address_list.append(row['region'])
    address_list.append(row['okrug'])
    address_list.append(row['city'])
    address_list.append(row['district'])
    
    redundant_setls = ['Чайковского', 'Лермонтовский', 'Воздушный', 'Андреевский Посад']
    
    if pd.isna(row['city']) or (not(row['settlement'] in redundant_setls)):
        address_list.append(row['settlement'])
    
    
    skip_setls = [
        'Прибрежное',
        'Большое Исаково',
        'Васильково',
        'Малое Васильково',
        'Ласкино',
        'Ново-Дорожный',
        'Дружный',
        'Колосовка',
        'Отважное',
        'Переславское',
    ]
    
    unknown_streets = {
        'улица Поленова' : "Декоративная улица, 19",
        'улица Крейсерская': "улица Понартская, 7",
    }
    
    if not (row['settlement'] in skip_setls):
        if pd.isna([row['street']]):
            address_list.append(row['microdistrict'])
            
        street = row['street']
        
        if not (street in unknown_streets):      
            address_list.append(row['street'])

            if not (pd.isna(row['street'])):
                address_list.append(row['house_number'])
        else:
            address_list.append(unknown_streets[street])
    
    address = ", ".join(filter(lambda x: not pd.isna(x), address_list))
    
    address = (
        address
        .replace("Гурьевский городской округ", "Гурьевский муниципальный округ")
        .replace("Калининград, Ленинградский, Старокаменная улица", "Старокаменная улица")
    )
    
    if 'Виталия Мариенко' in address:
        address = address.rsplit(', ', 2)[0] + ", улица Артиллерийская, 87а"
    elif 'Рижский бульвар' in address:
        if address.rsplit(', ', 1)[-1] == 'Рижский бульвар':
            address = address + ", 2"
    elif 'улица Олега Кошевого' in address:
        oleg_dict = {"1":"3", "56А":"58"}
        if address.split(', ')[-1] in oleg_dict:
            address = address.rsplit(', ', 1)[0] + ', ' + oleg_dict[address.split(', ')[-1]]
    elif "Центральный, улица Жасминовая" in address:
        address = 'Россия, Калининградская область, Гурьевский муниципальный округ, ДНТ Лотос'
    return address

In [7]:
df['parsable_address'] = df.apply(make_parsable_address, axis=1)

In [8]:
loc_address = pd.read_csv('data/loc_address.csv')
outer = df[['parsable_address']].merge(loc_address[['parsable_address']], how='left', on='parsable_address', indicator=True)
new_addresses = outer[outer['_merge'] == 'left_only'][['parsable_address']].drop_duplicates().values[:, 0].tolist()

In [9]:
loc_dict = {}
locator = geopy.Nominatim(user_agent="myGeocoder")
for address in tqdm.tqdm(new_addresses):
    location = locator.geocode(address)
    loc_dict[address] = location

0it [00:00, ?it/s]


In [10]:
def preprocess_locator_info(row: pd.Series):
    loc = row['loc']
    if loc:
        loc_series = pd.Series(loc.raw)
        loc_series.index = [f"locator_{i}" for i in loc_series.index]
        return loc_series
    else:
        return pd.Series({"f_locator_lat": None})

In [11]:
Path("./data").mkdir(exist_ok=True)
if loc_dict:
    new_loc_address = pd.Series(loc_dict).reset_index()
    new_loc_address.columns = ['parsable_address', 'loc']
    new_loc_address = new_loc_address.drop('loc', axis=1).join(new_loc_address.apply(preprocess_locator_info, axis=1))
    loc_address = pd.concat([loc_address, new_loc_address], axis=0).reset_index(drop=True)
    
    poss_inds = {f'{i:06d}' for i in range(1000000)}
    loc_address['post_index'] = loc_address['locator_display_name'].map(
        lambda x: [i for i in x.split(', ') if i in poss_inds] if not pd.isna(x) else x
    ).map(
        lambda x: x[0] if x and (not pd.isna(x)) else np.nan
    )
    
    loc_address.to_csv('data/loc_address.csv', index=False)

In [12]:
loc_address = pd.read_csv('data/loc_address.csv')
cur_adresses = df[['parsable_address']].drop_duplicates().merge(loc_address, on='parsable_address', how='left')
cur_adresses[cur_adresses['locator_lat'].isna()]['parsable_address'].tolist()

[]

In [13]:
loc_feats = [
    'parsable_address',
    # 'locator_boundingbox',
    'locator_class',
    # 'locator_display_name',
    'locator_importance',
    'locator_lat',
    'locator_lon',
    # 'locator_osm_id',
    'locator_osm_type',
    # 'locator_place_id',
    'locator_type',
    'post_index',
]

In [14]:
df = df.merge(
    loc_address[loc_feats].rename(columns={'locator_lat': 'lat', 'locator_lon': 'lon'}),
    how='left',
    on='parsable_address'
)

In [15]:
import plotly.io as pio
import plotly.express as px
pio.renderers.default = 'iframe'

In [None]:
fig = px.scatter_mapbox(
    df, 
    title = "Kaliningrad", 
    lat = "lat", 
    lon = "lon", 
    hover_data = [
        'new_object',
        'price_per_metr',
        'area',
        'address'
    ],
    color_continuous_scale=px.colors.sequential.Plotly3_r,
    color = df['price_per_metr'].fillna(600000),
    size = 'area',
    range_color = [60000, 300000],
    zoom = 20, 
    height = 1000,
    width = 1000,
    )

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

#### the results can be seen in 02_geo_prices.html

# 2. The distance to cities

In [17]:
cities = {
    "Калининград": "Kaliningrad",
    "Зеленоградск": "Zelenogradsk",
    "Светлогорск": "Svetlogorsk",
    "Гурьевск": "Gurevsk",
    "Балтийск": "Baltiisk",
}

cities_dict = {}

for city in cities:
    loc = locator.geocode(f"Россия, Калининградская область, {city}")
    cities_dict[cities[city]] = (loc.latitude, loc.longitude)

In [18]:
cities_dict['Kaliningrad_center'] = (54.719889, 20.501692)

In [19]:
cities_list = list(cities_dict)

In [20]:
df = df.join(pd.DataFrame([cities_dict for i in range(df.shape[0])]))

In [21]:
cities_dict

{'Kaliningrad': (54.710128, 20.5105838),
 'Zelenogradsk': (54.9600365, 20.4750188),
 'Svetlogorsk': (54.9417552, 20.1550863),
 'Gurevsk': (54.7745571, 20.6037041),
 'Baltiisk': (54.6437214, 19.8941584),
 'Kaliningrad_center': (54.719889, 20.501692)}

In [22]:
for city in cities_list:
    df[city] = df[['lat', 'lon', city]].apply(lambda x: GD((x['lat'], x['lon']), x[city]).km, axis=1)

# 3. The distance to schools

### Schools, parks and other datasets were prepared without automatization

In [23]:
clean_schools = pd.read_csv('data/clean_schools.csv')

In [24]:
clean_schools['loc'] = clean_schools.apply(lambda x: tuple(x[['lat', 'lon']]), axis=1)

In [25]:
def calculate_distances_to_school(row, top=3):
    lat, lon = row['lat'], row['lon']
    # dist_dict = {}
    schools_list = []
    for _, row in clean_schools.iterrows():
        school_id = f'schhol_{row["id"]}'
        forma = row['forma']
        tp = row['type']
        dist = GD((lat, lon), row['loc']).km
        schools_list.append((school_id, forma, tp, dist))
    schools_list.sort(key=lambda x: x[3])
    
    top_nearest_schools = schools_list[:top]
    
    dist_dict = {}
    for n, school in enumerate(top_nearest_schools):
        dist_dict[f'school_{n}_id'] = school[0]
        dist_dict[f'school_{n}_forma'] = school[1]
        dist_dict[f'school_{n}_type'] = school[2]
        dist_dict[f'school_{n}_dist'] = school[3]
    
    return pd.Series(dist_dict)

In [26]:
%%time
schools_info = df.apply(calculate_distances_to_school, axis=1)

CPU times: total: 13min 5s
Wall time: 13min 7s


In [27]:
df = df.join(schools_info)

# 4. The distance to parks, airports, railway stations

In [38]:
%%time
for place in ['parks', 'stations', 'airports']:
    df_place = pd.read_csv(f'data/{place}.csv')
    df_place['loc'] = df_place.apply(lambda x: tuple(x[['lat', 'lon']]), axis=1)
    for _, row in df_place.iterrows():
        placename = row['name']
        df[f"{place}_{placename}"] = df.apply(lambda x: GD((x['lat'], x['lon']), row['loc']).km, axis=1)

CPU times: total: 1min 16s
Wall time: 1min 16s


In [40]:
df.to_csv('output/geofeatures_df.csv', index=False)