# 0. Import, Loading and first Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('output/geofeatures_df.csv')

In [5]:
df['street'] = df['street'].map(lambda x: x.split(', ')[-1] if ((not pd.isna(x)) and (',' in x))else x)

In [6]:
first = pd.read_csv('features/export-kr1_1-39-20221101.csv', sep=';')

In [7]:
first_columns = [
    'mun_obr_oktmo',
    'mun_obr',
    'mkd_code',
    'houseguid',
    # 'address',
    'commission_year',
    'architectural_monument_status',
    'architectural_monument_category_id',
    'total_sq',
    'overhaul_fee_sq',
    'total_rooms_amount',
    'living_rooms_amount',
    'living_rooms_with_nonresidental_amount',
    'total_rooms_sq',
    'living_rooms_sq',
    'living_rooms_with_nonresidental_sq',
    'total_ppl',
    'number_floors_max',
    'number_floors_min',
    'money_collecting_way',
    'money_collecting_way_date_decision',
    'bank_bik',
    'money_ppl_collected',
    'loan_balance',
    'money_ppl_collected_debts',
    'overhaul_funds_spent_all',
    'overhaul_funds_spent_subsidy',
    'overhaul_fund_spent_other',
    'overhaul_funds_balance',
    'update_date_of_information',
    'money_ppl_collected_date',
    'owners_payment',
    'energy_efficiency',
    'previous_energy_efficiency',
    'energy_audit_date',
    'is_change_energy_efficiency',
    'alarm_document_date',
    'exclude_date_from_program',
    'reason_of_exclude_from_program_id',
    'inclusion_date_to_program',
    'comment',
    'last_update',
    'house_id',
]

In [8]:
def preprocess_first_dataset(row):
    row_list = row['address'].split(', ')
    add_dict = {}
    
    for elem in row_list:
        if elem.startswith('г. '):
            add_dict['city'] = elem[3:]
        elif [1 for i in ['ул.', 'пер.', 'б-р', 'наб.', 'пр-кт', 'ш.', 'пл.', 'аллея', 'проезд'] if i in elem]:
            add_dict['street'] = elem
        elif 'д.' in elem:
            add_dict['house_number'] = elem
        elif [1 for i in ['п.', 'пгт.'] if i in elem]:
            add_dict['settlement'] = elem
        elif 'ГО' in elem:
            add_dict['okrug'] = elem
        elif 'тер.' in elem:
            add_dict['territory'] = elem
        elif [1 for i in ['ж/д_ст', 'ст.'] if i in elem]:
            add_dict['station'] = elem
        else:
            add_dict['house_number'] = elem
    return pd.Series(add_dict)

In [9]:
first = first.join(first.apply(preprocess_first_dataset, axis=1))

# 1. Settlement Preprocessing

In [10]:
first['settlement'] = first['settlement'].map(lambda x: x.split('п. ')[-1].split(' (')[0] if not pd.isna(x) else x)

# 2. Street Preprocessing

In [11]:
def preprocess_df_street(street):
    if pd.isna(street):
        return street
    street_names = ['улица', 'переулок', 'бульвар', 'набережная', 'проезд', 'площадь', 'аллея', 'шоссе', 'проспект', 'тупик']
    num_endings = ['-я', '-й']
    endings = ['ая', 'ое', 'ий', 'ый', 'яя', 'ой']
    exclude_endings = ['знаний', 'московское', *street_names]
    
    if  [1 for ending in num_endings if ending in street]:
        street_list = street.split()
        for elem in street_list:
            if elem in street_names:
                street_name = elem
            elif [1 for ending in num_endings if ending in elem]:
                numer = elem
            else:
                remnant = elem
        street = " ".join([numer, remnant, street_name])
    elif [1 for street_part in street.split() if (street_part[-2:] in endings) and not (street_part.lower() in exclude_endings)]:
        street_list = street.split()
        for elem in street_list:
            if elem in street_names:
                street_name = elem
            else:
                remnant = elem
        try:
            street = " ".join([remnant, street_name])
        except:
            print(street)
            raise
    return street

def preprocess_first_street(street):
    if pd.isna(street):
        return street
    street = (
        street
        .replace('ул.', 'улица')
        .replace('пер.', 'переулок')
        .replace('б-р', 'бульвар')
        .replace('наб.', 'набережная')
        .replace('пл.', 'площадь')
        .replace('ш.', 'шоссе')
        .replace('пр-кт', 'проспект')
        .replace("проспект.", 'проспект')
        .replace("аллея.", "аллея")
    )
    street = preprocess_df_street(street)
    
    replace_dict = {
        "Тенистая аллея": "Тенистая улица",
        "улица 9 Апреля": "улица 9-го Апреля",
        "улица В. Талалихина": "улица Виктора Талалихина",
        "улица Д. Давыдова": "улица Давыдова",
        "улица Капитана Захарова": "улица Захарова",
        "улица Л.Толстого": "улица Льва Толстого",
        "улица младшего лейтенанта Ротко": "улица Младшего лейтенанта Ротко",
        "улица Памяти павших в Афганистане": "улица Памяти Павших в Афганистане",
        "улица Р.Люксембург": "улица Розы Люксембург",
        "улица С.Тюленина": "улица Сергея Тюленина",
    }
    street = replace_dict.get(street, street)
    return street

In [12]:
df['street'] = df['street'].map(preprocess_df_street)

In [13]:
df_slice = df[[
    'city',
    # 'district',
    'settlement',
    # 'microdistrict',
    'street',
    'house_number',
    # 'complex',
    # 'corpus',
    # 'partnership',
    # 'station'
]]

In [14]:
first['street'] = first['street'].map(preprocess_first_street)

In [15]:
set(df_slice['street'].unique()).__len__()

474

In [16]:
old_unfound_streets = {
    '1-й Октябрьский проезд',
    'Белорусская улица',
    'Букетная улица',
    'Взводная улица',
    'Жасминовая улица',
    'Иркутская улица',
    'Ключевая улица',
    'Крейсерская улица',
    'Московское',
    'Нефтяная улица',
    'Палубный переулок',
    'Полукольцо улица',
    'Старосаперная улица',
    'Таганрогская улица',
    'Тихорецкий тупик',
    'Украинская улица',
    'Флагманская улица',
    'переулок Ладушкина',
    'переулок Мира',
    'улица Генерала Хохлова',
    'улица Героя России Виталия Мариенко',
    'улица Михаила Светлова',
    'улица Поленова',
    'улица Румянцева',
    'улица Сергея Тюленина'
}

In [17]:
unfound_streets = sorted(list(
    set(df_slice['street'].unique())
    .difference(set(first['street'].unique()))
    .difference(old_unfound_streets)
))

In [18]:
unfound_streets

[]

# 3. House Number Preprocessing

In [19]:
first['house_number'] = first['house_number'].map(lambda x: x.replace("д. ", "") if not pd.isna(x) else x)

In [20]:
len(df_slice['house_number'].unique())

393

In [21]:
unfound_house = sorted(list(
    set(df_slice['house_number'].unique())
    .difference(set(first['house_number'].unique()))
))

In [22]:
unfound_house.__len__()

34

# 4. Join

In [23]:
STR_DIGITS = [str(i) for i in range(10)]

In [24]:
def filter_number(string):
    str_number = "".join(filter(lambda x: x in STR_DIGITS, string.split('/')[0]))
    if str_number:
        return int(str_number)
    else:
        return None

def filter_number_list(string):
    return list(filter(lambda x: not(x is None), map(filter_number, string.replace('_', '-').split('-'))))

In [25]:
def get_first_info(row):
    city, settlement, street, house_number = row[['city', 'settlement', 'street', 'house_number']]
    
    street_records = first[((first['city'] == city)|(first['settlement'] == settlement))&(first['street']==street)]
    first_house_number = street_records['house_number'].unique().tolist()
    
    n_rows = street_records.shape[0]
    
    if not n_rows:
        return
    
    if pd.isna(house_number):  
        second_info = (
            street_records[first_columns]
            .apply(func=lambda x: x.tolist, axis=0)
            .map(lambda x: x())
        )
    elif house_number in first_house_number:
        second_info = (
            street_records[street_records['house_number']==house_number][first_columns]
            .apply(func=lambda x: x.tolist, axis=0)
            .map(lambda x: x())
        )
    else:
        house_number = filter_number(house_number)
        map_numbers = list(map(
            filter_number_list, 
            first_house_number
        ))
        sort_map_numbers = sorted(enumerate(map_numbers), key=lambda x: min(x[1]))
        
        sim_idx = sort_map_numbers[-1][0]
        prev_idx = sort_map_numbers[0][0]
        for num, map_nums in sort_map_numbers:
            if min(map_nums) <= house_number <= max(map_nums):
                sim_idx = num
                break
            elif min(map_nums) > house_number:
                sim_idx = prev_idx
                break
            prev_idx = num
        house_number = first_house_number[prev_idx]
        second_info = (
            street_records[street_records['house_number']==house_number][first_columns]
            .apply(func=lambda x: x.tolist, axis=0)
            .map(lambda x: x())
        )

    return second_info

In [26]:
df_first_info = df.join(df.apply(
    get_first_info,
    axis=1
))

In [27]:
df_first_info.groupby('new_object')['mkd_code'].apply(lambda x: x.isna().mean())

new_object
False    0.032933
True     0.462146
Name: mkd_code, dtype: float64

In [28]:
df_first_info.to_csv('output/df_house_info_1.csv', index=False)