In [42]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [43]:
df0 = pd.read_csv('./data/raw/test_x.csv')
print(df0.shape)

(68825, 75)


### Заполняю null values

In [44]:
df3 = df0.copy()
df3.osm_city_nearest_population = df3.osm_city_nearest_population.fillna(method='ffill')

### Объединяю категории points

In [45]:
points_types = ['amenity_points', 'building_points', 'catering_points', 'crossing_points', 'culture_points',
                'finance_points', 'healthcare_points', 'historic_points', 'hotels_points', 'leisure_points',
                'offices_points', 'shops_points']
df4 = df3.copy()
for point_type in points_types:
       df4[point_type] = 16 * df3[f'osm_{point_type}_in_0.005'] + 8 * df3[f'osm_{point_type}_in_0.0075'] + 4 * df3[
              f'osm_{point_type}_in_0.01']
       df4 = df4.drop([f'osm_{point_type}_in_0.005', f'osm_{point_type}_in_0.0075', f'osm_{point_type}_in_0.01'], axis=1)
       if f'osm_{point_type}_in_0.001' in df3.columns:
              df4[point_type] += 32 * df3[f'osm_{point_type}_in_0.001']
              df4 = df4.drop([f'osm_{point_type}_in_0.001'], axis=1)

### Кодирую realty_type с помощью ohe

In [46]:
df5 = df4.copy()
ohe = OneHotEncoder()
oheRealtyType = ohe.fit_transform(df5[['realty_type']])
realtyTypeCategories = list(map(lambda x: 'realty_type_' + str(x),  ohe.categories_[0]))
df5[realtyTypeCategories] = oheRealtyType.toarray()
df5 = df5.drop(['realty_type'], axis=1)

### Кодирую названия городов

In [47]:
df7 = df5.copy()

city_mapping = dict(
    zip(
        pd.unique(df7.city),
        range(0, len(pd.unique(df7.city)))
    )
)
df7.city = df7.city.apply(lambda it: city_mapping[it])
print(df7.shape)
print(df7.shape[0] / df0.shape[0])

(68825, 45)
1.0


### Распределяю недвижимость по городам относительно численности населения

In [48]:
msk_spb = [city_mapping['Москва'], city_mapping['Санкт-Петербург']]
moscow_spb = df7[df7['city'].isin(msk_spb)]
print(moscow_spb.shape)
population_gt_1M = df7[(df7['osm_city_nearest_population'] >= 1e6) & (~df7['city'].isin(msk_spb))]
print(population_gt_1M.shape)
# TODO сделать больше категорий для малых городов, если точность предсказаний будет низкой (использовать сторонние источники с численностью населения городов, так как osm_city_nearest_population для малых городов не подходит)
population_lt_100K = df7[(df7['osm_city_nearest_population'] < 1e6) & (~df7['city'].isin(msk_spb))]
print(population_lt_100K.shape)
print(moscow_spb.shape[0] + population_gt_1M.shape[0] + population_lt_100K.shape[0])

(18569, 45)
(21554, 45)
(28702, 45)
68825


In [49]:
moscow_spb.to_csv('./data/test/moscow_spb.csv')
population_gt_1M.to_csv('./data/test/population_gt_1M.csv')
population_lt_100K.to_csv('./data/test/population_lt_100K.csv')