In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor

# Смотрим данные

In [2]:
train = pd.read_csv('train.csv', low_memory=False)
train.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type
0,Пермь,,COL_0,57.998207,56.292797,4,19,35,52,0,...,5.762963,5.530612,1964.118519,1960.959184,Пермский край,32.0,S27289,2020-01-05,10,0
1,Шатура,,COL_1,55.574284,39.543835,3,24,37,59,0,...,2.894366,3.527778,1952.321678,1957.222222,Московская область,280.0,S17052,2020-01-05,10,0
2,Ярославль,,COL_2,57.61914,39.850525,1,30,67,128,0,...,6.141414,7.222222,1968.15,1973.37037,Ярославская область,297.4,S16913,2020-01-05,110,0
3,Новокузнецк,,COL_3,53.897083,87.108604,0,0,5,21,0,...,8.581081,9.0,1992.716216,2014.0,Кемеровская область,190.0,S10148,2020-01-05,110,0
4,Москва,,COL_4,55.80259,37.48711,1,23,64,153,0,...,7.263889,5.684211,1963.229167,1960.5,Москва,60.2,S1338,2020-01-05,10,0


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279792 entries, 0 to 279791
Data columns (total 77 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   city                                 279792 non-null  object 
 1   floor                                103555 non-null  object 
 2   id                                   279792 non-null  object 
 3   lat                                  279792 non-null  float64
 4   lng                                  279792 non-null  float64
 5   osm_amenity_points_in_0.001          279792 non-null  int64  
 6   osm_amenity_points_in_0.005          279792 non-null  int64  
 7   osm_amenity_points_in_0.0075         279792 non-null  int64  
 8   osm_amenity_points_in_0.01           279792 non-null  int64  
 9   osm_building_points_in_0.001         279792 non-null  int64  
 10  osm_building_points_in_0.005         279792 non-null  int64  
 11  osm_building_

In [4]:
train.columns

Index(['city', 'floor', 'id', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_name', 'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in

# Избавляемся от NaN

In [5]:
train.isna().sum().sort_values(ascending=False).head(10)

floor                             176237
reform_mean_floor_count_500        30168
reform_mean_year_building_500      29637
reform_house_population_500        27234
reform_mean_floor_count_1000       16708
reform_mean_year_building_1000     16239
reform_house_population_1000       14596
street                              1606
osm_city_nearest_population           55
price_type                             0
dtype: int64

In [6]:
train[train['reform_house_population_1000'].isna()]['city'].nunique()

2738

In [7]:
train['osm_city_nearest_population'] = train['osm_city_nearest_population'].fillna(0)

In [8]:
train['street'] = train['street'].fillna('unknown')

In [9]:
train[['reform_mean_floor_count_500',
'reform_mean_year_building_500',
'reform_house_population_500',
'reform_mean_floor_count_1000',
'reform_mean_year_building_1000',
'reform_house_population_1000']] = train[['reform_mean_floor_count_500',
'reform_mean_year_building_500',
'reform_house_population_500',
'reform_mean_floor_count_1000',
'reform_mean_year_building_1000',
'reform_house_population_1000']].fillna(train[['reform_mean_floor_count_500',
'reform_mean_year_building_500',
'reform_house_population_500',
'reform_mean_floor_count_1000',
'reform_mean_year_building_1000',
'reform_house_population_1000']].mean())

In [10]:
train['floor'] = train['floor'].fillna('1')

In [11]:
train['price_type'].mean()

0.016058357637101846

# Корректируем цену рынка

In [12]:
price = pd.DataFrame(train.groupby(['city','realty_type', 'price_type'])['per_square_meter_price'].mean())
price = price.reset_index()
cities = price[price['price_type']==1]['city'].to_list()

In [13]:
t = price
t.columns = ['city', 'realty_type', 'price_type', 'mean__price']
t.head(20)

Unnamed: 0,city,realty_type,price_type,mean__price
0,АССР,100,0,108426.150121
1,Абадзехская,110,0,31382.366412
2,Абан,110,0,15443.056763
3,"Абатский район, с. Абатское",100,0,12314.939435
4,Абатское,10,0,11271.84466
5,Абатское,100,0,11204.576043
6,Абатское,110,0,14656.301146
7,Абинск,10,0,31721.905502
8,Абинск,100,0,40192.192649
9,Абинск,110,0,41275.511175


In [14]:
train = pd.merge(train, t, how='left', on=['city', 'realty_type', 'price_type'])

In [15]:
train.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type,mean__price
0,Пермь,1,COL_0,57.998207,56.292797,4,19,35,52,0,...,5.530612,1964.118519,1960.959184,Пермский край,32.0,S27289,2020-01-05,10,0,50232.383626
1,Шатура,1,COL_1,55.574284,39.543835,3,24,37,59,0,...,3.527778,1952.321678,1957.222222,Московская область,280.0,S17052,2020-01-05,10,0,37431.61054
2,Ярославль,1,COL_2,57.61914,39.850525,1,30,67,128,0,...,7.222222,1968.15,1973.37037,Ярославская область,297.4,S16913,2020-01-05,110,0,57931.325997
3,Новокузнецк,1,COL_3,53.897083,87.108604,0,0,5,21,0,...,9.0,1992.716216,2014.0,Кемеровская область,190.0,S10148,2020-01-05,110,0,39624.258347
4,Москва,1,COL_4,55.80259,37.48711,1,23,64,153,0,...,5.684211,1963.229167,1960.5,Москва,60.2,S1338,2020-01-05,10,0,236765.607747


In [16]:
train['two_price_type_cities'] = 0
train.loc[train['city'].isin(cities), 'two_price_type_cities'] = 1

In [17]:
train[['city', 'price_type', 'per_square_meter_price', 'mean__price', 'two_price_type_cities']].head()

Unnamed: 0,city,price_type,per_square_meter_price,mean__price,two_price_type_cities
0,Пермь,0,139937.5,50232.383626,1
1,Шатура,0,60410.714286,37431.61054,0
2,Ярославль,0,45164.761264,57931.325997,1
3,Новокузнецк,0,28805.263158,39624.258347,1
4,Москва,0,13222.591362,236765.607747,1


In [18]:
train.loc[(train['price_type']==0)&(train['two_price_type_cities']==1), 
          'per_square_meter_price'] = abs(train['per_square_meter_price']*\
                                          (1-(abs(train['per_square_meter_price']-train['mean__price'])/\
                                              train['per_square_meter_price']*0.85)))

# Колдуем над тестом

In [19]:
test = pd.read_csv('test.csv', low_memory=False)
test.head()

Unnamed: 0,city,floor,id,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,osm_building_points_in_0.0075,...,reform_mean_year_building_1000,reform_mean_year_building_500,region,lat,lng,total_square,street,date,realty_type,price_type
0,Курск,1.0,COL_289284,7,55,85,117,0,0,0,...,1966.471591,1966.74026,Курская область,51.709255,36.147908,156.148996,S6983,2020-09-06,100,1
1,Сургут,1.0,COL_289305,8,70,112,140,0,0,0,...,1988.259259,1989.068182,Ханты-Мансийский АО,61.23324,73.462509,190.737943,S29120,2020-09-06,110,1
2,Тюмень,-1.0,COL_289318,3,28,67,122,0,0,0,...,1985.880282,1991.458333,Тюменская область,57.14311,65.554573,457.118051,S23731,2020-09-06,10,1
3,Иркутск,1.0,COL_289354,5,76,139,231,0,0,0,...,1947.073276,1941.657895,Иркутская область,52.28138,104.282975,66.503622,S14207,2020-09-06,100,1
4,Курск,,COL_289399,8,105,189,279,0,0,2,...,1948.764151,1946.689655,Курская область,51.729706,36.194019,23.864915,S20658,2020-09-06,10,1


In [20]:
train['date'].min(), train['date'].max()

('2020-01-05', '2020-08-23')

In [21]:
test['date'].min(), test['date'].max()

('2020-09-06', '2020-12-27')

In [22]:
test.isna().sum().sort_values(ascending=False).head(10)

floor                             1301
reform_house_population_500         48
reform_mean_year_building_500       48
reform_mean_floor_count_500         48
reform_mean_year_building_1000      35
reform_mean_floor_count_1000        35
reform_house_population_1000        35
street                               6
osm_culture_points_in_0.0075         0
osm_crossing_points_in_0.001         0
dtype: int64

In [23]:
test['street'] = test['street'].fillna('unknown')

In [24]:
test[['reform_house_population_500',
'reform_mean_year_building_500',
'reform_mean_floor_count_500',
'reform_mean_year_building_1000',
'reform_mean_floor_count_1000',
'reform_house_population_1000']] = test[['reform_house_population_500',
'reform_mean_year_building_500',
'reform_mean_floor_count_500',
'reform_mean_year_building_1000',
'reform_mean_floor_count_1000',
'reform_house_population_1000']].fillna(test[['reform_house_population_500',
'reform_mean_year_building_500',
'reform_mean_floor_count_500',
'reform_mean_year_building_1000',
'reform_mean_floor_count_1000',
'reform_house_population_1000']].mean())

In [25]:
test['floor'] = test['floor'].fillna('1')

In [26]:
test = pd.merge(test, t, how='left', on=['city', 'realty_type', 'price_type'])

# Моделим и предиктим

In [27]:
features = train.drop(['per_square_meter_price', 'two_price_type_cities'], axis=1)
target = train['per_square_meter_price']

In [28]:
cat_features = ['city', 'floor', 'id', 'osm_city_nearest_name', 'region', 'street', 'date']

In [29]:
features.columns

Index(['city', 'floor', 'id', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_name', 'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in

In [30]:
test = test[['city', 'floor', 'id', 'lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
       'osm_city_nearest_name', 'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01', 'reform_count_of_houses_1000',
       'reform_count_of_houses_500', 'reform_house_population_1000',
       'reform_house_population_500', 'reform_mean_floor_count_1000',
       'reform_mean_floor_count_500', 'reform_mean_year_building_1000',
       'reform_mean_year_building_500', 'region', 'total_square', 'street',
       'date', 'realty_type', 'price_type', 'mean__price']]

In [31]:
from catboost import CatBoostRegressor

In [32]:
model = CatBoostRegressor(random_state=42, verbose=100, learning_rate=0.015, depth=8, l2_leaf_reg=5, eval_metric='MAPE')

In [33]:
model.fit(features, target, cat_features=cat_features)

0:	learn: 8.8148684	total: 853ms	remaining: 14m 12s
100:	learn: 5.7830031	total: 1m 5s	remaining: 9m 45s
200:	learn: 4.8218359	total: 2m 14s	remaining: 8m 54s
300:	learn: 4.4889642	total: 3m 26s	remaining: 8m
400:	learn: 4.3580683	total: 4m 39s	remaining: 6m 57s
500:	learn: 4.2942714	total: 5m 49s	remaining: 5m 47s
600:	learn: 4.2446042	total: 6m 58s	remaining: 4m 37s
700:	learn: 4.2119558	total: 8m 9s	remaining: 3m 28s
800:	learn: 4.1813516	total: 9m 17s	remaining: 2m 18s
900:	learn: 4.1535106	total: 10m 25s	remaining: 1m 8s
999:	learn: 4.1302963	total: 11m 31s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x22c1cd26730>

In [34]:
test['per_square_meter_price'] = model.predict(test)

In [35]:
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': features.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False).head(20)

Unnamed: 0,feature_importance,feature_names
76,28.429196,mean__price
71,10.665709,total_square
18,5.669128,osm_city_nearest_name
72,5.412058,street
53,5.388697,osm_subway_closest_dist
32,3.542494,osm_finance_points_in_0.01
1,3.255814,floor
0,2.757316,city
70,2.404436,region
19,2.176846,osm_city_nearest_population


In [36]:
pd.DataFrame({'feature_importance': model.get_feature_importance(), 
              'feature_names': features.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False).tail(20)

Unnamed: 0,feature_importance,feature_names
43,0.177755,osm_leisure_points_in_0.0075
47,0.170754,osm_offices_points_in_0.0075
46,0.17001,osm_offices_points_in_0.005
23,0.161988,osm_crossing_points_in_0.0075
10,0.158165,osm_building_points_in_0.005
51,0.15717,osm_shops_points_in_0.0075
68,0.147364,reform_mean_year_building_1000
48,0.146906,osm_offices_points_in_0.01
73,0.145378,date
36,0.124178,osm_historic_points_in_0.005


# Сабмитим 

In [37]:
csv = test[['id', 'per_square_meter_price']]
csv.head()

Unnamed: 0,id,per_square_meter_price
0,COL_289284,43581.55826
1,COL_289305,59525.22533
2,COL_289318,42839.21269
3,COL_289354,97404.495198
4,COL_289399,54933.946227


In [38]:
csv.to_csv('sub15.csv', index=False)