# **Задача обучения (соревнования)** - построить модель определения цены жилой недвижимости на основе представленного набора данных

Проект выполнен в рамках участия в ШИФТ Интенсиве по направлению ML Classic.

Проект подготовлен командой Русичи: Владислав Вольников, [Станислав Кацко](https://t.me/StanislavKatsko), Иван Литвинов.

На первом этапе (ноутбук Русичи_EDA) представлена загрузка исходных данных и выполнен разведочный анализ данных (EDA).

В текущем ноутбуке (Русичи_model) будут представлены процессы предобработки данных, обучения и сабмита.

# Подключаемся к Google Drive и импортируем библиотеки

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Load data libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
from sklearn.model_selection import train_test_split

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Data preparation
from sklearn.preprocessing import RobustScaler, StandardScaler
from datetime import datetime
import math

# Modeling
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, r2_score, precision_score, recall_score, classification_report, confusion_matrix


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
import os

import pandas as pd

In [None]:
df = pd.read_csv('./gdrive/MyDrive/SHIFT_SUMMER/train.csv', index_col=0)
df

Unnamed: 0,full_sq,floor,build_year,num_room,kitch_sq,state,product_type,raion_popul,indust_part,sport_objects_raion,...,cafe_count_1000_price_high,cafe_sum_1500_min_price_avg,green_part_2000,cafe_sum_2000_min_price_avg,mosque_count_3000,prom_part_5000,cafe_sum_5000_min_price_avg,mosque_count_5000,year,price_doc
0,43,24.0,2014.0,1.0,10.0,1.0,1,8.294300,0.007122,0,...,0,6.907755,3.710396,6.725430,0,1.803359,6.616560,0,2014,5533460
1,66,3.0,1979.0,3.0,10.0,3.0,0,11.314231,0.232205,4,...,0,6.626122,2.764431,6.479615,1,2.631169,6.469948,1,2013,9900000
2,55,5.0,1979.0,2.0,6.0,2.0,0,11.718817,0.000170,5,...,0,6.368753,3.511844,6.192138,0,1.363537,6.462171,0,2012,1990000
3,45,7.0,1970.0,2.0,6.0,2.0,0,11.331032,0.428826,4,...,0,6.753601,1.814825,6.551080,0,2.732418,6.458229,0,2014,6500000
4,43,4.0,1968.0,2.0,6.0,2.0,0,11.538769,0.136330,6,...,0,6.636603,2.257588,6.646507,1,2.020222,6.634778,2,2014,8500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21324,56,19.0,1979.0,2.0,1.0,1.0,1,11.655744,0.049637,6,...,0,6.492240,2.711378,6.816188,0,3.297317,6.536155,0,2014,10159562
21325,42,5.0,1977.0,1.0,8.0,3.0,0,11.718817,0.000170,5,...,0,6.502295,3.802208,6.291569,0,1.442202,6.487684,0,2014,5100000
21326,32,3.0,1979.0,2.0,6.0,2.0,0,11.719940,0.000000,4,...,0,6.572814,3.062456,6.583520,1,2.733718,6.573904,2,2011,4800000
21327,43,4.0,1969.0,2.0,5.0,2.0,0,11.331032,0.428826,4,...,0,6.586172,2.408745,6.563982,0,3.034953,6.476311,0,2013,6300000


In [None]:
features = df.columns.tolist()
print(features)

['full_sq', 'floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'product_type', 'raion_popul', 'indust_part', 'sport_objects_raion', 'shopping_centers_raion', 'radiation_raion', 'build_count_block', 'build_count_brick', 'build_count_monolith', 'metro_min_avto', 'school_km', 'green_zone_km', 'industrial_km', 'water_treatment_km', 'cemetery_km', 'incineration_km', 'ID_railroad_station_avto', 'mkad_km', 'ttk_km', 'oil_chemistry_km', 'nuclear_reactor_km', 'power_transmission_line_km', 'market_shop_km', 'fitness_km', 'stadium_km', 'basketball_km', 'detention_facility_km', 'additional_education_km', 'big_church_km', 'mosque_km', 'theater_km', 'exhibition_km', 'catering_km', 'green_part_1000', 'cafe_sum_1000_min_price_avg', 'cafe_count_1000_price_high', 'cafe_sum_1500_min_price_avg', 'green_part_2000', 'cafe_sum_2000_min_price_avg', 'mosque_count_3000', 'prom_part_5000', 'cafe_sum_5000_min_price_avg', 'mosque_count_5000', 'year', 'price_doc']


# Предобработка

In [None]:
def copy_by_feature(df, features):
  df_copy = pd.DataFrame()
  for feature in features:
    df_copy[feature] = df[feature]
  df_copy["price_doc"] = df['price_doc']
  return df_copy

In [None]:
df_clean = copy_by_feature(df, features)

In [None]:
df_clean['full_sq'].value_counts(bins=60, sort = False)

full_sq
(-5.327, 88.767]        20281
(88.767, 177.533]        1010
(177.533, 266.3]           30
(266.3, 355.067]            3
(355.067, 443.833]          0
(443.833, 532.6]            1
(532.6, 621.367]            1
(621.367, 710.133]          1
(710.133, 798.9]            1
(798.9, 887.667]            0
(887.667, 976.433]          0
(976.433, 1065.2]           0
(1065.2, 1153.967]          0
(1153.967, 1242.733]        0
(1242.733, 1331.5]          0
(1331.5, 1420.267]          0
(1420.267, 1509.033]        0
(1509.033, 1597.8]          0
(1597.8, 1686.567]          0
(1686.567, 1775.333]        0
(1775.333, 1864.1]          0
(1864.1, 1952.867]          0
(1952.867, 2041.633]        0
(2041.633, 2130.4]          0
(2130.4, 2219.167]          0
(2219.167, 2307.933]        0
(2307.933, 2396.7]          0
(2396.7, 2485.467]          0
(2485.467, 2574.233]        0
(2574.233, 2663.0]          0
(2663.0, 2751.767]          0
(2751.767, 2840.533]        0
(2840.533, 2929.3]          0
(2

In [None]:
df_clean.loc[df_clean['full_sq'] > 800, 'full_sq'] = np.NaN
df_clean.loc[df_clean['full_sq'] < 5, 'full_sq'] = np.NaN

In [None]:
df_clean['kitch_sq'].value_counts(bins=60, sort = False)

kitch_sq
(-2.014, 33.55]      21283
(33.55, 67.1]           33
(67.1, 100.65]           7
(100.65, 134.2]          3
(134.2, 167.75]          0
(167.75, 201.3]          0
(201.3, 234.85]          0
(234.85, 268.4]          0
(268.4, 301.95]          0
(301.95, 335.5]          0
(335.5, 369.05]          0
(369.05, 402.6]          0
(402.6, 436.15]          0
(436.15, 469.7]          0
(469.7, 503.25]          0
(503.25, 536.8]          0
(536.8, 570.35]          0
(570.35, 603.9]          0
(603.9, 637.45]          0
(637.45, 671.0]          0
(671.0, 704.55]          0
(704.55, 738.1]          0
(738.1, 771.65]          0
(771.65, 805.2]          0
(805.2, 838.75]          0
(838.75, 872.3]          0
(872.3, 905.85]          0
(905.85, 939.4]          0
(939.4, 972.95]          0
(972.95, 1006.5]         0
(1006.5, 1040.05]        0
(1040.05, 1073.6]        0
(1073.6, 1107.15]        0
(1107.15, 1140.7]        0
(1140.7, 1174.25]        0
(1174.25, 1207.8]        0
(1207.8, 1241.35]  

In [None]:
df_clean.loc[df_clean['kitch_sq'] > 50, 'kitch_sq'] = np.NaN
df_clean.loc[df_clean['kitch_sq'] < 0, 'kitch_sq'] = np.NaN

In [None]:
df_clean['build_year'].value_counts(bins=60, sort = False)

build_year
(-20052.01, 334200.15]       21328
(334200.15, 668400.3]            0
(668400.3, 1002600.45]           0
(1002600.45, 1336800.6]          0
(1336800.6, 1671000.75]          0
(1671000.75, 2005200.9]          0
(2005200.9, 2339401.05]          0
(2339401.05, 2673601.2]          0
(2673601.2, 3007801.35]          0
(3007801.35, 3342001.5]          0
(3342001.5, 3676201.65]          0
(3676201.65, 4010401.8]          0
(4010401.8, 4344601.95]          0
(4344601.95, 4678802.1]          0
(4678802.1, 5013002.25]          0
(5013002.25, 5347202.4]          0
(5347202.4, 5681402.55]          0
(5681402.55, 6015602.7]          0
(6015602.7, 6349802.85]          0
(6349802.85, 6684003.0]          0
(6684003.0, 7018203.15]          0
(7018203.15, 7352403.3]          0
(7352403.3, 7686603.45]          0
(7686603.45, 8020803.6]          0
(8020803.6, 8355003.75]          0
(8355003.75, 8689203.9]          0
(8689203.9, 9023404.05]          0
(9023404.05, 9357604.2]          0
(9357604.

In [None]:
df_clean.loc[df_clean['build_year'] > 2024, 'build_year'] = np.NaN
df_clean.loc[df_clean['build_year'] < 1700, 'build_year'] = np.NaN

In [None]:
df_clean['num_room'].value_counts(bins=60, sort = False)

num_room
(-0.02, 0.317]         12
(0.317, 0.633]          0
(0.633, 0.95]           0
(0.95, 1.267]        5300
(1.267, 1.583]          0
(1.583, 1.9]            0
(1.9, 2.217]        12423
(2.217, 2.533]          0
(2.533, 2.85]           0
(2.85, 3.167]        3269
(3.167, 3.483]          0
(3.483, 3.8]            0
(3.8, 4.117]          286
(4.117, 4.433]          0
(4.433, 4.75]           0
(4.75, 5.067]          27
(5.067, 5.383]          0
(5.383, 5.7]            0
(5.7, 6.017]            5
(6.017, 6.333]          0
(6.333, 6.65]           0
(6.65, 6.967]           0
(6.967, 7.283]          0
(7.283, 7.6]            0
(7.6, 7.917]            0
(7.917, 8.233]          3
(8.233, 8.55]           0
(8.55, 8.867]           0
(8.867, 9.183]          0
(9.183, 9.5]            0
(9.5, 9.817]            0
(9.817, 10.133]         2
(10.133, 10.45]         0
(10.45, 10.767]         0
(10.767, 11.083]        0
(11.083, 11.4]          0
(11.4, 11.717]          0
(11.717, 12.033]        0
(12

In [None]:
df_clean.loc[df_clean['num_room'] < 1, 'num_room'] = np.NaN

In [None]:
df_clean['water_treatment_km'].value_counts(bins=60, sort = False)

water_treatment_km
(-1.2999999999999998, -1.208]       2
(-1.208, -1.122]                    1
(-1.122, -1.036]                    0
(-1.036, -0.95]                     1
(-0.95, -0.864]                   675
(-0.864, -0.778]                    3
(-0.778, -0.692]                    6
(-0.692, -0.607]                   52
(-0.607, -0.521]                    4
(-0.521, -0.435]                    7
(-0.435, -0.349]                    6
(-0.349, -0.263]                    3
(-0.263, -0.177]                   12
(-0.177, -0.0909]                   4
(-0.0909, -0.00493]               177
(-0.00493, 0.081]                 285
(0.081, 0.167]                     23
(0.167, 0.253]                     18
(0.253, 0.339]                    112
(0.339, 0.425]                     39
(0.425, 0.511]                    226
(0.511, 0.597]                    278
(0.597, 0.683]                    114
(0.683, 0.769]                     96
(0.769, 0.855]                     81
(0.855, 0.94]                  

In [None]:
df_clean.loc[df_clean['water_treatment_km'] < 0, 'water_treatment_km'] = abs(df_clean['water_treatment_km'])

In [None]:
df_clean['state'].value_counts(bins=60, sort = False)

state
(0.967, 1.533]       3380
(1.533, 2.067]      13581
(2.067, 2.6]            0
(2.6, 3.133]         4071
(3.133, 3.667]          0
(3.667, 4.2]          296
(4.2, 4.733]            0
(4.733, 5.267]          0
(5.267, 5.8]            0
(5.8, 6.333]            0
(6.333, 6.867]          0
(6.867, 7.4]            0
(7.4, 7.933]            0
(7.933, 8.467]          0
(8.467, 9.0]            0
(9.0, 9.533]            0
(9.533, 10.067]         0
(10.067, 10.6]          0
(10.6, 11.133]          0
(11.133, 11.667]        0
(11.667, 12.2]          0
(12.2, 12.733]          0
(12.733, 13.267]        0
(13.267, 13.8]          0
(13.8, 14.333]          0
(14.333, 14.867]        0
(14.867, 15.4]          0
(15.4, 15.933]          0
(15.933, 16.467]        0
(16.467, 17.0]          0
(17.0, 17.533]          0
(17.533, 18.067]        0
(18.067, 18.6]          0
(18.6, 19.133]          0
(19.133, 19.667]        0
(19.667, 20.2]          0
(20.2, 20.733]          0
(20.733, 21.267]        0
(21.26

In [None]:
df_clean.loc[df_clean['state'] > 5, 'state'] = np.NaN

In [None]:
df_clean['mkad_km'].value_counts(bins=60, sort = False)

mkad_km
(-4.305000000000001, -4.158]      13
(-4.158, -4.02]                    0
(-4.02, -3.882]                    1
(-3.882, -3.744]                   0
(-3.744, -3.606]                   0
(-3.606, -3.469]                   3
(-3.469, -3.331]                   0
(-3.331, -3.193]                   0
(-3.193, -3.055]                   0
(-3.055, -2.917]                   0
(-2.917, -2.779]                   0
(-2.779, -2.642]                   0
(-2.642, -2.504]                   0
(-2.504, -2.366]                   1
(-2.366, -2.228]                   0
(-2.228, -2.09]                    2
(-2.09, -1.952]                    3
(-1.952, -1.814]                   2
(-1.814, -1.677]                   2
(-1.677, -1.539]                  59
(-1.539, -1.401]                  36
(-1.401, -1.263]                  23
(-1.263, -1.125]                  40
(-1.125, -0.987]                  70
(-0.987, -0.849]                  84
(-0.849, -0.712]                 160
(-0.712, -0.574]              

In [None]:
df_clean.loc[df_clean['mkad_km'] < 0, 'mkad_km'] = np.NaN

In [None]:
features_km = ['incineration_km', 'ttk_km', 'oil_chemistry_km', 'nuclear_reactor_km', 'power_transmission_line_km',
               'market_shop_km', 'fitness_km', 'stadium_km', 'basketball_km', 'detention_facility_km', 'big_church_km',
               'mosque_km', 'theater_km', 'exhibition_km', 'catering_km']

In [None]:
for feature in features_km:
  df_clean.loc[df_clean[feature] < 0, feature] = np.NaN

In [None]:
Q1 = df_clean.quantile(0.05)
Q3 = df_clean.quantile(0.95)
IQR = Q3 - Q1

In [None]:
outliers = (df_clean < (Q1 - 1.8* IQR)) | (df_clean > (Q3 + 1.8 * IQR))
print(outliers)

       full_sq  floor  build_year  num_room  kitch_sq  state  product_type  \
0        False  False       False     False     False  False         False   
1        False  False       False     False     False  False         False   
2        False  False       False     False     False  False         False   
3        False  False       False     False     False  False         False   
4        False  False       False     False     False  False         False   
...        ...    ...         ...       ...       ...    ...           ...   
21324    False  False       False     False     False  False         False   
21325    False  False       False     False     False  False         False   
21326    False  False       False     False     False  False         False   
21327    False  False       False     False     False  False         False   
21328    False  False       False     False     False  False         False   

       raion_popul  indust_part  sport_objects_raion  ...  \
0 

In [None]:
# Удаляем выбросы
df_clean1 = df_clean[~outliers.any(axis=1)]
print(df_clean1)

       full_sq  floor  build_year  num_room  kitch_sq  state  product_type  \
0         43.0   24.0      2014.0       1.0      10.0    1.0             1   
1         66.0    3.0      1979.0       3.0      10.0    3.0             0   
2         55.0    5.0      1979.0       2.0       6.0    2.0             0   
3         45.0    7.0      1970.0       2.0       6.0    2.0             0   
4         43.0    4.0      1968.0       2.0       6.0    2.0             0   
...        ...    ...         ...       ...       ...    ...           ...   
21324     56.0   19.0      1979.0       2.0       1.0    1.0             1   
21325     42.0    5.0      1977.0       1.0       8.0    3.0             0   
21326     32.0    3.0      1979.0       2.0       6.0    2.0             0   
21327     43.0    4.0      1969.0       2.0       5.0    2.0             0   
21328     32.0    1.0      1979.0       2.0       6.0    2.0             0   

       raion_popul  indust_part  sport_objects_raion  ...  \
0 

# Обучение после удаления выбросов



In [None]:
pip install catboost



In [None]:
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [None]:
# Отделение целевой переменной
X = df_clean1.drop(columns=['price_doc'])
y = df_clean1['price_doc']

# Разделение данных на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание и обучение модели CatBoost
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=100)
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, plot=True)

# Предсказание и оценка модели
y_pred = model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred)

print(f"CatBoostRegressor - Корень из среднеквадратичной ошибки (RMSE): {rmse_rf:.2f}")
print(f"CatBoostRegressor - Коэффициент детерминации (R^2): {r2_rf:.2f}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 3675244.7672441	test: 3636320.6978944	best: 3636320.6978944 (0)	total: 19.2ms	remaining: 19.1s
100:	learn: 2067693.0078466	test: 2288220.9831321	best: 2288220.9831321 (100)	total: 1.83s	remaining: 16.3s
200:	learn: 1896272.7324903	test: 2248953.3798597	best: 2248836.7107400 (186)	total: 4.59s	remaining: 18.2s
300:	learn: 1773494.7531558	test: 2236303.0322157	best: 2234977.3861940 (294)	total: 8.69s	remaining: 20.2s
400:	learn: 1678673.2119709	test: 2229520.8603525	best: 2229333.1280337 (399)	total: 12.6s	remaining: 18.9s
500:	learn: 1596768.5233221	test: 2228871.6901724	best: 2226203.4068177 (435)	total: 15.6s	remaining: 15.6s
600:	learn: 1526720.8754890	test: 2228707.9151937	best: 2226203.4068177 (435)	total: 18.7s	remaining: 12.4s
700:	learn: 1469303.3060079	test: 2226967.1982004	best: 2225523.6637890 (687)	total: 22.4s	remaining: 9.54s
800:	learn: 1414316.7271354	test: 2230330.2086748	best: 2225523.6637890 (687)	total: 25.2s	remaining: 6.26s
900:	learn: 1368622.1009504	tes

# Обучение до удаления выбросов

In [None]:
# Отделение целевой переменной
X = df_clean.drop(columns=['price_doc'])
y = df_clean['price_doc']

# Разделение данных на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создание и обучение модели CatBoost
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, loss_function='RMSE', verbose=100)
model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True, plot=True)

# Предсказание и оценка модели
y_pred = model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred)
rmse_rf = np.sqrt(mse_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred)

print(f"CatBoostRegressor - Корень из среднеквадратичной ошибки (RMSE): {rmse_rf:.2f}")
print(f"CatBoostRegressor - Коэффициент детерминации (R^2): {r2_rf:.2f}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 4428223.0956771	test: 4642681.6451153	best: 4642681.6451153 (0)	total: 13.2ms	remaining: 13.1s
100:	learn: 2276468.2309663	test: 2591572.9122761	best: 2591572.9122761 (100)	total: 1.83s	remaining: 16.3s
200:	learn: 2064325.8977974	test: 2530735.7929519	best: 2530735.7929519 (200)	total: 4.88s	remaining: 19.4s
300:	learn: 1909746.8070085	test: 2507504.9567474	best: 2506826.3157609 (284)	total: 8.75s	remaining: 20.3s
400:	learn: 1804300.9671911	test: 2503205.5982015	best: 2502935.5048347 (396)	total: 13.2s	remaining: 19.7s
500:	learn: 1719357.3305719	test: 2509028.1284750	best: 2502935.5048347 (396)	total: 17.2s	remaining: 17.1s
600:	learn: 1644722.4110307	test: 2513478.6568726	best: 2502935.5048347 (396)	total: 21.6s	remaining: 14.4s
700:	learn: 1578111.4533364	test: 2514740.2121217	best: 2502935.5048347 (396)	total: 28.1s	remaining: 12s
800:	learn: 1518462.2615028	test: 2518064.6765941	best: 2502935.5048347 (396)	total: 30.4s	remaining: 7.56s
900:	learn: 1466062.3945152	test:

In [None]:
submission = pd.read_csv('./gdrive/MyDrive/SHIFT_SUMMER/submission.csv', index_col=0)

In [None]:
test_df = pd.read_csv('./gdrive/MyDrive/SHIFT_SUMMER/test.csv', index_col=0)

test_preds = model.predict(test_df[X.columns])
submission['price_doc'] = test_preds
print(len(test_preds))
print(submission.shape)
submission.to_csv('./gdrive/MyDrive/SHIFT_SUMMER/submission.csv')

9142
(9142, 1)


# Выводы

В результате мы получили, что модель, обученная на предобработанных данных до удаления выбросов, показывает лучший результат по сравнению с моделью, использующей данных после удаления выбросов.