In [None]:
!pip install geodist >> none
from geodist import GeoDist

!pip install catboost >> none
from catboost import CatBoostRegressor, Pool

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle

from google.colab import drive
drive.mount('/content/gdrive')

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/gdrive


# Анализ и обработка данных

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/data csv/formatted.csv')
df.head()

Unnamed: 0,id,coords,rooms_count,year,house_floors,material,floor,flat_area,kitchen_area,balcony,metro,condition,price,url
0,2624808613,"(55.680879, 37.738863)",1,2021.0,13.0,панельный,13.0,36.7,10.3,лоджия,21–30 мин.,косметический,9500000,/moskva/kvartiry/1-k._kvartira_367m_1313et._26...
1,2532916318,"(55.796471, 37.609919)",студия,1971.0,8.0,блочный,5.0,32.0,,,6–10 мин.,дизайнерский,10490000,/moskva/kvartiry/apartamenty-studiya_32m_58et....
2,2587872760,"(55.740624, 37.66645)",2,1984.0,12.0,панельный,4.0,52.0,10.0,лоджия,6–10 мин.,евро,17500000,/moskva/kvartiry/2-k._kvartira_52m_412et._2587...
3,2543206638,"(55.829437, 37.414219)",студия,,4.0,блочный,1.0,21.1,,,16–20 мин.,евро,3258000,/moskva/kvartiry/apartamenty-studiya_211m_14et...
4,2658418970,"(55.805169, 37.540695)",3,,8.0,кирпичный,1.0,68.7,11.0,,6–10 мин.,требует ремонта,16500000,/moskva/kvartiry/3-k._kvartira_687m_18et._2658...


In [None]:
df = df.drop(['id', 'url'], axis=1)


def dist(x):
        s = [float(s) for s in re.findall(r'-?\d+\.?\d*', x)]
        s[0], s[1] = s[1], s[0]
        s = tuple(s)
        res = float(format(GeoDist([s]).distance(37.617734, 55.751999)/1000, '.3f'))
        return res

df['coords'] = [dist(x) for x in df['coords']]


df = df.dropna(thresh=4)
df = df.reset_index(drop=True)

df.year = df.year.replace(0, np.nan)
df.year[932] = 2021.0
df.year[990] = 1976.0
df.year[1706] = 1956.0


monolit = df.loc[(df.material == 'монолитный') & (df.year.isna() == False)]
panel = df.loc[(df.material == 'панельный') & (df.year.isna() == False)]
brick = df.loc[(df.material == 'кирпичный') & (df.year.isna() == False)]
monolit_brick = df.loc[(df.material == 'монолитно-кирпичный') & (df.year.isna() == False)]
block = df.loc[(df.material == 'блочный') & (df.year.isna() == False)]

for i in range(df.shape[0]):
    if df.year[i] != df.year[i]:
        if df.material[i] == 'монолитный':
            df.year[i] = monolit.year.mean()
        elif df.material[i] == 'панельный':
            df.year[i] = panel.year.mean()
        elif df.material[i] == 'кирпичный':
            df.year[i] = brick.year.mean()
        elif df.material[i] == 'блочный':
            df.year[i] = block.year.mean()
        else:
            df.year[i] = monolit_brick.year.mean()

df.year = df.year.astype(int)


kitchen_coef = df.flat_area.mean() / df.kitchen_area.mean()

for i in range(df.shape[0]):
    if df.kitchen_area[i] != df.kitchen_area[i]:
        df.kitchen_area[i] = format(df.flat_area[i] / kitchen_coef, '.1f')
        if df.kitchen_area[i] < 5: df.kitchen_area[i] = 5


df.balcony = df.balcony.fillna('нет')
df.metro = df.metro.fillna('6–10 мин.')


for i in range(df.shape[0]):
    if df.condition[i] != df.condition[i]:
        if df.coords[i] < 10:
            if 2000 < df.year[i] < 2020 and df.price[i] > df.price.mean():
                df.condition[i] = 'евро'
            else:
                df.condition[i] = 'косметический'
        else:
            df.condition[i] = 'требует ремонта'

df.sample(7)

Unnamed: 0,coords,rooms_count,year,house_floors,material,floor,flat_area,kitchen_area,balcony,metro,condition,price
2829,8.754,2,1971,14.0,блочный,5.0,38.0,6.0,лоджия,11–15 мин.,евро,12500000
14803,3.045,5,1961,19.0,кирпичный,13.0,205.0,20.0,балкон,6–10 мин.,требует ремонта,105000000
12264,13.069,9,2006,6.0,монолитный,4.0,528.5,20.0,балкон,от 31 мин.,косметический,495000000
8142,10.56,студия,1982,22.0,панельный,1.0,14.8,5.0,нет,11–15 мин.,косметический,2940000
14191,4.276,4,2011,65.0,монолитный,11.0,129.8,19.5,нет,6–10 мин.,евро,128541600
3750,12.537,студия,1957,5.0,монолитно-кирпичный,1.0,24.0,5.0,нет,16–20 мин.,дизайнерский,7190000
881,21.167,студия,1995,17.0,панельный,2.0,19.8,5.0,балкон,6–10 мин.,требует ремонта,4800000


In [None]:
df.corr().style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,coords,year,house_floors,floor,flat_area,kitchen_area,price
coords,1.0,0.06,-0.01,-0.12,-0.43,-0.34,-0.45
year,0.06,1.0,0.39,0.33,0.23,0.27,0.19
house_floors,-0.01,0.39,1.0,0.81,0.16,0.18,0.02
floor,-0.12,0.33,0.81,1.0,0.28,0.26,0.13
flat_area,-0.43,0.23,0.16,0.28,1.0,0.65,0.75
kitchen_area,-0.34,0.27,0.18,0.26,0.65,1.0,0.51
price,-0.45,0.19,0.02,0.13,0.75,0.51,1.0


In [None]:
num_cols = ['coords',
            'year',
            'house_floors',
            'floor',
            'flat_area',
            'kitchen_area']

cat_cols = ['rooms_count',
            'material',
            'balcony',
            'metro',
            'condition']
            
target_col = df.price

In [None]:
df = df.iloc[:, :-1]

# Обучение

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, target_col)

In [None]:
boosting_model = CatBoostRegressor(cat_features = cat_cols)

In [None]:
params = {
    'depth' : [6, 7, 8, 9, 10],
    'learning_rate' : [0.01, 0.02, 0.03, 0.04],
    'iterations' : [100, 200, 300]
}

grid_model = GridSearchCV(estimator=boosting_model,
                        param_grid = params,
                        n_jobs=-1)

grid_model.fit(X_train, y_train)

In [None]:
pred = grid_model.predict(X_test)
r2 = r2_score(y_test, pred)
print('R2: {:.2f}'.format(r2))

R2: 0.85


In [None]:
filename = 'lct_model.sav'
pickle.dump(open(filename, 'wb'))