In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

In [75]:
data = pd.read_csv(r"data\all_v2.csv", sep=',',parse_dates=["date"])

In [76]:
data.head()

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
0,6050000,2018-02-19,20:00:21,59.805808,30.376141,2661,1,8,10,3,82.6,10.8,1
1,8650000,2018-02-27,12:04:54,55.683807,37.297405,81,3,5,24,2,69.1,12.0,1
2,4000000,2018-02-28,15:44:00,56.29525,44.061637,2871,1,5,9,3,66.0,10.0,1
3,1850000,2018-03-01,11:24:52,44.996132,39.074783,2843,4,12,16,2,38.0,5.0,11
4,5450000,2018-03-01,17:42:43,55.918767,37.984642,81,3,13,14,2,60.0,10.0,1


In [77]:
data.shape

(5477006, 13)

In [78]:
data.isnull().sum()

price            0
date             0
time             0
geo_lat          0
geo_lon          0
region           0
building_type    0
level            0
levels           0
rooms            0
area             0
kitchen_area     0
object_type      0
dtype: int64

In [79]:
#Отбираем только данные за 2021 год из за большого объема данных
df = data[(data["date"].dt.year == 2021)]

In [80]:
df.head()

Unnamed: 0,price,date,time,geo_lat,geo_lon,region,building_type,level,levels,rooms,area,kitchen_area,object_type
4909898,17000000,2021-01-01,00:01:54,55.803096,37.557713,3,4,2,5,3,78.0,8.4,1
4909899,14093440,2021-01-01,00:03:48,55.815317,37.426025,3,2,9,24,2,60.8,21.3,11
4909900,3800000,2021-01-01,00:05:07,54.744539,20.484014,7896,3,14,17,1,43.0,10.0,1
4909901,9428200,2021-01-01,00:06:26,55.880344,37.501306,3,2,26,33,2,59.0,19.3,11
4909902,21339360,2021-01-01,00:06:29,55.812833,37.601419,3,2,24,32,4,116.8,17.1,11


In [81]:
#Отбираем только нужные столбцы
cols = ["region", "building_type", "level", "levels", "rooms", "area", "kitchen_area", "object_type"]

In [82]:
X = df[cols]
y = df["price"]
X.head()

Unnamed: 0,region,building_type,level,levels,rooms,area,kitchen_area,object_type
4909898,3,4,2,5,3,78.0,8.4,1
4909899,3,2,9,24,2,60.8,21.3,11
4909900,7896,3,14,17,1,43.0,10.0,1
4909901,3,2,26,33,2,59.0,19.3,11
4909902,3,2,24,32,4,116.8,17.1,11


In [84]:
# Разбиение на обучающую и тестовую выборки (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
# Создаем объект нормализатора
scaler = StandardScaler()

# Обучаем нормализатор на обучающих данных и трансформируем их
X_train_scaled = scaler.fit_transform(X_train)

# Трансформируем тестовые данные с помощью того же нормализатора
X_test_scaled = scaler.transform(X_test)

In [86]:
# Инициализация модели
model = LinearRegression()

# Обучение на тренировочных данных
model.fit(X_train_scaled, y_train)

# Оценка на тестовых данных
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE на тесте: {mse:.3f}")
print(f"R2 на тесте: {r2:.3f}")
print(f"Score: {model.score(X_test_scaled,y_test):.3f}")


MSE на тесте: 103053190346725.891
R2 на тесте: 0.172
Score: 0.172


In [87]:
# Инициализация модели XGBoost для регрессии
model = xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)

# Обучение модели
model.fit(X_train, y_train)

# Предсказание на тестовых данных
y_pred = model.predict(X_test)

# Оценка ошибки по среднеквадратичной ошибке
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE на тесте: {mse:.3f}")
print(f"R2 на тесте: {r2:.3f}")
print(f"Score: {model.score(X_test,y_test):.3f}")

MSE на тесте: 65231812296704.000
R2 на тесте: 0.476
Score: 0.476


In [19]:
# Инициализация модели
xgb_model = xgb.XGBRegressor(random_state=42)

# Задание сетки гиперпараметров для перебора
param_grid = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200]
}

# Инициализация GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)

# Обучение с подбором гиперпараметров
grid_search.fit(X_train, y_train)

# Лучшие параметры
print("Best parameters found: ", grid_search.best_params_)

# Предсказание и оценка качества
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE на тесте: {mse:.3f}")
print(f"R2 на тесте: {r2:.3f}")
print(f"Score: {best_model.score(X_test,y_test):.3f}")

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best parameters found:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 1.0}
MSE на тесте: 58275177431040.000
R2 на тесте: 0.532
Score: 0.532


In [88]:
best_model = xgb.XGBRegressor(random_state = 42, colsample_bytree= 0.8, learning_rate= 0.1, max_depth = 7, min_child_weight= 3, n_estimators = 200, subsample = 1.0)
# Обучение модели
best_model.fit(X_train, y_train)

# Предсказание на тестовых данных
y_pred = best_model.predict(X_test)

# Оценка ошибки по среднеквадратичной ошибке
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE на тесте: {mse:.3f}")
print(f"R2 на тесте: {r2:.3f}")
print(f"Score: {best_model.score(X_test,y_test):.3f}")

MSE на тесте: 58275177431040.000
R2 на тесте: 0.532
Score: 0.532


In [89]:
X_test.iloc[0].to_numpy().reshape(-1,1).T.shape

(1, 8)

In [90]:
X_train.iloc[0].to_numpy()

array([ 3. ,  1. ,  2. ,  9. ,  2. , 45.3,  6.8,  1. ])

In [40]:
# Создаем модель случайного леса для регрессии
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Обучаем модель
model.fit(X_train, y_train)

# Делаем предсказания на тесте
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE на тесте: {mse:.3f}")
print(f"R2 на тесте: {r2:.3f}")
print(f"Score: {model.score(X_test,y_test):.3f}")

MSE на тесте: 139569499615519.969
R2 на тесте: 0.186
Score: 0.186


In [91]:
best_model.predict(X_test.iloc[1].to_numpy().reshape(-1,1).T)[0].item()

3993094.25

In [92]:
import pickle
with open("russian_home_prices_model.pickle", "wb") as f:
    pickle.dump(best_model, f)

In [93]:
import json
columns = {"data_columns" : [col.lower() for col in X.columns]}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))