In [113]:
# загрузка
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
import lightgbm as lgb

data = pd.read_csv('flatprices_train.csv',sep=';')
test = pd.read_csv('flatprices_test.csv',sep=';')

data = data.drop_duplicates(subset=["price", "longitude", "latitude", "total_square", "floor_number"])

In [114]:
# создание и удаление признаков
labelEncoder = LabelEncoder()
data['source_encoded'] = labelEncoder.fit_transform(data['source']) # - хуже если их убрать(521к)
test['source_encoded'] = labelEncoder.fit_transform(test['source']) # - хуже если их убрать(521к)
data['age'] = 2023 - data['exploitation_start_year']
test['age'] = 2023 - test['exploitation_start_year']
data = data.drop(['created_at','source'], axis=1)
test = test.drop(['created_at','source'], axis=1)

In [115]:
# предобработка признаков
# работа с выбросами
data.loc[data[data['floor_number'] == 89].index, 'floor_number'] = 6
data.loc[data['floor_number'] < 1, 'floor_number'] = 6
indexes_to_remove = data[data['exploitation_start_year'] < 1900].index
data = data.drop(indexes_to_remove)
data.loc[data['total_square'] > 500, 'total_square'] = data['price'].mean().round(0).astype('int32')
data.loc[data['total_square'] > 500, 'total_square'] = 62.3
data.loc[data['number_of_rooms'] > 6,'number_of_rooms'] = 6
data.loc[data['age'] > 90, 'age'] = data['age'].quantile(0.75)
data.loc[data['floor_number'] < 1, 'floor_number'] = 6
test.loc[test['number_of_rooms'] > 6,'number_of_rooms'] = 2

In [117]:
# убираем выбросы таргета
# Вычисляем границы
lower_bound = data['price'].quantile(0.001)
upper_bound = data['price'].quantile(0.999)

# Заменяем выбросы на среднее значение датафрейма
data.loc[data['price'] < lower_bound, 'price'] = data['price'].mean().round(0).astype('int32')
data.loc[data['price'] > upper_bound, 'price'] = data['price'].mean().round(0).astype('int32')

In [119]:
# preprocessing
all_features = data.drop(['price','id','exploitation_start_year'],axis = 1).columns
target = ['price']

#   Для непрерывных - применяем SimpleImputer, а затем StandardScaler
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("scaler", StandardScaler())
])

# Собираем воедино трансформеры для признаков
data_transformer = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, all_features) ])

# Создание конвейера препроцессора, который сначала преобразует данные
preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])

In [120]:
# stacking_regression
gradient_params = {'alpha': 0.1,     'learning_rate':  0.2, 'subsample': 0.8,      'loss': 'huber',
                   'max_depth': 7,   'min_samples_leaf': 1,'min_samples_split': 10,'n_estimators': 200 }
forest_params =   {'max_depth': None,'min_samples_leaf': 1,'min_samples_split': 2, 'n_estimators': 200 }


clf1 = RandomForestRegressor(**forest_params)
clf2 = GradientBoostingRegressor(**gradient_params)
clf3 = lgb.LGBMRegressor()

estimators=[("random_forest",     clf1),
            ("gradient_boosting", clf2),
            ("lgbm_regressor",    clf3),]

stacking_regression = StackingRegressor(estimators=estimators, final_estimator = clf2, n_jobs=-1, verbose=True)

In [None]:
# тест модели(fit)
regression_pipeline = Pipeline(
steps=[("preprocessor", preprocessor), 
       ("regression", stacking_regression)]) 

x_train, x_test, y_train, y_test = train_test_split(data[all_features], data[target], test_size=0.2, random_state=42)
regression_pipeline.fit(x_train[all_features], y_train[target])

preds_train = regression_pipeline.predict(x_train[all_features])
preds_test =  regression_pipeline.predict(x_test[all_features])

a,b = round(sqrt(mean_squared_error(y_train[target], preds_train)),0), round(mean_absolute_error(y_train[target], preds_train),0)
c,d = round(sqrt(mean_squared_error(y_test[target],  preds_test)),0),  round(mean_absolute_error(y_test[target],  preds_test),0)

print(f'On train: MSE: {a}, MAE: {b}')
print(f'On test:  MSE: {c}, MAE: {d}')

In [121]:
# финальная модель(fit)
regression_pipeline = Pipeline(
steps=[("preprocessor", preprocessor), 
       ("regression", stacking_regression)]) 

x_train, x_test, y_train = data[all_features], test[all_features], data[target]

regression_pipeline.fit(x_train[all_features],  np.ravel(y_train[target]))

In [122]:
# выгрузка
submission = pd.DataFrame({
    'id': test['id'],
    'price': regression_pipeline.predict(x_test[all_features])
})
submission['price'] = submission['price'].round(0).astype('int32')
submission.to_csv('submission_13_11_6.csv', index=False)
submission.head(20)

Unnamed: 0,id,price
0,574026,2852484
1,589513,6316055
2,578919,3817092
3,589862,4400957
4,576247,3290718
5,596438,4998219
6,575882,7118216
7,589621,4245568
8,589552,4892570
9,587280,5095004
