In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:


import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
matplotlib.rcParams.update({'font.size': 14})

In [None]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    plt.figure(figsize=(18,10))
    
    plt.subplot(121)
    sns.scatterplot(x=train_pred_values, y=train_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Train sample prediction')
    
    plt.subplot(122)
    sns.scatterplot(x=test_pred_values, y=test_true_values)
    plt.xlabel('Predicted values')
    plt.ylabel('True values')
    plt.title('Test sample prediction')

    plt.show()

In [None]:
TRAIN_DATASET_PATH = '../input/real-estate-price-prediction-moscow/train.csv'
TEST_DATASET_PATH = '../input/real-estate-price-prediction-moscow/test.csv'

### 1.<b> Считывание данных

In [None]:
train_df = pd.read_csv(TRAIN_DATASET_PATH)
test_df = pd.read_csv(TEST_DATASET_PATH)

### <b>Square

### Т.к. у нас в площади квартиры были значения, меньшие 18 кв.м., и большие 100 кв.м., то  их было решено заменить на минимально-адекватные (для меньших) и медианные (для больших). Это позволило снизить разность между мат.ожиданием и медианной на ~ 50%.

### <b> KitchenSquare



### Приступаем сначала к анализу признака KitchenSquare, потому что с помощью него будем восстанавливать пропущенные значения в признаке LifeSquare. Значения, превышающие 0.975 квантиль заменяем на медиану, а значения, меньшие 3 кв.м. заменяем на 3 кв.м.

### <b>HouseFloor, Floor


### Выбросами у признака HouseFloor будем считать значения, равные нулю и будем заполнять их медианой.
### Выбросами у признака Floor будем считать такие значения, этаж квартиры которых больше этажности дома. Эти значения будем заменять на случайное число в диапозоне от 1 до x, где x - кол-во этажей в доме.
### В результате такой обработки мат.ожидание Floor увеличилось на 4%.
### Медиана и мат.ожидание признака HouseFloor свдинулись вправо.


### <b>HouseYear

### Значения, большие 2020 г. постройки — заменим на 2020.
### Значения, меньшие 1800 г. постройки — заменим на медиану.

### 3.<b>  Обработка пропусков

### <b>LifeSquare

### Пропуски этого признака можно вычислить как разность между Square и KitchenSquare, если для соответствующего NaN есть значения Square и KitchenSquare. Т.к. пропусков в полях Square и KitchenSquare нет, то мы сможем восстановить все поля, но сналача оценим СКО такой замены по имеющимся данным.

In [None]:
from sklearn.metrics import mean_squared_error as mse

In [None]:
cond = ~train_df['LifeSquare'].isna()
pred = train_df.loc[cond, 'Square'] - train_df.loc[cond, 'KitchenSquare']
rmse = mse(train_df.loc[cond, 'LifeSquare'], pred)**0.5
print(f'RMSE : {rmse}')

In [None]:
print(f"M(X) : {train_df['LifeSquare'].mean()}")

### Среднее значене LifeSquare ~ 37, а СКО, при такой оценке LifeSquare, больше среднего ~ в 2,5 раза. Попробуем придумать другую замену.
### Посчитаем средний процент разности между Square и LifeSquare + KitchenSquare.

In [None]:
cond = ~train_df['LifeSquare'].isna()
sub_1 = (train_df.loc[cond, 'Square'] - (train_df.loc[cond, 'LifeSquare'] + train_df.loc[cond, 'KitchenSquare']))/train_df.loc[cond, 'Square']
sub_1.describe()

### т.е. в среднем мы ошибаемся ~ на 18%. Почистим данные от выбросов, чтобы посмотреть как изменится средняя ошибка в %.


In [None]:
train_df['LifeSquare_outlier'] = 0

cond_1 = train_df['LifeSquare'] < 10
cond_2 = train_df['LifeSquare'] > train_df.loc[cond, 'LifeSquare'].quantile(q = .975)

train_df.loc[cond & (cond_1 | cond_2), 'LifeSquare_outlier'] = 1
train_df.loc[cond & cond_1, 'LifeSquare'] = 10
train_df.loc[cond & cond_2, 'LifeSquare'] = train_df['LifeSquare'].median()

sub_2 = (train_df.loc[cond, 'Square'] - (train_df.loc[cond, 'LifeSquare'] + train_df.loc[cond, 'KitchenSquare']))/train_df.loc[cond, 'Square']
sub_2.describe()

### Такие замены привели к увеличению мат.ожидания ~ на 16%, но к значительному уменьшению СКО, что можно воспринимать как то, что такая замена для LifeSquare стала более точной.
### Т.к. разность между Square и (LifeSquare + KitchenSquare) имеет тенденцию сохраняться ~ на уровне 20% от Square, то продолжим её сохранять.

In [None]:
train_df.loc[~cond, 'LifeSquare'] = 0.8*train_df.loc[~cond, 'Square'] - train_df.loc[~cond, 'KitchenSquare']

train_df[['LifeSquare', 'Square', 'KitchenSquare']].plot(kind = 'box', subplots = True, figsize = (15, 7))
plt.subplots_adjust(wspace = 0.5)

### <b>Попытка восстановить пропущенные значения признака LifeSquare при помощи моделей ML

<b> Для восстановления пропущенных значений было использовано 4 метода: LinearRegression, RandomForest, SVR, GBR. <br>Сначала данные из тестового датасета были очищены от выбросов по формулам<br> LifeSquare = 0.8*Sqaure - KitchenSquare (для значений LifeSquare меньших 10)<br>
и<br>LifeSquare = LifeSquare.median (для значений, превышающих 975 квантиль)<br>
Далее данные были разбиты на 3 датасета X_train и X_valid LS_nan.
Ниже представлены лучшие результаты среднего значения R2 на кросс-валидации с 3 фолдами для выбранных методов:<br>
LR: 0.6875<br>
RF: 0.7746<br>
SVR: 0.1523<br>
GBR: 0.7808<br>
Кандидатами для восстановления пропущенных значений стали GBR и RF.
Далее я объединил X_train и X_valid для прогнозирования пропусков.<br>
Ниже представлены лучшие результаты среднего значения R2 на кросс-валидации с 3 фолдами для отобранных методов.<br>
GBR: 0.7826<br>
RF: 0.7898<br>
Далее была предсказана целевая переменная Price. Наиболее высокую метрику показала следующая комбинация:<br>
Восстановление пропусков LifeSquare по GBR, предсказание целевой переменной при помощи GBR. Среднее значение R2 на CV = 0.7478<br>
Тем не менее, эта работа не привела к значительному увеличению среднего значения на этапе валидации — выигрыш по сравнению с дефолтной обработкой LifeSquare составил ~ 5%, но это, все таки, лучше. 


In [None]:
df_results_NAN_RF = pd.DataFrame({'Metrics' : ['R2_mean', 'D(X)', 'RMSE(X)'], 'RF' : [0.7458, 0.00045, 0.0212], 'GBR' : [0.7475, 0.00017, 0.0129]}, columns = ['Metrics', 'RF', 'GBR'])

df_results_NAN_GBR = pd.DataFrame({'Metrics' : ['R2_mean', 'D(X)', 'RMSE(X)'], 'RF' : [0.7454, 0.00043, 0.0208], 'GBR' : [0.7478, 0.00018, 0.0134]}, columns = ['Metrics', 'RF', 'GBR'])

### <b>Data Preprocessing for LifeSquare

In [None]:
class DataPreprocessing_for_LS:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        
        self.medians = None
        self.kitchen_square_quantile = None        
        
    def fit(self, X):
        """Сохранение статистик"""
        
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)        
                
    def transform(self, X):
        """Трансформация данных"""
        
        X = X.copy()
        
        
        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
        
        
        # Square
        X['Square_outlier'] = 0
        X.loc[(X['Square'] < 18.0) | (X['Square'] > 100.0), 'Square_outlier'] = 1
        X.loc[X['Square'] < 18.0, 'Square'] = 18.0
        X.loc[X['Square'] > 100.0, 'Square'] = self.medians['Square']

        
        # KitchenSquare
        X['KitchenSquare_outlier'] = 0
        X.loc[(X['KitchenSquare'] > self.kitchen_square_quantile) | (X['KitchenSquare'] < 3) , 'KitchenSquare_outlier'] = 1

        X.loc[X['KitchenSquare'] > self.kitchen_square_quantile, 'KitchenSquare'] = self.medians['KitchenSquare']
        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        
        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X['Floor_outlier'] = 0

        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'Floor_outlier'] = 1

        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']

        floor_outliers = X.loc[X['Floor'] > X['HouseFloor']].index
        X.loc[floor_outliers, 'Floor'] = X.loc[floor_outliers, 'HouseFloor'].apply(lambda x: random.randint(1, x))
        
        
        # HouseYear
        current_year = datetime.now().year
        X['HouseYear_outlier'] = 0
        X.loc[(X['HouseYear'] == 4.968000e+03) | (X['HouseYear'] > current_year) | (X['HouseYear'] < 1800), 'HouseYear_outlier'] = 1

        X.loc[X['HouseYear'] == 4.968000e+03, 'HouseYear'] = 1968
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        X.loc[X['HouseYear'] < 1800, 'HouseYear'] = self.medians['HouseYear']        
        
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)  

        
        # Default
        X.fillna(self.medians, inplace = True)
        
        return X

### <b> Feature Generator for LS

In [None]:
class FeatureGenetator_for_LS():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_life_square_by_district = None
        self.med_life_square_by_floor_year = None
        self.house_year_max = None
        self.floor_max = None
        self.district_size = None
        self.district_size_median = None
        
    def fit(self, X, y=None):        
        X = X.copy()
        
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        
        # DistrictSize
        self.district_size = X['DistrictId'].value_counts().reset_index().rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
        self.district_size_median = self.district_size['DistrictSize'].median()
                
            
        # Target encoding        
        df = X.copy()
        
        if y is not None:
        ## District, Rooms
            df['LifeSquare'] = y.values
            
            self.med_life_square_by_district = df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'LifeSquare':'median'})\
                                            .rename(columns={'LifeSquare':'MedLifeSquareByDistrict'})
            
            self.med_life_square_by_district_median = self.med_life_square_by_district['MedLifeSquareByDistrict'].median()
            
        ## floor, year
            self.floor_max = df['Floor'].max()
            self.house_year_max = df['HouseYear'].max()
            
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            
            self.med_life_square_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'LifeSquare':'median'}).\
                                            rename(columns={'LifeSquare':'MedLifeSquareByFloorYear'})
            self.med_life_square_by_floor_year_median = self.med_life_square_by_floor_year['MedLifeSquareByFloorYear'].median()
        

        
    def transform(self, X):
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
    
        # DistrictId, IsDistrictLarge
        X = X.merge(self.district_size, on='DistrictId', how='left')
    
        
        X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)
        
        # More categorical features
        X = self.floor_to_cat(X)  # + столбец floor_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        
        # Target encoding
        if self.med_life_square_by_district is not None:
            X = X.merge(self.med_life_square_by_district, on=['DistrictId', 'Rooms'], how='left')
            X['MedLifeSquareByDistrict'].fillna(self.med_life_square_by_district_median, inplace=True)
            
        if self.med_life_square_by_floor_year is not None:
            X = X.merge(self.med_life_square_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            X['MedLifeSquareByFloorYear'].fillna(self.med_life_square_by_floor_year_median, inplace=True)
            
        # Default
        X['DistrictSize'].fillna(self.district_size_median, inplace = True)
        
        
        return X
    
    def floor_to_cat(self, X):
        bins = [0, 3, 5, 9, 15, self.floor_max]
        X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)

        X['floor_cat'].fillna(-1, inplace=True)
        return X
     
    def year_to_cat(self, X):
        bins = [0, 1941, 1945, 1980, 2000, 2010, self.house_year_max]
        X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)

        X['year_cat'].fillna(-1, inplace=True)
        return X
    
    

### <b> Data Preprocessing

In [None]:
class DataPreprocessing:
    """Подготовка исходных данных"""
    
    def __init__(self):
        """Параметры класса"""
        
        self.medians = None
        self.kitchen_square_quantile = None
        self.life_square_quantile = None
        
    def fit(self, X):
        """Сохранение статистик"""
        
        self.medians = X.median()
        self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)
        self.life_square_quantile = X.loc[~X['LifeSquare'].isna(), 'LifeSquare'].quantile(.975)
        
    def transform(self, X):
        """Трансформация данных"""
        
        X = X.copy()
        
        
        # Rooms
        X['Rooms_outlier'] = 0
        X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1
        X.loc[X['Rooms'] == 0, 'Rooms'] = 1
        X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']
        
        
        # Square
        X['Square_outlier'] = 0
        X.loc[(X['Square'] < 18.0) | (X['Square'] > 100.0), 'Square_outlier'] = 1
        X.loc[X['Square'] < 18.0, 'Square'] = 18.0
        X.loc[X['Square'] > 100.0, 'Square'] = self.medians['Square']

        
        # KitchenSquare
        X['KitchenSquare_outlier'] = 0
        X.loc[(X['KitchenSquare'] > self.kitchen_square_quantile) | (X['KitchenSquare'] < 3) , 'KitchenSquare_outlier'] = 1

        X.loc[X['KitchenSquare'] > self.kitchen_square_quantile, 'KitchenSquare'] = self.medians['KitchenSquare']
        X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3
        
        
        # HouseFloor, Floor
        X['HouseFloor_outlier'] = 0
        X['Floor_outlier'] = 0

        X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1
        X.loc[X['Floor'] > X['HouseFloor'], 'Floor_outlier'] = 1

        X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']

        floor_outliers = X.loc[X['Floor'] > X['HouseFloor']].index
        X.loc[floor_outliers, 'Floor'] = X.loc[floor_outliers, 'HouseFloor'].apply(lambda x: random.randint(1, x))
        
        
        # HouseYear
        current_year = datetime.now().year
        X['HouseYear_outlier'] = 0
        X.loc[(X['HouseYear'] == 4.968000e+03) | (X['HouseYear'] > current_year) | (X['HouseYear'] < 1800), 'HouseYear_outlier'] = 1

        X.loc[X['HouseYear'] == 4.968000e+03, 'HouseYear'] = 1968
        X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year
        X.loc[X['HouseYear'] < 1800, 'HouseYear'] = self.medians['HouseYear']        
        
        
        # Healthcare_1
        if 'Healthcare_1' in X.columns:
            X.drop('Healthcare_1', axis=1, inplace=True)
        
        
        # LifeSquare
        X['LifeSquare_outlier'] = 0        

        X.loc[(~X['LifeSquare'].isna()) & ((X['LifeSquare'] < 10) | (X['LifeSquare'] > self.life_square_quantile)), 'LifeSquare_outlier'] = 1    
        X.loc[(~X['LifeSquare'].isna()) & (X['LifeSquare'] < 10), 'LifeSquare'] = 0.8*X.loc[~X['LifeSquare'].isna(), 'Square'] - X.loc[~X['LifeSquare'].isna(), 'KitchenSquare']
        X.loc[(~X['LifeSquare'].isna()) & (X['LifeSquare'] > self.life_square_quantile), 'LifeSquare'] = self.medians['LifeSquare']
        
        X.loc[X['LifeSquare'].isna(), 'LifeSquare'] = 0.8*X.loc[X['LifeSquare'].isna(), 'Square'] - X.loc[X['LifeSquare'].isna(), 'KitchenSquare']
        
        # Default
        X.fillna(self.medians, inplace = True)
        
        return X

### 4. <b>Построение новых признаков

In [None]:
class FeatureGenetator():
    """Генерация новых фич"""
    
    def __init__(self):
        self.DistrictId_counts = None
        self.binary_to_numbers = None
        self.med_price_by_district = None
        self.med_price_by_floor_year = None
        self.house_year_max = None
        self.floor_max = None
        self.district_size = None
        self.district_size_median = None
        
    def fit(self, X, y=None):        
        X = X.copy()
        
        
        # Binary features
        self.binary_to_numbers = {'A': 0, 'B': 1}
        
        
        # DistrictSize
        self.district_size = X['DistrictId'].value_counts().reset_index().rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})
        self.district_size_median = self.district_size['DistrictSize'].median()
                
            
        # Target encoding        
        df = X.copy()
        
        if y is not None:
        ## District, Rooms
            df['Price'] = y.values
            
            self.med_price_by_district = df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\
                                            .rename(columns={'Price':'MedPriceByDistrict'})
            
            self.med_price_by_district_median = self.med_price_by_district['MedPriceByDistrict'].median()
            
        ## floor, year
            self.floor_max = df['Floor'].max()
            self.house_year_max = df['HouseYear'].max()
            
            df = self.floor_to_cat(df)
            df = self.year_to_cat(df)
            
            self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\
                                            rename(columns={'Price':'MedPriceByFloorYear'})
            self.med_price_by_floor_year_median = self.med_price_by_floor_year['MedPriceByFloorYear'].median()
        

        
    def transform(self, X):
        
        # Binary features
        X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers)  # self.binary_to_numbers = {'A': 0, 'B': 1}
        X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)
        X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)
        
        # DistrictId, IsDistrictLarge
        X = X.merge(self.district_size, on='DistrictId', how='left')
    
        
        X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)
        
        # More categorical features
        X = self.floor_to_cat(X)  # + столбец floor_cat
        X = self.year_to_cat(X)   # + столбец year_cat
        
        # Target encoding
        if self.med_price_by_district is not None:
            X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')
            X['MedPriceByDistrict'].fillna(self.med_price_by_district_median, inplace=True)
            
        if self.med_price_by_floor_year is not None:
            X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')
            X['MedPriceByFloorYear'].fillna(self.med_price_by_floor_year_median, inplace=True)
            
        # Default
        X['DistrictSize'].fillna(self.district_size_median, inplace = True)
        
        
        return X
    
    def floor_to_cat(self, X):
        bins = [0, 3, 5, 9, 15, self.floor_max]
        X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)

        X['floor_cat'].fillna(-1, inplace=True)
        return X
     
    def year_to_cat(self, X):
        bins = [0, 1941, 1945, 1980, 2000, 2010, self.house_year_max]
        X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)

        X['year_cat'].fillna(-1, inplace=True)
        return X
    
    

### 5.<b> Отбор признаков

<b> Все признаки

In [None]:
feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',
                 'Helthcare_2', 'Shops_1', 'Shops_2']

new_feature_names = ['Rooms_outlier', 'HouseFloor_outlier', 'HouseYear_outlier', 'DistrictSize',
                     'IsDistrictLarge',  'MedPriceByDistrict', 'MedPriceByFloorYear']

target_name = 'Price'

### <b> Обучение моделей RF/GBR для восстановления NaN у LifeSquare на всем датасете (без валидационного набора)

In [None]:
feature_names = ['Square', 'Price', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Social_1', 'Social_2', 'Social_3']

new_feature_names = ['DistrictSize', 'MedLifeSquareByFloorYear', 'MedLifeSquareByDistrict']

target_name = 'LifeSquare'


In [None]:
data = pd.read_csv(TRAIN_DATASET_PATH) #Считываем данные
cond = data['LifeSquare'].isna()

LS_nan = data.loc[cond, ] #Разбиваем на 2 df'a: с пропусками и нет
LS_nan.drop(columns = target_name, inplace = True) #Выбиваем целевую переменную 

LS = data.loc[~cond, ]

#Работа над выбросами в целевой переменной
#LS.loc[LS['LifeSquare'] < 10, 'LifeSquare'] = 10
LS.loc[LS['LifeSquare'] < 10, 'LifeSquare'] = 0.8*LS['Square'] - LS['KitchenSquare']
LS.loc[LS['LifeSquare'] > LS['LifeSquare'].quantile(.975), 'LifeSquare'] = LS['LifeSquare'].median()
#LS.loc[(LS['LifeSquare'] < 10) | (LS['LifeSquare'] > LS['LifeSquare'].quantile(.975)), 'LifeSquare'] = 0.8*LS['Square'] - LS['KitchenSquare']

X_train = LS.drop(columns = target_name)
y_train = LS[target_name]


preprocessor = DataPreprocessing_for_LS()
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
LS_nan  = preprocessor.transform(LS_nan)

X_train.shape, LS_nan.shape

In [None]:
features_gen = FeatureGenetator_for_LS()
features_gen.fit(X_train, y_train)

X_train = features_gen.transform(X_train)
LS_nan  = features_gen.transform(LS_nan)

X_train = X_train[feature_names + new_feature_names]
LS_nan  = LS_nan[feature_names + new_feature_names]

X_train.isna().sum().sum(), LS_nan.isna().sum().sum()

In [None]:
#rf_model = RandomForestRegressor(max_depth = 13, n_estimators = 200, min_samples_leaf = 4 , random_state=21, criterion='mse')
#rf_model.fit(X_train, y_train)

gb_model = GradientBoostingRegressor(learning_rate = 0.1, max_depth = 5, min_samples_leaf = 4, min_samples_split = 2,\
                                     n_estimators = 100)
gb_model.fit(X_train, y_train)

#y_train_preds = rf_model.predict(X_train)
#y_nan_preds = rf_model.predict(LS_nan)

y_train_preds = gb_model.predict(X_train)
y_nan_preds = gb_model.predict(LS_nan)

evaluate_preds(y_train, y_train_preds, [0, 1, 2], [0, 1, 2])

In [None]:
#data.loc[cond, 'LifeSquare'] = rf_model.predict(LS_nan)
data.loc[cond, 'LifeSquare'] = gb_model.predict(LS_nan)

### <b> Часть, выбранная для исследования. Важность остальных признаков была < 10**(-2)

In [None]:
feature_names = ['Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
                 'Ecology_1', 'Social_1', 'Social_2', 'Social_3']

new_feature_names = ['DistrictSize', 'MedPriceByFloorYear', 'MedPriceByDistrict']

target_name = 'Price'

In [None]:
train_df = data
test_df = pd.read_csv(TEST_DATASET_PATH)

X = train_df.drop(columns=target_name)
y = train_df[target_name]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=21)

preprocessor = DataPreprocessing()
preprocessor.fit(X_train)
#preprocessor.fit(X)

X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)
#X = preprocessor.transform(X)
test_df = preprocessor.transform(test_df)

#X.shape, test_df.shape
X_train.shape, X_valid.shape, test_df.shape

In [None]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()

In [None]:
features_gen = FeatureGenetator()
features_gen.fit(X_train, y_train)

#features_gen.fit(X, y)

X_train = features_gen.transform(X_train)
X_valid = features_gen.transform(X_valid)
test_df = features_gen.transform(test_df)

#X = features_gen.transform(X)
#test_df = features_gen.transform(test_df)

X_train = X_train[feature_names + new_feature_names]
X_valid = X_valid[feature_names + new_feature_names]
test_df = test_df[feature_names + new_feature_names]

#X = X[feature_names + new_feature_names]
#test_df = test_df[feature_names + new_feature_names]

#X.isna().sum().sum(), test_df.isna().sum().sum()

X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()

In [None]:
X_train.shape, X_valid.shape, test_df.shape

### 7. <b>Построение модели

<b>Random Forest Regressor

In [None]:

#rf_model = RandomForestRegressor(random_state=21, criterion='mse', n_jobs = 3)
#gb_model = GradientBoostingRegressor(random_state=21, criterion='mse')
#params = {'n_estimators' : [10, 100, 200], 'max_depth' : [el for el in range(1, 20)], 'min_samples_leaf' : [el for el in range(2, 5)]}
#clf = GridSearchCV(gb_model, params, scoring = 'r2')
#clf.fit(X_train, y_train)
#clf.best_params_, clf.best_score_


In [None]:
#rf_model = RandomForestRegressor(max_depth = 19, n_estimators = 200, min_samples_leaf = 3 , random_state=21, criterion='mse')
gb_model = GradientBoostingRegressor(max_depth = 5, min_samples_leaf = 4, n_estimators = 200, random_state=21, criterion='mse')

#rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
#gb_model.fit(X, y)

y_train_preds = gb_model.predict(X_train)
y_test_preds = gb_model.predict(X_valid)
#y_train_preds = gb_model.predict(X)

#y_train_preds = rf_model.predict(X_train)
#y_test_preds = rf_model.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)

In [None]:
#cv_score = cross_val_score(rf_model, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))
cv_score = cross_val_score(gb_model, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))
#cv_score = cross_val_score(gb_model, X, y, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))
cv_score

In [None]:
cv_score.mean()

### <b>StackingRegressor, VotingRegressor, BaggingRegressor

In [None]:
#Stacking

lr = LinearRegression()
gb = GradientBoostingRegressor(max_depth = 5, min_samples_leaf = 4, n_estimators = 200)

stack = StackingRegressor([('lr', lr), ('rf', rf_model)], final_estimator=gb)
stack.fit(X_train, y_train)

In [None]:
y_train_preds = stack.predict(X_train)
y_test_preds = stack.predict(X_valid)

evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)



### 8.<b> Прогнозирование на тестовом датасете

In [None]:
submit = pd.read_csv('/kaggle/input/real-estate-price-prediction-moscow/sample_submission.csv')
predictions = gb_model.predict(test_df)
submit['Price'] = predictions
submit.to_csv('gb_v1_submit.csv', index = False)