<a href="https://colab.research.google.com/github/coldbilberry/repo-gui/blob/main/%D0%91%D0%B8%D0%B1%D0%BB%D0%B8%D0%BE%D1%82%D0%B5%D0%BA%D0%B8_Python_%D0%B4%D0%BB%D1%8F_Data_Science_Numpy%2C_Matplotlib%2C_Scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
matplotlib.rcParams.update({'font.size': 14})

In [None]:
TRAIN_DATASET_PATH = 'C:/АННА/Data Sience/Библиотеки Python/Проект/train.csv'
TEST_DATASET_PATH = 'C:/АННА/Data Sience/Библиотеки Python/Проект/test.csv'

1. Постановка задачи

Цель - оценивать стоимость квартиры

2. Получение данных

Статистические данные о ряде квартир в России. Описание датасета

Id - идентификационный номер квартиры

DistrictId - идентификационный номер района

Rooms - количество комнат

Square - площадь

LifeSquare - жилая площадь

KitchenSquare - площадь кухни

Floor - этаж

HouseFloor - количество этажей в доме

HouseYear - год постройки дома

Ecology_1, Ecology_2, Ecology_3 - экологические показатели местности

Social_1, Social_2, Social_3 - социальные показатели местности

Healthcare_1, Helthcare_2 - показатели местности, связанные с охраной здоровья

Shops_1, Shops_2 - показатели, связанные с наличием магазинов, торговых центров

Price - цена квартиры

3. Анализ данных

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
df = pd.read_csv(TRAIN_DATASET_PATH, sep=',')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

4. Визуализация данных

In [None]:
df.head(4)

In [None]:
df.tail(2)

In [None]:
df.sample(10)

In [None]:
df[df['Rooms'] == 0].sample(8) # 0 комнат у 8-ми квартир

In [None]:
df[df['Price'] > 0].agg([np.min, np.mean, np.max])

In [None]:
df[(df['HouseFloor'] < 5) & (df['Floor'] > 5)].head() # этажность дома меньше, чем этаж квартиры

In [None]:
df.dtypes

In [None]:
df.info(memory_usage='deep')

In [None]:
df.describe()

5. Разбиение данных на train и valid

1 Разделение на target и feature датасеты

In [None]:
df.columns.tolist()

In [None]:
target = 'Price'

In [None]:
feature_names = ['Id','DistrictId','Rooms','Square','LifeSquare','KitchenSquare','Floor','HouseFloor','HouseYear','Ecology_1',
                 'Ecology_2','Ecology_3','Social_1','Social_2','Social_3','Healthcare_1','Helthcare_2','Shops_1','Shops_2']

In [None]:
X = pd.DataFrame(df, columns=feature_names)
X.head()

In [None]:
y = pd.DataFrame(df, columns=['Price'])
y.head()

2 Разделение на тренировочный и тестовый датасеты

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

3 Тестовый датасет

In [None]:
test_df = pd.read_csv(TEST_DATASET_PATH, sep=',')

In [None]:
test_df.head()

6. Переработка данных

1 Обработка нулевых значений и NaN.

1.1 Пропуски

In [None]:
df.isna().sum()

1.1.1 LifeSquare_nan

In [None]:
X_train['LifeSquare_nan'] = 0
X_valid['LifeSquare_nan'] = 0
test_df['LifeSquare_nan'] = 0
X_train

In [None]:
X_train.loc[X_train['LifeSquare'].isna(), 'LifeSquare_nan']
X_valid.loc[X_valid['LifeSquare'].isna(), 'LifeSquare_nan']
test_df.loc[test_df['LifeSquare'].isna(), 'LifeSquare_nan']

In [None]:
X_train.loc[X_train['LifeSquare'].isna(), 'LifeSquare_nan'] = 1
X_valid.loc[X_valid['LifeSquare'].isna(), 'LifeSquare_nan'] = 1
test_df.loc[test_df['LifeSquare'].isna(), 'LifeSquare_nan'] = 1

In [None]:
X_valid[X_valid['LifeSquare_nan'] == 1]

In [None]:
# проверка метода fillna (заполнить пропуски)
median = X_train['LifeSquare'].median()
X_train['LifeSquare'].fillna(median).isna().sum()
X_valid['LifeSquare'].fillna(median).isna().sum()
test_df['LifeSquare'].fillna(median).isna().sum()

In [None]:
X_train['LifeSquare'].isna().sum()

In [None]:
# заполняем медианой
X_train['LifeSquare'] = X_train['LifeSquare'].fillna(median)
X_valid['LifeSquare'] = X_valid['LifeSquare'].fillna(median)
test_df['LifeSquare'] = test_df['LifeSquare'].fillna(median)

In [None]:
X_train.isna().sum()

1.1.2 Healthcare_1_nan

In [None]:
X_train['Healthcare_1_nan'] = 0
X_valid['Healthcare_1_nan'] = 0
test_df['Healthcare_1_nan'] = 0
X_train

In [None]:
X_train.loc[X_train['Healthcare_1'].isna(), 'Healthcare_1_nan']
X_valid.loc[X_valid['Healthcare_1'].isna(), 'Healthcare_1_nan']
test_df.loc[test_df['Healthcare_1'].isna(), 'Healthcare_1_nan']

In [None]:
X_train.loc[X_train['Healthcare_1'].isna(), 'Healthcare_1_nan'] = 1
X_valid.loc[X_valid['Healthcare_1'].isna(), 'Healthcare_1_nan'] = 1
test_df.loc[test_df['Healthcare_1'].isna(), 'Healthcare_1_nan'] = 1

In [None]:
X_valid[X_valid['Healthcare_1_nan'] == 1]

In [None]:
# проверка метода fillna (заполнить пропуски)
median = X_train['Healthcare_1'].median()
X_train['Healthcare_1'].fillna(median).isna().sum()
X_valid['Healthcare_1'].fillna(median).isna().sum()
test_df['Healthcare_1'].fillna(median).isna().sum()

In [None]:
X_train['Healthcare_1'].isna().sum()

In [None]:
X_train['Healthcare_1'] = X_train['Healthcare_1'].fillna(median)
X_valid['Healthcare_1'] = X_valid['Healthcare_1'].fillna(median)
test_df['Healthcare_1'] = test_df['Healthcare_1'].fillna(median)

In [None]:
X_valid.isna().sum()

In [None]:
X_train = X_train.drop('LifeSquare_nan', axis=1)
X_valid = X_valid.drop('LifeSquare_nan', axis=1)
test_df = test_df.drop('LifeSquare_nan', axis=1)

X_train = X_train.drop('Healthcare_1_nan', axis=1)
X_valid = X_valid.drop('Healthcare_1_nan', axis=1)
test_df = test_df.drop('Healthcare_1_nan', axis=1)

1.2 Обработка нулевых значений

1.2.1 Rooms

In [None]:
X_train.loc[X_train['Rooms'] == 0, :]

In [None]:
X_valid.loc[X_valid['Rooms'] == 0, :]

In [None]:
test_df.loc[test_df['Rooms'] == 0, :]

In [None]:
X_train['Rooms'] = X_train['Rooms'].replace(0, X_train['Rooms'].median())
X_valid['Rooms'] = X_valid['Rooms'].replace(0, X_train['Rooms'].median())
test_df['Rooms'] = test_df['Rooms'].replace(0, X_train['Rooms'].median())

In [None]:
X_train.loc[X_train['Rooms'] == 0, :]

In [None]:
X_train.loc[X_train['KitchenSquare'] == 0, :]

In [None]:
X_valid.loc[X_valid['KitchenSquare'] == 0, :]

In [None]:
test_df.loc[test_df['KitchenSquare'] == 0, :]

In [None]:
X_train['KitchenSquare'] = X_train['KitchenSquare'].replace(0, X_train['KitchenSquare'].median())
X_valid['KitchenSquare'] = X_valid['KitchenSquare'].replace(0, X_train['KitchenSquare'].median())
test_df['KitchenSquare'] = test_df['KitchenSquare'].replace(0, X_train['KitchenSquare'].median())

In [None]:
test_df.loc[test_df['KitchenSquare'] == 0, :]

1.2.3 HouseFloor

In [None]:
X_train.loc[X_train['HouseFloor'] == 0, :]

In [None]:
X_valid.loc[X_valid['HouseFloor'] == 0, :]

In [None]:
test_df.loc[test_df['HouseFloor'] == 0, :]

In [None]:
X_train['HouseFloor'] = X_train['HouseFloor'].replace(0, X_train['HouseFloor'].median())
X_valid['HouseFloor'] = X_valid['HouseFloor'].replace(0, X_train['HouseFloor'].median())
test_df['HouseFloor'] = test_df['HouseFloor'].replace(0, X_train['HouseFloor'].median())

In [None]:
X_valid.loc[X_valid['HouseFloor'] == 0, :]

6.2. Обработка выбросов

6.2.1 Обработка признака Rooms

In [None]:
X_train.sort_values(['Rooms','LifeSquare'], ascending=[False, False])
X_train.head(10)

In [None]:
X_valid.sort_values(['Rooms','LifeSquare'], ascending=[False, False])
X_valid.head(10)

In [None]:
test_df.sort_values(['Rooms','LifeSquare'], ascending=[False, False])
test_df.head(10)

In [None]:
X_train.loc[X_train['Rooms'] > 6, 'Rooms'] = X_train['Rooms'].median()

In [None]:
X_valid.loc[X_valid['Rooms'] > 6, 'Rooms'] = X_train['Rooms'].median()

In [None]:
test_df.loc[test_df['Rooms'] > 6, 'Rooms'] = X_train['Rooms'].median()

6.2.3 Обработка признаков Square и LifeSquare

In [None]:
X_train.loc[X_train['Square'] < X_train['LifeSquare'], :] # Общая площадь меньше, чем жилая

In [None]:
X_valid.loc[X_valid['Square'] < X_valid['LifeSquare'], :] # Общая площадь меньше, чем жилая

In [None]:
test_df.loc[test_df['Square'] < test_df['LifeSquare'], :] # Общая площадь меньше, чем жилая

In [None]:
# обработка в пункте 7.2.1.6

In [None]:
X_train.loc[X_train['Square'] < X_train['LifeSquare'] + X_train['KitchenSquare'], :] # Сумма жилой площади и кухни больше, чем общая площадь

In [None]:
X_valid.loc[X_valid['Square'] < X_valid['LifeSquare'] + X_valid['KitchenSquare'], :] # Сумма жилой площади и кухни больше, чем общая площадь

In [None]:
test_df.loc[test_df['Square'] < test_df['LifeSquare'] + test_df['KitchenSquare'], :] # Сумма жилой площади и кухни больше, чем общая площадь

In [None]:
# обработка в пункте 7.2.1.7

6.2.4 Обработка признака HouseYear

In [None]:
X_train.loc[X_train['HouseYear'] > 2020, 'HouseYear'] = X_train['HouseYear'].median()
X_valid.loc[X_valid['HouseYear'] > 2020, 'HouseYear'] = X_train['HouseYear'].median()
test_df.loc[test_df['HouseYear'] > 2020, 'HouseYear'] = X_train['HouseYear'].median()

6.3 Обработка дробных чисел

In [None]:
X_train.dtypes

In [None]:
X_valid.dtypes

In [None]:
test_df.dtypes

6.3.1 Количество комнат - целое число

In [None]:
X_train['Rooms'] = X_train['Rooms'].astype('int16')
X_train['Rooms'].dtype

In [None]:
X_valid['Rooms'] = X_valid['Rooms'].astype('int16')
X_valid['Rooms'].dtype

In [None]:
test_df['Rooms'] = test_df['Rooms'].astype('int16')
test_df['Rooms'].dtype

6.3.2 Этаж квартиры - целое число

In [None]:
X_train['Floor'] = X_train['Floor'].astype('int16')
X_train['Floor'].dtype

In [None]:
X_valid['Floor'] = X_valid['Floor'].astype('int16')
X_valid['Floor'].dtype

In [None]:
test_df['Floor'] = test_df['Floor'].astype('int16')
test_df['Floor'].dtype

6.3.3 Этажность дома - целое число

In [None]:
X_train['HouseFloor'] = X_train['HouseFloor'].astype('int16')
X_train['HouseFloor'].dtype

In [None]:
X_valid['HouseFloor'] = X_valid['HouseFloor'].astype('int16')
X_valid['HouseFloor'].dtype

In [None]:
test_df['HouseFloor'] = test_df['HouseFloor'].astype('int16')
test_df['HouseFloor'].dtype

6.4 Преобразование качественных переменных в количественные

Качественные признаки: Ecology_2, Ecology_3, Shops_2

6.4.1 Признак Ecology_2

In [None]:
pd.get_dummies(X_train['Ecology_2'], prefix='Eco_2', drop_first=True)

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['Ecology_2'], prefix='Eco_2', drop_first=True)], axis=1)
X_train.head()

In [None]:
X_train = X_train.drop('Ecology_2', axis=1)

In [None]:
pd.get_dummies(X_valid['Ecology_2'], prefix='Eco_2', drop_first=True)

In [None]:
X_valid = pd.concat([X_valid, pd.get_dummies(X_valid['Ecology_2'], prefix='Eco_2', drop_first=True)], axis=1)
X_valid.head()

In [None]:
X_valid = X_valid.drop('Ecology_2', axis=1)

In [None]:
pd.get_dummies(test_df['Ecology_2'], prefix='Eco_2', drop_first=True)

In [None]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['Ecology_2'], prefix='Eco_2', drop_first=True)], axis=1)
test_df.head()

In [None]:
test_df = test_df.drop('Ecology_2', axis=1)

6.4.2 Признак Ecology_3

In [None]:
pd.get_dummies(X_train['Ecology_3'], prefix='Eco_3', drop_first=True)

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['Ecology_3'], prefix='Eco_3', drop_first=True)], axis=1)
X_train.head()

In [None]:
X_train = X_train.drop('Ecology_3', axis=1)

In [None]:
pd.get_dummies(X_valid['Ecology_3'], prefix='Eco_3', drop_first=True)

In [None]:
X_valid = pd.concat([X_valid, pd.get_dummies(X_valid['Ecology_3'], prefix='Eco_3', drop_first=True)], axis=1)
X_valid.head()

In [None]:
X_valid = X_valid.drop('Ecology_3', axis=1)

In [None]:
pd.get_dummies(test_df['Ecology_3'], prefix='Eco_3', drop_first=True)

In [None]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['Ecology_3'], prefix='Eco_3', drop_first=True)], axis=1)
test_df.head()

In [None]:
test_df = test_df.drop('Ecology_3', axis=1)

6.4.3 Признак Shops_2

In [None]:
pd.get_dummies(X_train['Shops_2'], prefix='Shops_2', drop_first=True)

In [None]:
X_train = pd.concat([X_train, pd.get_dummies(X_train['Shops_2'], prefix='Shops_2', drop_first=True)], axis=1)
X_train.head()

In [None]:
X_train = X_train.drop('Shops_2', axis=1)

In [None]:
pd.get_dummies(X_valid['Shops_2'], prefix='Shops_2', drop_first=True)

In [None]:
X_valid = pd.concat([X_valid, pd.get_dummies(X_valid['Shops_2'], prefix='Shops_2', drop_first=True)], axis=1)
X_valid.head()

In [None]:
X_valid = X_valid.drop('Shops_2', axis=1)

In [None]:
pd.get_dummies(test_df['Shops_2'], prefix='Shops_2', drop_first=True)

In [None]:
test_df = pd.concat([test_df, pd.get_dummies(test_df['Shops_2'], prefix='Shops_2', drop_first=True)], axis=1)
test_df.head()

6.5 Преобразование признака по категориям (Feature discretization)

6.5.1 HouseYear

In [None]:
X_train['HouseYear'].describe()

In [None]:
def age_to_cat(X):

    X['age_cat'] = 0

    X.loc[X['HouseYear'] >= 2015, 'age_cat'] = 1
    X.loc[(X['HouseYear'] < 2015) & (X['HouseYear'] >= 2000), 'age_cat'] = 2
    X.loc[(X['HouseYear'] < 2000) & (X['HouseYear'] >= 1990), 'age_cat'] = 3
    X.loc[(X['HouseYear'] < 1990) & (X['HouseYear'] >= 1950), 'age_cat'] = 4
    X.loc[X['HouseYear'] <= 1949, 'age_cat'] = 5

    return X

In [None]:
X_train = age_to_cat(X_train)
X_train.head()

In [None]:
X_valid = age_to_cat(X_valid)
X_valid.head()

In [None]:
test_df = age_to_cat(test_df)
test_df.head()

6.5.2 DistrictId

In [None]:
X_train['DistrictId'].describe()

In [None]:
def dist_to_cat(X):

    X['dist_cat'] = 0

    X.loc[X['DistrictId'] >= 150, 'dist_cat'] = 1
    X.loc[(X['DistrictId'] < 149) & (X['DistrictId'] >= 75), 'dist_cat'] = 2
    X.loc[(X['DistrictId'] < 74) & (X['DistrictId'] >= 36), 'dist_cat'] = 3
    X.loc[(X['DistrictId'] < 35) & (X['DistrictId'] >= 19), 'dist_cat'] = 4
    X.loc[X['DistrictId'] <= 18, 'dist_cat'] = 5

    return X

In [None]:
X_train = dist_to_cat(X_train)
X_train.head()

In [None]:
X_valid = dist_to_cat(X_valid)
X_valid.head()

In [None]:
test_df = dist_to_cat(test_df)
test_df.head()

7. EDA (иследовательский анализ данных). Генерация новых признаков.

7.1 Распределение таргета

In [None]:
plt.figure(figsize = (16, 8))

plt.subplot(121)
y_train['Price'].hist(density=False, bins=20)
plt.ylabel('количество квартир')
plt.xlabel('Price')

plt.subplot(122)
sns.kdeplot(y_train['Price'], shade=True, legend=False)
plt.xlabel('Price')

plt.suptitle('Распределение Цены на train')
plt.show()

In [None]:
plt.figure(figsize = (16, 8))

plt.subplot(121)
y_valid['Price'].hist(density=False, bins=20)
plt.ylabel('количество квартир')
plt.xlabel('Price')

plt.subplot(122)
sns.kdeplot(y_valid['Price'], shade=True, legend=False)
plt.xlabel('Price')

plt.suptitle('Распределение Цены на valid')
plt.show()

In [None]:
target_mean = round(y_train['Price'].mean(), 2)
target_median = y_train['Price'].median()
target_mode = y_train['Price'].mode()[0]
print(f'mean = {target_mean}, median = {target_median}, mode = {target_mode}')

In [None]:
plt.figure(figsize = (16, 8))

sns.distplot(y_train['Price'], bins=50)

y = np.linspace(0, 0.000005, 10)
plt.plot([target_mean] * 10, y, label='mean', linewidth=4)
plt.plot([target_median] * 10, y, label='median', linewidth=4)
plt.plot([target_mode] * 10, y, label='mode', linewidth=4)

plt.title('Распределение Цены на train')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (16, 8))

sns.distplot(y_valid['Price'], bins=50)

y = np.linspace(0, 0.000005, 10)
plt.plot([target_mean] * 10, y, label='mean', linewidth=4)
plt.plot([target_median] * 10, y, label='median', linewidth=4)
plt.plot([target_mode] * 10, y, label='mode', linewidth=4)

plt.title('Распределение Цены на valid')
plt.legend()
plt.show()

In [None]:
target_bins = y_train['Price'] // 10000 * 10000

target_adjusted_mode = target_bins.mode()[0]
target_adjusted_mode

In [None]:
plt.figure(figsize = (16, 8))

sns.distplot(y_train['Price'], bins=50)

y = np.linspace(0, 0.000005, 10)
plt.plot([target_mean] * 10, y, label='mean', linestyle=':', linewidth=4)
plt.plot([target_median] * 10, y, label='median', linestyle='--', linewidth=4)
plt.plot([target_adjusted_mode] * 10, y, label='target_adjusted_mode', linestyle='-.', linewidth=4)
print(f'mean = {target_mean}, median = {target_median}, mode = {target_adjusted_mode}')
plt.title('Распределение Цены на train')
plt.legend
plt.show

In [None]:
plt.figure(figsize = (16, 8))

sns.distplot(y_valid['Price'], bins=50)

y = np.linspace(0, 0.000005, 10)
plt.plot([target_mean] * 10, y, label='mean', linestyle=':', linewidth=4)
plt.plot([target_median] * 10, y, label='median', linestyle='--', linewidth=4)
plt.plot([target_adjusted_mode] * 10, y, label='target_adjusted_mode', linestyle='-.', linewidth=4)
print(f'mean = {target_mean}, median = {target_median}, mode = {target_adjusted_mode}')
plt.title('Распределение Цены на valid')
plt.legend
plt.show

7.2 Распределение фичей

7.2.1 Количественные признаки

In [None]:
X_train.hist(figsize=(16,16), bins=20, grid=False);

In [None]:
X_valid.hist(figsize=(16,16), bins=20, grid=False);

In [None]:
test_df.hist(figsize=(16,16), bins=20, grid=False);

7.2.1.1 Обработка признака KitchenSquare

In [None]:
X_train['KitchenSquare'].describe()

In [None]:
X_valid['KitchenSquare'].describe()

In [None]:
test_df['KitchenSquare'].describe()

In [None]:
X_train.loc[X_train['KitchenSquare'] > 50, 'KitchenSquare'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
X_valid.loc[X_valid['KitchenSquare'] > 50, 'KitchenSquare'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
test_df.loc[test_df['KitchenSquare'] > 50, 'KitchenSquare'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
X_train.loc[X_train['KitchenSquare'] > 50, 'KitchenSquare'] = X_train['KitchenSquare'].median()
X_valid.loc[X_valid['KitchenSquare'] > 50, 'KitchenSquare'] = X_train['KitchenSquare'].median()
test_df.loc[test_df['KitchenSquare'] > 50, 'KitchenSquare'] = X_train['KitchenSquare'].median()
X_train.loc[X_train['KitchenSquare'] < 5, 'KitchenSquare'] = X_train['KitchenSquare'].median()
X_valid.loc[X_valid['KitchenSquare'] < 5, 'KitchenSquare'] = X_train['KitchenSquare'].median()
test_df.loc[test_df['KitchenSquare'] < 5, 'KitchenSquare'] = X_train['KitchenSquare'].median()

In [None]:
X_train['KitchenSquare'].describe()

In [None]:
X_valid['KitchenSquare'].describe()

In [None]:
test_df['KitchenSquare'].describe()

7.2.1.2 Обработка признака HouseFloor

In [None]:
X_train['HouseFloor'].describe()

In [None]:
X_valid['HouseFloor'].describe()

In [None]:
test_df['HouseFloor'].describe()

In [None]:
X_train.loc[X_train['HouseFloor'] > 90, 'HouseFloor'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
X_valid.loc[X_valid['HouseFloor'] > 90, 'HouseFloor'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
test_df.loc[test_df['HouseFloor'] > 90, 'HouseFloor'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
X_train.loc[X_train['HouseFloor'] > 90, 'HouseFloor'] = X_train['HouseFloor'].median()
X_valid.loc[X_valid['HouseFloor'] > 90, 'HouseFloor'] = X_train['HouseFloor'].median()
test_df.loc[test_df['HouseFloor'] > 90, 'HouseFloor'] = X_train['HouseFloor'].median()

In [None]:
X_train['HouseFloor'].describe()

In [None]:
X_valid['HouseFloor'].describe()

In [None]:
test_df['HouseFloor'].describe()

7.2.1.3 Обработка признака LifeSquare

In [None]:
X_train['LifeSquare'].describe()

In [None]:
X_valid['LifeSquare'].describe()

In [None]:
test_df['LifeSquare'].describe()

In [None]:
X_train.loc[X_train['LifeSquare'] > 500, 'LifeSquare'].\
hist(figsize=(4,4), bins=200, grid=False);

In [None]:
X_train.loc[X_train['LifeSquare'] > 500, 'LifeSquare'] = X_train['LifeSquare'].median()
X_valid.loc[X_valid['LifeSquare'] > 500, 'LifeSquare'] = X_train['LifeSquare'].median()
test_df.loc[test_df['LifeSquare'] > 500, 'LifeSquare'] = X_train['LifeSquare'].median()

In [None]:
X_train.loc[X_train['LifeSquare'] < 5, 'LifeSquare'] = X_train['LifeSquare'].median()
X_valid.loc[X_valid['LifeSquare'] < 5, 'LifeSquare'] = X_train['LifeSquare'].median()
test_df.loc[test_df['LifeSquare'] < 5, 'LifeSquare'] = X_train['LifeSquare'].median()

In [None]:
X_train['LifeSquare'].describe()

In [None]:
X_valid['LifeSquare'].describe()

In [None]:
test_df['LifeSquare'].describe()

7.2.1.4 Обработка признака Square

In [None]:
X_train['Square'].describe()

In [None]:
X_valid['Square'].describe()

In [None]:
test_df['Square'].describe()

In [None]:
X_train.loc[X_train['Square'] < 10, 'Square'] = X_train['Square'].median()

In [None]:
X_valid.loc[X_valid['Square'] < 10, 'Square'] = X_train['Square'].median()

In [None]:
test_df.loc[test_df['Square'] < 10, 'Square'] = X_train['Square'].median()

In [None]:
X_train.loc[X_train['Square'] > 500, 'Square'] = X_train['Square'].median()

In [None]:
X_valid.loc[X_valid['Square'] > 500, 'Square'] = X_train['Square'].median()

In [None]:
test_df.loc[test_df['Square'] > 500, 'Square'] = X_train['Square'].median()

7.2.1.5 Отображение признаков на графиках

In [None]:
grid = sns.jointplot(X_train['Rooms'], X_train['Square'], kind='kde')
grid.fig.set_figwidth(8)
grid.fig.set_figheight(8)

plt.show

In [None]:
plt.figure(figsize=(6,3))

sns.boxplot(X_valid['Square'], whis=3)

plt.xlabel('Square')
plt.show()

7.2.1.6 Обработка признаков Square < LifeSquare

In [None]:
X_train[(X_train['Square'] < X_train['LifeSquare'])].head(1000)

In [None]:
X_valid[(X_valid['Square'] < X_valid['LifeSquare'])].head(1000)

In [None]:
test_df[(test_df['Square'] < test_df['LifeSquare'])].head(1000)

In [None]:
X_train.loc[X_train['Square'] < X_train['LifeSquare'], ['Square', 'LifeSquare']] = X_train.loc[X_train['Square'] < X_train['LifeSquare'], ['LifeSquare', 'Square']].values
X_valid.loc[X_valid['Square'] < X_valid['LifeSquare'], ['Square', 'LifeSquare']] = X_valid.loc[X_valid['Square'] < X_valid['LifeSquare'], ['LifeSquare', 'Square']].values
test_df.loc[test_df['Square'] < test_df['LifeSquare'], ['Square', 'LifeSquare']] = test_df.loc[test_df['Square'] < test_df['LifeSquare'], ['LifeSquare', 'Square']].values

In [None]:
X_train.loc[7246, :]

7.2.1.7 Обработка признаков Square < (LifeSquare + KitchenSquare)

In [None]:
X_train[(X_train['Square'] < (X_train['LifeSquare'] + X_train['KitchenSquare']))].head(1000)

In [None]:
X_valid[(X_valid['Square'] < (X_valid['LifeSquare'] + X_valid['KitchenSquare']))].head(1000)

In [None]:
test_df[(test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare']))].head(1000)

In [None]:
X_train.loc[X_train['Square'] < (X_train['LifeSquare'] + X_train['KitchenSquare']), 'Square'] = X_train.loc[X_train['Square'] < (X_train['LifeSquare'] + X_train['KitchenSquare']), ['LifeSquare', 'KitchenSquare']].sum(axis=1)

In [None]:
X_train.loc[7246, :]

In [None]:
X_valid.loc[X_valid['Square'] < (X_valid['LifeSquare'] + X_valid['KitchenSquare']), 'Square'] = X_valid.loc[X_valid['Square'] < (X_valid['LifeSquare'] + X_valid['KitchenSquare']), ['LifeSquare', 'KitchenSquare']].sum(axis=1)

In [None]:
X_valid.loc[8296, :]

In [None]:
test_df.loc[test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare']), 'Square'] = test_df.loc[test_df['Square'] < (test_df['LifeSquare'] + test_df['KitchenSquare']), ['LifeSquare', 'KitchenSquare']].sum(axis=1)

In [None]:
test_df.loc[468, :]

7.2.1.8 Обработка признаков Floor > HouseFloor

In [None]:
X_train[(X_train['HouseFloor'] < X_train['Floor'])].head(1000)

In [None]:
X_valid[(X_valid['HouseFloor'] < X_valid['Floor'])].head(1000)

In [None]:
test_df[(test_df['HouseFloor'] < test_df['Floor'])].head(1000)

In [None]:
X_train.loc[X_train['HouseFloor'] < X_train['Floor'], ['HouseFloor', 'Floor']] = X_train.loc[X_train['HouseFloor'] < X_train['Floor'], ['Floor', 'HouseFloor']].values
X_valid.loc[X_valid['HouseFloor'] < X_valid['Floor'], ['HouseFloor', 'Floor']] = X_valid.loc[X_valid['HouseFloor'] < X_valid['Floor'], ['Floor', 'HouseFloor']].values
test_df.loc[test_df['HouseFloor'] < test_df['Floor'], ['HouseFloor', 'Floor']] = test_df.loc[test_df['HouseFloor'] < test_df['Floor'], ['Floor', 'HouseFloor']].values

In [None]:
test_df.loc[468, :]

7.2.2 Категориальные признаки

In [None]:
counts = test_df['age_cat'].value_counts()

plt.figure(figsize=(12, 8))
plt.title('age_cat')
sns.barplot(counts.index, counts.values)

plt.show()

In [None]:
counts_ = X_train['dist_cat'].value_counts()

plt.figure(figsize=(12, 8))
plt.title('dist_cat')
sns.barplot(counts_.index, counts_.values)

plt.show()

In [None]:
plt.figure(figsize = (15, 10))

sns.set(font_scale=1.4)

corr_matrix = X_train.corr()
corr_matrix = np.round(corr_matrix, 2)
corr_matrix[np.abs(corr_matrix) < 0.2] = 0

sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='coolwarm')

plt.title('Correlation matrix')
plt.show()

7.3 Удаление Id

In [None]:
X_train = X_train.drop('Id', axis=1)
X_valid = X_valid.drop('Id', axis=1)
test_df = test_df.drop('Id', axis=1)

In [None]:
X_train.head(20)

In [None]:
X_valid.head(20)

In [None]:
test_df.head(20)

In [None]:
y_train.head(20)

8. Масштабирование

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
cols_for_scale = ['DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Healthcare_1', 'Helthcare_2', 'Shops_1',
       'Eco_2_B', 'Eco_3_B', 'Shops_2_B', 'age_cat', 'dist_cat']

In [None]:
scaler = RobustScaler()

In [None]:
X_train[cols_for_scale] = scaler.fit_transform(X_train[cols_for_scale])
X_valid[cols_for_scale] = scaler.transform(X_valid[cols_for_scale])
test_df[cols_for_scale] = scaler.transform(test_df[cols_for_scale])

9. Построение модели

9.1 Обучение модели

In [None]:
X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gb_model = GradientBoostingRegressor(criterion='mse', max_depth=7, min_samples_leaf=10, random_state=42, n_estimators=100)

gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_valid)
check_test = pd.DataFrame({'y_valid': y_valid['Price'], 'y_pred': y_pred.flatten()}, columns=['y_valid', 'y_pred'])
check_test

9.2 Метрика оценки качества

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_valid, y_pred)

10. Формирование файла с прогнозами

In [None]:
test_df.shape

In [None]:
test_df

In [None]:
submit = pd.read_csv('C:/АННА/Data Sience/Библиотеки Python/Проект/Архив/project_data_from_kaggle/sample_submission.csv')
submit.head()

In [None]:
predictions = gb_model.predict(test_df)
predictions

In [None]:
submit['Price'] = predictions
submit.head()

In [None]:
submit.to_csv('C:/АННА/Data Sience/Библиотеки Python/Проект/gb_submission.csv', index=False)