# Exploratory data analysis

In [382]:
from typing import Dict

from scipy.stats import ks_2samp
import pandas as pd
import numpy as np

from plotly import express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots

import os
import re
from IPython.display import display_markdown

In [3]:
for d in os.listdir('../data/raw'):
    print(d)

.gitkeep
data_description.txt
sample_submission.csv
test.csv
train.csv


In [24]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

train['_t'] = 'train'
test['_t'] = 'test'

In [25]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,_t
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,208500,train
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,181500,train
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,223500,train
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,140000,train
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,250000,train


In [232]:
def get_descriptions() -> Dict[str, str]:
    descriptions: Dict[str, str] = {}
    feature_name_pattern = re.compile('^(\w+):.*')

    with open('../data/raw/data_description.txt', 'r') as f:        
        lines: list[str] = f.readlines()
        current_feature_name: str = ''
        current_feature_summary: str = ''
        current_feature_desc: str = ''

        for line in lines:
            match = feature_name_pattern.match(line)
            if match:
                tmp: str = match.group(1)
                if current_feature_name != tmp and current_feature_name != '':
                    descriptions[current_feature_name] = { 'summary': current_feature_summary, 'description': current_feature_desc }
                current_feature_name = tmp
                current_feature_desc = line
                current_feature_summary = line
            else:
                current_feature_desc += line
        
        descriptions[current_feature_name] = { 'summary': current_feature_summary, 'description': current_feature_desc }
    return descriptions

descriptions = get_descriptions()

In [75]:
print(descriptions['Alley']['summary'])

Alley: Type of alley access to property



## Features

### Sale price

Рассмотрим распределение цен на недивжимость в обучающей выборке.

In [35]:
px.histogram(train['SalePrice'], nbins=100, title='Sale Price')

In [23]:
px.histogram(train['SalePrice'].apply(lambda x: np.log(x)), nbins=100, title='Sale Price (log)')

Возможно, лучше будет работать predict для `log(target)` и на постпроцессинге делать обратное преобразование.

### Остальные признаки

#### Категориальные признаки

In [263]:
cat_features = train.select_dtypes([np.object]).columns
md: str = f'{len(cat_features)} категориальных признаков (в том числе технический признак `_t`):\n\n'
md += '| № | признак | описание | nunique | # n/a | # not n/a |\n'
md += '|--:|:-------:|:---------|--------:|------:|----------:|\n'
for i, cat_feature in enumerate(cat_features):
    if cat_feature == '_t':
        continue

    md += f'| {i + 1} | **{cat_feature}** | {descriptions[cat_feature]["summary"].split(":")[1].strip()} | {train[cat_feature].nunique()} | {train[train[cat_feature].isna()].shape[0]} | {train[~train[cat_feature].isna()].shape[0]} |\n'

display_markdown(md, raw=True)

44 категориальных признаков (в том числе технический признак `_t`):

| № | признак | описание | nunique | # n/a | # not n/a |
|--:|:-------:|:---------|--------:|------:|----------:|
| 1 | **MSZoning** | Identifies the general zoning classification of the sale. | 5 | 0 | 1460 |
| 2 | **Street** | Type of road access to property | 2 | 0 | 1460 |
| 3 | **Alley** | Type of alley access to property | 2 | 1369 | 91 |
| 4 | **LotShape** | General shape of property | 4 | 0 | 1460 |
| 5 | **LandContour** | Flatness of the property | 4 | 0 | 1460 |
| 6 | **Utilities** | Type of utilities available | 2 | 0 | 1460 |
| 7 | **LotConfig** | Lot configuration | 5 | 0 | 1460 |
| 8 | **LandSlope** | Slope of property | 3 | 0 | 1460 |
| 9 | **Neighborhood** | Physical locations within Ames city limits | 25 | 0 | 1460 |
| 10 | **Condition1** | Proximity to various conditions | 9 | 0 | 1460 |
| 11 | **Condition2** | Proximity to various conditions (if more than one is present) | 8 | 0 | 1460 |
| 12 | **BldgType** | Type of dwelling | 5 | 0 | 1460 |
| 13 | **HouseStyle** | Style of dwelling | 8 | 0 | 1460 |
| 14 | **RoofStyle** | Type of roof | 6 | 0 | 1460 |
| 15 | **RoofMatl** | Roof material | 8 | 0 | 1460 |
| 16 | **Exterior1st** | Exterior covering on house | 15 | 0 | 1460 |
| 17 | **Exterior2nd** | Exterior covering on house (if more than one material) | 16 | 0 | 1460 |
| 18 | **MasVnrType** | Masonry veneer type | 4 | 8 | 1452 |
| 19 | **ExterQual** | Evaluates the quality of the material on the exterior | 4 | 0 | 1460 |
| 20 | **ExterCond** | Evaluates the present condition of the material on the exterior | 5 | 0 | 1460 |
| 21 | **Foundation** | Type of foundation | 6 | 0 | 1460 |
| 22 | **BsmtQual** | Evaluates the height of the basement | 4 | 37 | 1423 |
| 23 | **BsmtCond** | Evaluates the general condition of the basement | 4 | 37 | 1423 |
| 24 | **BsmtExposure** | Refers to walkout or garden level walls | 4 | 38 | 1422 |
| 25 | **BsmtFinType1** | Rating of basement finished area | 6 | 37 | 1423 |
| 26 | **BsmtFinType2** | Rating of basement finished area (if multiple types) | 6 | 38 | 1422 |
| 27 | **Heating** | Type of heating | 6 | 0 | 1460 |
| 28 | **HeatingQC** | Heating quality and condition | 5 | 0 | 1460 |
| 29 | **CentralAir** | Central air conditioning | 2 | 0 | 1460 |
| 30 | **Electrical** | Electrical system | 5 | 1 | 1459 |
| 31 | **KitchenQual** | Kitchen quality | 4 | 0 | 1460 |
| 32 | **Functional** | Home functionality (Assume typical unless deductions are warranted) | 7 | 0 | 1460 |
| 33 | **FireplaceQu** | Fireplace quality | 5 | 690 | 770 |
| 34 | **GarageType** | Garage location | 6 | 81 | 1379 |
| 35 | **GarageFinish** | Interior finish of the garage | 3 | 81 | 1379 |
| 36 | **GarageQual** | Garage quality | 5 | 81 | 1379 |
| 37 | **GarageCond** | Garage condition | 5 | 81 | 1379 |
| 38 | **PavedDrive** | Paved driveway | 3 | 0 | 1460 |
| 39 | **PoolQC** | Pool quality | 3 | 1453 | 7 |
| 40 | **Fence** | Fence quality | 4 | 1179 | 281 |
| 41 | **MiscFeature** | Miscellaneous feature not covered in other categories | 4 | 1406 | 54 |
| 42 | **SaleType** | Type of sale | 9 | 0 | 1460 |
| 43 | **SaleCondition** | Condition of sale | 6 | 0 | 1460 |


Наиболее интересными выглядят признаки `Alley`, `FireplaceQu`, `PoolQC`, `Fence`, `MiscFeature`, т.к. много записей в данных, где эти признаки не определены. Возможно, это наиболее дорогое или наоборот наиболее дешёвое жильё.

In [353]:
def report(feature_name: str) -> None:
    not_na_records = train[~train[feature_name].isna()]
    na_records = train[train[feature_name].isna()]

    alley_not_na_percentile = ''.join([f'    - {x}: {np.percentile(not_na_records["SalePrice"], x)}\n' for x in [5, 25, 50, 75, 95]])
    alley_na_percentile = ''.join([f'    - {x}: {np.percentile(na_records["SalePrice"], x)}\n' for x in [5, 25, 50, 75, 95]])

    display_markdown(f"""
##### `{feature_name}` == n/a
- Записей `{feature_name}` == n/a: {na_records.shape[0]}
- Распределение стоимости жилья (перцентиль):

{alley_na_percentile}

##### `{feature_name}` != n/a
- Записей `{feature_name}` != n/a: {not_na_records.shape[0]}
- Распределение стоимости жилья (перцентиль):

{alley_not_na_percentile}
""", raw=True)

def ks(feature_name: str) -> None:
    result = ks_2samp(train[~train[feature_name].isna()]['SalePrice'], train[train[feature_name].isna()]['SalePrice'])
    return (result.statistic, result.pvalue) 

def hist(feature_name: str, nbins: int=100, log: bool=False) -> None:
    fig = go.Figure()

    na_data = train[train[feature_name].isna()]['SalePrice'].apply(lambda x: np.log(x) if log else x)
    not_na_data = train[~train[feature_name].isna()]['SalePrice'].apply(lambda x: np.log(x) if log else x)

    fig.add_trace(go.Histogram(x=na_data, nbinsx=nbins, name=f'{feature_name} is na'))
    fig.add_trace(go.Histogram(x=not_na_data, nbinsx=nbins, name=f'{feature_name} is not na'))

    fig.update_layout(showlegend=True, height=400, width=1000, title=feature_name)
    fig.show()

##### Alley

In [335]:
report('Alley')


##### `Alley` == n/a
- Записей `Alley` == n/a: 1369
- Распределение стоимости жилья (перцентиль):

    - 5: 89188.4
    - 25: 130500.0
    - 50: 165000.0
    - 75: 217500.0
    - 95: 335000.0


##### `Alley` != n/a
- Записей `Alley` != n/a: 91
- Распределение стоимости жилья (перцентиль):

    - 5: 70000.0
    - 25: 114752.0
    - 50: 140000.0
    - 75: 172200.0
    - 95: 207950.0



Из данных выше сложно делать какие-то выводы. Проведём тест Колмогорова-Смирнова:

In [336]:
stat, _ = ks('Alley')
print(stat)

0.2560383371194182


Значение статистики большое &mdash; можно отвергнуть нулевую гипотезу, что данные имеют одинаковое распределение.

Изобразим данные в разрезе признака на графике:

In [354]:
hist('Alley')

График так же не даёт возможности сделать однозначные выводы.

##### FireplaceQu

In [338]:
report('FireplaceQu')


##### `FireplaceQu` == n/a
- Записей `FireplaceQu` == n/a: 690
- Распределение стоимости жилья (перцентиль):

    - 5: 80000.0
    - 25: 112000.0
    - 50: 135000.0
    - 75: 164375.0
    - 95: 224955.0


##### `FireplaceQu` != n/a
- Записей `FireplaceQu` != n/a: 770
- Распределение стоимости жилья (перцентиль):

    - 5: 119335.0
    - 25: 159000.0
    - 50: 191000.0
    - 75: 257375.0
    - 95: 380550.0



Кажется, что жильё с не-пустым `FireplaceQu` стоит дороже. Проведём тест Колмогорова-Смирнова:

In [339]:
ks('FireplaceQu')

(0.4862224731789949, 8.575157499901099e-80)

Значение статистики большое &mdash; можно отвергнуть нулевую гипотезу, что данные имеют одинаковое распределение.

Изобразим данные в разрезе признака на графике:

In [355]:
hist('FireplaceQu', log=True, nbins=50)
hist('FireplaceQu')

На графике видно, что цены на жильё с `FireplaceQu` выше.

##### Fence

In [350]:
report('Fence')


##### `Fence` == n/a
- Записей `Fence` == n/a: 1179
- Распределение стоимости жилья (перцентиль):

    - 5: 87950.0
    - 25: 132750.0
    - 50: 173000.0
    - 75: 224700.0
    - 95: 339775.0


##### `Fence` != n/a
- Записей `Fence` != n/a: 281
- Распределение стоимости жилья (перцентиль):

    - 5: 89500.0
    - 25: 123000.0
    - 50: 141500.0
    - 75: 166000.0
    - 95: 256000.0



Из данных выше сложно сделать какие-то выводы. Провердём тест Колмогорова-Смирнова:

In [351]:
ks('Fence')

(0.29453454432400944, 1.1102230246251565e-16)

Значение статистики большое &mdash; можно отвергнуть нулевую гипотезу, что данные имеют одинаковое распределение.

Изобразим данные в разрезе признака на графике:

In [356]:
hist('Fence', log=True, nbins=50)
hist('Fence')

Визуально сильных перекосов в данных не заметно.

##### MiscFeature

In [357]:
report('MiscFeature')


##### `MiscFeature` == n/a
- Записей `MiscFeature` == n/a: 1406
- Распределение стоимости жилья (перцентиль):

    - 5: 89500.0
    - 25: 130000.0
    - 50: 164250.0
    - 75: 215000.0
    - 95: 332101.0


##### `MiscFeature` != n/a
- Записей `MiscFeature` != n/a: 54
- Распределение стоимости жилья (перцентиль):

    - 5: 78450.0
    - 25: 120125.0
    - 50: 146000.0
    - 75: 181750.0
    - 95: 259674.99999999994



Данных для не-пустых `MiscFeature` довольно мало, но судя по ним радикальной разницы не заметно. Проведём тест Колмогорова-Смирнова: 

In [359]:
ks('MiscFeature')

(0.1994889626468574, 0.02723654325436331)

Значение статистики большое &mdash; можно отвергнуть нулевую гипотезу, что распределения одинаковые.

Изобразим данные на графике:

In [360]:
hist('MiscFeature', log=True, nbins=50)
hist('MiscFeature')

Данных мало и на графике сложно что-то сказать, но явных различий не видно.

##### PoolQC

`PoolQC` &mdash; качество бассейна на объекте.

Записей с не-пустым `PoolQC` исчезающе мало &mdash; всего 7 штук:

In [373]:
train[~train['PoolQC'].isna()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,_t
197,198,75,RL,174.0,25419,Pave,,Reg,Lvl,AllPub,...,Ex,GdPrv,,0,3,2006,WD,Abnorml,235000,train
810,811,20,RL,78.0,10140,Pave,,Reg,Lvl,AllPub,...,Fa,GdPrv,,0,1,2006,WD,Normal,181000,train
1170,1171,80,RL,76.0,9880,Pave,,Reg,Lvl,AllPub,...,Gd,GdPrv,,0,7,2008,WD,Normal,171000,train
1182,1183,60,RL,160.0,15623,Pave,,IR1,Lvl,AllPub,...,Ex,MnPrv,,0,7,2007,WD,Abnorml,745000,train
1298,1299,60,RL,313.0,63887,Pave,,IR3,Bnk,AllPub,...,Gd,,,0,1,2008,New,Partial,160000,train
1386,1387,60,RL,80.0,16692,Pave,,IR1,Lvl,AllPub,...,Fa,MnPrv,TenC,2000,7,2006,WD,Normal,250000,train
1423,1424,80,RL,,19690,Pave,,IR1,Lvl,AllPub,...,Gd,GdPrv,,0,8,2006,WD,Alloca,274970,train


In [374]:
report('PoolQC')


##### `PoolQC` == n/a
- Записей `PoolQC` == n/a: 1453
- Распределение стоимости жилья (перцентиль):

    - 5: 88000.0
    - 25: 129900.0
    - 50: 162900.0
    - 75: 213500.0
    - 95: 325774.39999999997


##### `PoolQC` != n/a
- Записей `PoolQC` != n/a: 7
- Распределение стоимости жилья (перцентиль):

    - 5: 163300.0
    - 25: 176000.0
    - 50: 235000.0
    - 75: 262485.0
    - 95: 603990.9999999997



In [376]:
ks('PoolQC')

(0.48382656572608396, 0.050548351243565715)

In [378]:
hist('PoolQC', log=True, nbins=50)
hist('PoolQC')

Выглядит так, что дома с бассейнами смещены ближе к правой границе цен.

##### Остальные категориальные признаки

In [423]:
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=cat_features)

x = train[['SalePrice', cat_feature]].groupby(cat_feature).agg(np.median).reset_index()['SalePrice']
y = train[['SalePrice', cat_feature]].groupby(cat_feature).agg(np.median).reset_index()[cat_feature]
# train[['SalePrice', cat_feature]].groupby(cat_feature).agg(np.median)
fig.add_trace(go.Bar(x=x, y=y), row=1, col=1)

# fig.add_trace(go.Histogram(x=na_data, nbinsx=100, name=f'{cat_feature} is na'), row=i%22+1, col=i%2+1)
# fig.add_trace(go.Histogram(x=not_na_data, nbinsx=100, name=f'{cat_feature} is not na'), row=i%22+1, col=i%2+1)

# fig.update_layout(showlegend=False, height=4400)
fig.show()


# train[['SalePrice', cat_feature]].groupby(cat_feature).agg(np.median).reset_index()

In [432]:
fig = make_subplots(
    rows=22, cols=2,
    subplot_titles=cat_features)

col_idx = 1
row_idx = 1

all_data = train.append(test)

for i, cat_feature in enumerate(cat_features):
    print(cat_feature)
    # na_data = train[train[cat_feature].isna()]['SalePrice']
    # not_na_data = train[~train[cat_feature].isna()]['SalePrice']

    data = all_data[[cat_feature, '_t']]

    if col_idx == 3:
        col_idx = 1
        row_idx += 1
    
    fig.add_trace(go.Histogram(x=data[data['_t'] == 'train'][cat_feature], nbinsx=100, name=f'{cat_feature} - train'), row=row_idx, col=col_idx)
    fig.add_trace(go.Histogram(x=data[data['_t'] == 'test'][cat_feature], nbinsx=100, name=f'{cat_feature} - test'), row=row_idx, col=col_idx)
    col_idx += 1

    # fig.add_trace(go.Histogram(x=na_data, nbinsx=100, name=f'{cat_feature} is na'), row=i%22+1, col=i%2+1)
    # fig.add_trace(go.Histogram(x=not_na_data, nbinsx=100, name=f'{cat_feature} is not na'), row=i%22+1, col=i%2+1)

fig.update_layout(showlegend=False, height=4400)
fig.show()

MSZoning
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Heating
HeatingQC
CentralAir
Electrical
KitchenQual
Functional
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PavedDrive
PoolQC
Fence
MiscFeature
SaleType
SaleCondition
_t


Визуально тестовый и обучающий набор выглядят похожими.

### Вещественные признаки

In [439]:
num_features = train.select_dtypes([np.int64, np.float64]).columns
md: str = f'{len(num_features)} вещественных признаков:\n\n'
md += '| № | признак | описание | median  |   min   |   max   | # n/a | # not n/a |\n'
md += '|--:|:-------:|:---------|--------:|--------:|--------:|------:|----------:|\n'
for i, num_feature in enumerate(num_features):
    if num_feature == 'Id':
        continue

    desc: str = descriptions[num_feature]["summary"].split(":")[1].strip() if num_feature in descriptions else 'description not provided'
    md += f'| {i + 1} | **{num_feature}** | {desc} | {train[num_feature].median()} | {train[num_feature].min()} | {train[num_feature].max()} | {train[train[num_feature].isna()].shape[0]} | {train[~train[num_feature].isna()].shape[0]} |\n'

display_markdown(md, raw=True)

38 вещественных признаков:

| № | признак | описание | median  |   min   |   max   | # n/a | # not n/a |
|--:|:-------:|:---------|--------:|--------:|--------:|------:|----------:|
| 2 | **MSSubClass** | Identifies the type of dwelling involved in the sale. | 50.0 | 20 | 190 | 0 | 1460 |
| 3 | **LotFrontage** | Linear feet of street connected to property | 69.0 | 21.0 | 313.0 | 259 | 1201 |
| 4 | **LotArea** | Lot size in square feet | 9478.5 | 1300 | 215245 | 0 | 1460 |
| 5 | **OverallQual** | Rates the overall material and finish of the house | 6.0 | 1 | 10 | 0 | 1460 |
| 6 | **OverallCond** | Rates the overall condition of the house | 5.0 | 1 | 9 | 0 | 1460 |
| 7 | **YearBuilt** | Original construction date | 1973.0 | 1872 | 2010 | 0 | 1460 |
| 8 | **YearRemodAdd** | Remodel date (same as construction date if no remodeling or additions) | 1994.0 | 1950 | 2010 | 0 | 1460 |
| 9 | **MasVnrArea** | Masonry veneer area in square feet | 0.0 | 0.0 | 1600.0 | 8 | 1452 |
| 10 | **BsmtFinSF1** | Type 1 finished square feet | 383.5 | 0 | 5644 | 0 | 1460 |
| 11 | **BsmtFinSF2** | Type 2 finished square feet | 0.0 | 0 | 1474 | 0 | 1460 |
| 12 | **BsmtUnfSF** | Unfinished square feet of basement area | 477.5 | 0 | 2336 | 0 | 1460 |
| 13 | **TotalBsmtSF** | Total square feet of basement area | 991.5 | 0 | 6110 | 0 | 1460 |
| 14 | **1stFlrSF** | First Floor square feet | 1087.0 | 334 | 4692 | 0 | 1460 |
| 15 | **2ndFlrSF** | Second floor square feet | 0.0 | 0 | 2065 | 0 | 1460 |
| 16 | **LowQualFinSF** | Low quality finished square feet (all floors) | 0.0 | 0 | 572 | 0 | 1460 |
| 17 | **GrLivArea** | Above grade (ground) living area square feet | 1464.0 | 334 | 5642 | 0 | 1460 |
| 18 | **BsmtFullBath** | Basement full bathrooms | 0.0 | 0 | 3 | 0 | 1460 |
| 19 | **BsmtHalfBath** | Basement half bathrooms | 0.0 | 0 | 2 | 0 | 1460 |
| 20 | **FullBath** | Full bathrooms above grade | 2.0 | 0 | 3 | 0 | 1460 |
| 21 | **HalfBath** | Half baths above grade | 0.0 | 0 | 2 | 0 | 1460 |
| 22 | **BedroomAbvGr** | description not provided | 3.0 | 0 | 8 | 0 | 1460 |
| 23 | **KitchenAbvGr** | description not provided | 1.0 | 0 | 3 | 0 | 1460 |
| 24 | **TotRmsAbvGrd** | Total rooms above grade (does not include bathrooms) | 6.0 | 2 | 14 | 0 | 1460 |
| 25 | **Fireplaces** | Number of fireplaces | 1.0 | 0 | 3 | 0 | 1460 |
| 26 | **GarageYrBlt** | Year garage was built | 1980.0 | 1900.0 | 2010.0 | 81 | 1379 |
| 27 | **GarageCars** | Size of garage in car capacity | 2.0 | 0 | 4 | 0 | 1460 |
| 28 | **GarageArea** | Size of garage in square feet | 480.0 | 0 | 1418 | 0 | 1460 |
| 29 | **WoodDeckSF** | Wood deck area in square feet | 0.0 | 0 | 857 | 0 | 1460 |
| 30 | **OpenPorchSF** | Open porch area in square feet | 25.0 | 0 | 547 | 0 | 1460 |
| 31 | **EnclosedPorch** | Enclosed porch area in square feet | 0.0 | 0 | 552 | 0 | 1460 |
| 32 | **3SsnPorch** | Three season porch area in square feet | 0.0 | 0 | 508 | 0 | 1460 |
| 33 | **ScreenPorch** | Screen porch area in square feet | 0.0 | 0 | 480 | 0 | 1460 |
| 34 | **PoolArea** | Pool area in square feet | 0.0 | 0 | 738 | 0 | 1460 |
| 35 | **MiscVal** | $Value of miscellaneous feature | 0.0 | 0 | 15500 | 0 | 1460 |
| 36 | **MoSold** | Month Sold (MM) | 6.0 | 1 | 12 | 0 | 1460 |
| 37 | **YrSold** | Year Sold (YYYY) | 2008.0 | 2006 | 2010 | 0 | 1460 |
| 38 | **SalePrice** | description not provided | 163000.0 | 34900 | 755000 | 0 | 1460 |


##### Коррелация

In [448]:
num_feature_values = train.select_dtypes([np.int64, np.float64]).drop('Id', axis='columns')
px.imshow(num_feature_values.corr(), text_auto=True, width=1000, height=1000)

In [457]:
corrs = num_feature_values.corrwith(num_feature_values['SalePrice']).sort_values(ascending=False)
top_correlated_features = corrs[1:11]
top_correlated_features

OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
dtype: float64

In [464]:
px.imshow(train[list(top_correlated_features.index) + ['SalePrice']].corr(), text_auto=True)

In [477]:
%load_ext watermark

%watermark -n -u -v -iv -w -p plotly,pandas,scipy,numpy

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
pandas 1.1.5
re     2.2.1
numpy  1.19.5
last updated: Sat May 07 2022 

CPython 3.6.8
IPython 7.16.3

plotly 5.7.0
pandas 1.1.5
scipy 1.5.4
numpy 1.19.5
watermark 2.0.2
