In [5]:
import numpy as np
import pandas as pd

import os

from sklearn.model_selection import train_test_split

In [2]:
!ls -lt

total 32
drwxr-xr-x@ 5 dashakretiuk  staff  160 26 фев 22:21 [34mreal-estate-price-prediction-moscow[m[m
drwxr-xr-x@ 5 dashakretiuk  staff  160 26 фев 22:20 [34mgb-black-friday-sales[m[m
-rw-r--r--  1 dashakretiuk  staff  555 26 фев 22:14 real_estate.ipynb
-rw-r--r--  1 dashakretiuk  staff  555 26 фев 22:14 black_friday_sales.ipynb
drwxr-xr-x@ 6 dashakretiuk  staff  192 18 фев 19:15 [34mvenv[m[m
-rw-r--r--  1 dashakretiuk  staff   45 18 фев 16:47 requirements.txt
-rw-r--r--  1 dashakretiuk  staff   15 18 фев 16:41 README.md


In [7]:
DATASET_PATH = 'real-estate-price-prediction-moscow/'
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'

## Загрузка датасета

**Описание датасета**

* **Id** - идентификационный номер квартиры
* **DistrictId** - идентификационный номер района
* **Rooms** - количество комнат
* **Square** - площадь
* **LifeSquare** - жилая площадь
* **KitchenSquare** - площадь кухни
* **Floor** - этаж
* **HouseFloor** - количество этажей в доме
* **HouseYear** - год постройки дома
* **Ecology_1, Ecology_2, Ecology_3** - экологические показатели местности
* **Social_1, Social_2, Social_3** - социальные показатели местности
* **Healthcare_1, Helthcare_2** - показатели местности, связанные с охраной здоровья
* **Shops_1, Shops_2** - показатели, связанные с наличием магазинов, торговых центров
* **Price** - цена квартиры

In [8]:
df_train = pd.read_csv(os.path.join(DATASET_PATH, TRAIN_CSV), sep=',')
df_test = pd.read_csv(os.path.join(DATASET_PATH, TEST_CSV), sep=',')
df_train.head(10)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,11809,27,3.0,115.027311,,10.0,4,10.0,2014,0.075424,B,B,11,3097,0,,0,0,B,305018.871089
1,3013,22,1.0,39.832524,23.169223,8.0,7,8.0,1966,0.118537,B,B,30,6207,1,1183.0,1,0,B,177734.553407
2,8215,1,3.0,78.342215,47.671972,10.0,2,17.0,1988,0.025609,B,B,33,5261,0,240.0,3,1,B,282078.72085
3,2352,1,1.0,40.409907,,1.0,10,22.0,1977,0.007122,B,B,1,264,0,,0,1,B,168106.00763
4,13866,94,2.0,64.285067,38.562517,9.0,16,16.0,1972,0.282798,B,B,33,8667,2,,0,6,B,343995.102962
5,5804,99,3.0,62.528465,47.103833,6.0,9,9.0,1972,0.012339,B,B,35,5776,1,2078.0,2,4,B,161044.944138
6,8864,44,1.0,33.93875,21.77293,9.0,1,12.0,1968,0.232205,B,B,24,4860,1,80.0,0,3,B,142402.739272
7,13072,59,2.0,50.391814,32.893256,8.0,8,17.0,1986,0.019509,B,B,37,7687,11,176.0,5,5,B,326174.175191
8,9951,1,1.0,46.887892,44.628132,1.0,12,20.0,1977,0.007122,B,B,1,264,0,,0,1,B,160400.401732
9,8094,6,3.0,79.867477,75.085125,1.0,3,17.0,2014,0.243205,B,B,5,1564,0,540.0,0,0,B,194756.023945


In [9]:
X, Y = df_train.iloc[:,:-1], df_train.iloc[:,-1]
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.33, shuffle=True, random_state=21)

## Анализ данных

In [10]:
x_train.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1
count,6700.0,6700.0,6700.0,6700.0,5284.0,6700.0,6700.0,6700.0,6700.0,6700.0,6700.0,6700.0,6700.0,3462.0,6700.0,6700.0
mean,8400.447015,50.159701,1.895821,56.302571,36.221623,6.184179,8.541194,12.587313,4977.358,0.119933,24.778209,5379.07,7.888209,1155.736857,1.310597,4.217761
std,4888.123259,43.501559,0.849056,21.756246,21.314509,24.56837,5.29553,6.6673,244950.1,0.119618,17.629568,4033.819223,23.400692,1026.472476,1.492412,4.766738
min,0.0,0.0,0.0,2.596351,0.370619,0.0,1.0,0.0,1912.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0
25%,4152.25,20.0,1.0,41.941181,22.766272,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0
50%,8389.5,35.0,2.0,52.621164,32.679351,6.0,7.0,13.0,1977.0,0.075779,25.0,5285.0,2.0,920.0,1.0,3.0
75%,12665.5,75.0,2.0,65.765015,44.93355,9.0,12.0,17.0,2001.0,0.195781,36.0,7287.0,5.0,1548.0,2.0,6.0
max,16797.0,209.0,19.0,641.065193,638.163193,1970.0,42.0,99.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0


In [16]:
for i in df_train.columns:
    print(f'===== {i} =====')
    value_counts = df_train[i].value_counts()
    print(pd.DataFrame(value_counts).sort_index())
    print()
    
# ВЫБРОСЫ:
# - Rooms: 0 комнат не бывает, 10 и 19 комнат-это выброс
# - KitchenSquare: 1970.0 и 2014.0 площади не может быть скорее всего-выброс
# - HouseYear: год домасне мождет быть больше настоящего года (2021)
# - HouseFloor: 99 и 117 похожи на выбросы
# - LifeSquare: 7480.592129 похоже на выброс

===== Id =====
       Id
0       1
2       1
5       1
6       1
10      1
...    ..
16793   1
16794   1
16796   1
16797   1
16798   1

[10000 rows x 1 columns]

===== DistrictId =====
     DistrictId
0            48
1           652
2            78
3            82
4            11
..          ...
202           2
205           1
207           1
208           2
209           1

[205 rows x 1 columns]

===== Rooms =====
      Rooms
0.0       8
1.0    3705
2.0    3880
3.0    2235
4.0     150
5.0      18
6.0       1
10.0      2
19.0      1

===== Square =====
            Square
1.136859         1
1.988943         1
2.377248         1
2.596351         1
2.954309         1
...            ...
212.932361       1
275.645284       1
409.425181       1
604.705972       1
641.065193       1

[10000 rows x 1 columns]

===== LifeSquare =====
             LifeSquare
0.370619              1
0.641822              1
0.795539              1
0.873147              1
1.049867              1
...               

In [13]:
df_train.isnull().sum()

# в LifeSquare и Healthcare_1 есть значения Null => их нужно убирать

Id                  0
DistrictId          0
Rooms               0
Square              0
LifeSquare       2113
KitchenSquare       0
Floor               0
HouseFloor          0
HouseYear           0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Healthcare_1     4798
Helthcare_2         0
Shops_1             0
Shops_2             0
Price               0
dtype: int64