### Итоговая работа по теме "Библиотеки Python для Data Science: Numpy, Matplotlib, Scikit-learn"

In [266]:
# Подключение библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [227]:
%matplotlib inline

In [228]:
# Константы
TRAIN_DATASET_PATH = 'train.csv'
TEST_DATASET_PATH = 'test.csv'

In [229]:
# Чтение датасета
df = pd.read_csv(TRAIN_DATASET_PATH)

In [230]:
# Просмор первых 5 записей и заголовков
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [231]:
# Размер датасета
df.shape

(10000, 20)

In [232]:
# Смотрим информацию о датасете (количество строк и столбцов, тип данных, наличие пустых полей)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

In [233]:
# Выводы:
# LifeSquare, Healthcare_1 - имеют пропуски. Необходимо решить эту проблему.
# Ecology_2, Ecology_3, Shops_2 - категориальные признаки. Проверить уникальность и необходимость их участия в построение модели.

In [234]:
# Заменяем индекс датасета поскольку для построение модели он не несет никакой полезной информации
df.set_index('Id')

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,B,B,33,7976,5,,0,11,B,184966.930730
15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,B,B,46,10309,1,240.0,1,16,B,300009.450063
4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,32,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.135650,B,B,46,7960,6,350.0,3,11,B,196684.316040
6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,B,B,30,5562,0,,0,5,A,189050.289571
5123,27,1.0,47.939008,,1.0,12,16.0,2015,0.072158,B,B,2,629,1,,0,0,A,159143.805370
5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,B,A,30,5048,9,325.0,2,5,B,181595.339808


In [235]:
df.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


In [236]:
# Выводы:
# Датасет требует проверку на выборосы. 
# Причины:
# У некоторых полей "min" == 0, что нелогично. Например: Rooms, KitchenSquare
# Завышенные значения в "max". Например: Square, LifeSquare, KitchenSquare, DistrictId

##### DistrictId

In [237]:
# Визуально оценим массив уникальных значений DistrictId
district_id = df['DistrictId'].unique()
district_id.sort()
district_id
# Выглядит правдоподобно

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [238]:
# Проверим, что DistrictId == 0 не выброс
df['DistrictId'].value_counts().sort_index()
# Будем считать, что DistrictId == 0 это порядковый номер района в некой БД. 

0       48
1      652
2       78
3       82
4       11
      ... 
202      2
205      1
207      1
208      2
209      1
Name: DistrictId, Length: 205, dtype: int64

##### Rooms

In [239]:
# Визуально оценим количество комнат в квартирах
df['Rooms'].value_counts()

2.0     3880
1.0     3705
3.0     2235
4.0      150
5.0       18
0.0        8
10.0       2
19.0       1
6.0        1
Name: Rooms, dtype: int64

In [240]:
df['Rooms'].mode()

0    2.0
dtype: float64

In [241]:
df.loc[df['Rooms']==0.0, 'Rooms'] = df['Rooms'].mode()[0]

In [242]:
# Видим, что количество квартир с двумя комнатами прибавилось на 8 едениц 
df['Rooms'].value_counts()

2.0     3888
1.0     3705
3.0     2235
4.0      150
5.0       18
10.0       2
19.0       1
6.0        1
Name: Rooms, dtype: int64

In [243]:
df.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,5202.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8921,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1142.90446,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.837811,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1021.517264,1.493601,4.806341,92872.293865
min,0.0,0.0,1.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,350.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,900.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,1548.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,633233.46657


##### Square

In [244]:
square = df['Square'].sort_values()
square.tail(10)

5087    185.906396
7201    186.692602
652     190.857689
8511    198.930182
9910    200.334539
1981    212.932361
1982    275.645284
4690    409.425181
4262    604.705972
6977    641.065193
Name: Square, dtype: float64

In [257]:
# Несколько последних значений сильно опережают остальных
# Расммотрим данные по ним

In [258]:
# df['Price'].corr(df['Square'])

In [259]:
# df.plot(x='Price', y='Square', style='o')

In [260]:
df[df['Square'] > 400]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
4262,28,9,2.0,604.705972,,1.0,17,18.0,1977,0.161532,B,B,25,5648,1,30.0,2,4,B,187717.242538
4690,2307,102,1.0,409.425181,410.639749,10.0,4,4.0,2016,0.238617,B,B,26,3889,6,705.0,3,6,B,90470.43083
6977,11602,30,2.0,641.065193,638.163193,10.0,20,19.0,2019,7.8e-05,B,B,22,6398,141,1046.0,3,23,B,133529.681562


In [262]:
df[df['Square'] < df['LifeSquare']]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
28,8054,23,1.0,42.530043,43.967759,1.0,3,9.0,2014,0.034656,B,B,0,168,0,,0,0,B,95338.198549
44,10521,38,3.0,104.211396,106.340403,0.0,20,0.0,2017,0.060753,B,B,15,2787,2,520.0,0,7,B,435462.048070
52,2301,1,2.0,61.400054,65.224603,0.0,17,22.0,2016,0.007122,B,B,1,264,0,,0,1,B,199215.452229
123,8753,25,3.0,85.952306,89.803753,1.0,4,3.0,2017,0.069753,B,B,53,13670,4,,1,11,B,309688.592681
153,9870,62,1.0,51.831473,53.491301,1.0,5,1.0,2015,0.072158,B,B,2,629,1,,0,0,A,131797.472284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9884,41,30,2.0,59.015896,59.439082,12.0,5,5.0,2016,0.000078,B,B,22,6398,141,1046.0,3,23,B,126281.142781
9889,12918,23,2.0,51.440463,53.134243,51.0,3,17.0,2017,0.005767,B,B,1,388,0,,0,0,B,88150.012510
9895,2737,27,3.0,123.430072,125.806981,123.0,5,10.0,2015,0.017647,B,B,2,469,0,,0,0,B,234194.837047
9902,14001,73,1.0,44.098768,44.267551,1.0,7,24.0,2014,0.042032,B,B,37,6856,84,1940.0,2,5,B,381937.404161


Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
1982,5548,86,5.0,275.645284,233.949309,26.0,12,37.0,2011,0.161976,B,A,31,7010,5,4508.0,3,7,B,455264.882666
4262,28,9,2.0,604.705972,,1.0,17,18.0,1977,0.161532,B,B,25,5648,1,30.0,2,4,B,187717.242538
4690,2307,102,1.0,409.425181,410.639749,10.0,4,4.0,2016,0.238617,B,B,26,3889,6,705.0,3,6,B,90470.43083
6977,11602,30,2.0,641.065193,638.163193,10.0,20,19.0,2019,7.8e-05,B,B,22,6398,141,1046.0,3,23,B,133529.681562


In [252]:
# df['Square'].plot()

In [279]:
X = df.drop(columns=['Price'])
X = pd.get_dummies(X)

In [280]:
y = df['Price']

In [281]:
X, y

(         Id  DistrictId  Rooms     Square  LifeSquare  KitchenSquare  Floor  \
 0     14038          35    2.0  47.981561   29.442751            6.0      7   
 1     15053          41    3.0  65.683640   40.049543            8.0      7   
 2      4765          53    2.0  44.947953   29.197612            0.0      8   
 3      5809          58    2.0  53.352981   52.731512            9.0      8   
 4     10783          99    1.0  39.649192   23.776169            7.0     11   
 ...     ...         ...    ...        ...         ...            ...    ...   
 9995     77          32    2.0  50.401785   30.476203            5.0      6   
 9996   6159          18    1.0  41.521546   20.539216            9.0     13   
 9997   5123          27    1.0  47.939008         NaN            1.0     12   
 9998   5400          75    2.0  43.602562   33.840147            8.0      1   
 9999   6306         128    1.0  38.666645   21.157874            8.0      7   
 
       HouseFloor  HouseYear  Ecology_

In [283]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

In [284]:
lr = LinearRegression()

In [285]:
lr.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').