### Итоговая работа по теме "Библиотеки Python для Data Science: Numpy, Matplotlib, Scikit-learn"

In [7]:
# Подключение библиотек
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [8]:
%matplotlib inline

In [9]:
# Константы
TRAIN_DATASET_PATH = 'train.csv'
TEST_DATASET_PATH = 'test.csv'

In [78]:
# Чтение датасета
df = pd.read_csv(TRAIN_DATASET_PATH)

In [77]:
# Просмор первых 5 записей и заголовков
df.head(100)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.000000,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,856,7,1.0,40.061694,24.430708,6.0,5,9.0,1973,0.130618,B,B,39,10418,9,900.0,1,9,B
96,12791,44,2.0,59.807466,36.616450,8.0,7,14.0,1971,0.122190,B,B,25,4724,2,1308.0,1,3,B
97,12499,6,3.0,81.725331,,0.0,10,17.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B
98,12249,6,1.0,43.286141,,0.0,12,17.0,1977,0.243205,B,B,5,1564,0,540.0,0,0,B


In [12]:
# Размер датасета
df.shape

(10000, 20)

In [13]:
# Смотрим информацию о датасете (количество строк и столбцов, тип данных, наличие пустых полей)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10000 non-null  int64  
 1   DistrictId     10000 non-null  int64  
 2   Rooms          10000 non-null  float64
 3   Square         10000 non-null  float64
 4   LifeSquare     7887 non-null   float64
 5   KitchenSquare  10000 non-null  float64
 6   Floor          10000 non-null  int64  
 7   HouseFloor     10000 non-null  float64
 8   HouseYear      10000 non-null  int64  
 9   Ecology_1      10000 non-null  float64
 10  Ecology_2      10000 non-null  object 
 11  Ecology_3      10000 non-null  object 
 12  Social_1       10000 non-null  int64  
 13  Social_2       10000 non-null  int64  
 14  Social_3       10000 non-null  int64  
 15  Healthcare_1   5202 non-null   float64
 16  Helthcare_2    10000 non-null  int64  
 17  Shops_1        10000 non-null  int64  
 18  Shops_2

In [14]:
# Выводы:
# LifeSquare, Healthcare_1 - имеют пропуски. Необходимо решить эту проблему.
# Ecology_2, Ecology_3, Shops_2 - категориальные признаки. Проверить уникальность и необходимость их участия в построение модели.

In [15]:
# Заменяем индекс датасета поскольку для построение модели он не несет никакой полезной информации
df.set_index('Id')

Unnamed: 0_level_0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,B,B,33,7976,5,,0,11,B,184966.930730
15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,B,B,46,10309,1,240.0,1,16,B,300009.450063
4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,32,2.0,50.401785,30.476203,5.0,6,5.0,1968,0.135650,B,B,46,7960,6,350.0,3,11,B,196684.316040
6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,B,B,30,5562,0,,0,5,A,189050.289571
5123,27,1.0,47.939008,,1.0,12,16.0,2015,0.072158,B,B,2,629,1,,0,0,A,159143.805370
5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,B,A,30,5048,9,325.0,2,5,B,181595.339808


In [69]:
df.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Price
count,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,3545.0,6015.0,6015.0,6015.0
mean,8426.699418,59.432585,1.939318,54.606086,33.372851,7.381546,7.504406,12.98537,5318.224,0.124681,30.463175,6498.158271,6.971737,1231.522426,1.610973,4.705569,227693.253236
std,4840.048061,46.375154,0.8566,18.000776,14.243134,3.104363,4.721679,5.728341,258521.8,0.121109,15.962437,3825.392409,17.783932,1090.619423,1.513034,4.277488,94347.358487
min,0.0,0.0,1.0,16.117154,0.795539,0.0,1.0,1.0,1910.0,0.0,0.0,168.0,0.0,30.0,0.0,0.0,59174.778028
25%,4268.0,21.0,1.0,41.058593,22.39453,6.0,4.0,9.0,1970.0,0.033494,22.0,4378.0,1.0,320.0,0.0,1.0,166577.475077
50%,8452.0,51.0,2.0,50.392833,31.557039,8.0,7.0,12.0,1980.0,0.090799,30.0,5889.0,3.0,1015.0,1.0,4.0,203646.504266
75%,12643.5,93.0,3.0,64.052291,42.267723,9.0,10.0,17.0,2001.0,0.194489,39.0,7960.0,6.0,1937.0,3.0,6.0,268557.63814
max,16798.0,208.0,19.0,275.645284,233.949309,58.0,42.0,99.0,20052010.0,0.521867,74.0,19083.0,141.0,4849.0,6.0,23.0,625678.644994


In [17]:
# Выводы:
# Датасет требует проверку на выборосы. 
# Причины:
# У некоторых полей "min" == 0, что нелогично. Например: Rooms, KitchenSquare
# Завышенные значения в "max". Например: Square, LifeSquare, KitchenSquare, DistrictId

##### DistrictId

In [18]:
# Визуально оценим массив уникальных значений DistrictId
district_id = df['DistrictId'].unique()
district_id.sort()
district_id
# Выглядит правдоподобно

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [19]:
# Проверим, что DistrictId == 0 не выброс
df['DistrictId'].value_counts().sort_index()
# Будем считать, что DistrictId == 0 это порядковый номер района в некой БД. 

0       48
1      652
2       78
3       82
4       11
      ... 
202      2
205      1
207      1
208      2
209      1
Name: DistrictId, Length: 205, dtype: int64

##### Rooms

In [36]:
# Визуально оценим количество комнат в квартирах
df['Rooms'].value_counts()

2.0     3880
1.0     3705
3.0     2235
4.0      150
5.0       18
10.0       2
19.0       1
6.0        1
Name: Rooms, dtype: int64

In [21]:
# df['Rooms'].mode()

In [22]:
# df.loc[df['Rooms']==0.0, 'Rooms'] = df['Rooms'].mode()[0]

In [23]:
# Видим, что количество квартир с двумя комнатами прибавилось на 8 едениц 
# df['Rooms'].value_counts()

In [24]:
# df.describe()

In [37]:
df.drop(df[df['Rooms']==0.0].index, inplace=True)

##### Square

In [244]:
square = df['Square'].sort_values()
square.tail(10)

5087    185.906396
7201    186.692602
652     190.857689
8511    198.930182
9910    200.334539
1981    212.932361
1982    275.645284
4690    409.425181
4262    604.705972
6977    641.065193
Name: Square, dtype: float64

In [257]:
# Несколько последних значений сильно опережают остальных
# Расммотрим данные по ним

In [258]:
# df['Price'].corr(df['Square'])

In [259]:
# df.plot(x='Price', y='Square', style='o')

In [88]:
df.drop(df[df['Square'] > 400].index, inplace=True)

In [89]:
df.drop(df[df['Square'] < df['LifeSquare']].index, inplace=True)

In [90]:
df.drop(df[df['LifeSquare'].isnull()].index, inplace=True)

In [91]:
df.drop(df[df['KitchenSquare'] > 1000].index, inplace=True)

In [92]:
df.drop(df[df['KitchenSquare']+df['LifeSquare'] > df['Square']].index, inplace=True)

In [93]:
df.drop(df[df['HouseFloor']==0.0].index, inplace=True)

In [94]:
df.drop(df[df['Floor']>df['HouseFloor']].index, inplace=True)

In [95]:
df.drop(columns=['Healthcare_1'], inplace=True)

In [96]:
df.isnull().sum()

Id               0
DistrictId       0
Rooms            0
Square           0
LifeSquare       0
KitchenSquare    0
Floor            0
HouseFloor       0
HouseYear        0
Ecology_1        0
Ecology_2        0
Ecology_3        0
Social_1         0
Social_2         0
Social_3         0
Helthcare_2      0
Shops_1          0
Shops_2          0
Price            0
dtype: int64

In [73]:
df

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.089040,B,B,33,7976,5,0,11,B,184966.930730
1,15053,41,3.0,65.683640,40.049543,8.0,7,9.0,1978,0.000070,B,B,46,10309,1,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,1,3,B,220925.908524
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2,4,B,150226.531644
5,12915,59,3.0,80.384479,46.683720,12.0,5,17.0,2011,0.309479,B,B,35,7715,4,0,6,B,215898.447742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,14333,150,3.0,78.249637,49.385096,10.0,5,16.0,1976,0.300323,B,B,52,10311,6,1,9,B,342418.758888
9993,48,13,3.0,80.631333,48.899083,10.0,11,14.0,1999,0.090799,B,B,74,19083,2,5,15,B,350684.628560
9996,6159,18,1.0,41.521546,20.539216,9.0,13,13.0,2000,0.000000,B,B,30,5562,0,0,5,A,189050.289571
9998,5400,75,2.0,43.602562,33.840147,8.0,1,5.0,1961,0.307467,B,A,30,5048,9,2,5,B,181595.339808


In [74]:
df.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Price
count,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0,6015.0
mean,8426.699418,59.432585,1.939318,54.606086,33.372851,7.381546,7.504406,12.98537,5318.224,0.124681,30.463175,6498.158271,6.971737,1.610973,4.705569,227693.253236
std,4840.048061,46.375154,0.8566,18.000776,14.243134,3.104363,4.721679,5.728341,258521.8,0.121109,15.962437,3825.392409,17.783932,1.513034,4.277488,94347.358487
min,0.0,0.0,1.0,16.117154,0.795539,0.0,1.0,1.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,59174.778028
25%,4268.0,21.0,1.0,41.058593,22.39453,6.0,4.0,9.0,1970.0,0.033494,22.0,4378.0,1.0,0.0,1.0,166577.475077
50%,8452.0,51.0,2.0,50.392833,31.557039,8.0,7.0,12.0,1980.0,0.090799,30.0,5889.0,3.0,1.0,4.0,203646.504266
75%,12643.5,93.0,3.0,64.052291,42.267723,9.0,10.0,17.0,2001.0,0.194489,39.0,7960.0,6.0,3.0,6.0,268557.63814
max,16798.0,208.0,19.0,275.645284,233.949309,58.0,42.0,99.0,20052010.0,0.521867,74.0,19083.0,141.0,6.0,23.0,625678.644994


In [252]:
# df['Square'].plot()

In [97]:
X = df.drop(columns=['Price'])
X = pd.get_dummies(X)

In [98]:
y = df['Price']

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.3, random_state=42)

In [100]:
lr = LinearRegression()

In [101]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [102]:
y_pred = lr.predict(X_test)

In [103]:
r2_score(y_test, y_pred)

0.5431814687655208

# Вывод: Удаление параметров показало себя плохо