## Курсовой проект по предмету "Python для Data Science"
Используя данные из train.csv, построить
модель для предсказания цен на недвижимость (квартиры).
С помощью полученной модели предсказать
цены для квартир из файла test.csv.

Целевая переменная:
Price

Основная метрика:
R2 - коэффициент детерминации (sklearn.metrics.r2_score)

Вспомогательная метрика:
MSE - средняя квадратичная ошибка (sklearn.metrics.mean_squared_error)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

Просмотрим состав Базы данных, заполненность признаков.

In [2]:
data_ = pd.read_csv('train.csv')
data_.shape

(10000, 20)

In [3]:
data_.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price'],
      dtype='object')

In [4]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


### Подготовка данных (произведена на всей выборке, т.е. и на train  и на valid)
##### Коррекция данных в столбце Healthcare_1 
По итогам рассмотрения Базы данных оказалось, что столбец Healthcare_1 заполнен на 52%. Так как фактически половина данных по признаку отсутствует и корректный способ восстановления данных не понятен, то столбец лучше убрать и тренировать модель на выборке без него.

In [5]:
data = data_.drop('Healthcare_1', axis=1)

In [6]:
data.shape

(10000, 19)

In [7]:
data.describe()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Price
count,10000.0,10000.0,10000.0,10000.0,7887.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,8383.4077,50.4008,1.8905,56.315775,37.199645,6.2733,8.5267,12.6094,3990.166,0.118858,24.687,5352.1574,8.0392,1.3195,4.2313,214138.857399
std,4859.01902,43.587592,0.839512,21.058732,86.241209,28.560917,5.241148,6.775974,200500.3,0.119025,17.532614,4006.799803,23.831875,1.493601,4.806341,92872.293865
min,0.0,0.0,0.0,1.136859,0.370619,0.0,1.0,0.0,1910.0,0.0,0.0,168.0,0.0,0.0,0.0,59174.778028
25%,4169.5,20.0,1.0,41.774881,22.769832,1.0,4.0,9.0,1974.0,0.017647,6.0,1564.0,0.0,0.0,1.0,153872.633942
50%,8394.5,36.0,2.0,52.51331,32.78126,6.0,7.0,13.0,1977.0,0.075424,25.0,5285.0,2.0,1.0,3.0,192269.644879
75%,12592.5,75.0,2.0,65.900625,45.128803,9.0,12.0,17.0,2001.0,0.195781,36.0,7227.0,5.0,2.0,6.0,249135.462171
max,16798.0,209.0,19.0,641.065193,7480.592129,2014.0,42.0,117.0,20052010.0,0.521867,74.0,19083.0,141.0,6.0,23.0,633233.46657


##### Коррекция Года постройки (HouseYear)
Смотрим здания очень ранних лет постройки (раньше 1900 года) и постройки позднее 2020 года.
Вручную изменяем явно неправильные значения (так как ошибок мало).

In [8]:
data.loc[data['HouseYear']>2020, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price
1497,10814,109,1.0,37.26507,20.239714,9.0,9,12.0,20052011,0.13633,B,B,30,6141,10,3,6,B,254084.534396
4189,11607,147,2.0,44.791836,28.360393,5.0,4,9.0,4968,0.319809,B,B,25,4756,16,5,8,B,243028.603096


In [9]:
data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2008

In [10]:
data.loc[data['HouseYear'] == 4968, 'HouseYear'] = 1968

In [11]:
data.loc[data['HouseYear']<1900, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price


##### Коррекция Комнат (Rooms)
Смотрим квартиры с большим количеством комнат (больше 5) и с количеством 0 комнат. Исходя из площади оцениваем вероятность того, что количество комнат указано верно. Необходимые изменения делаются вручную из-за малого количества выбросов.

In [12]:
data.Rooms.describe()

count    10000.000000
mean         1.890500
std          0.839512
min          0.000000
25%          1.000000
50%          2.000000
75%          2.000000
max         19.000000
Name: Rooms, dtype: float64

In [13]:
data.loc[data['Rooms'] > 5, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,10.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,B,B,74,19083,2,5,15,B,317265.323792
1454,8491,1,19.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,B,B,1,264,0,0,1,B,78364.616704
2170,14003,99,6.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,B,B,66,10573,1,3,8,B,229661.964416
8849,14865,9,10.0,60.871266,38.420681,10.0,3,2.0,1994,0.161532,B,B,25,5648,1,2,4,B,172329.270863


In [14]:
data.loc[data['Id'].isin([5927, 14003, 14865]), 'Rooms'] = 2 

In [15]:
data.loc[data['Id'] == 8491, 'Rooms'] = 1 

In [16]:
data.loc[data['Rooms'] == 0, :].shape

(8, 19)

In [17]:
data.loc[data['Rooms'] == 0, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price
1397,12638,27,0.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,B,B,11,3097,0,0,0,B,268394.744389
1981,7917,27,0.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,0,1,B,302211.260887
2269,7317,27,0.0,41.790881,,0.0,13,0.0,1977,0.211401,B,B,9,1892,0,0,1,B,98129.976788
3911,770,28,0.0,49.483501,,0.0,16,0.0,2015,0.118537,B,B,30,6207,1,1,0,B,217009.338463
4366,456,6,0.0,81.491446,,0.0,4,0.0,1977,0.243205,B,B,5,1564,0,0,0,B,212864.799112
4853,3224,27,0.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,0,0,B,126596.941798
6149,3159,88,0.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,B,B,43,8429,3,3,9,B,158998.110646
8834,9443,27,0.0,87.762616,85.125471,0.0,5,15.0,1977,0.211401,B,B,9,1892,0,0,1,B,219281.918007


In [18]:
data.loc[data['Id'].isin([12638, 7917]), 'Rooms'] = 6

In [19]:
data.loc[data['Id'].isin([7317, 770, 3159, 3224]), 'Rooms'] = 1

In [20]:
data.loc[data['Id'].isin([456, 9443]), 'Rooms'] = 3

In [21]:
data.loc[data['Id'].isin([5927, 14003, 14865, 8491, 12638, 7917, 7317, 770, 3159, 3224, 456, 9443])]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price
377,5927,57,2.0,59.056975,36.223072,10.0,22,22.0,2002,0.090799,B,B,74,19083,2,5,15,B,317265.323792
1397,12638,27,6.0,138.427694,136.215499,0.0,4,3.0,2016,0.075424,B,B,11,3097,0,0,0,B,268394.744389
1454,8491,1,1.0,42.006046,21.779288,7.0,17,17.0,2014,0.007122,B,B,1,264,0,0,1,B,78364.616704
1981,7917,27,6.0,212.932361,211.231125,0.0,2,3.0,2008,0.211401,B,B,9,1892,0,0,1,B,302211.260887
2170,14003,99,2.0,59.414334,38.702244,6.0,7,9.0,1969,0.033494,B,B,66,10573,1,3,8,B,229661.964416
2269,7317,27,1.0,41.790881,,0.0,13,0.0,1977,0.211401,B,B,9,1892,0,0,1,B,98129.976788
3911,770,28,1.0,49.483501,,0.0,16,0.0,2015,0.118537,B,B,30,6207,1,1,0,B,217009.338463
4366,456,6,3.0,81.491446,,0.0,4,0.0,1977,0.243205,B,B,5,1564,0,0,0,B,212864.799112
4853,3224,27,1.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,0,0,B,126596.941798
6149,3159,88,1.0,38.697117,19.345131,9.0,9,16.0,1982,0.127376,B,B,43,8429,3,3,9,B,158998.110646


##### Коррекция Площади (Square)

При размере площади Square менее 15 кв.м. заменяем её на усреднённую площадь (усреднённую по квартирам с таким же количеством комнат).

In [22]:
data.loc[data['Square'] < 15, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price
212,1748,88,2.0,5.497061,67.628717,1.0,24,22.0,1977,0.127376,B,B,43,8429,3,3,9,B,412511.088764
1316,11526,27,1.0,4.633498,1.969969,1.0,18,1.0,1977,0.011654,B,B,4,915,0,0,0,B,107604.269441
1608,10202,6,1.0,2.596351,4.604943,1.0,3,25.0,2014,0.243205,B,B,5,1564,0,0,0,B,137597.601458
3280,10527,27,1.0,4.380726,40.805837,1.0,10,17.0,2013,0.211401,B,B,9,1892,0,0,1,B,97560.720383
3413,9487,5,1.0,5.129222,5.549458,1.0,1,1.0,1977,0.150818,B,B,16,3433,4,4,5,B,369472.403061
4739,12676,81,3.0,13.784865,15.988889,7.0,4,5.0,1960,0.319809,B,B,25,4756,16,5,8,B,78388.806186
4853,3224,27,1.0,2.377248,0.873147,0.0,1,0.0,1977,0.017647,B,B,2,469,0,0,0,B,126596.941798
4900,4504,27,3.0,4.390331,5.610772,1.0,8,19.0,2016,0.211401,B,B,9,1892,0,0,1,B,161379.067034
6392,14786,1,1.0,1.136859,4.525736,1.0,3,1.0,1977,0.007122,B,B,1,264,0,0,1,B,181434.825589
8030,13265,1,3.0,4.823679,79.767964,0.0,6,17.0,1977,0.007122,B,B,1,264,0,0,1,B,237716.681261


In [23]:
small_square = data.loc[data['Square'] < 15, 'Id'].values

In [24]:
mean_square = data.groupby('Rooms', as_index=False)[['Square']].mean().rename(columns={'Square':'mean_square'})
mean_square

Unnamed: 0,Rooms,mean_square
0,1.0,41.314582
1,2.0,56.790526
2,3.0,76.91014
3,4.0,98.377544
4,5.0,122.614941
5,6.0,175.680028


In [25]:
data = pd.merge(data, mean_square, on='Rooms', how='left')

In [26]:
data.loc[data['Id'].isin(small_square), 'Square'] = data['mean_square']

При размере площади Square более 150 кв.м. при явном несоответствии площади признакам Rooms и Price (неправдоподобно малое количество комнат и/или малая цена) вручную производим замену на усреднённую площадь (усреднение по квартирам с таким же количеством комнат).

In [27]:
data = data[['Id',
 'DistrictId',
 'Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'mean_square',
 'Price',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Ecology_2',
 'Ecology_3',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Shops_2']]

In [28]:
data.loc[data['Square'] > 150, :]

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,mean_square,Price,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2
652,3718,56,3.0,190.857689,107.376164,19.0,76.91014,338644.075979,7,3.0,2007,0.0,B,B,23,5067,4,2,6,B
1641,11055,151,1.0,174.649522,174.837082,15.0,41.314582,232766.187698,10,12.0,1958,0.236108,B,B,16,3893,27,3,10,B
1812,5602,93,3.0,169.385184,,1.0,76.91014,190949.293998,2,10.0,1977,0.004949,B,B,0,186,0,0,0,B
1981,7917,27,6.0,212.932361,211.231125,0.0,175.680028,302211.260887,2,3.0,2008,0.211401,B,B,9,1892,0,0,1,B
1982,5548,86,5.0,275.645284,233.949309,26.0,122.614941,455264.882666,12,37.0,2011,0.161976,B,A,31,7010,5,3,7,B
2603,5621,23,3.0,163.495333,161.504222,12.0,76.91014,207007.956663,5,3.0,1977,0.014073,B,B,2,475,0,0,0,B
2656,14556,183,4.0,151.15066,98.187482,23.0,98.377544,284884.959104,6,7.0,1912,0.037316,B,B,32,6748,93,4,10,B
3260,16204,23,1.0,160.647588,,1.0,41.314582,217874.997452,3,3.0,1977,0.014073,B,B,2,475,0,0,0,B
3368,6674,54,1.0,168.086347,125.486563,30.0,41.314582,367353.574566,18,28.0,2011,0.006076,B,B,30,5285,0,6,6,B
3651,5300,30,3.0,169.509941,170.713651,1.0,76.91014,200800.673767,2,2.0,2018,7.8e-05,B,B,22,6398,141,3,23,B


In [29]:
data.loc[data['Id'].isin([28, 2307, 11602, 5376]), 'Square'] = data['mean_square']

##### Коррекция ЖилойПлощади (LifeSquare)

Если ЖилаяПлощадь больше Площади (чего быть не может), то значение ЖилойПлощади заменяется на значение Площади.

In [30]:
big_LifeSquare = data.loc[data['Square'] < data['LifeSquare'], 'Id'].values
big_LifeSquare.shape

(474,)

In [31]:
data.loc[data['Id'].isin(big_LifeSquare), 'LifeSquare'] = data['Square']

In [32]:
data['LifeSquare'].describe()

count    7887.000000
mean       35.950190
std        17.649141
min         0.370619
25%        22.769832
50%        32.772431
75%        45.035822
max       233.949309
Name: LifeSquare, dtype: float64

Рассчитано среднее отношение ЖилойПлощади к Площади - mean_LifeSquare_ratio. Если ЖилаяПлощадь меньше Площади, делённой на mean_LifeSquare_ratio, то значение ЖилойПлощади заменяется на значение Площадь/mean_LifeSquare_ratio.

In [34]:
mean_LifeSquare_ratio = data.loc[~data['Id'].isin(small_LifeSquare), 'Square'].sum() / data.loc[~data['Id'].isin(small_LifeSquare), 'LifeSquare'].sum()
mean_LifeSquare_ratio

1.9296369630607095

In [35]:
small_LifeSquare = data.loc[data['Square'] > 1.9296369630607095 * data['LifeSquare'], 'Id'].values
small_LifeSquare.shape

(1171,)

In [36]:
data.loc[data['Id'].isin(small_LifeSquare), 'LifeSquare'] = data['Square'] / 1.9296369630607095

Если значение ЖилойПлощади отсутствует ('NaN'), то значение ЖилойПлощади принимается равным значению Площадь/mean_LifeSquare_ratio (среднее отношение ЖилойПлощади к Площади - mean_LifeSquare_ratio).

In [37]:
nan_LifeSquare = data.loc[data['LifeSquare'].isin(['NaN']), 'Id'].values
nan_LifeSquare.shape

(2113,)

In [38]:
data.loc[data['Id'].isin(nan_LifeSquare), 'LifeSquare'] = data['Square'] / 1.9296369630607095

##### Коррекция ПлощадьКухни (KitchenSquare)

Если ПлощадьКухни == 0, то значение ПлощадьКухни заменяется на 15 кв.м для квартир с 4, 5 и 6 комнатами.

In [39]:
nan_KitchenSquare = data.loc[data['KitchenSquare'] == 0, 'Id'].values
nan_KitchenSquare.shape

(697,)

In [40]:
mean_KitchenSquare = data.groupby('Rooms', as_index=False)[['KitchenSquare']].mean().rename(columns={'KitchenSquare':'mean_KitchenSquare'})
mean_KitchenSquare

Unnamed: 0,Rooms,mean_KitchenSquare
0,1.0,6.521294
1,2.0,5.638681
2,3.0,6.795709
3,4.0,8.506667
4,5.0,9.222222
5,6.0,0.0


In [41]:
mean_data_square = pd.merge(mean_square, mean_KitchenSquare, on='Rooms', how='inner')
mean_data_square

Unnamed: 0,Rooms,mean_square,mean_KitchenSquare
0,1.0,41.314582,6.521294
1,2.0,56.790526,5.638681
2,3.0,76.91014,6.795709
3,4.0,98.377544,8.506667
4,5.0,122.614941,9.222222
5,6.0,175.680028,0.0


In [42]:
mean_data_square['KitchenSquare_ratio'] = mean_data_square['mean_square'] / mean_data_square['mean_KitchenSquare']
mean_data_square

Unnamed: 0,Rooms,mean_square,mean_KitchenSquare,KitchenSquare_ratio
0,1.0,41.314582,6.521294,6.335335
1,2.0,56.790526,5.638681,10.071597
2,3.0,76.91014,6.795709,11.317457
3,4.0,98.377544,8.506667,11.564758
4,5.0,122.614941,9.222222,13.295596
5,6.0,175.680028,0.0,inf


In [43]:
mean_data_square.loc[mean_data_square['mean_KitchenSquare'] == 0, 'mean_KitchenSquare'] = 15
mean_data_square.loc[mean_data_square['KitchenSquare_ratio'].isin(['inf']), 'KitchenSquare_ratio'] = mean_data_square['mean_square'] / mean_data_square['mean_KitchenSquare']

In [44]:
a = mean_data_square.loc[mean_data_square['Rooms'] == 6, 'KitchenSquare_ratio']
a
mean_data_square.loc[mean_data_square['Rooms'] == 5, 'KitchenSquare_ratio'] = a[5]

Для квартир с 1, 2 и 3 комнатами если ПлощадьКухни == 0, то значение ПлощадьКухни заменяется на значение  Площадь/KitchenSquare_ratio (среднее отношение Площади к ПлощадиКухни - KitchenSquare_ratio).

In [45]:
mean_data_square

Unnamed: 0,Rooms,mean_square,mean_KitchenSquare,KitchenSquare_ratio
0,1.0,41.314582,6.521294,6.335335
1,2.0,56.790526,5.638681,10.071597
2,3.0,76.91014,6.795709,11.317457
3,4.0,98.377544,8.506667,11.564758
4,5.0,122.614941,9.222222,11.712002
5,6.0,175.680028,15.0,11.712002


In [46]:
mean_KitchenSquare_ratio = mean_data_square.drop(['mean_square', 'mean_KitchenSquare'], axis=1)
mean_KitchenSquare_ratio

Unnamed: 0,Rooms,KitchenSquare_ratio
0,1.0,6.335335
1,2.0,10.071597
2,3.0,11.317457
3,4.0,11.564758
4,5.0,11.712002
5,6.0,11.712002


In [47]:
data = pd.merge(data, mean_KitchenSquare_ratio, on='Rooms', how='left')

In [48]:
data.loc[data['Id'].isin(nan_KitchenSquare), 'KitchenSquare'] = data['Square'] / data['KitchenSquare_ratio']

##### Коррекция ЭтажностьДома (HouseFloor)
При ЭтажностиДома менее значения Этажа, на котором расположена Квартира, значение ЭтажностиДома заменяется на значение Этажа.

In [49]:
data.loc[data['HouseFloor'] < data['Floor'], 'HouseFloor'] = data['Floor']

##### Коррекция столбцов Ecology_2, Ecology_3 и Shops_2
В столбцах с переменными Object проверяется их количество. Так как по факту их только по 2 в каждом столбце (А и В), то dummy-переменные не вводятся, используется замена А и В на 0 и 1.

In [50]:
data.loc[data['Ecology_2'] == 'A', 'Ecology_2'] = 0
data.loc[data['Ecology_2'] == 'B', 'Ecology_2'] = 1

In [51]:
data.loc[data['Ecology_3'] == 'A', 'Ecology_3'] = 0
data.loc[data['Ecology_3'] == 'B', 'Ecology_3'] = 1

In [52]:
data.loc[data['Shops_2'] == 'A', 'Shops_2'] = 0
data.loc[data['Shops_2'] == 'B', 'Shops_2'] = 1

##### Удаление столбцов, добавленных при очистке данных
В конце очистки убираются вспомогательные столбцы, целевой столбец Price переносится в конец таблицы.

In [53]:
data.drop(['mean_square', 'KitchenSquare_ratio'], axis=1)

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Price,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2
0,14038,35,2.0,47.981561,29.442751,6.000000,184966.930730,7,9.0,1969,8.903972e-02,1,1,33,7976,5,0,11,1
1,15053,41,3.0,65.683640,40.049543,8.000000,300009.450063,7,9.0,1978,6.998930e-05,1,1,46,10309,1,1,16,1
2,4765,53,2.0,44.947953,29.197612,4.462843,220925.908524,8,12.0,1968,4.963726e-02,1,1,34,7759,0,1,3,1
3,5809,58,2.0,53.352981,52.731512,9.000000,175616.227217,8,17.0,1977,4.378852e-01,1,1,23,5735,3,0,5,1
4,10783,99,1.0,39.649192,23.776169,7.000000,150226.531644,11,12.0,1976,1.233889e-02,1,1,35,5776,1,2,4,1
5,12915,59,3.0,80.384479,46.683720,12.000000,215898.447742,5,17.0,2011,3.094791e-01,1,1,35,7715,4,0,6,1
6,14549,154,2.0,62.254114,37.160377,7.000000,296021.204377,3,5.0,1960,4.605564e-01,1,1,20,4386,14,1,5,1
7,11993,74,2.0,80.312926,41.620744,7.974200,221244.156664,14,14.0,1977,7.577876e-02,1,1,6,1437,3,0,2,1
8,5172,1,2.0,64.511437,33.431904,1.000000,229102.795999,9,17.0,1977,7.122317e-03,1,1,1,264,0,0,1,1
9,8649,23,1.0,46.461409,24.077798,8.000000,95380.220993,13,17.0,2014,7.577876e-02,1,1,6,1437,3,0,2,1


In [54]:
data = data[['Id',
 'DistrictId',
 'Rooms',
 'Square',
 'LifeSquare',
 'KitchenSquare',
 'Floor',
 'HouseFloor',
 'HouseYear',
 'Ecology_1',
 'Ecology_2',
 'Ecology_3',
 'Social_1',
 'Social_2',
 'Social_3',
 'Helthcare_2',
 'Shops_1',
 'Shops_2',
 'Price',]]

##### Данные очищены, готовы к моделированию.

##### Разделение и запись данных в файлы X_train, y_train, X_valid, y_valid.

In [55]:
feature_names_ = data.columns
feature_names = feature_names_.drop('Price')

In [56]:
feature_names

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Helthcare_2',
       'Shops_1', 'Shops_2'],
      dtype='object')

In [57]:
X = pd.DataFrame(data, columns = feature_names)

In [58]:
y = pd.DataFrame(data, columns = ['Price'])

In [59]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 18 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       10000 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null int64
Ecology_3        10000 non-null int64
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null int64
dtypes: float64(6), int64(12)
memory usage: 1.8 MB


In [60]:
y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 1 columns):
Price    10000 non-null float64
dtypes: float64(1)
memory usage: 476.2 KB


In [61]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

In [62]:
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv')
X_valid.to_csv('X_valid.csv')
y_valid.to_csv('y_valid.csv')