### Описание

Для создания модели были использованы большинство моделей, которые преподавались на курсе, а именно:

+ 1 Линейная регрессия
+ 2 КНН
+ 3 Случайный лес
+ А так же Экстра Случайный лес

В предварительных "очищенных данных" лучше всего себя показал Случайный лес, с переменным успехом опережавший его Экстра случайный лес второе место КНН и самое последние место у линейной регрессии

Выбор между Экстра случайным лесом и Случайным лесом определился в последний момент, когда возникла теория о том, что алгоритмы чувствительны к людским ошибкам, а значит писать функции, которые будут "исправлять" ошибки бесмысленно, так как при изменение первых, будут появляться новые, а значит - алгоритм подвержен, человеческому фактору, что не есть хорошо. Значит, необходимо найти такое сочетание весов, при котором алгоритм был максимально наполненым

#### Загружаем датафрейм и библиотеки

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [3]:
df = df.drop(['KitchenSquare', 'HouseYear'], axis=1)
df.Ecology_2 = pd.get_dummies(df.Ecology_2).loc[:, 'A']
df.Ecology_3 = pd.get_dummies(df.Ecology_3).loc[:, 'A']
df.Shops_2 = pd.get_dummies(df.Shops_2).loc[:, 'A']
df = df.drop(['Healthcare_1', 'LifeSquare'], axis=1)

In [4]:
df.isna().sum()

Id             0
DistrictId     0
Rooms          0
Square         0
Floor          0
HouseFloor     0
Ecology_1      0
Ecology_2      0
Ecology_3      0
Social_1       0
Social_2       0
Social_3       0
Helthcare_2    0
Shops_1        0
Shops_2        0
Price          0
dtype: int64

In [5]:
df.shape

(10000, 16)

#### Добавляем дополнительные метрики, известные нам

In [6]:
# Средняя цена по районам в зависимости от количества комнат + данные для пустых значений
d = []
for i, v in df.groupby(['DistrictId', 'Rooms']):
    d.append([*i, 
              v.Price.mean(),])
d = pd.DataFrame(d, columns=['DistrictId', 'Rooms', 'Mean_price_d_id'])
df = df.merge(d, on=['DistrictId', 
                     'Rooms'])
mean_d = []
for i, v in df.groupby(['Rooms']):
    mean_d.append([i, 
              v.Price.mean(),])
mean_d = pd.DataFrame(mean_d, columns=['Rooms', 'Mean_price_d_id'])

In [7]:
# Отношение средней площади к средней стоимости по районам + данные для пустых значений
x = []
for i, v in df.groupby(['DistrictId', 'Rooms']):
    x.append([*i, v.Price.mean() / v.Square.mean()])
x = pd.DataFrame(x, columns=['DistrictId', 'Rooms', 'S_mean|P_mean'])
df = df.merge(x, on=['DistrictId', 
                     'Rooms'])
mean_x = []
for i, v in df.groupby(['Rooms']):
    mean_x.append([i, v.Price.mean() / v.Square.mean()])
mean_x = pd.DataFrame(mean_x, columns=['Rooms', 'S_mean|P_mean'])

In [8]:
df.shape

(10000, 18)

In [9]:
# Правим индекс
df.index = df.Id
df = df.drop('Id', axis=1)
df.index.name = None
df.head()

Unnamed: 0,DistrictId,Rooms,Square,Floor,HouseFloor,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Helthcare_2,Shops_1,Shops_2,Price,Mean_price_d_id,S_mean|P_mean
14038,35,2.0,47.981561,7,9.0,0.08904,0,0,33,7976,5,0,11,0,184966.93073,213160.076297,4317.011198
8401,35,2.0,43.325817,7,14.0,0.08904,0,0,33,7976,5,0,11,0,194175.395111,213160.076297,4317.011198
14308,35,2.0,48.249779,3,9.0,0.08904,0,0,33,7976,5,0,11,0,181020.120527,213160.076297,4317.011198
14777,35,2.0,49.986281,9,9.0,0.08904,0,0,33,7976,5,0,11,0,212745.174808,213160.076297,4317.011198
13067,35,2.0,48.58862,12,12.0,0.08904,0,0,33,7976,5,0,11,0,212187.943646,213160.076297,4317.011198


In [10]:
# Загружаем модели
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [11]:
parameters = [{'n_estimators': np.arange(10,30,5),
'max_features': np.arange(3, 16),
'max_depth': np.arange(1, 16)}]

mod = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                  param_grid=parameters,
                  scoring='r2',
                  n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(df.drop(['Price'], axis=1), df.Price, random_state=42)

In [12]:
mod.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'n_estimators': array([10, 15, 20, 25]), 'max_features': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]), 'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=0)

In [13]:
print(mod.best_score_, mod.best_params_, sep='\n')

0.7617100091899136
{'max_depth': 10, 'max_features': 9, 'n_estimators': 25}


In [14]:
mod = RandomForestRegressor(max_depth=10, max_features=9, n_estimators=25, random_state=42)
mod.fit(X_train, y_train)
y_pred = mod.predict(X_test)
r2_score(y_test, y_pred)

0.7600471040265492

Неплохо, но есть способ повысить на 0.1% результат, а именно

In [15]:
data = []
for i in np.arange(3, 17):
    for v in np.arange(1, 17):
        mod = RandomForestRegressor(max_depth=i, max_features=v, n_estimators=25, random_state=42)
        mod.fit(X_train, y_train)
        y_pred = mod.predict(X_test)
        data.append([i, v, r2_score(y_test, y_pred)])
data = pd.DataFrame(data)
data[data.iloc[:, 2] == data.iloc[:, 2].max()]

Unnamed: 0,0,1,2
151,12,8,0.761772


In [16]:
mod = RandomForestRegressor(max_depth=12, max_features=8, n_estimators=25, random_state=42)
mod.fit(X_train, y_train)
y_pred = mod.predict(X_test)
r2_score(y_test, y_pred)

0.761772388040349

### Тоже самое с валидными данными

In [17]:
df = pd.read_csv('test.csv')
df.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [18]:
df = df.drop(['KitchenSquare', 'HouseYear'], axis=1)
df.Ecology_2 = pd.get_dummies(df.Ecology_2).loc[:, 'A']
df.Ecology_3 = pd.get_dummies(df.Ecology_3).loc[:, 'A']
df.Shops_2 = pd.get_dummies(df.Shops_2).loc[:, 'A']
df = df.drop(['Healthcare_1', 'LifeSquare'], axis=1)

In [19]:
df.isna().sum()

Id             0
DistrictId     0
Rooms          0
Square         0
Floor          0
HouseFloor     0
Ecology_1      0
Ecology_2      0
Ecology_3      0
Social_1       0
Social_2       0
Social_3       0
Helthcare_2    0
Shops_1        0
Shops_2        0
dtype: int64

In [20]:
df.shape

(5000, 15)

In [21]:
# Так как у нас нет данных по цене то просто мержим)
df = df.merge(d, on=['DistrictId', 
                     'Rooms'], how='left')
df = df.merge(x, on=['DistrictId', 
                     'Rooms'], how='left')

In [22]:
df.shape

(5000, 17)

In [23]:
df.isna().sum()

Id                  0
DistrictId          0
Rooms               0
Square              0
Floor               0
HouseFloor          0
Ecology_1           0
Ecology_2           0
Ecology_3           0
Social_1            0
Social_2            0
Social_3            0
Helthcare_2         0
Shops_1             0
Shops_2             0
Mean_price_d_id    68
S_mean|P_mean      68
dtype: int64

Есть пустые значения - поправляем

In [24]:
df2 = df[df.Mean_price_d_id.isna()]
df = df.dropna()

In [25]:
df2 = df2.merge(mean_d, on='Rooms', how='left')
df2 = df2.merge(mean_x, on='Rooms', how='left')

In [26]:
df2['Mean_price_d_id'] = df2.Mean_price_d_id_y
df2['S_mean|P_mean'] = df2.loc[:, 'S_mean|P_mean_y']

In [27]:
df2 = df2.drop(['Mean_price_d_id_y', 'S_mean|P_mean_y'], axis=1)
df2 = df2.dropna(axis=1)

In [28]:
df2.shape

(68, 15)

In [29]:
df2.isna().sum()

Id             0
DistrictId     0
Rooms          0
Square         0
Floor          0
HouseFloor     0
Ecology_1      0
Ecology_2      0
Ecology_3      0
Social_1       0
Social_2       0
Social_3       0
Helthcare_2    0
Shops_1        0
Shops_2        0
dtype: int64

In [30]:
# Обьеденяем обратно
df = df.append(df2)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [31]:
df = df.fillna(0)

In [32]:
# Правим индекс
df.index = df.Id
df = df.drop('Id', axis=1)
df.index.name = None
df.head()

Unnamed: 0,DistrictId,Ecology_1,Ecology_2,Ecology_3,Floor,Helthcare_2,HouseFloor,Mean_price_d_id,Rooms,S_mean|P_mean,Shops_1,Shops_2,Social_1,Social_2,Social_3,Square
725,58,0.310199,0,0,6,0,14.0,172368.882269,2.0,2872.289445,0,0,11,2748,1,49.882643
15856,74,0.075779,0,0,6,0,1.0,236250.931918,2.0,3242.967075,2,0,6,1437,3,69.263183
15664,47,0.101872,0,0,22,3,22.0,206507.966738,2.0,3830.073951,3,0,23,4583,3,73.046609
14275,27,0.072158,0,0,17,0,17.0,122710.743033,1.0,2878.20207,0,1,2,629,1,47.527111
7633,53,0.049637,0,0,21,1,21.0,195994.138322,1.0,5093.138069,3,0,34,7759,0,40.675627


In [33]:
df.shape

(5000, 16)

In [34]:
# Предсказываем
y_pred = mod.predict(df)

In [35]:
y_pred.shape

(5000,)

In [36]:
df['y_pred'] = y_pred

In [37]:
df.y_pred.to_csv('VNikandrov_predict.csv')

In [43]:
# не нашел как сделать названия колонок, поэтому через нотпад вписал Id, Price
# Проверяем правильность записи данных

df = pd.read_csv('VNikandrov_predict.csv')
print(df.shape)
df[-5:]

(5000, 2)


Unnamed: 0,Id,Price
4995,2196,93304.744257
4996,429,90526.787791
4997,13426,86100.319235
4998,15237,83002.748864
4999,4141,82594.219851
