In [1]:
import numpy as np
import pandas as pd

In [2]:
np.set_printoptions(precision=3, suppress=True)

#### Замена буквенных признаков на числовые

In [3]:
def prepare_df(df):
    df = df.fillna(0)
    values = {'A': 1, 'B': 2}
    return df.replace(to_replace={'Ecology_2': values, 'Ecology_3': values, 'Shops_2':values})

#### Разделение данных

In [4]:
def split_ids_X_y(df):
    return (pd.DataFrame(df, columns=['Id']), 
            df.drop(['Id', 'Price'], 1), 
            pd.DataFrame(df, columns=['Price']))

In [5]:
data_train = pd.read_csv("train.csv")
data_train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [6]:
data_train = prepare_df(data_train)
data_train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,2,2,33,7976,5,0.0,0,11,2,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,2,2,46,10309,1,240.0,1,16,2,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,2,2,34,7759,0,229.0,1,3,2,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,2,2,23,5735,3,1084.0,0,5,2,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,2,2,35,5776,1,2078.0,2,4,2,150226.531644


In [7]:
ids, X_train, y_train = split_ids_X_y(data_train)

#### Обучение модели

In [8]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=400, max_depth=32, random_state=777)

In [9]:
model.fit(X_train, y_train.values[:, 0])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=32,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=400,
                      n_jobs=None, oob_score=False, random_state=777, verbose=0,
                      warm_start=False)

#### Проверка модели

In [10]:
y_pred = model.predict(X_train)

In [11]:
check_pred = pd.DataFrame({'y_train': y_train['Price'], 'y_pred': y_pred.flatten()})
check_pred.head()

Unnamed: 0,y_train,y_pred
0,184966.93073,192046.56368
1,300009.450063,292495.704665
2,220925.908524,217144.105686
3,175616.227217,186614.932681
4,150226.531644,154453.27343


In [12]:
from sklearn.metrics import r2_score

In [13]:
r2 = r2_score(y_train, y_pred)
r2

0.9646596036372534

In [14]:
print(model.feature_importances_)

[0.048 0.079 0.391 0.033 0.026 0.029 0.024 0.038 0.041 0.    0.    0.095
 0.095 0.055 0.02  0.008 0.017 0.001]


#### Предсказания

In [15]:
data_test = pd.read_csv("test.csv")
data_test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [16]:
def split_ids_X(df):
    return (pd.DataFrame(df, columns=['Id']), df.drop('Id', 1))

In [17]:
data_test = prepare_df(data_test)

In [18]:
ids, X_test = split_ids_X(data_test)

In [19]:
y_test = model.predict(X_test)

In [20]:
predictions = pd.DataFrame({'Id': ids['Id'], 'Price': y_test.flatten()})
predictions.head()

Unnamed: 0,Id,Price
0,725,160834.94506
1,15856,224855.096198
2,5480,200895.235159
3,15664,332490.425537
4,14275,143814.856992


#### Выгрузка в csv

In [21]:
predictions.to_csv('Demianenko_predictions.csv', index=False)