In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train = train.loc[train['Rooms'].between(1, 9), :]
train = train.loc[train['Square'] < 350, :]
train = train.loc[train['Price'].between(30000, 600000), :]

In [4]:
train.loc[train['Square'] < train['LifeSquare'], 'Square'] = train['LifeSquare'] 
train2 = train.copy()

In [5]:
# DataFrame средних значение по district and Rooms
def mean_col(column, new):
    mean_col = train.groupby(['DistrictId', 'Rooms'])[column].mean().reset_index()
    mean_col = mean_col.rename(columns={column[0]: new})
    return mean_col

In [6]:
# DataFrame средних значение по district and Rooms
def median_col(column, new):
    mean_col = train.groupby(['DistrictId', 'Rooms'])[column].median().reset_index()
    mean_col = mean_col.rename(columns={column[0]: new})
    return mean_col

In [7]:
# Слияние основного датафрейма и датафрейма средних значений датафрейма
def merge_mean(df, mean_df, howw):
    return pd.merge(df, mean_df, on=['DistrictId', 'Rooms'], how=howw)

In [8]:
mean_kitchen_sq = median_col(['KitchenSquare'], 'MeanKitchen')
mean_square = mean_col(['Square'], 'MeanSquare')
mean_life_sq = mean_col(['LifeSquare'], 'MeanLifeSquare')
mean_price = mean_col(['Price'], 'MeanPrice')
train2 = merge_mean(train2, mean_kitchen_sq, 'inner')
train2 = merge_mean(train2, mean_square, 'inner')
train2 = merge_mean(train2, mean_life_sq, 'inner')
train2 = merge_mean(train2, mean_price, 'inner')

In [9]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9966 entries, 0 to 9965
Data columns (total 24 columns):
Id                9966 non-null int64
DistrictId        9966 non-null int64
Rooms             9966 non-null float64
Square            9966 non-null float64
LifeSquare        7860 non-null float64
KitchenSquare     9966 non-null float64
Floor             9966 non-null int64
HouseFloor        9966 non-null float64
HouseYear         9966 non-null int64
Ecology_1         9966 non-null float64
Ecology_2         9966 non-null object
Ecology_3         9966 non-null object
Social_1          9966 non-null int64
Social_2          9966 non-null int64
Social_3          9966 non-null int64
Healthcare_1      5180 non-null float64
Helthcare_2       9966 non-null int64
Shops_1           9966 non-null int64
Shops_2           9966 non-null object
Price             9966 non-null float64
MeanKitchen       9966 non-null float64
MeanSquare        9966 non-null float64
MeanLifeSquare    9964 non-null fl

In [10]:
train2['S/LS'] = train2['Square'] / train2['LifeSquare']
sls = train2.loc[(train2['S/LS'] > 1.2) & (train2['S/LS'] < 2), 'S/LS'].mean()
train2['LifeSquare'] = train2['LifeSquare'].fillna(train2['Square'] / sls)

In [11]:
train2.loc[train2['MeanLifeSquare'].isnull(), 'MeanLifeSquare'] = train2['LifeSquare']

In [12]:
train2.loc[train2['KitchenSquare'] > 30, 'KitchenSquare'] = train2['MeanKitchen'].median()
train2.loc[train2['KitchenSquare'] < 3, 'KitchenSquare'] = train2['MeanKitchen'].median()

In [13]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9966 entries, 0 to 9965
Data columns (total 25 columns):
Id                9966 non-null int64
DistrictId        9966 non-null int64
Rooms             9966 non-null float64
Square            9966 non-null float64
LifeSquare        9966 non-null float64
KitchenSquare     9966 non-null float64
Floor             9966 non-null int64
HouseFloor        9966 non-null float64
HouseYear         9966 non-null int64
Ecology_1         9966 non-null float64
Ecology_2         9966 non-null object
Ecology_3         9966 non-null object
Social_1          9966 non-null int64
Social_2          9966 non-null int64
Social_3          9966 non-null int64
Healthcare_1      5180 non-null float64
Helthcare_2       9966 non-null int64
Shops_1           9966 non-null int64
Shops_2           9966 non-null object
Price             9966 non-null float64
MeanKitchen       9966 non-null float64
MeanSquare        9966 non-null float64
MeanLifeSquare    9966 non-null fl

In [14]:
train2['Ecology_2'] = (train2['Ecology_2'] == 'A').astype(int)
train2['Ecology_3'] = (train2['Ecology_3'] == 'A').astype(int)
train2['Shops_2'] = (train2['Shops_2'] == 'A').astype(int)

In [15]:
train2.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'MeanKitchen',
       'MeanSquare', 'MeanLifeSquare', 'MeanPrice', 'S/LS'],
      dtype='object')

In [16]:
columns = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2', 'Social_3',
       'Helthcare_2', 'Shops_1', 'MeanKitchen',
       'MeanSquare', 'MeanLifeSquare']

In [17]:
X = train2[columns]
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9966 entries, 0 to 9965
Data columns (total 16 columns):
Rooms             9966 non-null float64
Square            9966 non-null float64
LifeSquare        9966 non-null float64
KitchenSquare     9966 non-null float64
Floor             9966 non-null int64
HouseFloor        9966 non-null float64
HouseYear         9966 non-null int64
Ecology_1         9966 non-null float64
Social_1          9966 non-null int64
Social_2          9966 non-null int64
Social_3          9966 non-null int64
Helthcare_2       9966 non-null int64
Shops_1           9966 non-null int64
MeanKitchen       9966 non-null float64
MeanSquare        9966 non-null float64
MeanLifeSquare    9966 non-null float64
dtypes: float64(9), int64(7)
memory usage: 1.6 MB


In [18]:
y = train2['Price']

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
data_train, data_valid, price_train, price_valid = train_test_split(X, y, test_size = 0.3, random_state=42)

In [21]:
from sklearn.ensemble import RandomForestRegressor
model_forest = RandomForestRegressor(n_estimators=1000, max_features = 11, max_depth = 16, random_state=42)

In [22]:
price_arr_train = price_train.values[:]

In [23]:
model_forest.fit(data_train, price_arr_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16,
           max_features=11, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=None, oob_score=False,
           random_state=42, verbose=0, warm_start=False)

In [24]:
pred_train = model_forest.predict(data_train)

In [25]:
from sklearn.metrics import r2_score

In [26]:
r2_score(price_train, pred_train)

0.9490878993284346

In [None]:
pred_forest = model_forest.predict(data_valid)

In [None]:
r2_score(price_valid, pred_forest)

0.7413338685539304

In [None]:
model_forest.feature_importances_

array([0.08220445, 0.31015439, 0.03697555, 0.0274236 , 0.02799704,
       0.02338081, 0.03742123, 0.04082642, 0.07222618, 0.07888375,
       0.05614392, 0.00884203, 0.01622445, 0.05846395, 0.08635789,
       0.03647433])

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = [{'n_estimators': [1000], 
               'max_features': np.arange(11, 12),
               'max_depth': np.arange(16, 17)}]

In [None]:
clf = GridSearchCV(estimator=RandomForestRegressor(random_state=42), 
                   param_grid=parameters,
                   scoring='r2',
                   cv=5)

In [None]:
clf.fit(data_train, price_arr_train)

In [None]:
pred_grid = clf.predict(data_valid)
r2_score(price_valid, pred_grid)

In [None]:
clf.best_params_

In [None]:
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

In [None]:
test = pd.read_csv('test.csv')
test.head(20)

In [None]:
test.info()

In [None]:
test2 = test.copy()

In [None]:
test2 = merge_mean(test2, mean_kitchen_sq, 'outer')
test2 = merge_mean(test2, mean_square, 'outer')
test2 = merge_mean(test2, mean_life_sq, 'outer')
test2 = merge_mean(test2, mean_price, 'outer')
test2 = test2.loc[test2['Id'].notnull()]

In [None]:
test2.info()

In [None]:
test2.loc[(test2['Rooms'] == 0) | (test2['Rooms'] > 4), :] 

In [None]:
test2.loc[test2['Rooms'] == 17.0, 'Rooms'] = 1.0
test2.loc[test2['Rooms'] == 0.0, 'Rooms'] = 2.0

In [None]:
test2['LifeSquare'] = test2['LifeSquare'].fillna(test2['Square'] / sls)
test2.info()

In [None]:
test2.loc[test2['MeanLifeSquare'].isnull(), 'MeanLifeSquare'] = test2['LifeSquare']
test2.loc[test2['MeanSquare'].isnull(), 'MeanSquare'] = test2['Square']
test2.loc[test2['MeanKitchen'].isnull(), 'MeanKitchen'] = test2['KitchenSquare']
test2.loc[test2['MeanPrice'].isnull(), 'MeanPrice'] = test2['MeanPrice'].mean()

In [None]:
test2['Id'] = test2['Id'].astype('int')
test2['HouseYear'] = test2['HouseYear'].astype('int')
test2['Social_1'] = test2['Social_1'].astype('int')
test2['Social_2'] = test2['Social_2'].astype('int')
test2['Social_3'] = test2['Social_3'].astype('int')
test2.info()

In [None]:
test_pred = test2[columns]
test2.head()

In [None]:
test_pred.info()

In [None]:
test2['Price'] = model_forest.predict(test_pred)

In [None]:
test2.to_csv('INeznanov', columns=['Id', 'Price'], index=False)