In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [61]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [62]:
# plt.scatter(test_data.Rooms, test_data.Square)

In [63]:
# Функции для очистки и подготовки данных
mean_year = np.round(train_data.loc[train_data['HouseYear'] <= 2020, 'HouseYear'].mean())
mean_healthcare = np.round(train_data["Healthcare_1"].mean())
mean_square_for_max = train_data.loc[(train_data['Rooms'] <= train_data.loc[(train_data['Square'] > 300), 'Rooms'].mean()), 'Square'].mean()
mean_square_for_big_ls = train_data.loc[train_data['LifeSquare'] > 250, 'Square'].mean()
mean_life_squae_for_max = train_data.loc[train_data['Square'] >= mean_square_for_big_ls, 'LifeSquare'].mean()

In [64]:
def clean_year(df, mean_year):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = mean_year

In [65]:
def clean_life_square(df, koef_S_LS):
    df.loc[(df['LifeSquare'] < 21) | (df['LifeSquare'].isnull()), 'LifeSquare'] = df['Square']*0.85
    df.loc[df['LifeSquare'] > 250, 'LifeSquare'] = mean_life_squae_for_max

In [66]:
def clean_square(df, mean_square_for_max):
    df.loc[(df['Square'] > 300), 'Square'] = mean_square_for_max

In [67]:
def clean_healthcare_1(df, mean_healthcare):
    df.loc[df['Healthcare_1'].isnull(), 'Healthcare_1'] = mean_healthcare

In [68]:
def clean_rooms(df):
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] < 30), 'Rooms'] = 1
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 30) & (df['LifeSquare'] < 45), 'Rooms'] = 2
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 45) & (df['LifeSquare'] < 60), 'Rooms'] = 3
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 60) & (df['LifeSquare'] < 75), 'Rooms'] = 4
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 70), 'Rooms'] = 6
    df.loc[(df['Rooms'] > 10), 'Rooms'] = 2

In [69]:
def prepare_data(df, mean_year=mean_year, mean_healthcare=mean_healthcare, mean_square_for_max=mean_square_for_max, mean_life_squae_for_max=mean_life_squae_for_max):
    clean_year(df, mean_year)
    clean_life_square(df, mean_life_squae_for_max)
    clean_healthcare_1(df, mean_healthcare)
    clean_rooms(df)
    clean_square(df, mean_square_for_max)

In [70]:
prepare_data(train_data)
prepare_data(test_data)

In [71]:
district_size = train_data['DistrictId'].value_counts().reset_index()\
               .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})

district_size.head()

Unnamed: 0,DistrictId,DistrictSize
0,27,851
1,1,652
2,23,565
3,6,511
4,9,294


In [72]:
train_data = train_data.merge(district_size, on='DistrictId', how='left')
train_data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,...,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,DistrictSize
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969.0,0.08904,...,B,33,7976,5,1143.0,0,11,B,184966.93073,22
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978.0,7e-05,...,B,46,10309,1,240.0,1,16,B,300009.450063,87
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968.0,0.049637,...,B,34,7759,0,229.0,1,3,B,220925.908524,174
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977.0,0.437885,...,B,23,5735,3,1084.0,0,5,B,175616.227217,179
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976.0,0.012339,...,B,35,5776,1,2078.0,2,4,B,150226.531644,97


In [73]:
district_size = test_data['DistrictId'].value_counts().reset_index()\
               .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})

district_size.head()

Unnamed: 0,DistrictId,DistrictSize
0,27,391
1,1,344
2,23,264
3,6,257
4,9,132


In [74]:
test_data = test_data.merge(district_size, on='DistrictId', how='left')
test_data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,DistrictSize
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972.0,0.310199,B,B,11,2748,1,1143.0,0,0,B,91
1,15856,74,2.0,69.263183,58.873706,1.0,6,1.0,1977.0,0.075779,B,B,6,1437,3,1143.0,0,2,B,61
2,5480,190,1.0,13.597819,11.558146,12.0,2,5.0,1909.0,0.0,B,B,30,7538,87,4702.0,5,5,B,2
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007.0,0.101872,B,B,23,4583,3,1143.0,3,3,B,49
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017.0,0.072158,B,B,2,629,1,1143.0,0,0,A,391


In [75]:
X = pd.get_dummies(train_data)
X.drop("Price", axis=1, inplace=True)
X.drop("Id", axis=1, inplace=True)
y = train_data.Price

In [59]:
# df_num_features = train_data.select_dtypes(include=['float64','uint8'])  # отбираем количественные признаки
# import seaborn as sns
# sns.pairplot(df_num_features);

# GradientBoostingRegressor

In [29]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
prepare_data(train_data)
prepare_data(test_data)
X = pd.get_dummies(train_data)
X.drop("Price", axis=1, inplace=True)
X.drop("Id", axis=1, inplace=True)
y = train_data.Price

In [76]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.16, random_state=42)
# переобучение и оценка модели
from sklearn.ensemble import GradientBoostingRegressor
final_model = GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=42
                                       )
# min_samples_split=5, subsample=0.5 , min_samples_leaf=4 

final_model.fit(X_train, y_train)

y_pred_gbr = final_model.predict(X_valid)
y_pred_train_gbr = final_model.predict(X_train)

r2_score(y_valid, y_pred_gbr)

0.7575085594575676

In [77]:
X_train

Unnamed: 0,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Social_1,...,Healthcare_1,Helthcare_2,Shops_1,DistrictSize,Ecology_2_A,Ecology_2_B,Ecology_3_A,Ecology_3_B,Shops_2_A,Shops_2_B
1144,99,2.0,34.573754,23.588537,6.0,5,12.0,1973.0,0.033494,66,...,1322.0,3,8,97,0,1,0,1,0,1
4248,48,1.0,38.478425,32.706662,1.0,25,24.0,2014.0,0.041125,46,...,1143.0,1,10,90,0,1,0,1,0,1
4110,27,2.0,66.787523,64.616662,60.0,14,20.0,2015.0,0.017647,2,...,1143.0,0,0,851,0,1,0,1,0,1
462,92,3.0,77.462932,47.945290,9.0,22,22.0,1995.0,0.460556,20,...,1143.0,1,5,12,0,1,0,1,0,1
2932,6,1.0,43.612945,37.071004,1.0,11,17.0,1977.0,0.243205,5,...,540.0,0,0,511,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,30,2.0,62.219340,62.405338,1.0,4,2.0,2015.0,0.000078,22,...,1046.0,3,23,228,0,1,0,1,0,1
5191,25,3.0,85.790813,72.922191,1.0,9,25.0,2018.0,0.069753,53,...,1143.0,1,11,77,0,1,0,1,0,1
5390,1,3.0,83.225854,70.741976,1.0,13,17.0,1977.0,0.007122,1,...,1143.0,0,1,652,0,1,0,1,0,1
860,94,3.0,51.354934,36.538453,5.0,1,9.0,1973.0,0.127376,43,...,1143.0,3,9,89,0,1,0,1,0,1


In [78]:
# Предсказываем цены для тестовых данных и выгружаем в файл
X_test = pd.get_dummies(test_data)
X_test.drop("Id", axis=1, inplace=True)
test_data["Price"] = final_model.predict(X_test)

In [79]:
# экспорт в файл
test_data.loc[:, ['Id', 'Price']].to_csv('best_gbr_05.csv', index=False)

In [24]:
test_data.Id.value_counts()

1066     1
12995    1
15030    1
6010     1
4795     1
        ..
7485     1
9534     1
5440     1
7489     1
16384    1
Name: Id, Length: 5000, dtype: int64

In [23]:
test_data.Price.value_counts()

203240.753450    10
204003.270010     9
170664.652053     7
140170.883250     7
182880.585984     4
                 ..
294885.235545     1
264920.343061     1
187428.502213     1
261538.110980     1
241032.445330     1
Name: Price, Length: 4901, dtype: int64

0.7621056876187297 - test_size=0.16 - n_estimators=200, max_depth=5, random_state=42 (0.75339)

0.75142

0.75156


In [16]:
# k=1000
# for number in range(200):
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.16, random_state=42)
#     # переобучение и оценка модели
#     from sklearn.ensemble import GradientBoostingRegressor
#     final_model = GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=42
#                                            )
#     # min_samples_split=5, subsample=0.5 , min_samples_leaf=4 

#     final_model.fit(X_train, y_train)

#     y_pred_gbr = final_model.predict(X_valid)
#     y_pred_train_gbr = final_model.predict(X_train)

#     print('r2: ', r2_score(y_valid, y_pred),', n_estimators: ',k)

In [76]:
# y_train_preds = final_model.predict(X_train)
# evaluate_preds(y_train, y_train_preds)