In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [27]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

In [28]:
# plt.scatter(test_data.Rooms, test_data.Square)

In [29]:
# Функции для очистки и подготовки данных
mean_year = np.round(train_data.loc[train_data['HouseYear'] <= 2020, 'HouseYear'].mean())
mean_healthcare = np.round(train_data["Healthcare_1"].mean())
mean_square_for_max = train_data.loc[(train_data['Rooms'] <= train_data.loc[(train_data['Square'] > 300), 'Rooms'].mean()), 'Square'].mean()
mean_square_for_big_ls = train_data.loc[train_data['LifeSquare'] > 250, 'Square'].mean()
mean_life_squae_for_max = train_data.loc[train_data['Square'] >= mean_square_for_big_ls, 'LifeSquare'].mean()

In [30]:
def clean_year(df, mean_year):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = mean_year

In [31]:
def clean_life_square(df, koef_S_LS):
    df.loc[(df['LifeSquare'] < 21) | (df['LifeSquare'].isnull()), 'LifeSquare'] = df['Square']*0.85
    df.loc[df['LifeSquare'] > 250, 'LifeSquare'] = mean_life_squae_for_max

In [32]:
def clean_square(df, mean_square_for_max):
    df.loc[(df['Square'] > 300), 'Square'] = mean_square_for_max

In [33]:
def clean_healthcare_1(df, mean_healthcare):
    df.loc[df['Healthcare_1'].isnull(), 'Healthcare_1'] = mean_healthcare

In [34]:
def clean_rooms(df):
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] < 30), 'Rooms'] = 1
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 30) & (df['LifeSquare'] < 45), 'Rooms'] = 2
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 45) & (df['LifeSquare'] < 60), 'Rooms'] = 3
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 60) & (df['LifeSquare'] < 75), 'Rooms'] = 4
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 70), 'Rooms'] = 6
    df.loc[(df['Rooms'] > 10), 'Rooms'] = 2

In [35]:
def prepare_data(df, mean_year=mean_year, mean_healthcare=mean_healthcare, mean_square_for_max=mean_square_for_max, mean_life_squae_for_max=mean_life_squae_for_max):
    clean_year(df, mean_year)
    clean_life_square(df, mean_life_squae_for_max)
    clean_healthcare_1(df, mean_healthcare)
    clean_rooms(df)
    clean_square(df, mean_square_for_max)

In [36]:
prepare_data(train_data)
prepare_data(test_data)

In [37]:
X = pd.get_dummies(train_data)
X.drop("Price", axis=1, inplace=True)
X.drop("Id", axis=1, inplace=True)
y = train_data.Price

In [38]:
# df_num_features = train_data.select_dtypes(include=['float64','uint8'])  # отбираем количественные признаки
# import seaborn as sns
# sns.pairplot(df_num_features);

# GradientBoostingRegressor

In [39]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
prepare_data(train_data)
prepare_data(test_data)
X = pd.get_dummies(train_data)
X.drop("Price", axis=1, inplace=True)
X.drop("Id", axis=1, inplace=True)
y = train_data.Price

In [40]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.16, random_state=42)
# переобучение и оценка модели
from sklearn.ensemble import GradientBoostingRegressor
final_model = GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=42
                                       )
# min_samples_split=5, subsample=0.5 , min_samples_leaf=4 

final_model.fit(X_train, y_train)

y_pred_gbr = final_model.predict(X_valid)
y_pred_train_gbr = final_model.predict(X_train)

r2_score(y_valid, y_pred_gbr)

0.7577861874526557

In [22]:
# Предсказываем цены для тестовых данных и выгружаем в файл
X_test = pd.get_dummies(test_data)
X_test.drop("Id", axis=1, inplace=True)
test_data["Price"] = final_model.predict(X_test)

In [25]:
# экспорт в файл
test_data.loc[:, ['Id', 'Price']].to_csv('best_gbr_05.csv', index=False)

In [24]:
test_data.Id.value_counts()

1066     1
12995    1
15030    1
6010     1
4795     1
        ..
7485     1
9534     1
5440     1
7489     1
16384    1
Name: Id, Length: 5000, dtype: int64

In [23]:
test_data.Price.value_counts()

203240.753450    10
204003.270010     9
170664.652053     7
140170.883250     7
182880.585984     4
                 ..
294885.235545     1
264920.343061     1
187428.502213     1
261538.110980     1
241032.445330     1
Name: Price, Length: 4901, dtype: int64

0.7621056876187297 - test_size=0.16 - n_estimators=200, max_depth=5, random_state=42 (0.75339)

0.75142

0.75156


In [16]:
# k=1000
# for number in range(200):
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.16, random_state=42)
#     # переобучение и оценка модели
#     from sklearn.ensemble import GradientBoostingRegressor
#     final_model = GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=42
#                                            )
#     # min_samples_split=5, subsample=0.5 , min_samples_leaf=4 

#     final_model.fit(X_train, y_train)

#     y_pred_gbr = final_model.predict(X_valid)
#     y_pred_train_gbr = final_model.predict(X_train)

#     print('r2: ', r2_score(y_valid, y_pred),', n_estimators: ',k)

In [76]:
# y_train_preds = final_model.predict(X_train)
# evaluate_preds(y_train, y_train_preds)