In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import seaborn as sns

%config InlineBackend.figure_format = 'svg'
%matplotlib inline
TRAIN_PATH = './train.csv'
TEST_PATH = './test.csv'
SUBMISSION_PATH = './sample_submission.csv'

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor, plot_tree

In [5]:
tr = pd.read_csv(TRAIN_PATH)
te = pd.read_csv(TEST_PATH)

# 1 - EDA, очистка

In [6]:
def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values, paint=0):
    print("Train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("Test R2:\t" + str(round(r2(test_true_values, test_pred_values), 3)))
    
    if paint:
        plt.figure(figsize=(10,5))

        plt.subplot(121)
        sns.scatterplot(x=train_pred_values, y=train_true_values, size=3)
        plt.xlabel('Predicted values')
        plt.ylabel('True values')
        plt.title('Train sample prediction')

        plt.subplot(122)
        sns.scatterplot(x=test_pred_values, y=test_true_values, size=3)
        plt.xlabel('Predicted values')
        plt.ylabel('True values')
        plt.title('Test sample prediction')

        plt.show()
    
    return round(r2(train_true_values, train_pred_values), 3), round(r2(test_true_values, test_pred_values), 3)

In [7]:
class DataPreprocessing:
    
    def __init__(self):
        self.binary_to_numbers = {'A': 0, 'B': 1}
        pass
    
    def correct_ls_1(self, X):
        df = X
        bls = (df.LifeSquare / 4 > df.Square) & (df.LifeSquare > 150)
        while bls.sum():
            df.loc[bls, 'LifeSquare'] = df.loc[bls, 'LifeSquare'] / 10
            bls = (df.LifeSquare / 4 > df.Square) & (df.LifeSquare > 150)
        return df
    
    def correct_ls_2(self, X):
        df = X
        bsq = (df.LifeSquare > 300)
        df.loc[bsq, ['LifeSquare']] = df.loc[bsq, ['LifeSquare']] / 10
        return df
    
    def correct_sq_1(self, X):
        df = X
        bsq = (df.Square > 300)
        df.loc[bsq, ['Square']] = df.loc[bsq, ['Square']] / 10
        return df
    
    def swap_sq_lsq(self, X):
        df = X
        swap_mask = df.LifeSquare > df.Square
        df.loc[swap_mask, ['Square', 'LifeSquare']] = df.loc[swap_mask, ['LifeSquare', 'Square']].values
        return df
    
    def calc_sq_room_stat(self, X):
        df = X
        df.loc[df.Rooms > 5, 'Rooms'] = 0
        norm_rooms = (df.Rooms > 0)
        sq_room_stat = df[norm_rooms].groupby(by='Rooms')['Square'].mean().values
        return sq_room_stat
    
    def get_mean_sq(self, X):
        df = tr
        min_sq = (df.Square < 12)
        mean = df.Square[~min_sq].mean()
        return mean

    def restore_min_square(self, X, mean_sq):
        df = X
        min_sq = (df.Square < 12)
        df.loc[min_sq, 'Square'] = mean_sq
        return df
    
    def restore_rooms(self, sq):
        stat = self.square_room_stat.copy()
        stat -= sq
        stat = np.abs(stat)
        index = int(np.argmin(stat))
        return index + 1
    
    def get_kitchen_stat(self, X):
        df = X
        norm_ksq = ((df.KitchenSquare >= 4) & (df.KitchenSquare < 35))
        df_nsq = df[norm_ksq]
        kitchen_stat = []
        for i in range(1, 6):
            kitchen_stat_line = []
            kitchen_stat_line.append(df_nsq.loc[df_nsq.Rooms == i, 'KitchenSquare'].quantile(q=0.025))
            kitchen_stat_line.append(df_nsq.loc[df_nsq.Rooms == i, 'KitchenSquare'].mean())
            kitchen_stat_line.append(df_nsq.loc[df_nsq.Rooms == i, 'KitchenSquare'].quantile(q=0.975))
            kitchen_stat.append(kitchen_stat_line)
        return kitchen_stat
    
    def restore_kitchen_size(self, X):
        df = X
        for n_rooms in range(1, 6):
            min_sq, med_sq, max_sq = self.kitchen_stat[n_rooms - 1]
            very_small_kitchen = (df.KitchenSquare < min_sq)
            df.loc[(very_small_kitchen) & (df.Rooms == n_rooms), 'KitchenSquare'] = med_sq
            very_big_kitchen = (df.KitchenSquare > max_sq)
            df.loc[(very_big_kitchen) & (df.Rooms == n_rooms), 'KitchenSquare'] = med_sq
        return df
    
    def get_excess_stat(self, X):
        df = X
        ls_na = df.LifeSquare.isna()
        ls_big = df.LifeSquare > (df.Square - df.KitchenSquare - 2.5)
        ls_small = (df.Square / df.LifeSquare) > 2
        ls_bad = ls_na | ls_big | ls_small
        df_norm_ls = df.loc[~ls_bad]
#         print(df_norm_ls.shape[0])
        ranges = [120, 110, 100, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 30, 10]
        excess_stat = []
        mask = df.Square.isna() # 00000000
    #     print(mask.sum())
        for curr_range in ranges:
            excess = []
            excess.append(curr_range)
            curr_mask = df_norm_ls.Square >= curr_range
            curr_mask = curr_mask ^ mask
            mean_ex = (df_norm_ls.loc[curr_mask, 'Square'] / (df_norm_ls.loc[curr_mask, 'Square'] - \
                       (df_norm_ls.loc[curr_mask, 'LifeSquare'] + \
                        df_norm_ls.loc[curr_mask, 'KitchenSquare']))).mean()
            excess.append(mean_ex)
            excess_stat.append(excess)
            mask = mask | curr_mask
#             print(f"{curr_range} {curr_mask.sum()} {mask.sum()}")
        return excess_stat

    def square_to_excess_square(self, sq):
        if sq > self.excess_stat[0][0]:
            return sq / self.excess_stat[0][1]
        i = 0
        while sq > self.excess_stat[i][0]:
            i += 1
        return sq / self.excess_stat[i][1]
    
    def correct_ls_3(self, X):
        df = X
        ls_na = df.LifeSquare.isna()
        ls_big = df.LifeSquare > (df.Square - df.KitchenSquare - 2.5)
        ls_small = (df.Square / df.LifeSquare) > 2
        ls_bad = ls_na | ls_big | ls_small
        df.loc[ls_bad, 'LifeSquare'] = df.loc[ls_bad, 'Square'] - df.loc[ls_bad, 'KitchenSquare'] - \
                                       df.loc[ls_bad, 'Square'].apply(self.square_to_excess_square)
        return df
    
    def correct_house_floor_1(self, X):
        df = X
        df.loc[df.HouseFloor > 50, 'HouseFloor'] = round(df.loc[df.HouseFloor > 50, 'HouseFloor'] / 10)
        return df
    
    def correct_house_year_1(self, X):
        df = X
        bad_hy = (df.HouseYear < 1900) | (df.HouseYear > 2021)
        df.loc[bad_hy, 'HouseYear'] = self.hy_mode
        return df
    
    def get_mode(self, s):
        return s.mode().values[0]
    
    def get_hfloor_stat(self, X):
        df = X
        norm_hfloor = (df.HouseFloor >= df.Floor) & (df.HouseFloor != 0)
        df['hy'] = np.floor(df['HouseYear'] / 10)
        floor_stat_modes = df.loc[norm_hfloor].groupby(by='hy')['HouseFloor'].apply(self.get_mode).to_dict()
        floor_stat_max = df.loc[norm_hfloor].groupby(by='hy')['HouseFloor'].max().to_dict()
        last_val = 4
        for key in range(190, 202):
            val = floor_stat_modes.get(key)
            if val is None:
                floor_stat_modes[key] = last_val
            else:
                last_val = val
        df.drop(columns='hy', axis=1, inplace=True)
        return floor_stat_modes, floor_stat_max
    
    def year_to_floor_modes(self, y):
        return self.floor_stat_modes.get(np.floor(y / 10))
    
    def year_to_floor_max(self, y):
        return self.floor_stat_max.get(np.floor(y / 10))
    
    def correct_house_floor_2(self, X):
        df = X
        bad_hfloor = (df.HouseFloor == 0)
        df.loc[bad_hfloor, 'HouseFloor'] = df.loc[bad_hfloor, 'HouseYear'].apply(self.year_to_floor_modes)
        bad_hfloor = (df.HouseFloor < df.Floor)
        df.loc[bad_hfloor, 'HouseFloor'] = df.loc[bad_hfloor, 'HouseYear'].apply(self.year_to_floor_modes)
        bad_hfloor = (df.HouseFloor < df.Floor)
        df.loc[bad_hfloor, 'HouseFloor'] = df.loc[bad_hfloor, 'HouseYear'].apply(self.year_to_floor_max)
        return df
    
    def correct_floor(self, X):
        df = X
        bad_floor = (df.Floor > df.HouseFloor) | (df.Floor == 0)
        df.loc[bad_floor, 'Floor'] = np.random.randint(1, df.loc[bad_floor, 'HouseFloor'])
        return df
    
    def fit_transform(self, X):
        df = X.copy()
        # 0 - жилая в 4 раза больше общей и больше 150
        df = self.correct_ls_1(df)
        # 1 - общая больше 300
        df = self.correct_sq_1(df)
        # 2 - жилая больше 300
        df = self.correct_ls_2(df)
        # 3 - жилая больше общей
        df = self.swap_sq_lsq(df)
        # 4 - статистика по средней площади на количество комнат
        df.loc[df.Rooms > 5, 'Rooms'] = 0
        self.square_room_stat = self.calc_sq_room_stat(df)
        # 5 - средняя площадь квартиры на рынке
        self.mean_square = self.get_mean_sq(df)
        # 6 - восстановление площади для самых маленьких (<12м)
        df = self.restore_min_square(df, self.mean_square)
        # 7 - восстановление числа комнат
        null_rooms = (df.Rooms == 0)
        df.loc[null_rooms, 'Rooms'] = df.loc[null_rooms, 'Square'].apply(self.restore_rooms)
        # 8 - сбор статистики, восстановление и корректировка размера кухни
        self.kitchen_stat = self.get_kitchen_stat(df)
        df = self.restore_kitchen_size(df)
        # 9 - Статистика лишнего пространства в квартирах на основе площади
        self.excess_stat = self.get_excess_stat(df)
#         print(self.excess_stat)
        df = self.correct_ls_3(df)
        # 10 - Этажи и год постройки
        df = self.correct_house_floor_1(df)
        self.hy_mode = df.HouseYear.mode().values[0]
        df = self.correct_house_year_1(df)
        self.floor_stat_modes, self.floor_stat_max = self.get_hfloor_stat(df)
        df = self.correct_house_floor_2(df)
        df = self.correct_floor(df)
        # 11 - Экология
        df['Ecology_2'] = df['Ecology_2'].replace(self.binary_to_numbers)
        df['Ecology_3'] = df['Ecology_3'].replace(self.binary_to_numbers)
        # 12 - магазины
        df['Shops_2'] = df['Shops_2'].replace(self.binary_to_numbers)
        # 13 - Медицина
#         df.drop('Healthcare_1', axis=1, inplace=True)
        self.hc1_features = ['Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 
                        'Healthcare_1', 'Helthcare_2', 'Shops_1', 'Shops_2']
        self.hc1_features_1 = ['Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 
                          'Helthcare_2', 'Shops_1', 'Shops_2']
        X = df[self.hc1_features]
        X = X.loc[~X.Healthcare_1.isna()]
        y = X['Healthcare_1']
        X = X[self.hc1_features_1]
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=42)
        X_tr.shape, X_te.shape, y_tr.shape, y_te.shape
        self.hc1_model = RandomForestRegressor(random_state=100, max_depth=7, max_features=4, n_estimators=200)
        self.hc1_model.fit(X_tr, y_tr)
        df.loc[df.Healthcare_1.isna(), 'Healthcare_1'] = \
        self.hc1_model.predict(df.loc[df.Healthcare_1.isna(), self.hc1_features_1])
        return df
    
    def transform(self, X):
        df = X.copy()
        # 0 - жилая в 4 раза больше общей и больше 150
        df = self.correct_ls_1(df)
        # 1 - общая больше 300
        df = self.correct_sq_1(df)
        # 2 - жилая больше 300
        df = self.correct_ls_2(df)
        # 3 - жилая больше общей
        df = self.swap_sq_lsq(df)
        # 4 - статистика по средней площади на количество комнат
        df.loc[df.Rooms > 5, 'Rooms'] = 0
#         self.square_room_stat = self.calc_sq_room_stat(df)

        # 5 - средняя площадь квартиры на рынке
#         self.mean_square = self.get_mean_sq(df)

        # 6 - восстановление площади для самых маленьких (<12м)
        df = self.restore_min_square(df, self.mean_square)
        # 7 - восстановление числа комнат
        null_rooms = (df.Rooms == 0)
        df.loc[null_rooms, 'Rooms'] = df.loc[null_rooms, 'Square'].apply(self.restore_rooms)
        # 8 - сбор статистики, восстановление и корректировка размера кухни
#         self.kitchen_stat = self.get_kitchen_stat(df)
        df = self.restore_kitchen_size(df)
        # 9 - Статистика лишнего пространства в квартирах на основе площади
#         self.excess_stat = self.get_excess_stat(df)
#         print(self.excess_stat)
        df = self.correct_ls_3(df)
        # 10 - Этажи и год постройки
        df = self.correct_house_floor_1(df)
#         self.hy_mode = df.HouseYear.mode().values[0]
        df = self.correct_house_year_1(df)
#         self.floor_stat_modes, self.floor_stat_max = self.get_hfloor_stat(df)
        df = self.correct_house_floor_2(df)
        df = self.correct_floor(df)
        # 11 - Экология
        df['Ecology_2'] = df['Ecology_2'].replace(self.binary_to_numbers)
        df['Ecology_3'] = df['Ecology_3'].replace(self.binary_to_numbers)
        # 12 - магазины
        df['Shops_2'] = df['Shops_2'].replace(self.binary_to_numbers)
        # 13 - Медицина
#         df.drop('Healthcare_1', axis=1, inplace=True)
        df.loc[df.Healthcare_1.isna(), 'Healthcare_1'] = \
        self.hc1_model.predict(df.loc[df.Healthcare_1.isna(), self.hc1_features_1])
        return df


In [8]:
preprocessor = DataPreprocessing()

In [9]:
tr = preprocessor.fit_transform(tr)
te = preprocessor.transform(te)

# 2 - Генерация новых фич

In [10]:
class FeatureGenerator:
    
    def __init__(self):
        pass
    
    def is_unk_district(self, dist_id):
        if dist_id in self.districts_id:
            return 0
        else:
            return 1
    
    def fit_transform(self, df_in):
        df = df_in
        self.distr_rating = df.groupby(by='DistrictId', as_index=False).agg({'Price': 'sum', 'Square': 'sum'})
        self.distr_rating.rename(columns={'Price': 'price_sum', 'Square': 'sqr_sum'}, inplace=True)
        self.distr_rating['sq_meter_price'] = self.distr_rating.price_sum / self.distr_rating.sqr_sum
        df = pd.merge(df, self.distr_rating[['DistrictId', 'sq_meter_price']], on='DistrictId', how='left')
        self.sq_meter_features = ['Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor',\
                     'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', \
                     'Social_3', 'Healthcare_1','Helthcare_2', 'Shops_1', 'Shops_2']
        X = df[self.sq_meter_features]
        y = df[['sq_meter_price']]
        X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=42)
        self.model = RandomForestRegressor(random_state=100, max_depth=10, max_features=15, n_estimators=200)
        self.model.fit(X_tr, y_tr)
        self.districts_id = df.DistrictId.unique().tolist()
        
#         df['min_or_max'] = ((df.Floor == 1) | (df.Floor == df.HouseFloor)).astype(np.int32)
        df['min_floor'] = (df.Floor == 1).astype(np.int32)
        df['max_floor'] = (df.Floor == df.HouseFloor).astype(np.int32)
        
        return df
    
    def transform(self, df_in):
        df = df_in
        df = pd.merge(df, self.distr_rating[['DistrictId', 'sq_meter_price']], on='DistrictId', how='left')
        df['unk_district'] = df.DistrictId.apply(self.is_unk_district)
        unk_district = (df.unk_district == 1)
        df.loc[unk_district, 'sq_meter_price'] = self.model.predict(df.loc[unk_district, self.sq_meter_features])
        
        df['min_floor'] = (df.Floor == 1).astype(np.int32)
        df['max_floor'] = (df.Floor == df.HouseFloor).astype(np.int32)
#         df['min_or_max'] = ((df.Floor == 1) | (df.Floor == df.HouseFloor)).astype(np.int32)
        
        return df

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [12]:
feature_generator = FeatureGenerator()

tr = feature_generator.fit_transform(tr)
te = feature_generator.transform(te)

In [13]:
used_features = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
                 'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
                 'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
                 'Helthcare_2', 'Shops_1', 'Shops_2', 'sq_meter_price']

In [14]:
from sklearn.model_selection import train_test_split, cross_val_score

# 3 - Построение и оценка качества моделей

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score as r2
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import RobustScaler

In [16]:
X = tr[used_features]
y = tr['Price']

In [17]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=42)
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((7500, 18), (2500, 18), (7500,), (2500,))

In [18]:
te = te[used_features]

In [19]:
X_tr.isna().sum().sum(), X_te.isna().sum().sum(), te.isna().sum().sum()

(0, 0, 0)

In [20]:
# %%time
# # GBR
# gb_model = GradientBoostingRegressor(criterion='mse',
# #                                      max_depth=6,
#                                      min_samples_leaf=60,
#                                      random_state=42,  
# #                                      n_estimators=2250, 
#                                      max_features='sqrt', 
# #                                      learning_rate=0.025,
#                                      loss='huber' 
                                     
#                                     )
# params = {'n_estimators': np.arange(200, 3250, 250),
#           'learning_rate': np.arange(0.005, 0.030, 0.005),
#           'max_depth':[6, ]}

# model = GridSearchCV(gb_model, params, scoring='r2', verbose=1,
#                      cv=KFold(n_splits=4,random_state=128, shuffle=True), n_jobs=10)
# model.fit(X_tr, y_tr)
# print(model.best_params_)

# y_tr_pred = model.predict(X_tr)
# y_te_pred = model.predict(X_te)

# evaluate_preds(y_tr.values.flatten(), y_tr_pred.flatten(), y_te.values.flatten(), y_te_pred.flatten(), 0)

In [21]:
model = GradientBoostingRegressor(criterion='mse',
                                     max_depth=6,
                                     min_samples_leaf=60,
                                     random_state=42,  
                                     n_estimators=3200, 
                                     max_features='sqrt', 
                                     learning_rate=0.01,
                                     loss='huber')
model.fit(X_tr, y_tr)
y_tr_pred = model.predict(X_tr)
y_te_pred = model.predict(X_te)
evaluate_preds(y_tr.values.flatten(), y_tr_pred.flatten(), y_te.values.flatten(), y_te_pred.flatten(), 0)

Train R2:	0.84
Test R2:	0.767


(0.84, 0.767)

In [22]:
# xxx = pd.DataFrame(model.feature_importances_, index = X_tr.columns, columns=['w'] ).\
# sort_values(by="w", ascending=False)
# xxx

In [23]:
predictions = model.predict(te)
submit = pd.read_csv('./sample_submission.csv')
submit['Price'] = predictions
submit.head()
submit.to_csv('rf_submit.csv', index=False)

# Юля, спасибо вам большое за курс! Я начал потихоньку понимать что такое машинное обучение.  Все было понятно и доходчиво. Удачи вам!  Эти знания пригодятся однозначно!:)