In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, KFold, GridSearchCV

# 3. Модели
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler

# 4. Метрики качества
from sklearn.metrics import mean_squared_error, r2_score

In [49]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [61]:
class DataPipeline:
    
    """Подготовка исходных данных"""
    def __init__(self):
        """Параметры класса"""
        self.medians = None
        
    def fit(self, df):
        """Сохранение статистик"""
        
        # Расчет медиан
        self.medians = df[['LifeSquare', "Healthcare_1"]].median()
        
        
    def train_transform(self, df):
        """Трансформация данных"""
        
        # 1. Пропуски
        df[['LifeSquare', 'Healthcare_1']] = df[['LifeSquare', 'Healthcare_1']].fillna(self.medians)
        
        
        # 2. Выбросы (outliers)
        df.loc[df["KitchenSquare"] > 40, "KitchenSquare"] = df["KitchenSquare"].median()
        df.loc[df["HouseFloor"] > 50, "HouseFloor"] = df["HouseFloor"].median()
        df.loc[df['LifeSquare'] < 15, "LifeSquare"] = df["LifeSquare"].median()
        df.loc[df['Square'] < 15, 'Square'] = df['Square'].median()
        df.loc[df['Rooms'] < 1, 'Rooms'] = df['Rooms'].median()
        
        #df.drop(df[(df['LifeSquare'] < 15) & (df['Square'] < 15)].index, inplace=True)
        df.drop(df[df['LifeSquare'] > 1000].index, inplace=True)
        
        Id_col = df['Id']
        df.drop(labels=['Id'], axis=1, inplace=True)
        
        df = pd.concat([df, pd.get_dummies(df['Ecology_2'])], axis=1)
        df = pd.concat([df, pd.get_dummies(df['Ecology_3'])], axis=1)
        df = pd.concat([df, pd.get_dummies(df['Shops_2'])], axis=1)
        df.drop(labels=["Ecology_2", "Ecology_3", "Shops_2"], axis=1, inplace=True)
        
        # 3. Новые фичи (features)
        df['mean_room_square'] = df['LifeSquare'] / df['Rooms']
        
        # 4. Понижение размерности
        pca = PCA(n_components=1, random_state=42)
        df['Social'] = pca.fit_transform(df.loc[:, ['Social_1', 'Social_2']])
        df.drop(labels=['Social_1', 'Social_2'], axis=1, inplace=True)
        
        # 5. Train test split
        y = df.pop('Price')
        X = df
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42) 
        
        scaler = RobustScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        
        return X_train, X_test, y_train, y_test, Id_col

    def test_transform(self, df):
        """Трансформация данных"""
        
        # 1. Пропуски
        df[['LifeSquare', 'Healthcare_1']] = df[['LifeSquare', 'Healthcare_1']].fillna(self.medians)
        
        
        # 2. Выбросы (outliers)
        df.loc[df["KitchenSquare"] > 40, "KitchenSquare"] = df["KitchenSquare"].median()
        df.loc[df["HouseFloor"] > 50, "HouseFloor"] = df["HouseFloor"].median()
        df.loc[df['LifeSquare'] < 15, "LifeSquare"] = df["LifeSquare"].median()
        df.loc[df['Square'] < 15, 'Square'] = df['Square'].median()
        df.loc[df['Rooms'] < 1, 'Rooms'] = df['Rooms'].median()
        
        #df.drop(df[(df['LifeSquare'] < 15) & (df['Square'] < 15)].index, inplace=True)
        df.drop(df[df['LifeSquare'] > 1000].index, inplace=True)
        
        Id_col = df['Id']
        df.drop(labels=['Id'], axis=1, inplace=True)
        
        df = pd.concat([df, pd.get_dummies(df['Ecology_2'])], axis=1)
        df = pd.concat([df, pd.get_dummies(df['Ecology_3'])], axis=1)
        df = pd.concat([df, pd.get_dummies(df['Shops_2'])], axis=1)
        df.drop(labels=["Ecology_2", "Ecology_3", "Shops_2"], axis=1, inplace=True)
        
        # 3. Новые фичи (features)
        df['mean_room_square'] = df['LifeSquare'] / df['Rooms']
        
        # 4. Понижение размерности
        pca = PCA(n_components=1, random_state=42)
        df['Social'] = pca.fit_transform(df.loc[:, ['Social_1', 'Social_2']])
        df.drop(labels=['Social_1', 'Social_2'], axis=1, inplace=True)
        
        # 5. Train test split
        X = df
        
        scaler = RobustScaler()
        X_train = scaler.fit_transform(X)        
        
        return X_train, Id_col

In [62]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,4567,44,1.0,36.847630,19.094182,5.0,5,9.0,1970,0.036122,B,B,24,4378,0,1036.0,1,1,B
1,5925,62,1.0,42.493907,42.568133,10.0,7,17.0,2017,0.072158,B,B,2,629,1,,0,0,A
2,960,27,2.0,59.463678,,9.0,19,19.0,1977,0.211401,B,B,9,1892,0,,0,1,B
3,3848,23,3.0,49.646030,33.893825,6.0,2,2.0,1965,0.014073,B,B,2,475,0,,0,0,B
4,746,74,1.0,53.837056,,1.0,8,17.0,1977,0.309479,B,B,35,7715,4,990.0,0,6,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,10379,29,2.0,43.177521,30.339945,5.0,6,5.0,1962,0.069660,B,B,31,6119,4,,1,2,B
4996,16138,38,3.0,93.698122,94.521465,10.0,21,27.0,2018,0.060753,B,B,15,2787,2,520.0,0,7,B
4997,3912,101,1.0,33.656723,19.003259,5.0,2,5.0,1966,0.038693,B,B,28,6533,1,1015.0,2,5,B
4998,5722,10,1.0,38.635155,20.976257,9.0,8,14.0,1970,0.089040,B,B,33,7976,5,,0,11,B


In [63]:
pipe_tr = DataPipeline()
pipe_tr.fit(train_df)
X_train, X_test, y_train, y_test, Id = pipe_tr.train_transform(train_df)
X_train

array([[ 1.09090909,  0.        ,  0.33162249, ...,  0.        ,
        -0.20478423,  1.69642002],
       [-0.16363636, -1.        , -0.90305972, ...,  0.        ,
         1.4733541 , -0.59915109],
       [-0.54545455,  0.        ,  0.5636402 , ...,  0.        ,
         0.54180422, -0.65707225],
       ...,
       [-0.45454545, -1.        , -0.53249903, ...,  0.        ,
         1.03819857,  0.22037023],
       [-0.54545455, -1.        , -0.36972926, ...,  0.        ,
         1.95980559, -0.65707225],
       [ 0.47272727,  0.        ,  0.62586082, ..., -1.        ,
         0.1292021 , -0.82217666]])

In [64]:
grid = GridSearchCV(GradientBoostingRegressor(), {'n_estimators':[50, 100, 200], 
            'max_depth':[5, 7, 20],
            'min_samples_leaf':[5, 10, 20]}, cv=KFold(n_splits=5, random_state=21, shuffle=True), n_jobs=-1)
grid.fit(X_train, y_train)
grid.best_params_

{'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 200}

In [65]:
grid.best_score_

0.7455119153876079

In [66]:
model = GradientBoostingRegressor(n_estimators=200, min_samples_leaf=10, max_depth=5)
model.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=5, min_samples_leaf=10, n_estimators=200)

In [67]:
r2_score(y_test, model.predict(X_test))

0.7534091240788445

In [68]:
pipe = DataPipeline()
pipe.fit(test_df)
X_train, Id = pipe.test_transform(test_df)

In [69]:
result = model.predict(X_train)

In [70]:
test_df['Price'] = result
test_df['Id'] = Id
res = test_df.loc[:, ['Id', 'Price']]

In [71]:
res.to_csv('result7.csv', index=False)