In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import mannwhitneyu
import itertools

import matplotlib.pyplot as plt

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("data/moscow_estate.csv", names=['okrug', 'metro', 'route_minutes', 'total_area', 'rooms', 'price'], )
df.head(3)

Unnamed: 0,okrug,metro,route_minutes,total_area,rooms,price
0,СЗАО,Октябрьское Поле,8,51.0,2,16000000
1,ЮАО,ЗИЛ,20,40.7,1,15374104
2,ВАО,Выхино,12,63.0,3,13900000


In [3]:
num_cols = ['route_minutes', 'total_area', 'rooms']
cat_cols = ['metro', 'okrug']
target = 'price'

In [4]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    

class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
class NumericPower(BaseEstimator, TransformerMixin):
    def __init__(self, key, p = 2):
        self.key = key
        self.columns = []
        self.p = p+1

    def fit(self, X, y=None):
        B = [self.key + str(i) for i in range(1, self.p)]
        self.columns = B + ['log']
        return self

    def transform(self, X):
        Xp = X.values.reshape(-1,1)
        for i in range(2, self.p):
            Xp = np.hstack([Xp,(X.values.reshape(-1,1) ** i).astype(float)])

        Xp = np.hstack([Xp, np.log(X.values.reshape(-1,1) + 1).astype(float)])    
        B = pd.DataFrame(data = Xp, index = X.index,columns =[self.columns] )
        return B[self.columns]     

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['price'], axis=1), df['price'], test_size=0.3, random_state=2)
X_train.shape, X_test.shape

((1389, 5), (596, 5))

In [6]:
X_train.head(3)

Unnamed: 0,okrug,metro,route_minutes,total_area,rooms
1111,ЮЗАО,Новые Черёмушки,14,50.1,2
188,ВАО,Первомайская,6,35.0,2
407,СЗАО,Терехово,17,66.6,2


In [7]:
final_transformers = list()

for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))


for num_col in num_cols:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=num_col)),
                ('Scale', StandardScaler())
            ])
    final_transformers.append((num_col, cont_transformer))

In [99]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion(final_transformers)

pipeline = Pipeline([
    ('features', feats),
    ('regressor', CatBoostRegressor(iterations=2000, learning_rate=1,  max_depth=10, random_state=42, silent=True)),
])

In [100]:
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
mean_squared_error(y_test, preds)

24300938137102.176

In [101]:
mean_absolute_error(y_test, preds)

2978502.3043330275

In [102]:
r2_score(y_test, preds)

0.7610769396666266