In [78]:
import numpy as np
import pandas as pd
import datetime

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics


import warnings
warnings.simplefilter('ignore')

In [87]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The class provides basic functionality for retrieving
    a subset of columns from the dataset.
    """
    
    features = ['ABS (антиблокировочная система)',
         'AUX/iPod',
         'Bluetooth',
         'CD/MP3 проигрыватель',
         'ESP (система поддержания динамической стабильности)',
         'USB',
         'Автозапуск двигателя',
         'Антипробуксовочная система',
         'Датчик дождя',
         'Иммобилайзер',
         'Камера заднего вида',
         'Климат-контроль',
         'Кондиционер',
         'Контроль мертвых зон на зеркалах',
         'Круиз-контроль',
         'Ксеноновые фары',
         'Легкосплавные диски',
         'Люк',
         'Материал салона',
         'Мультимедийный экран',
         'Обогрев зеркал',
         'Обогрев лобового стекла',
         'Обогрев руля',
         'Обогрев сидений',
         'Панорамная крыша',
         'Парктроники',
         'Подушки безопасности боковые',
         'Подушки безопасности задние',
         'Подушки безопасности передние',
         'Противотуманные фары',
         'Рейлинги на крыше',
         'Светодиодные фары',
         'Сигнализация',
         'Управление мультимедиа с руля',
         'Фаркоп',
         'Цвет салона - темный',
         'Штатная навигация',
         'Электрорегулировка сидений',
         'Электростеклоподъемники задние',
         'Электростеклоподъемники передние']
    
    def __init__(self, feature_names=features):
        """
        Initialize class instance by setting
        a list of columns to retrieve from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        """
        Fit FeatureSelector to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transform X using feature selection. 
        Return column-subset of X.
        """
        return X[self.feature_names]

In [88]:
class ToIntTransformer(BaseEstimator, TransformerMixin):
    """
    Required columns: volume, run, name, year.
    For update data.
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        self.to_int(X, 'Объем')
        self.to_int(X, 'Пробег')
        self.year_to_old(X)
        
        return X.drop(['Название', 'Год'], axis=1).values
        
    def year_to_old(self, X):
        X['Возраст'] = (datetime.datetime.now().year - X['Год']).astype('int8')
        
    def to_int(self, X, column_name):
        X[column_name] = X[column_name].astype('int32')
        

In [89]:
class FeatureGenerator(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X
            

In [90]:
class CatTransformer(BaseEstimator, TransformerMixin):
    """
    Required columns: cuzov, fuel, name, region.
    """    
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
#         self.cut(X, ['cuzov', 'fuel'])
        self.fix_names(X)
        return X
    
    def cut(self, X, column_names):
        for col in column_names:
            X[col] = X[col].apply(lambda x: x.split()[0])
        
    def fix_names(self, df):
        value_count = df['Название'].value_counts()
        df['Название'] = df['Название'].apply(lambda x: x if value_count[x] > 100 else 'другая')
        

In [32]:
from_file = 'data/full_vis.csv'

In [33]:
avto = pd.read_csv(from_file, sep=';')

In [74]:
# avto.info()
datetime.datetime.now().year

2020

In [92]:
X = avto.drop(['Цена'], axis=1)
y = avto['Цена'].apply(lambda x: int(str(x).replace(' ', '')))
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42)

feat_pipeline = Pipeline(
    steps=[
        ('feat_selector', FeatureSelector(FeatureSelector().feature_names)),
        ('feat_generator', FeatureGenerator()),
        ('feat_encoder', OneHotEncoder())
    ]
)

int_pipeline = Pipeline(
    steps=[
        ('int_selector', FeatureSelector(['Объем', 'Название', 'Пробег', 'Год'])),
        ('int_transformer', ToIntTransformer())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('cat_selector', FeatureSelector(['Тип кузова', 'Тип топлива', 'Название', 'Регион', 'Цвет'])),
        ('cat_transformer', CatTransformer()),
        ('cat_encoder', OneHotEncoder())
    ]
)

no_proc_pipeline = Pipeline(
    steps=[
        ('no_proc_selector', FeatureSelector(['Привод', 'Состояние', 'Трансмиссия'])),
        ('no_proc_imputer', SimpleImputer(strategy='most_frequent')),
        ('no_proc_encoder', OneHotEncoder())
    ]
)

pipeline = FeatureUnion(transformer_list= [
    ('feat', feat_pipeline),
    ('int', int_pipeline),
    ('cat', cat_pipeline),
    ('no_proc', no_proc_pipeline)
])

In [93]:
X_train
pipeline

FeatureUnion(n_jobs=None,
             transformer_list=[('feat',
                                Pipeline(memory=None,
                                         steps=[('feat_selector',
                                                 FeatureSelector(feature_names=['ABS '
                                                                                '(антиблокировочная '
                                                                                'система)',
                                                                                'AUX/iPod',
                                                                                'Bluetooth',
                                                                                'CD/MP3 '
                                                                                'проигрыватель',
                                                                                'ESP '
                                                                          

### Random forest

In [94]:
pip_rf = Pipeline(
    steps=[
        ('preprocessing', pipeline),
        ('model', RandomForestRegressor(
                        max_depth=20, 
                        min_samples_split=5, 
                        min_samples_leaf=1,
                        max_features=None,
                        random_state=42,
                        n_estimators=150,
                        n_jobs=-1
        ))
    ]
)

In [95]:
pip_rf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessing',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('feat',
                                                 Pipeline(memory=None,
                                                          steps=[('feat_selector',
                                                                  FeatureSelector(feature_names=['ABS '
                                                                                                 '(антиблокировочная '
                                                                                                 'система)',
                                                                                                 'AUX/iPod',
                                                                                                 'Bluetooth',
                                                                                                 'CD/MP3 '
                                

In [96]:
y_pred = pip_rf.predict(X_test)
print(metrics.r2_score(y_test, y_pred))

0.9749529656597223


In [98]:
pip_rf['model'].feature_importances_

array([2.02297334e-04, 2.34596509e-04, 3.18072146e-04, 3.14712682e-04,
       2.14585783e-04, 2.17646308e-04, 4.24477041e-04, 4.28281871e-04,
       6.03001037e-04, 6.28843633e-04, 2.42079053e-04, 2.41880613e-04,
       7.96523658e-04, 7.14879737e-04, 2.50982233e-04, 2.62434060e-04,
       1.25706620e-03, 1.23008858e-03, 4.07330722e-04, 4.52697626e-04,
       6.24600011e-04, 6.41634481e-04, 5.96162456e-04, 5.86764480e-04,
       8.33843738e-04, 6.39406500e-04, 9.71102058e-04, 1.07553887e-03,
       5.89301058e-04, 4.90294195e-04, 1.41321179e-03, 1.36204798e-03,
       3.74539627e-04, 3.86424490e-04, 6.97164583e-04, 7.54738528e-04,
       4.45265901e-04, 4.65810487e-04, 4.92767666e-04, 1.19468121e-03,
       2.97758641e-03, 1.89727363e-03, 1.53585272e-03, 1.33453098e-03,
       3.52025926e-04, 4.18606422e-04, 1.07690393e-03, 9.40720792e-04,
       1.76519218e-03, 1.65258712e-03, 3.18040653e-04, 3.43002086e-04,
       3.01002787e-03, 2.43780187e-03, 4.12305388e-04, 3.77323704e-04,
      

### GridSearch

In [99]:
param_grid = [
    {
        'max_depth': [ 8, 10, 15],
        'n_estimators': [100, 150],
        'min_samples_split': [3, 5]
    }
]

pip_rf = Pipeline(
    steps=[
        ('preprocessing', pipeline),
        ('rf', GridSearchCV(RandomForestRegressor(
                        random_state=42,
                        n_jobs=-1
        ), param_grid, scoring='r2', cv=5))
    ]
)

In [None]:
pip_rf.fit(X_train, y_train)

In [None]:
y_true, y_pred = y_test, pip_rf.predict(X_test)
print(metrics.r2_score(y_true, y_pred))

In [None]:
pip_rf[1].best_estimator_

## Validation

In [None]:
param_grid = {"criterion": ["mse", "mae"],
              "min_samples_split": [5, 10, 20],
              "max_depth": [2, 5, 8, 10],
               "min_samples_leaf": [20, 40, 100],
               "max_leaf_nodes": [None, 5, 20, 10],
              }
pip_tree = Pipeline(
    steps=[
        ('preprocessing', pipeline),
        ('tree', GridSearchCV(DecisionTreeRegressor(
                        random_state=42
        ), param_grid, scoring='r2', cv=5))
    ]
)

In [None]:
pip_tree.fit(X_train, y_train)
y_true, y_pred = y_test, pip_rf.predict(X_test)
print(metrics.r2_score(y_true, y_pred))

In [None]:
pip_tree[1].best_estimator_

### cat_boost