In [8]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold, train_test_split
from lightgbm import LGBMRegressor

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [85]:
train = pd.read_csv('data/train.csv.gz', compression='gzip')
train = train.drop(train[train.price == 0].index)

test = pd.read_csv('data/test.csv.gz', compression='gzip')

In [None]:
# кросс-валидация
test['price'] = -1

data = pd.concat([train, test], 0) #, sort=False)
data.reset_index(drop=True, inplace=True)

def cv_mode(CV):
    
    if CV:
    
        cv = KFold(n_splits=5, shuffle=False, random_state=42).split(data.iloc[:train.shape[0]])
    
    else:
        i_tr = data.iloc[:train.shape[0]].index
        i_tst = data.iloc[train.shape[0]:].index
        cv = (i_tr, i_tst)
    
    return cv

def make_prediction( model, X, y, i_tr, i_tst):
    X_train, X_test = X.iloc[i_tr, :], X.iloc[i_tst, :]
    y_train, y_test = np.log1p(y.iloc[i_tr]), y.iloc[i_tst]
    model.fit(X_train, y_train)
    prediction = np.expm1(model.predict(X_test))
    return prediction, MAPE(y_test, prediction)
        
    
def cross_val(model, X, y, CV=True):
    cv = cv_mode(CV)
    scores = []
    for i_tr, i_tst in cv:
        _, score = make_prediction(model, X, y, i_tr, i_tst)
        scores.append(score)
    return np.mean(scores), np.std(scores)


def make_subm(model, X, y, filename, CV=False):
    cv = cv_mode(CV)
    (i_tr, i_tst) = cv
    prediction, _ = make_prediction(model, X, y, i_tr, i_tst)
    subm['price'] = prediction
    subm.to_csv(filename, index=False)
    pass

In [86]:
# Какой признак лучше выбрать для targetencoding?

num_cols = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'square_feet',
            'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
            'minimum_nights', 'latitude', 'longitude']

cat_cols = ['bed_type', 'cancellation_policy', 'host_has_profile_pic', 'host_identity_verified', 
            'host_is_superhost', 'host_response_time', 'is_location_exact', 'neighbourhood_cleansed',
            'property_type', 'require_guest_phone_verification', 'require_guest_profile_picture',
            'room_type', 'zipcode', 'host_id']

train[cat_cols].nunique()

bed_type                                5
cancellation_policy                     6
host_has_profile_pic                    2
host_identity_verified                  2
host_is_superhost                       2
host_response_time                      4
is_location_exact                       2
neighbourhood_cleansed                 33
property_type                          40
require_guest_phone_verification        2
require_guest_profile_picture           2
room_type                               3
zipcode                             23648
host_id                             32597
dtype: int64

In [7]:
# возьмем neighbourhood_cleansed, 'property_type' и zipcode

In [None]:
# Подготовим датасет для экспериментов

In [133]:
# разобьем на обучение и валидацию

X = train[num_cols + ['neighbourhood_cleansed', 'zipcode', 'property_type']]
y = train.price

for col in ['neighbourhood_cleansed', 'zipcode', 'property_type']:
    X[col] = X[col].astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [89]:
def MAPE(y_true, y_pred):
    return np.mean(np.abs((y_true-y_pred) / (y_true)).replace([-np.inf, np.inf], np.nan).dropna())*100

In [90]:
def get_rate(X_train, X_test):
    model = LGBMRegressor()
    model.fit(X_train, np.log1p(y_train))
    preds = np.expm1(model.predict(X_test))
    return MAPE(y_test, preds)

In [91]:
# Качество без категориальных признаков
get_rate(X_train.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1),
         X_test.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1)
        )

30.687587710683623

In [134]:
# Качество с категориальными признаками, закодированными LabelEncoding
get_rate(X_train, X_test)

30.510956016192566

In [136]:
# Качество с ['neighbourhood_cleansed', 'property_type'], закодированными OneHotEncoding

from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

get_rate(np.hstack((X_train.drop(['neighbourhood_cleansed', 'property_type'], axis=1), 
                    ohe.fit_transform(X_train[['neighbourhood_cleansed', 'property_type']]))),
         np.hstack((X_test.drop(['neighbourhood_cleansed', 'property_type'], axis=1), 
                    ohe.transform(X_test[['neighbourhood_cleansed', 'property_type']])))
)
        

30.43596338287703

In [None]:
# Чем кодируем? средним по таргету или по логарифму таргета? 

In [144]:
X = train[num_cols + ['neighbourhood_cleansed', 'zipcode', 'property_type']]
y = train.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [145]:
for col in ['neighbourhood_cleansed', 'property_type', 'zipcode']:
    tmp = pd.DataFrame({'feature': X_train[col], 'target': np.log1p(y_train)}).dropna()
    global_mean = y_train.mean()
    mean = tmp.groupby('feature').target.mean()
    X_train[col + '_te'] = X_train[col].map(mean).fillna(global_mean).values
    X_test[col + '_te'] = X_test[col].map(mean).fillna(global_mean).values
    
# Смотрим качество
get_rate(X_train.drop(['neighbourhood_cleansed', 'zipcode','property_type'], axis=1),
         X_test.drop(['neighbourhood_cleansed', 'zipcode','property_type'], axis=1))

37.05249907563195

In [146]:
# Добавим сглаживание

C = 10

for col in ['neighbourhood_cleansed', 'zipcode', 'property_type']:
    tmp = pd.DataFrame({'feature': X_train[col], 'target': np.log1p(y_train)}).dropna()
    global_mean = y_train.mean()
    mean = tmp.groupby('feature').target.mean()
    size = tmp.groupby('feature').target.size()
    encoding = (global_mean * C + mean * size) / (C + size)
    X_train[col + '_te'] = X_train[col].map(encoding).fillna(global_mean).values
    X_test[col + '_te'] = X_test[col].map(encoding).fillna(global_mean).values
    
# качество
get_rate(X_train.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1),
         X_test.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1))

37.99903566366023

In [147]:
# K-fold

for col in ['neighbourhood_cleansed', 'property_type', 'zipcode']:
    tmp = pd.DataFrame({'feature': X_train[col], 'target': np.log1p(y_train)}).dropna()
    global_mean = y_train.mean()
    tst_mean = tmp.groupby('feature').target.mean()
    tst_size = tmp.groupby('feature').target.size()
    tst_encoding = (global_mean * C + tst_mean * tst_size) / (C + size)
    
    te = pd.Series(index=X_train.index)
    kf = KFold(n_splits=5, shuffle=False, random_state=42)
    for train_idx, test_idx in kf.split(tmp):
        X_tr, X_val = tmp.iloc[train_idx], tmp.iloc[test_idx]
        tmp_mean = X_tr.groupby('feature').target.mean()
        tmp_size = X_tr.groupby('feature').target.size()
        tmp_encoding = (global_mean * C + mean * size) / (C + size)
        te.iloc[test_idx] = X_val['feature'].map(tmp_encoding).values
        
    X_train[col + '_te'] = te
    X_test[col + '_te'] = X_test[col].map(tst_encoding).fillna(global_mean).values
    
# качество
get_rate(X_train.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1),
         X_test.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1))

30.401137412075702

In [None]:
# А как с кросс-валидацией? Чтобы не раздувать функцию кросс-валидации, все это надо оформить так:

In [148]:
from sklearn.base import BaseEstimator

class MeanEncoding(BaseEstimator):
    """
    Здесь мы делаем самый простой targetencoding - сложный и удобный пойдет в домашнее задание
    """
    
    def __init__(self, feature, C=1.):
        self.C = C
        self.feature = feature
        
    def fit(self, X_train, y_train):
        
        df = pd.DataFrame({'feature': X_train[self.feature], 'target': y_train}).dropna()
        
        self.global_mean = df.target.mean()
        mean = df.groupby('feature').target.mean()
        size = df.groupby('feature').target.size()
        
        self.encoding = (self.global_mean * self.C + mean * size) / (self.C + size)
    
    def transform(self, X_test):
        
        X_test[self.feature] = X_test[self.feature].map(self.encoding).fillna(self.global_mean).values
        
        
        return X_test
    
    def fit_transform(self, X_train, y_train):
        
        df = pd.DataFrame({'feature': X_train[self.feature], 'target': y_train}).dropna()
        
        self.global_mean = df.target.mean()
        mean = df.groupby('feature').target.mean()
        size = df.groupby('feature').target.size()
        self.encoding = (self.global_mean * self.C + mean * size) / (self.C + size)
        
        X_train[self.feature] = X_train[self.feature].map(self.encoding).fillna(self.global_mean).values
        
        return X_train

In [155]:
model = LGBMRegressor()

cross_val(model, X.drop(['neighbourhood_cleansed', 'zipcode', 'property_type'], axis=1), y, CV=True)

(31.273938173670665, 1.4348108443583092)

In [156]:
from sklearn.pipeline import make_pipeline

te = MeanEncoding(feature = 'neighbourhood_cleansed', C=10)
lgb = LGBMRegressor()

model = make_pipeline(te, lgb)

cross_val(model, X.drop(['zipcode', 'property_type'], axis=1), y, CV=True)

(31.215083350832344, 1.4521233440242987)

In [None]:
# Для каких еще методов обработки данных лучше написать вот такой вот отдельный класс?