## Goal: pridict price
note: airbnb deploys dynamic pricing. Here I assume that is not happening


I intentionally to make the pipeline as less manual effort as possible. For text like columns, seg it with ckiptagger.

In [1]:
import os

In [2]:
import numpy as np
import pandas as pd
from ckiptagger import WS
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
ws = WS("../../src/data")



In [3]:
data = pd.read_csv("../../data/raw/listings_detail.csv")

In [4]:
data.shape

(5258, 74)

In [5]:
data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,178036,https://www.airbnb.com/rooms/178036,20201231072412,2020-12-31,單人床位 Single bunk bed [mixed dorm for 4pax],Hello from Taipei :)<br />We are located at Z...,Local food within 5 mins walk<br />6 mins walk...,https://a0.muscache.com/pictures/1263b356-1513...,851825,https://www.airbnb.com/users/show/851825,...,10.0,10.0,10.0,,t,9,0,1,1,0.71
1,271733,https://www.airbnb.com/rooms/271733,20201231072412,2021-01-01,Taipei Rooftop - Whole apartment!,"Yes! You get the whole apartment. It's nice, i...",A family neighborhood -- very quiet and safe. ...,https://a0.muscache.com/pictures/8824698/31a49...,242033,https://www.airbnb.com/users/show/242033,...,10.0,10.0,10.0,,f,1,1,0,0,0.58
2,289296,https://www.airbnb.com/rooms/289296,20201231072412,2021-01-01,Fabulous studio - Center Taipei-- Monthly only,"Zhongxiao East Road, Daan District<br /><br />...",,https://a0.muscache.com/pictures/57880667/43e7...,1338052,https://www.airbnb.com/users/show/1338052,...,10.0,10.0,9.0,,f,3,3,0,0,0.34
3,289298,https://www.airbnb.com/rooms/289298,20201231072412,2021-01-03,Fabulous Studio in heart of Taipei----Monthly ...,"Elite area in Dazhi, Zhongshan District.<br />...",,https://a0.muscache.com/pictures/cca653f0-6815...,1338052,https://www.airbnb.com/users/show/1338052,...,10.0,10.0,9.0,,f,3,3,0,0,1.96
4,310542,https://www.airbnb.com/rooms/310542,20201231072412,2021-01-01,"TMP Co-Living,long term+monthly rental+Not daily",台北居大不易(夭壽貴)?<br />回家覺得空虛寂寞覺得冷??<br />下班後沒人一起吃飯...,便利交通<br /><br />→ 步行松山車站只要3分鐘／捷運松山站5分鐘／捷運後山埤站6...,https://a0.muscache.com/pictures/06496984-ed90...,1597675,https://www.airbnb.com/users/show/1597675,...,9.0,8.0,9.0,,f,8,0,0,8,0.18


In [6]:
# target is numerical
data['price'] = data['price'].str.replace('[$,]', '').astype(float)

In [7]:
class DropCols(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # drop meaningless cols 
        X = X.drop([
            'neighbourhood_group_cleansed','calendar_updated',
            'calendar_last_scraped','license','bathrooms',
            'id', 'listing_url', 'scrape_id', 'last_scraped'
        ],axis='columns')
        
        # host upload info
        X = X.drop(['host_picture_url','host_thumbnail_url'],axis='columns')
        # host info
        X = X.drop(['host_id','picture_url', 'host_url','host_name','host_about'],axis='columns')

        # house geo
        X = X.drop(['neighbourhood'],axis='columns')
        return X



In [8]:
class Preprocess(BaseEstimator, TransformerMixin):
    # transform all text, some irregular columns
    def __init__(self):
        self.text_cols = ['name', 'description','neighborhood_overview']
#         self.special_col = ['bathrooms_text']
        
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # text
        X['text'] = X[self.text_cols].fillna('').apply(lambda x:" ".join(x),axis='columns')
        remove_pattern_list = [r"<br(\ )+\/>","<br>","<b>","<\/b>",",",'(',")",'/','[',']']
        for pat in remove_pattern_list:
            X['text'] = X['text'].str.replace(pat,"")
        X['text_list'] = ws(X['text'])
        for row in X.loc[X['text_list'].isnull(), 'text_list'].index:
            X.at[row, 'text_list'] = []
        X['text_list'] = X['text_list'].apply(lambda x:" ".join(x))
        X = X.drop(['text']+self.text_cols, axis='columns')
        
        X['shared_bath'] = X['bathrooms_text'].fillna('').str.lower().apply(lambda x: 'shared' in x)
        X['bath_num'] = X['bathrooms_text'].str.extract(r'(\d+(\.\d)?)')[0].astype(float)
        X = X.drop('bathrooms_text',axis='columns')
        return X

In [9]:
class NumTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        df = pd.DataFrame([X.values, y.values]).T
        df.columns = ['cat','y']
        df['y'] = df['y'].astype(float)
        df['cat'] = df['cat'].fillna('nan')
        self.enc_dict = df.groupby('cat')['y'].mean().to_dict()
        self.ymean = df['y'].mean()
        return self
    def transform(self, X):
        return X.fillna('nan').map(self.enc_dict).fillna(self.ymean)

In [10]:
class CountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.enc_dict = X.value_counts().to_dict()
        return self
    def transform(self, X):
        return X.map(self.enc_dict)

In [11]:
class DenseCountVectorizer(TransformerMixin):

    def fit(self, X, y=None):
        self.enc = CountVectorizer(max_features=20).fit(X)
        self.feature_names = self.enc.get_feature_names()
        return self

    def transform(self, X):
        return self.enc.transform(X).todense()

In [12]:
class FeatureEng(BaseEstimator, TransformerMixin):
    def __init__(self, cat_enc_method='target_enc', cat_list_enc_method='count+pca'):
        self.cat_enc_method = cat_enc_method
        self.cat_list_enc_method = cat_list_enc_method
        
        self.numerical_cols = [
            'host_response_rate',
           'host_acceptance_rate', 'host_total_listings_count',
            'latitude','longitude',
            'accommodates','bedrooms','beds',
            'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
           'maximum_minimum_nights', 'minimum_maximum_nights',
           'maximum_maximum_nights', 'minimum_nights_avg_ntm',
           'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
           'availability_60', 'availability_90', 'availability_365',
           'number_of_reviews', 'number_of_reviews_ltm',
            'number_of_reviews_l30d',
             'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
            'review_scores_communication',
            'review_scores_location','review_scores_value',
            'calculated_host_listings_count',
           'calculated_host_listings_count_entire_homes',
           'calculated_host_listings_count_private_rooms',
           'calculated_host_listings_count_shared_rooms', 'reviews_per_month',
            'bath_num' # preprocessed
        ]
        
        self.categorical_cols = [
            'host_response_time','has_availability',
            'host_is_superhost','host_neighbourhood','host_has_profile_pic','host_identity_verified',
            'neighbourhood_cleansed','property_type','room_type',
            'instant_bookable',
            'shared_bath' # preprocessed
        ]
        self.categorical_list_cols = ['amenities','host_verifications','text_list']
        self.date_cols = ['first_review','last_review','host_since']
        

        
    def fit(self, X, y=None):
        self.cat_list_enc_dict = {}
        if self.cat_list_enc_method == 'count':
            for col in self.categorical_list_cols:
                self.cat_list_enc_dict[col] = Pipeline([
                    ('count_vec', DenseCountVectorizer()),
                ]).fit(X[col])
        elif self.cat_list_enc_method == 'count+cluster':
            for col in self.categorical_list_cols:
                self.cat_list_enc_dict[col] = Pipeline([
                    ('count_vec', DenseCountVectorizer()),
                    ('cluster', KMeans())]
                ).fit(X[col])
        elif self.cat_list_enc_method == 'count+pca':
            for col in self.categorical_list_cols:
                self.cat_list_enc_dict[col] = Pipeline([
                    ('count_vec', DenseCountVectorizer()),
                    ('pca', PCA(n_components=10))
                ]).fit(X[col])
    
        self.cat_enc_dict = {}
        if self.cat_enc_method == 'label':
            for col in self.categorical_cols:
                cat_enc_dict[col] = LabelEncoder().fit(X[col])
        elif self.cat_enc_method == 'one-hot':
            for col in self.categorical_cols:
                self.cat_enc_dict[col] = OneHotEncoder().fit(X[col])
        elif self.cat_enc_method == 'target_enc':
            for col in self.categorical_cols:
                self.cat_enc_dict[col] = NumTargetEncoder().fit(X[col], y)
        elif self.cat_list_enc_method == 'count_enc':
            for col in self.categorical_cols:
                self.cat_enc_dict[col] = CountEncoder().fit(X[col])
        return self
    def transform(self, X):
        for col in self.cat_enc_dict.keys():
            X[col] = self.cat_enc_dict[col].transform(X[col])
        
        
        for col in self.cat_list_enc_dict.keys():
            X[col] = X[col].fillna('')
            if self.cat_list_enc_method == 'count':
                temp = pd.DataFrame(self.cat_list_enc_dict[col].transform(X[col]), columns = [col + "_count_"+ i for i in self.cat_list_enc_dict[col]['count_vec'].feature_names],index=X.index)
                X = pd.concat([X,temp],axis='columns')
            elif self.cat_list_enc_method == 'count+cluster':
                X[col] = self.cat_list_enc_dict[col].predict(X[col])
            elif self.cat_list_enc_method == 'count+pca':
                temp = pd.DataFrame(self.cat_list_enc_dict[col].transform(X[col]), columns = [col + "_pca_"+ str(i) for i in range(self.cat_list_enc_dict[col]['pca'].n_components)],index=X.index)
                X = pd.concat([X,temp],axis='columns')
        X = X.drop(self.cat_list_enc_dict.keys(),axis='columns')
        
        X['host_location_is_local'] = X['host_location'].fillna('').apply(lambda x: any([keyword in x for keyword in ['taipei','Taipei','台北','臺北']]))
        X['host_localtion_is_tw'] = X['host_location_is_local'] | X['host_location'].fillna('').apply(lambda x: any([keyword in x for keyword in ['taiwan','Taiwan','台灣','臺灣']]))
        
        for col in ['host_response_rate','host_acceptance_rate']:
            X[col] = X[col].str.replace("%","").astype(float)
        X = X.drop('host_location', axis='columns')
        for col in self.date_cols:
            X[col+'_duration']= (pd.to_datetime('today') - pd.to_datetime(X[col].fillna(pd.to_datetime('today')))).dt.days
        X = X.drop(self.date_cols, axis='columns')
        return X.fillna(0)

In [13]:
pipe = Pipeline([
    ('DropCols',DropCols()),
    ('preprocess', Preprocess()),
    ('fe', FeatureEng(cat_list_enc_method = 'count+pca'))
])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('price',axis='columns'),
    data['price'], test_size=0.2, random_state=1)

In [15]:

valid_ind = y_train[y_train<3000].index
X_train = X_train.loc[valid_ind]
y_train = y_train.loc[valid_ind]

valid_ind = y_test[y_test<3000].index
X_test2 = X_test.loc[valid_ind]
y_test2 = y_test.loc[valid_ind]

In [16]:
%%time
processed_X_train = pipe.fit_transform(X_train, np.log1p(y_train))
processed_X_test = pipe.transform(X_test)
processed_X_test2 = pipe.transform(X_test2)
processed_X_train.to_csv("../../data/processed/X_train.csv", index=None)
processed_X_test.to_csv("../../data/processed/X_test.csv", index=None)
processed_X_test2.to_csv("../../data/processed/X_test2.csv", index=None)

CPU times: user 1h 9min 4s, sys: 6min 49s, total: 1h 15min 53s
Wall time: 13min 42s


In [17]:
processed_X_train.head()

Unnamed: 0,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,...,text_list_pca_5,text_list_pca_6,text_list_pca_7,text_list_pca_8,text_list_pca_9,host_location_is_local,host_localtion_is_tw,first_review_duration,last_review_duration,host_since_duration
4626,7.141947,100.0,83.0,7.091416,7.230851,18.0,18.0,7.121491,7.082228,7.137842,...,-0.111507,-0.167808,-0.116957,0.025881,-0.208008,False,False,-1,-1,1011
2316,7.003099,100.0,0.0,7.091416,7.230851,1.0,1.0,7.121491,7.13487,7.130972,...,0.421133,-1.263704,-0.842646,0.158299,-0.473464,True,True,987,417,1557
3655,7.165519,70.0,93.0,7.091416,7.156771,37.0,37.0,7.121491,7.13487,7.203456,...,2.209976,0.877267,-0.895491,-0.901787,0.155565,False,False,561,174,856
891,7.165519,92.0,42.0,7.091416,7.156771,58.0,58.0,7.121491,7.13487,7.203456,...,0.075359,-0.797819,-0.778872,-0.562512,-0.494698,True,True,1022,566,1742
273,6.971313,13.0,0.0,7.091416,7.082973,5.0,5.0,7.121491,7.13487,7.066723,...,-1.595278,1.105324,1.305407,-1.598155,1.468366,True,True,2281,455,2322


In [18]:
# processed_X_train = pd.read_csv("../../data/processed/X_train.csv")
# processed_X_test = pd.read_csv("../../data/processed/X_test.csv")

In [19]:
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [20]:
regressor_dict = {
    'xgbr': {'model':XGBRegressor(), 'feat_imp_attr':'feature_importances_','param_grid':{'n_estimators':[30,100,200],'max_depth':[2,4,6], 'n_jobs':[-1]}},
    'rf': {'model':RandomForestRegressor(), 'feat_imp_attr':'feature_importances_', 'param_grid':{'n_estimators':[50,100,200],'max_depth':[2,4,6]}, 'n_jobs':[-1]},
    'elasticnet':{'model':ElasticNet(), 'feat_imp_attr':'coef_','param_grid':{'alpha':[1],'l1_ratio':[0.3, 0.5, 0.7]}},   
}

In [21]:
res = pd.DataFrame(columns=['best_param','feat_imp','train','test','test2'],index=regressor_dict.keys())

In [22]:
%%time
for reg_method in regressor_dict.keys():
    reg = GridSearchCV(regressor_dict[reg_method]['model'], param_grid=regressor_dict[reg_method]['param_grid'],cv=5)
    reg.fit(processed_X_train,np.log1p(y_train))
    res.loc[reg_method,'best_param'] = str(reg.best_params_)
    res.loc[reg_method,'feat_imp'] = str(pd.Series(index=processed_X_train.columns.values,
                                                   data=getattr(reg.best_estimator_,regressor_dict[reg_method]['feat_imp_attr'])).sort_values(ascending=False).iloc[:10]
                                        )
    res.loc[reg_method,'train'] = mean_absolute_error(np.expm1(reg.predict(processed_X_train)),y_train)
    res.loc[reg_method,'test'] = mean_absolute_error(np.expm1(reg.predict(processed_X_test)),y_test)
    res.loc[reg_method,'test2'] = mean_absolute_error(np.expm1(reg.predict(processed_X_test2)),y_test2)
    

CPU times: user 4min 25s, sys: 4.95 s, total: 4min 30s
Wall time: 2min 17s


In [23]:
res

Unnamed: 0,best_param,feat_imp,train,test,test2
xgbr,"{'max_depth': 4, 'n_estimators': 200, 'n_jobs'...",shared_bath ...,109.016,1140.18,320.114
rf,"{'max_depth': 6, 'n_estimators': 100}",shared_bath ...,342.728,1236.24,370.62
elasticnet,"{'alpha': 1, 'l1_ratio': 0.3}",host_acceptance_rate 0.001697\navailabili...,544.899,1475.19,543.731


In [24]:
res.loc['xgbr','feat_imp']

'shared_bath                                     0.419830\nproperty_type                                   0.043067\naccommodates                                    0.031388\nmaximum_minimum_nights                          0.029689\nbedrooms                                        0.025813\ncalculated_host_listings_count_entire_homes     0.024436\nminimum_nights                                  0.016534\ncalculated_host_listings_count_private_rooms    0.015710\nroom_type                                       0.015201\nbath_num                                        0.012644\ndtype: float32'

In [25]:
res.loc['rf','feat_imp']

'shared_bath                                    0.420566\naccommodates                                   0.148737\nproperty_type                                  0.063548\ncalculated_host_listings_count_entire_homes    0.046685\nminimum_minimum_nights                         0.025829\ntext_list_pca_4                                0.024639\nhost_neighbourhood                             0.015856\nminimum_nights                                 0.013569\nnumber_of_reviews_ltm                          0.013455\nmaximum_minimum_nights                         0.012267\ndtype: float64'

In [26]:
res.loc['elasticnet','feat_imp']

'host_acceptance_rate      0.001697\navailability_90           0.000411\nnumber_of_reviews         0.000343\nlast_review_duration      0.000066\nmaximum_nights            0.000065\nfirst_review_duration     0.000006\nhost_neighbourhood        0.000000\nnumber_of_reviews_l30d   -0.000000\nhost_is_superhost         0.000000\nreview_scores_accuracy   -0.000000\ndtype: float64'