In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from utils.transformers import *
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

#dummy column so the format of dataframes is the same in the pipeline
df_test['target'] = 1

X = df_train#.drop(columns=['target'])
y = df_train[['target']]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=1)

In [4]:
pipe_features = Pipeline(steps=[
    ('col_dropper',
     ColumnDropper(
        columns_to_drop=[
            'property_id',
            'property_name',
            'host_id',
            'host_location',
            'host_since',
            'host_nr_listings_total',
            'host_response_time',
            'property_desc',
            'property_last_updated',
            'property_scraped_at',
            'property_zipcode',
            'property_sqfeet', 
            'property_neighborhood',
            'property_notes',
            'property_transit', 
            'property_access',
            'property_interaction',
            'property_rules',
            'reviews_first',
            'reviews_last',
            'property_amenities',
            'host_about',
            'host_verified',
            'property_bathrooms',
            'property_bed_type',
            'property_summary'
            ]
        )
     ),
     ('target_handler',
      TargetHandler(target_col='target')
      ),
    ('missing_flagger',
     MissingFlagger(
        columns_to_flag=[
            'property_space'   
            ]
        )
     ),
    ('extras_handler',
     ExtrasHandler()
     ),
    ('clust_location',
     GaussianClusterer(
        n_clusters=7,
        features_cluster=['property_lat', 'property_lon'],
        initial_centroids = np.array([
            [51.24, 4.34], [51.20, 4.41], [51.20, 4.45],
            [50.85, 4.30], [50.85, 4.35], [50.85, 4.38], [50.85, 4.43]
            ])
        )
     ),
    ('property_type_handler',
     PropertyTypeHandler()
     ),
    ('booking_cancel_handler',
     BookingCancelHandler()
     ),
     ('max_guest_adjuster',
      MaxGuestsAdjuster()
      ),
     ('median_imputer',
      CustomSimpleImputer(
        columns=[
            'property_bedrooms',
            'property_beds',
            'host_response_rate',
            'host_nr_listings',
            'reviews_rating',
            'reviews_acc',
            'reviews_cleanliness',
            'reviews_checkin',
            'reviews_communication',
            'reviews_location',
            'reviews_value',
            'reviews_per_month'
        ])
      ),
    ('truncator',
      CustomTruncator(
        cols_and_lims={
            'property_beds':7,
            'property_bedrooms':4,
            'property_max_guests':7,
            'booking_price_covers':4
        }
      )
     ),
    ('target_encoding',
     CustomTargetEncoder(
        feat_columns=[
            'property_beds',
            'property_bedrooms',
            'property_max_guests',
            'booking_price_covers',
            'property_type_new',
            'property_room_type',
            'location_zone_g',
            'booking_cancel_policy'
        ],
        target_column='target'
     )
     ),
    ('target_col_drop',
      ColumnDropper(columns_to_drop=['target'])
      ),
    ('scaler',
     CustomStandardScaler(
        columns=[
            'host_response_rate',
            'host_nr_listings',
            'booking_min_nights',
            'booking_max_nights',
            'booking_availability_30',
            'booking_availability_60',
            'booking_availability_90',
            'booking_availability_365',
            'reviews_num',
            'reviews_rating',
            'reviews_acc',
            'reviews_cleanliness',
            'reviews_checkin',
            'reviews_communication',
            'reviews_location',
            'reviews_value',
            'reviews_per_month'
        ]
     )),
    ('pca_reviews',
     PCATransformer(
        n_components=3,
        columns=[
            'reviews_rating',
            'reviews_acc',
            'reviews_cleanliness',
            'reviews_checkin',
            'reviews_communication',
            'reviews_location',
            'reviews_value'
        ]
     )),
     ('pca_bookings',
      PCATransformer(
        n_components=1,
        columns=[
            'booking_availability_30',
            'booking_availability_60',
            'booking_availability_90',
            'booking_availability_365'
            ]
     )),
    ('lgbm', LGBMRegressor())
])

pipe_target = Pipeline(steps=[
    ('log_transform', FunctionTransformer(np.log, inverse_func = np.exp, check_inverse = True)),
    ('scaler', StandardScaler())
])

model = TransformedTargetRegressor(regressor=pipe_features, transformer=pipe_target)

In [5]:
cv2 = KFold(n_splits=10)
#RMSE Score
scores2 = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=cv2, n_jobs=-1)
scores2.mean()

-53.00240805689244

In [6]:
scores2

array([-49.26943658, -49.31568418, -55.20425978, -62.99025611,
       -57.68803588, -62.36169822, -44.18405258, -50.58799667,
       -46.03110312, -52.39155743])

In [7]:
cv3 = KFold(n_splits=10)
#Median Abs. Error Score (robust metric to evaluate the regression score)
scores3 = cross_val_score(model, X, y, scoring='neg_median_absolute_error', cv=cv2, n_jobs=-1)
scores3.mean()

-19.91252579010729

In [12]:
scores3

array([-20.90674972, -19.83168255, -19.54811731, -17.67914896,
       -18.42997239, -19.83295296, -19.02819152, -19.85801545,
       -19.39993509, -19.30818603])

In [9]:
model.fit(X,y)

pred = model.predict(df_test)

pred_df = df_test[['property_id']].copy()

pred_df['pred_price'] = pred

pred_df.to_csv('pred_v2.csv', header=False, index=False)