In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [52]:
import sys
sys.path.append('../')

from utils.processing import *
from utils.visualisation import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from lightgbm import LGBMRegressor

np.random.seed(0)

In [4]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 80)

In [172]:
# load data set
df = pd.read_csv('../data/train.csv')
df_train, df_validate = train_test_split(df, test_size=0.10)

In [173]:
def drop_cols_for_regression(df):
    drop_columns = [
        'address',
        'title',
        'listing_id',
        'property_name',
        'total_num_units',
        'available_unit_types',
        'property_details_url',
        'elevation',
        'tenure',
        'property_type',
        'floor_level',
        'furnishing',
        'built_year',
#         'gep_pri_sch_within_1km',
#         'gep_pri_sch_within_1km_2km',
#         'gep_pri_sch_outside_2km',
#         'pri_sch_within_500m',
#         'pri_sch_outside_500m',
#         'line_cc',
#         'line_ce',
#         'line_cg',
#         'line_dt',
#         'line_ew',
#         'line_ne',
#         'line_ns',
#         'line_te',
#         'furnishing_partial',
#         'furnishing_unfurnished',
#         'furnishing_unspecified',
    ]
    return df.drop(columns=drop_columns)

def prepare_data_for_regression(df):
    adfs = read_aux_csv('../data')
#     merge_aux = dict((k, adfs[k]) for k in ['primary_schools', 'regions', 'mrt_stations'])
    merge_aux = dict((k, adfs[k]) for k in ['regions'])
    df = join_aux(df, merge_aux)
    df = drop_cols_for_regression(df)

    return df

In [174]:
#df = prepare_data_for_regression(df)
#df.head()
#df_train[df_train.isna().any(axis=1)]

df_train = preprocess(df_train)
df_train = prepare_data_for_regression(df_train)
df_train.head()

Unnamed: 0,num_beds,num_baths,size_sqft,lat,lng,subzone,planning_area,price,property_type_private,property_type_public,tenure_high_year,tenure_low_year,floor_level_ground,floor_level_high,floor_level_low,floor_level_mid,floor_level_penthouse,floor_level_top,furnishing_partial,furnishing_unfurnished,furnishing_unspecified,region
0,4.0,3.0,1184,1.417017,103.845035,lower seletar,yishun,1575000.0,0,0,0,1,0,0,0,0,0,0,0,0,1,n
1,3.0,2.0,1130,1.351787,103.963119,tampines east,tampines,582800.0,0,1,0,0,0,0,0,0,0,0,0,0,1,e
2,3.0,2.0,1410,1.31629,103.840576,moulmein,novena,3439800.0,1,0,1,0,0,0,0,0,0,0,0,1,0,c
3,3.0,2.0,1216,1.398331,103.879066,fernvale,sengkang,1050000.0,0,1,0,1,0,1,0,0,0,0,0,0,1,ne
4,6.0,6.0,4925,1.330792,103.757727,toh tuck,clementi,6016500.0,1,0,1,0,0,0,0,0,0,0,0,0,1,w


In [175]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']

In [176]:
numeric_features = ['size_sqft', 'num_beds', 'num_baths']
numeric_transformer = StandardScaler()

categorical_features = ["subzone", "planning_area", "region"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
     (categorical_transformer, categorical_features))

In [180]:
def pipelined_model(model):
    pipe = make_pipeline(preprocessor, model)
    return TransformedTargetRegressor(regressor=pipe, transformer=StandardScaler())

In [181]:
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    score = np.sqrt(mse)
    return score
    
def run_models(models, X, y):
    def rmse_scorer():
        return make_scorer(rmse)
    
    model_scores = dict()
    
    for model in models:
        regr = pipelined_model(model["model"])
        grid = GridSearchCV(regr, param_grid=model["params"], scoring=rmse_scorer(), cv=5, n_jobs=-1)
        grid.fit(X, y)
        model_scores[model["model"].__class__.__name__] = [grid.best_estimator_, grid.best_score_]
        print(model["model"].__class__.__name__, grid.best_score_)
        
    return model_scores

In [182]:
%%time

models = [
    {"model": Lasso(), "params": {'regressor__lasso__alpha': [0.02, 0.024, 0.025, 0.026, 0.03]}},
    {"model": Ridge(), "params": {'regressor__ridge__alpha': [200, 230, 250,265, 270, 275, 290, 300, 500, 750, 1000, 2000]}},
#     {"model": LGBMRegressor(boosting_type='gbdt'), "params": {
#         'regressor__lgbmregressor__n_estimators': [20, 30, 40, 50, 60],
#         'regressor__lgbmregressor__max_depth': [5, 6, 9, 12],
#         'regressor__lgbmregressor__learning_rate': [0.1, 0.01, 0.005],
        
#     }},
    {"model": xgb.XGBRegressor(), "params": {
        'regressor__xgbregressor__n_estimators': [5, 10, 15, 20, 30],
        'regressor__xgbregressor__max_depth': [3, 5, 6, 10, 15],
        'regressor__xgbregressor__reg_lambda': [200, 250, 500, 1000],
#         'regressor__xgbregressor__colsample_bytree': [0.5, 0.7, 1],
#         'regressor__xgbregressor__subsample': [0.5, 0.7, 1],
#         'regressor__xgbregressor__eta': [0.01, 0.005],
    }},
#     {"model": GradientBoostingRegressor(), "params": {
#         'regressor__gradientboostingregressor__n_estimators':range(30, 51, 10),
#         'regressor__gradientboostingregressor__max_depth':range(5,11,2),
#         'regressor__gradientboostingregressor__min_samples_split':range(10,51,10)
#     }},
#     {"model": AdaBoostRegressor(), "params": {}},
#     {"model": RandomForestRegressor(), "params": {
#         'regressor__randomforestregressor__n_estimators': [50, 60, 70, 80, 90],
#         'regressor__randomforestregressor__max_depth': [5, 10, 15, 25],
# #         'regressor__randomforestregressor__min_samples_split': [2, 5, 10, 15],
#     }},
]

model_scores = run_models(models, X_train, y_train)

Lasso 1618909.3648647657
Ridge 1594391.672395079
XGBRegressor 1911025.871910093
CPU times: user 1.87 s, sys: 286 ms, total: 2.15 s
Wall time: 14.9 s


In [184]:
model_scores["XGBRegressor"][0]

In [185]:
y_validate = df_validate['price']
X_validate = prepare_data_for_regression(preprocess(df_validate.drop('price', axis=1), is_target=True, num_beds=df_train['num_beds'].median(), num_baths=df_train['num_baths'].median()))

for k, v in model_scores.items():
    regr, scores = v
    predict_validate = regr.predict(X_validate)

    print(k, rmse(y_validate, predict_validate))

Lasso 117266699.06959774
Ridge 115414038.37676308
XGBRegressor 110786036.07053132


In [186]:
# Retrain using best model
fmodel = pipelined_model(xgb.XGBRegressor(n_estimators=5, max_depth=3, reg_lambda=500))

df_ftrain = pd.read_csv('../data/train.csv')
df_ftrain = preprocess(df_ftrain)
df_ftrain = prepare_data_for_regression(df_ftrain)
df_ftrain.head()

X_ftrain = df_ftrain.drop('price', axis=1)
y_ftrain = df_ftrain['price']

fmodel.fit(X_ftrain, y_ftrain)
predict_ftrain = fmodel.predict(X_ftrain)
print(rmse(y_ftrain, predict_ftrain))

1705597.0471448153


In [154]:
df_test = preprocess(pd.read_csv('../data/test.csv'), is_target=True, num_beds=df_train['num_beds'].median(), num_baths=df_train['num_baths'].median())

# further preprocessing
X_test = prepare_data_for_regression(df_test)

y_predict = fmodel.predict(X_test)

X_test['Predicted'] = y_predict

In [155]:
y_predict

array([2358971.2, 2103689.2, 5096663.5, ..., 1740976.6, 1863184.1,
       1740976.6], dtype=float32)

In [156]:
submission = X_test[['Predicted']]
submission.to_csv('submission10.csv', index=True, index_label='id', header=True, columns=['Predicted'])