In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import sys
sys.path.append('../')

from utils.processing import *
from utils.visualisation import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 80)

In [4]:
# load data set
df = preprocess(pd.read_csv('../data/train.csv'))
adfs = read_aux_csv('../data/')

# further preprocessing
df_orig = join_aux(df, adfs)

In [5]:
drop_columns = [
    'lat', 
    'lng', 
    'built_year', 
    'subzone', 
    'planning_area', 
    'region', 
    'property_type', 
    'elevation', 
    'listing_id', 
    'title', 
    'address', 
    'property_name', 
    'available_unit_types', 
    'total_num_units', 
    'property_details_url', 
    'furnishing', 
    'floor_level',
    'tenure',
    'pri_sch_outside_500m',
    'gep_pri_sch_outside_2km',
    'gep_pri_sch_within_1km_2km',
]
df = df_orig.drop(columns=drop_columns)

In [6]:
df.head()
df[df.isna().any(axis=1)]

Unnamed: 0,num_beds,num_baths,size_sqft,price,property_type_private,property_type_public,tenure_high_year,tenure_low_year,floor_level_ground,floor_level_high,floor_level_low,floor_level_mid,floor_level_penthouse,floor_level_top,furnishing_partial,furnishing_unfurnished,furnishing_unspecified,nearest_mrt_distance_in_km,line_cc,line_ce,line_cg,line_dt,line_ew,line_ne,line_ns,line_te,nearest_pri_sch_distance_in_km,nearest_gep_pri_sch_distance_in_km,gep_pri_sch_within_1km,pri_sch_within_500m,nearest_com_centre_distance_in_km,cc_type_BN,cc_type_CR,cc_type_IEBP,cc_type_IEPB,cc_type_IHL,nearest_mall_distance_in_km,area_size,population,density


In [7]:
# split data set into training and validation

df_train = df.sample(frac = 0.70)
df_validate = df.drop(df_train.index)

X, y = df_train.drop(columns=['price']), df_train['price']
vX, vy = df_validate.drop(columns=['price']), df_validate['price']

In [8]:
rs = [
    KNeighborsRegressor(n_neighbors=5),
    LinearRegression()
]

for r in rs:
    r.fit(X, y)

    print(r)
    
    y_pred = r.predict(X)
    print(np.sqrt(mean_squared_error(y, y_pred)))

    vy_pred = r.predict(vX)
    print(np.sqrt(mean_squared_error(vy, vy_pred)))
    print()

KNeighborsRegressor()
769490.4455338941
984388.2045130444

LinearRegression()
1439824.7062919268
1456830.771904208



In [9]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2, interaction_only=True)
pX = poly.fit_transform(X)
pvX = poly.fit_transform(vX)

rs = [
    KNeighborsRegressor(n_neighbors=5),
    LinearRegression()
]

for r in rs:
    r.fit(pX, y)

    print(r)
    
    y_pred = r.predict(pX)
    print(np.sqrt(mean_squared_error(y, y_pred)))

    vy_pred = r.predict(pvX)
    print(np.sqrt(mean_squared_error(vy, vy_pred)))
    print()

KNeighborsRegressor()
798536.7853113204
996815.66879076

LinearRegression()
984843.9500983054
1086463.8639780201



In [10]:
# Test out poly features and xgboost on choon boon's experiments

import xgboost as xgb

X_train, y_train = df.drop(columns=['price']), df['price']
poly = PolynomialFeatures(2, interaction_only=True)
poly.fit_transform(X_train)

array([[1.00000000e+00, 3.00000000e+00, 2.00000000e+00, ...,
        5.30409600e+03, 1.51200000e+04, 6.51694413e+08],
       [1.00000000e+00, 2.00000000e+00, 2.00000000e+00, ...,
        1.23669000e+03, 3.15000000e+03, 2.52738156e+07],
       [1.00000000e+00, 2.00000000e+00, 1.00000000e+00, ...,
        6.24796380e+04, 3.24300000e+04, 5.45886484e+08],
       ...,
       [1.00000000e+00, 3.00000000e+00, 2.00000000e+00, ...,
        1.65179940e+04, 1.60900000e+04, 2.52180109e+08],
       [1.00000000e+00, 2.00000000e+00, 2.00000000e+00, ...,
        2.43910590e+04, 9.93000000e+03, 4.01436714e+07],
       [1.00000000e+00, 3.00000000e+00, 2.00000000e+00, ...,
        3.56155300e+04, 2.55400000e+04, 4.67760201e+08]])

In [11]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.compose import TransformedTargetRegressor

def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    score = np.sqrt(mse)
    return score

def rmse_scorer():
    return make_scorer(rmse)

# encode some more stuff
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

numeric_features = ['size_sqft', 'num_beds', 'num_baths']
numeric_transformer = make_pipeline(StandardScaler())

preprocessor = make_column_transformer((numeric_transformer, numeric_features))

models = []
model_scores = []
for model in [KNeighborsRegressor,
              Lasso,
              Ridge,
              DecisionTreeRegressor,
              GradientBoostingRegressor,
              AdaBoostRegressor,
              RandomForestRegressor,
              LinearRegression,
              xgb.XGBRegressor]:
    candidate = model()
    pipe = make_pipeline(preprocessor, candidate)
    regr = TransformedTargetRegressor(regressor=pipe, transformer=StandardScaler())
    scores = cross_validate(regr, X_train, y_train, cv=5, scoring=rmse_scorer())
    regr.fit(X_train, y_train)
    models.append(regr)
    model_scores.append(scores['test_score'].mean())
    print(candidate.__class__.__name__, scores['test_score'].mean())

KNeighborsRegressor 1484574.4536951212
Lasso 3172131.376098768
Ridge 1773068.2195744957
DecisionTreeRegressor 1468549.5234242794
GradientBoostingRegressor 1452893.5967807132
AdaBoostRegressor 1805186.5535296306
RandomForestRegressor 1301217.4128629859
LinearRegression 1773069.0686500818
XGBRegressor 1287105.5170104816
