In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import sys
sys.path.append('../')

from utils.processing import *
from utils.visualisation import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 80)

In [4]:
# load data set
df = preprocess(pd.read_csv('../data/train.csv'))

# further preprocessing
#adfs = read_aux_csv('../data/')
#df = join_aux(df, adfs)

In [5]:
def drop_cols_for_regression(df):
    drop_columns = [
        'address',
        'title',
        'listing_id',
        'property_name',
        'total_num_units',
        'available_unit_types',
        'property_details_url',
        'elevation',
        'tenure',
        'property_type',
        'floor_level',
        'furnishing',
        'built_year',
    ]
    return df.drop(columns=drop_columns)

In [6]:
df_train = drop_cols_for_regression(df)
df_train.head()
#df_train[df_train.isna().any(axis=1)]

Unnamed: 0,num_beds,num_baths,size_sqft,lat,lng,subzone,planning_area,price,property_type_private,property_type_public,tenure_high_year,tenure_low_year,floor_level_ground,floor_level_high,floor_level_low,floor_level_mid,floor_level_penthouse,floor_level_top,furnishing_partial,furnishing_unfurnished,furnishing_unspecified
0,3.0,2.0,1115,1.414399,103.837196,yishun south,yishun,514500.0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,4.0,2.0,1575,1.372597,103.875625,serangoon north,serangoon,995400.0,0,1,0,1,0,0,0,0,0,0,0,0,1
2,4.0,6.0,3070,1.298773,103.895798,mountbatten,marine parade,8485000.0,1,0,1,0,0,0,1,0,0,0,1,0,0
3,3.0,2.0,958,1.312364,103.803271,farrer court,bukit timah,2626000.0,1,0,1,0,0,0,0,0,0,0,1,0,0
4,2.0,1.0,732,1.273959,103.843635,anson,downtown core,1764000.0,1,0,0,1,0,0,0,0,0,0,0,0,1


In [7]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']

In [8]:
numeric_features = ['size_sqft', 'num_beds', 'num_baths']
numeric_transformer = StandardScaler()

categorical_features = ["subzone", "planning_area"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
     (categorical_transformer, categorical_features))

In [9]:
def run_models(models, X, y):
    def rmse(y_true, y_pred):
        mse = mean_squared_error(y_true, y_pred)
        score = np.sqrt(mse)
        return score

    def rmse_scorer():
        return make_scorer(rmse)
    
    model_scores = dict()
    
    for model in models:
        pipe = make_pipeline(preprocessor, PolynomialFeatures(2), model)
        regr = TransformedTargetRegressor(regressor=pipe, transformer=StandardScaler())
        scores = cross_validate(regr, X, y, cv=5, scoring=rmse_scorer())
        regr.fit(X, y)
        model_scores[model.__class__.__name__] = [regr, scores]
        print(model.__class__.__name__, scores['test_score'].mean())
        
    return model_scores

In [10]:
models = [
            KNeighborsRegressor(),
            Lasso(),
            Ridge(),
            DecisionTreeRegressor(),
            GradientBoostingRegressor(),
            AdaBoostRegressor(),
            RandomForestRegressor(max_depth=10),
            xgb.XGBRegressor()
        ]

model_scores = run_models(models, X_train, y_train)

KNeighborsRegressor 1010385.5027917067
Lasso 2679027.59366869
Ridge 1055443.4461157492
DecisionTreeRegressor 944463.2907242803
GradientBoostingRegressor 1124872.887463098
AdaBoostRegressor 1740795.863177313
RandomForestRegressor 1141290.1868469215
XGBRegressor 846171.6029833553


In [11]:
import math
best_score = math.inf
best_model = None

for k, v in model_scores.items():
    regr, scores = v
    score = scores['test_score'].mean()
    
    print(k, score)
    
    if score < best_score:
        best_score = score
        best_model = regr
print(best_model)

KNeighborsRegressor 1010385.5027917067
Lasso 2679027.59366869
Ridge 1055443.4461157492
DecisionTreeRegressor 944463.2907242803
GradientBoostingRegressor 1124872.887463098
AdaBoostRegressor 1740795.863177313
RandomForestRegressor 1141290.1868469215
XGBRegressor 846171.6029833553
TransformedTargetRegressor(regressor=Pipeline(steps=[('columntransformer',
                                                      ColumnTransformer(transformers=[('standardscaler',
                                                                                       StandardScaler(),
                                                                                       ['size_sqft',
                                                                                        'num_beds',
                                                                                        'num_baths']),
                                                                                      ('onehotencoder',
                            

In [12]:
df_test = preprocess(pd.read_csv('../data/test.csv'), is_target=True)

# further preprocessing
#df_test = join_aux(df_test, adfs)
X_test = drop_cols_for_regression(df_test)

print(X_test.shape[0])

y_predict = best_model.predict(X_test)

X_test['Predicted'] = y_predict

6966


In [13]:
y_predict

array([1186902. , 1833670.9, 1121824.1, ..., 3020575.5, 1176003.2,
       3671173.2], dtype=float32)

In [14]:
submission = X_test[['Predicted']]
submission.to_csv('submission3.csv', index=True, index_label='id', header=True, columns=['Predicted'])