In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
 
# setting path
sys.path.append('../')

from utils.processing import preprocess

In [3]:
df = pd.read_csv('../data/train.csv')
df = preprocess(df)

In [4]:
df.head()

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,...,tenure_low_year,floor_level_ground,floor_level_high,floor_level_low,floor_level_mid,floor_level_penthouse,floor_level_top,furnishing_partial,furnishing_unfurnished,furnishing_unspecified
0,122881,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,property_type_public,,1988.0,3.0,2.0,1115,...,0,0,0,0,0,0,0,0,0,1
1,259374,hdb flat for sale in 506b serangoon north aven...,hougang / punggol / sengkang (d19),hdb-serangoon estate,property_type_public,tenure_low_year,1992.0,4.0,2.0,1575,...,1,0,0,0,0,0,0,0,0,1
2,665422,4 bed condo for sale in meyerhouse,128 meyer road,meyerhouse,property_type_private,tenure_high_year,2022.0,4.0,6.0,3070,...,0,0,0,1,0,0,0,1,0,0
3,857699,3 bed condo for sale in leedon green,26 leedon heights,leedon green,property_type_private,tenure_high_year,2023.0,3.0,2.0,958,...,0,0,0,0,0,0,0,1,0,0
4,216061,2 bed condo for sale in one bernam,1 bernam street,one bernam,property_type_private,tenure_low_year,2026.0,2.0,1.0,732,...,1,0,0,0,0,0,0,0,0,1


In [6]:
# drop some columns not useful for prediction

def drop_cols_for_regression(df):
    df_without_columns = df.drop(['address', 'title', 'listing_id', 'property_name', 'total_num_units', 'available_unit_types', 'property_details_url'], axis=1)
    df_after_explore = df.drop(['elevation', 'tenure', 'property_type', 'floor_level', 'furnishing'], axis=1)
    return df_after_explore

df_train = drop_cols_for_regression(df)

In [7]:
# exploring some columns and their values
df['property_type_private'].value_counts()

1    12431
0     7333
Name: property_type_private, dtype: int64

In [8]:
df['planning_area'].isna().sum()

0

In [9]:
# drop some more columns after more exploration
df_after_explore = df.drop(['elevation', 'tenure', 'property_type', 'floor_level', 'furnishing'], axis=1)

In [10]:
# encode some more stuff
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

numeric_features = ['size_sqft', 'num_beds', 'num_baths']
numeric_transformer = make_pipeline(StandardScaler())

preprocessor = make_column_transformer((numeric_transformer, numeric_features))

In [11]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']

In [12]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.compose import TransformedTargetRegressor

def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    score = np.sqrt(mse)
    return score

def rmse_scorer():
    return make_scorer(rmse)


models = []
model_scores = []
for model in [KNeighborsRegressor,
              Lasso,
              Ridge,
              DecisionTreeRegressor,
              GradientBoostingRegressor,
              AdaBoostRegressor,
              RandomForestRegressor]:
    candidate = model()
    pipe = make_pipeline(preprocessor, candidate)
    regr = TransformedTargetRegressor(regressor=pipe, transformer=StandardScaler())
    scores = cross_validate(regr, X_train, y_train, cv=5, scoring=rmse_scorer())
    regr.fit(X_train, y_train)
    models.append(regr)
    model_scores.append(scores['test_score'].mean())
    print(candidate.__class__.__name__, scores['test_score'].mean())

KNeighborsRegressor 1469484.1061346997
Lasso 3172440.323409148
Ridge 1773034.843367948
DecisionTreeRegressor 1459811.9929123
GradientBoostingRegressor 1466833.3135607387
AdaBoostRegressor 1823367.7798214399
RandomForestRegressor 1314880.224013635


In [29]:
df_test = pd.read_csv('../data/test.csv')
df_test = preprocess(df_test, is_target=True)
X_test = drop_cols_for_regression(df_test)
print(X_test.shape[0])

6966


In [30]:
index_min = np.argmin(model_scores)
y_predict = models[index_min].predict(X_test)

X_test['Predicted'] = y_predict

In [31]:
submission = X_test[['Predicted']]
submission.to_csv('submission.csv', index=True, index_label='id', header=True, columns=['Predicted'])