In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import sys
sys.path.append('../')

from utils.processing import *
from utils.visualisation import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, MinMaxScaler

np.random.seed(0)

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 80)

In [4]:
# load data set
df = pd.read_csv('../data/train.csv')
df_train, df_validate = train_test_split(df, test_size=0.30)

df_train.iloc[0]

listing_id                                                          224052
title                                         1 bed condo for sale in myra
address                                            9 meyappa chettiar road
property_name                                                         myra
property_type                                                        condo
tenure                                                            freehold
built_year                                                          2024.0
num_beds                                                               1.0
num_baths                                                              1.0
size_sqft                                                              474
floor_level                                                            NaN
furnishing                                                     unspecified
available_unit_types                                         1, 2, 3, 4 br
total_num_units          

In [5]:
def drop_cols_for_regression(df):
    drop_columns = [
        'address',
        'title',
        'listing_id',
        'property_name',
        'total_num_units',
        'available_unit_types',
        'property_details_url',
        'elevation',
        'tenure',
        'property_type',
        'floor_level',
        'furnishing',
        'built_year',
        'planning_area',
        'subzone',
        
        'furnishing_partial',
        'furnishing_unfurnished',
        
        'floor_level_ground',
        'floor_level_high',
        'floor_level_mid',
        'floor_level_low',
        'floor_level_penthouse',
        'floor_level_top',
        
        'line_cc',
        'line_ce',
        'line_cg',
        'line_dt',
        'line_ew',
        'line_ne',
        'line_ns',
        'line_te',
        'gep_pri_sch_within_1km',
        'gep_pri_sch_within_1km_2km',
        'pri_sch_within_500m',
        'cc_type_CR',
        'cc_type_IEBP',
        'cc_type_IEPB',
        'cc_type_IHL',
        'region',
    ]
    
    return df.drop(columns=drop_columns)

def prepare_data_for_regression(df):
    adfs = read_aux_csv('../data')
    df = join_aux(df, adfs)
    df['region_c'] = np.where(df['region'] == 'c', 1, 0)

    df = drop_cols_for_regression(df)

    return df

In [6]:
df_train = preprocess(df_train)
df_train = prepare_data_for_regression(df_train)
df_train

Unnamed: 0,num_beds,num_baths,size_sqft,lat,lng,price,property_type_private,property_type_public,tenure_high_year,tenure_low_year,furnishing_unspecified,nearest_mrt_distance_in_km,nearest_pri_sch_distance_in_km,nearest_gep_pri_sch_distance_in_km,gep_pri_sch_outside_2km,pri_sch_outside_500m,nearest_com_centre_distance_in_km,cc_type_BN,nearest_mall_distance_in_km,area_size,population,density,region_c
0,1.0,1.0,474,1.330709,103.868391,1060900.0,1,0,1,0,1,0.131235,0.342137,3.934774,1,0,2.251585,1,0.100566,0.6391,11720,18338.288218,1
1,2.0,2.0,915,1.360919,103.892050,1029900.0,1,0,0,1,1,0.780849,0.366408,2.363387,1,0,0.694122,0,0.339265,1.5155,32320,21326.294952,0
2,4.0,4.0,4717,1.300620,103.892783,8059800.0,1,0,1,0,1,0.973647,0.998307,2.130345,1,1,1.952462,1,1.000584,1.7119,9980,5829.779777,1
3,3.0,1.0,699,1.316655,103.805632,554400.0,0,1,0,1,1,0.212855,0.546077,0.546077,0,1,2.066441,1,1.325490,0.5588,6180,11059.413028,1
4,3.0,3.0,1249,1.315795,103.832096,3738000.0,1,0,1,0,1,0.760161,0.466849,0.466849,0,0,1.340886,1,0.977231,2.0961,9520,4541.768045,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13829,3.0,2.0,979,1.420758,103.838035,514500.0,0,1,0,0,1,0.675683,0.287658,6.676536,1,0,3.438561,0,0.454060,1.3402,42240,31517.683928,0
13830,2.0,2.0,689,1.296394,103.891014,2052600.0,1,0,1,0,1,1.368677,1.442544,2.465268,1,1,2.436429,1,1.402825,1.7119,9980,5829.779777,1
13831,3.0,3.0,1518,1.276125,103.853253,4122000.0,1,0,0,1,0,0.151672,1.486155,2.350847,1,1,1.191965,0,0.530202,0.9793,880,898.601042,1
13832,3.0,3.0,1249,1.314664,103.831084,3934400.0,1,0,1,0,0,0.810220,0.635320,0.635320,0,1,1.492339,1,0.883319,2.0961,9520,4541.768045,1


In [7]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']

In [8]:
def data_pipeline(model=None, polynomial=2):
    preprocessor = make_column_transformer(
                    (StandardScaler(), ['num_beds', 'num_baths', 'size_sqft', 'nearest_mrt_distance_in_km', 'nearest_pri_sch_distance_in_km', 'nearest_gep_pri_sch_distance_in_km', 'nearest_com_centre_distance_in_km', 'nearest_mall_distance_in_km', 'area_size', 'population', 'density']),
                    remainder='passthrough')
    
    pipeline = [preprocessor]
    
    if polynomial > 0:
        pipeline.append(PolynomialFeatures(polynomial))
    
    if model != None:
        pipeline.append(model)
        
    return make_pipeline(*pipeline)

def pipelined_model(model):
    return TransformedTargetRegressor(regressor=data_pipeline(model), transformer=StandardScaler())

def transform_data(df):
    p = data_pipeline(polynomial=0)
    p.fit(df)
    
    return pd.DataFrame(data=p.transform(df), columns=p.get_feature_names_out())

In [9]:
t_X_train = transform_data(X_train)
t_X_train.head()

Unnamed: 0,standardscaler__num_beds,standardscaler__num_baths,standardscaler__size_sqft,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,remainder__lat,remainder__lng,remainder__property_type_private,remainder__property_type_public,remainder__tenure_high_year,remainder__tenure_low_year,remainder__furnishing_unspecified,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__region_c
0,-1.675193,-1.131695,-0.695263,-1.207192,-0.748522,0.165773,0.314224,-1.328525,-0.425303,-0.455532,0.274934,1.330709,103.868391,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
1,-0.878157,-0.432616,-0.436065,-0.013581,-0.702091,-0.396043,-1.155924,-0.868999,-0.090912,0.419935,0.525214,1.360919,103.89205,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,0.715915,0.965541,1.79856,0.34067,0.50679,-0.479362,0.03187,0.404127,-0.015976,-0.529479,-0.7728,1.30062,103.892783,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
3,-0.081121,-1.131695,-0.563019,-1.057222,-0.358368,-1.045784,0.13946,1.029614,-0.455942,-0.690973,-0.334757,1.316655,103.805632,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4,-0.081121,0.266462,-0.239757,-0.051593,-0.509939,-1.07411,-0.545419,0.359171,0.130616,-0.549029,-0.880686,1.315795,103.832096,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0


In [10]:
t_X_train[t_X_train.isna().any(axis=1)]

Unnamed: 0,standardscaler__num_beds,standardscaler__num_baths,standardscaler__size_sqft,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,remainder__lat,remainder__lng,remainder__property_type_private,remainder__property_type_public,remainder__tenure_high_year,remainder__tenure_low_year,remainder__furnishing_unspecified,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__region_c


In [11]:
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    score = np.sqrt(mse)
    return score
    
def run_models(models, X, y):
    def rmse_scorer():
        return make_scorer(rmse)
    
    model_scores = dict()
    
    for model in models:
        regr = pipelined_model(model)
        scores = cross_validate(regr, X, y, cv=10, scoring=rmse_scorer())
        regr.fit(X, y)
        model_scores[model.__class__.__name__] = [regr, scores]
        print(model.__class__.__name__, scores['test_score'].mean())
        
    return model_scores

In [12]:
models = [
            xgb.XGBRegressor(),
        ]

model_scores = run_models(models, X_train, y_train)

XGBRegressor 758334.5291296256


In [13]:
y_validate = df_validate['price']
X_validate = prepare_data_for_regression(preprocess(df_validate.drop('price', axis=1), is_target=True, num_beds=df_train['num_beds'].median(), num_baths=df_train['num_baths'].median()))

t_X_v = transform_data(X_validate)
t_X_v.head()

Unnamed: 0,standardscaler__num_beds,standardscaler__num_baths,standardscaler__size_sqft,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,remainder__lat,remainder__lng,remainder__property_type_private,remainder__property_type_public,remainder__tenure_high_year,remainder__tenure_low_year,remainder__furnishing_unspecified,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__region_c
0,0.681451,0.930558,0.009133,-0.485358,0.438822,-0.55377,-0.650078,-0.555592,0.058527,0.106832,-0.174706,1.357815,103.881064,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,-0.866943,-1.102396,-0.070694,-0.081178,0.909779,-0.751929,-0.400462,0.083827,0.091878,-0.667007,-0.972863,1.310429,103.802821,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,-0.092746,-0.424745,-0.051139,-0.088027,-0.923569,2.873543,-0.428278,-0.240812,0.282934,3.231257,1.99558,1.446546,103.803916,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,2.229846,0.930558,0.065515,0.091244,-0.67912,-0.776326,-0.975115,-1.060358,0.050365,0.427826,0.167178,1.327671,103.841705,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
4,-0.092746,-0.424745,-0.051191,0.828457,-0.810085,1.150104,0.963756,-0.032994,-0.007991,1.617441,1.622717,1.42419,103.844932,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0


In [14]:
t_X_v[t_X_v.isna().any(axis=1)]

Unnamed: 0,standardscaler__num_beds,standardscaler__num_baths,standardscaler__size_sqft,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,remainder__lat,remainder__lng,remainder__property_type_private,remainder__property_type_public,remainder__tenure_high_year,remainder__tenure_low_year,remainder__furnishing_unspecified,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__region_c


In [15]:
y_validate = df_validate['price']
X_validate = prepare_data_for_regression(preprocess(df_validate.drop('price', axis=1), is_target=True, num_beds=df_train['num_beds'].median(), num_baths=df_train['num_baths'].median()))

for k, v in model_scores.items():
    regr, scores = v
    predict_validate = regr.predict(X_validate)
    
    print(k, rmse(y_validate, predict_validate))

XGBRegressor 2912343.3945819973


In [16]:
# Retrain using best model
fmodel = pipelined_model(xgb.XGBRegressor())

df_ftrain = pd.read_csv('../data/train.csv')
df_ftrain = preprocess(df_ftrain)
df_ftrain = prepare_data_for_regression(df_ftrain)
df_ftrain.head()

X_ftrain = df_ftrain.drop('price', axis=1)
y_ftrain = df_ftrain['price']

fmodel.fit(X_ftrain, y_ftrain)
predict_ftrain = fmodel.predict(X_ftrain)
print(rmse(y_ftrain, predict_ftrain))

265036.0118628543


In [17]:
df_test = preprocess(pd.read_csv('../data/test.csv'), is_target=True, num_beds=df_ftrain['num_beds'].median(), num_baths=df_ftrain['num_baths'].median())

# further preprocessing
X_test = prepare_data_for_regression(df_test)

y_predict = fmodel.predict(X_test)

X_test['Predicted'] = y_predict

In [18]:
y_predict

array([1164279.5 , 1513520.4 , 1209036.1 , ..., 3543865.2 ,  505530.84,
       4056774.5 ], dtype=float32)

In [19]:
submission = X_test[['Predicted']]
submission.to_csv('submission3.csv', index=True, index_label='id', header=True, columns=['Predicted'])