In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
import sys
sys.path.append('../')

from utils.processing import *
from utils.visualisation import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.decomposition import PCA, TruncatedSVD, SparsePCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import DBSCAN

np.random.seed(0)

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 80)

In [4]:
# load data set
df = pd.read_csv('../data/train.csv')
df_train_orig, df_validate_orig = train_test_split(df, test_size=0.30)

df_train_orig.iloc[0]

df_train_orig = df.copy()

In [5]:
# Utility functions

def prepare_data_for_regression(df, drop_columns=[]):
    adfs = read_aux_csv('../data')
    df = join_aux(df, adfs)
    df = df.drop(columns=drop_columns)

    return df

def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    score = np.sqrt(mse)
    return score

def rmse_scorer():
    return make_scorer(rmse, greater_is_better=False)

# Constants
initial_drop_columns = [
        'address',
        'title',
        'listing_id',
        'property_name',
        'available_unit_types',
        'property_details_url',
        'elevation',
        'tenure',
        'property_type',
        'floor_level',
        'furnishing',
    ]

In [6]:
# Prepare training data set
df_train = preprocess(df_train_orig.copy())
df_train = prepare_data_for_regression(df_train, drop_columns=initial_drop_columns)
df_train

Unnamed: 0,built_year,num_beds,num_baths,size_sqft,total_num_units,lat,lng,subzone,planning_area,price,property_type_private,property_type_public,tenure_high_year,tenure_low_year,floor_level_ground,floor_level_high,floor_level_low,floor_level_mid,floor_level_penthouse,floor_level_top,furnishing_partial,furnishing_unfurnished,furnishing_unspecified,nearest_mrt_distance_in_km,line_cc,line_ce,line_cg,line_dt,line_ew,line_ne,line_ns,line_te,nearest_pri_sch_distance_in_km,nearest_gep_pri_sch_distance_in_km,gep_pri_sch_within_1km,gep_pri_sch_within_1km_2km,gep_pri_sch_outside_2km,pri_sch_within_500m,pri_sch_outside_500m,nearest_com_centre_distance_in_km,cc_type_BN,cc_type_CR,cc_type_IEBP,cc_type_IHL,nearest_mall_distance_in_km,area_size,population,density,region
0,1988.0,3.0,2.0,1115,116.0,1.414399,103.837196,yishun south,yishun,514500.0,0,1,0,0,0,0,0,0,0,0,0,0,1,0.574204,0,0,0,0,0,0,1,0,0.275582,6.194099,0,0,1,1,0,3.338999,0,1,0,0,0.621334,1.3402,42240,31517.683928,n
1,1992.0,4.0,2.0,1575,,1.372597,103.875625,serangoon north,serangoon,995400.0,0,1,0,1,0,0,0,0,0,0,0,0,1,1.734303,0,0,0,0,0,1,0,0,0.122925,0.122925,1,0,0,1,0,2.401882,1,0,0,0,0.552544,0.6847,15940,23280.268731,ne
2,2022.0,4.0,6.0,3070,56.0,1.298773,103.895798,mountbatten,marine parade,8485000.0,1,0,1,0,0,0,1,0,0,0,1,0,0,1.319766,1,0,0,0,0,0,0,0,0.891475,1.872702,0,1,0,0,1,2.171328,1,0,0,0,0.824208,1.7119,9980,5829.779777,c
3,2023.0,3.0,2.0,958,638.0,1.312364,103.803271,farrer court,bukit timah,2626000.0,1,0,1,0,0,0,0,0,0,0,1,0,0,0.726006,1,0,0,0,0,0,0,0,1.090371,1.090371,0,1,0,0,1,1.605928,1,0,0,0,0.907063,0.5588,6180,11059.413028,c
4,2026.0,2.0,1.0,732,351.0,1.273959,103.843635,anson,downtown core,1764000.0,1,0,0,1,0,0,0,0,0,0,0,0,1,0.371115,0,0,0,0,1,0,0,0,0.464835,2.742534,0,0,1,1,0,1.869574,0,1,0,0,0.430415,0.1032,80,775.193798,c
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19838,2026.0,2.0,2.0,635,605.0,1.385938,103.834466,tagore,ang mo kio,1050000.0,1,0,0,1,0,0,0,0,0,0,0,0,1,0.149941,0,0,0,0,0,0,0,1,0.790370,4.691173,0,0,1,0,1,3.163579,0,0,1,0,2.014133,3.3342,7950,2384.380061,ne
19839,2026.0,2.0,2.0,883,137.0,1.315948,103.857589,lavender,kallang,2087400.0,1,0,1,0,0,0,0,0,0,0,0,0,1,0.443603,0,0,0,0,0,1,0,0,0.660929,2.178475,0,0,1,0,1,1.641446,1,0,0,0,0.535185,0.7569,9690,12802.219580,c
19840,2023.0,4.0,4.0,1378,340.0,1.315961,103.836848,moulmein,novena,4193700.0,1,0,1,0,0,0,0,0,0,0,0,0,1,0.424244,0,0,0,1,0,0,1,0,0.367552,0.367552,1,0,0,1,0,0.856170,1,0,0,0,0.764410,1.3475,9300,6901.669759,c
19841,2017.0,3.0,2.0,1205,402.0,1.440753,103.806671,woodlands east,woodlands,754800.0,0,1,0,1,0,1,0,0,0,0,0,1,0,0.631718,0,0,0,0,0,0,1,0,0.230075,10.658711,0,0,1,1,0,2.165768,0,0,1,0,0.601720,2.5535,98980,38762.482867,n


In [7]:
X_train = df_train.drop('price', axis=1)
y_train = df_train['price']

In [8]:
# Baseline prediction model, use linear regression 
ct = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ["region", "subzone", "planning_area"]))
baseline = make_pipeline(ct, LinearRegression()).fit(X_train, y_train)
print(rmse(y_train, baseline.predict(X_train)))

2294349.18087949


In [9]:
def data_pipeline(model=None, polynomial=2):
    preprocessor = make_column_transformer(
                    (make_pipeline(KNNImputer(), StandardScaler()), ['num_beds', 'num_baths', 'built_year', 'total_num_units']),
                    (StandardScaler(), ['nearest_mrt_distance_in_km', 'nearest_pri_sch_distance_in_km', 'nearest_gep_pri_sch_distance_in_km', 'nearest_com_centre_distance_in_km', 'nearest_mall_distance_in_km', 'area_size', 'population', 'density']),
                    (OneHotEncoder(sparse=False, handle_unknown='ignore'), ["region", "subzone", "planning_area"]),
                    remainder='passthrough')
    
    pipeline = [preprocessor]
    
    if polynomial > 0:
        pipeline.append(PolynomialFeatures(polynomial))
    
    if model != None:
        pipeline.append(model)
        
    return make_pipeline(*pipeline)

def pipelined_model(model, polynomial=2):
    return TransformedTargetRegressor(regressor=data_pipeline(model, polynomial), transformer=StandardScaler())

def transform_data(df):
    p = data_pipeline(polynomial=0)
    p.fit(df)
    
    return pd.DataFrame(data=p.transform(df), columns=p.get_feature_names_out())

In [10]:
t_X_train = transform_data(X_train)
t_X_train.head()

Unnamed: 0,pipeline__num_beds,pipeline__num_baths,pipeline__built_year,pipeline__total_num_units,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,onehotencoder__region_c,onehotencoder__region_e,onehotencoder__region_n,onehotencoder__region_ne,onehotencoder__region_w,onehotencoder__subzone_admiralty,onehotencoder__subzone_alexandra hill,onehotencoder__subzone_alexandra north,onehotencoder__subzone_aljunied,onehotencoder__subzone_anak bukit,onehotencoder__subzone_anchorvale,onehotencoder__subzone_ang mo kio town centre,onehotencoder__subzone_anson,...,remainder__floor_level_high,remainder__floor_level_low,remainder__floor_level_mid,remainder__floor_level_penthouse,remainder__floor_level_top,remainder__furnishing_partial,remainder__furnishing_unfurnished,remainder__furnishing_unspecified,remainder__line_cc,remainder__line_ce,remainder__line_cg,remainder__line_dt,remainder__line_ew,remainder__line_ne,remainder__line_ns,remainder__line_te,remainder__gep_pri_sch_within_1km,remainder__gep_pri_sch_within_1km_2km,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_within_500m,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__cc_type_CR,remainder__cc_type_IEBP,remainder__cc_type_IHL
0,-0.075132,-0.428648,-1.435178,-0.773425,-0.388973,-0.878327,0.987085,1.349263,-0.323959,-0.158765,0.837688,1.378484,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.719733,-0.428648,-1.180783,-0.861413,1.751708,-1.172786,-1.192262,0.462318,-0.456667,-0.407354,-0.276501,0.689146,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.719733,2.355255,0.727179,-0.958015,0.986781,0.309661,-0.564151,0.244108,0.067416,-0.017802,-0.528994,-0.771177,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-0.075132,-0.428648,0.790778,0.832507,-0.10886,0.693308,-0.844982,-0.291021,0.227256,-0.4551,-0.689979,-0.333542,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-0.869998,-1.124623,0.981574,-0.050448,-0.763724,-0.513281,-0.251911,-0.04149,-0.692271,-0.627881,-0.948403,-1.194164,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [11]:
t_X_train[t_X_train.isna().any(axis=1)]

Unnamed: 0,pipeline__num_beds,pipeline__num_baths,pipeline__built_year,pipeline__total_num_units,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,onehotencoder__region_c,onehotencoder__region_e,onehotencoder__region_n,onehotencoder__region_ne,onehotencoder__region_w,onehotencoder__subzone_admiralty,onehotencoder__subzone_alexandra hill,onehotencoder__subzone_alexandra north,onehotencoder__subzone_aljunied,onehotencoder__subzone_anak bukit,onehotencoder__subzone_anchorvale,onehotencoder__subzone_ang mo kio town centre,onehotencoder__subzone_anson,...,remainder__floor_level_high,remainder__floor_level_low,remainder__floor_level_mid,remainder__floor_level_penthouse,remainder__floor_level_top,remainder__furnishing_partial,remainder__furnishing_unfurnished,remainder__furnishing_unspecified,remainder__line_cc,remainder__line_ce,remainder__line_cg,remainder__line_dt,remainder__line_ew,remainder__line_ne,remainder__line_ns,remainder__line_te,remainder__gep_pri_sch_within_1km,remainder__gep_pri_sch_within_1km_2km,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_within_500m,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__cc_type_CR,remainder__cc_type_IEBP,remainder__cc_type_IHL


In [12]:
def run_models(models, X, y):    
    model_scores = dict()
    
    for i, model in enumerate(models):
        regr = pipelined_model(model, polynomial=0)
  
        scores = cross_validate(regr, X, y, cv=5, scoring=rmse_scorer())
        regr.fit(X, y)
        model_scores[model.__class__.__name__ + str(i)] = [regr, scores]
        print(model.__class__.__name__, scores['test_score'].mean())
        
    return model_scores

In [20]:
models = [
            LinearRegression(),
            Lasso(),
            Ridge(),
            KNeighborsRegressor(),
            xgb.XGBRegressor(),
        ]


model_scores = run_models(models, X_train, y_train)

LinearRegression -1176790833488.5337
Lasso -1848825.799838389
Ridge -1294217.504905371
KNeighborsRegressor -1212904.3812863408
XGBRegressor -731092.5683477728


In [21]:

def run_selection(models, X, y):
    model_scores = dict()
    
    ct = make_column_transformer(
                        (make_pipeline(KNNImputer(), StandardScaler()), ['num_beds', 'num_baths', 'built_year', 'total_num_units']),
                        (StandardScaler(), ['nearest_mrt_distance_in_km', 'nearest_pri_sch_distance_in_km', 'nearest_gep_pri_sch_distance_in_km', 'nearest_com_centre_distance_in_km', 'nearest_mall_distance_in_km', 'area_size', 'population', 'density']),
                        (OneHotEncoder(sparse=False, handle_unknown='ignore'), ["region", "subzone", "planning_area"]),
                        remainder='passthrough')
    
    for i, model in enumerate(models):
        regr = make_pipeline(ct, LinearDiscriminantAnalysis(), model)
        scores = cross_validate(regr, X, y, cv=5, scoring=rmse_scorer())
        model_scores[model.__class__.__name__ + str(i)] = [regr, scores]
        print(model.__class__.__name__, scores['test_score'].mean())

    return model_scores


In [22]:
run_selection(models, X_train, y_train)

LinearRegression -1295991.0278268515
Lasso -1295990.8986794981
Ridge -1295990.3778261412
KNeighborsRegressor -1094274.3930703097
XGBRegressor -844223.2833563995


{'LinearRegression0': [Pipeline(steps=[('columntransformer',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('pipeline',
                                                    Pipeline(steps=[('knnimputer',
                                                                     KNNImputer()),
                                                                    ('standardscaler',
                                                                     StandardScaler())]),
                                                    ['num_beds', 'num_baths',
                                                     'built_year',
                                                     'total_num_units']),
                                                   ('standardscaler',
                                                    StandardScaler(),
                                                    ['nearest_mrt_distance_in_km',
                           

In [16]:
y_validate = df_validate_orig['price']
X_validate = prepare_data_for_regression(preprocess(df_validate_orig.drop('price', axis=1), is_target=True))

t_X_v = transform_data(X_validate)
t_X_v.head()

      listing_id                                         title  \
0         297997           4 bed condo for sale in kovan jewel   
1         907902       2 bed condo for sale in hyll on holland   
2         235585      hdb flat for sale in 783c woodlands rise   
3         284232   6 bed condo for sale in tan tong meng tower   
4         746242     hdb flat for sale in 452 yishun ring road   
...          ...                                           ...   
6072      217473  6 bed house for sale in thomson hills estate   
6073      993982            3 bed condo for sale in riversails   
6074      533951         3 bed condo for sale in sunstone hill   
6075      115618  hdb flat for sale in 562 ang mo kio avenue 3   
6076      406027          hdb flat for sale in 443d fajar road   

                                                address         property_name  \
0                                         51 kovan road           kovan jewel   
1                                       89 ho

Unnamed: 0,pipeline__num_beds,pipeline__num_baths,pipeline__built_year,pipeline__total_num_units,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,onehotencoder__region_c,onehotencoder__region_e,onehotencoder__region_n,onehotencoder__region_ne,onehotencoder__region_w,onehotencoder__subzone_admiralty,onehotencoder__subzone_alexandra hill,onehotencoder__subzone_alexandra north,onehotencoder__subzone_aljunied,onehotencoder__subzone_anak bukit,onehotencoder__subzone_anchorvale,onehotencoder__subzone_ang mo kio town centre,onehotencoder__subzone_anson,...,remainder__floor_level_high,remainder__floor_level_low,remainder__floor_level_mid,remainder__floor_level_penthouse,remainder__floor_level_top,remainder__furnishing_partial,remainder__furnishing_unfurnished,remainder__furnishing_unspecified,remainder__line_cc,remainder__line_ce,remainder__line_cg,remainder__line_dt,remainder__line_ew,remainder__line_ne,remainder__line_ns,remainder__line_te,remainder__gep_pri_sch_within_1km,remainder__gep_pri_sch_within_1km_2km,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_within_500m,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__cc_type_CR,remainder__cc_type_IEBP,remainder__cc_type_IHL
0,0.682822,0.916767,0.790116,-1.000407,-0.485358,0.438822,-0.55377,-0.650078,-0.555592,0.058527,0.106832,-0.174706,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0
1,-0.862389,-1.106424,0.917894,-0.143403,-0.081178,0.909779,-0.751929,-0.400462,0.083827,0.091878,-0.667007,-0.972863,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0
2,-0.089783,-0.432027,0.406781,0.160307,-0.088027,-0.923569,2.873543,-0.428278,-0.240812,0.282934,3.231257,1.99558,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0
3,2.228032,0.916767,-2.148782,-0.9974,0.091244,-0.67912,-0.776326,-0.975115,-1.060358,0.050365,0.427826,0.167178,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0
4,-0.089783,-0.432027,0.087336,1.187509,0.828457,-0.810085,1.150104,0.963756,-0.032994,-0.007991,1.617441,1.622717,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0


In [17]:
t_X_v[t_X_v.isna().any(axis=1)]

Unnamed: 0,pipeline__num_beds,pipeline__num_baths,pipeline__built_year,pipeline__total_num_units,standardscaler__nearest_mrt_distance_in_km,standardscaler__nearest_pri_sch_distance_in_km,standardscaler__nearest_gep_pri_sch_distance_in_km,standardscaler__nearest_com_centre_distance_in_km,standardscaler__nearest_mall_distance_in_km,standardscaler__area_size,standardscaler__population,standardscaler__density,onehotencoder__region_c,onehotencoder__region_e,onehotencoder__region_n,onehotencoder__region_ne,onehotencoder__region_w,onehotencoder__subzone_admiralty,onehotencoder__subzone_alexandra hill,onehotencoder__subzone_alexandra north,onehotencoder__subzone_aljunied,onehotencoder__subzone_anak bukit,onehotencoder__subzone_anchorvale,onehotencoder__subzone_ang mo kio town centre,onehotencoder__subzone_anson,...,remainder__floor_level_high,remainder__floor_level_low,remainder__floor_level_mid,remainder__floor_level_penthouse,remainder__floor_level_top,remainder__furnishing_partial,remainder__furnishing_unfurnished,remainder__furnishing_unspecified,remainder__line_cc,remainder__line_ce,remainder__line_cg,remainder__line_dt,remainder__line_ew,remainder__line_ne,remainder__line_ns,remainder__line_te,remainder__gep_pri_sch_within_1km,remainder__gep_pri_sch_within_1km_2km,remainder__gep_pri_sch_outside_2km,remainder__pri_sch_within_500m,remainder__pri_sch_outside_500m,remainder__cc_type_BN,remainder__cc_type_CR,remainder__cc_type_IEBP,remainder__cc_type_IHL
13,0.682822,0.24237,-1.701559,-0.75383,-0.033091,-1.018943,-0.964635,-1.435244,-1.099215,0.947066,4.579263,1.277845,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0
18,-0.089783,-0.432027,-2.021004,-0.829006,-0.794271,-1.007115,-0.336307,0.358689,-0.969809,-0.38643,-0.348541,0.321743,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0
30,0.682822,-0.432027,-1.446002,-0.862083,-0.653282,-0.171769,-0.299944,-1.175963,-0.479551,0.366034,0.723545,-0.052044,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1
36,-0.089783,-0.432027,-1.446002,-0.889147,0.853583,-0.968545,1.178841,0.978947,0.257066,-0.007991,1.617441,1.622717,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0
43,0.682822,-0.432027,-1.63767,-0.747816,0.242711,-0.230742,1.528426,0.275661,1.278262,0.000987,1.331833,1.268955,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6031,-0.862389,-1.106424,-1.573781,-0.600472,-0.249676,-0.67021,-0.365399,1.32745,0.390656,0.524146,2.509231,0.893604,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0
6038,-0.862389,-0.432027,-1.382113,-0.702711,1.478942,-1.023797,1.246803,0.955215,1.271163,-0.007991,1.617441,1.622717,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0
6045,-0.089783,-0.432027,-1.701559,-0.801943,0.317976,-0.641288,-0.772823,-1.550527,-0.972281,0.947066,4.579263,1.277845,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0
6056,-0.089783,-0.432027,-0.551555,-0.810964,-0.197869,-0.957956,2.526058,-0.445086,-0.351597,-0.321879,0.763143,2.411416,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0


In [19]:
y_validate = df_validate_orig['price']
X_validate = prepare_data_for_regression(preprocess(df_validate_orig.drop('price', axis=1), is_target=True))

for k, v in model_scores.items():
    regr, scores = v
    predict_validate = regr.predict(X_validate)
    
    print(k, rmse(y_validate, predict_validate))

LinearRegression0 21532850.61661274
Lasso1 29103457.251779743
Ridge2 21586365.665886156
KNeighborsRegressor3 3144307.243097033
XGBRegressor4 2906085.5093156993


In [None]:
# Retrain using best model
fmodel = pipelined_model(xgb.XGBRegressor())

df_ftrain = pd.read_csv('../data/train.csv')
df_ftrain = preprocess(df_ftrain)
df_ftrain = prepare_data_for_regression(df_ftrain)
df_ftrain.head()

X_ftrain = df_ftrain.drop('price', axis=1)
y_ftrain = df_ftrain['price']

fmodel.fit(X_ftrain, y_ftrain)
predict_ftrain = fmodel.predict(X_ftrain)
print(rmse(y_ftrain, predict_ftrain))

In [None]:
df_test = preprocess(pd.read_csv('../data/test.csv'), is_target=True)

# further preprocessing
X_test = prepare_data_for_regression(df_test)

y_predict = fmodel.predict(X_test)

X_test['Predicted'] = y_predict

In [None]:
y_predict

In [None]:
submission = X_test[['Predicted']]
submission.to_csv('submission3.csv', index=True, index_label='id', header=True, columns=['Predicted'])