# Project 2 - Singapore Housing Data and Kaggle Challenge

## Part 6 - Kaggle Submission

## Step 1: Data Import

Let us start off by importing the data.

In [1]:
import pandas as pd
import numpy as np

from scipy import stats

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline

import pickle

In [2]:
# Importing data.
test = pd.read_csv('../data/test.csv', low_memory = False)
train_clean = pd.read_csv('../data/04_cleaned_df.csv', low_memory = False)


with open('../data/05_final_lr_model.pkl', 'rb') as file:
    lr = pickle.load(file)

## Step 2. Data Cleaning on test.csv

In [3]:
# Slicing ID column first
test_id = test['id']

In [4]:
# Replacing null values with zeroes in the columns 'Mall_Within_500m', 'Mall_Within_1km', 'Mall_Within_2km'.
test.fillna(value = {'Mall_Within_500m': 0, 'Mall_Within_1km': 0, 'Mall_Within_2km': 0}, 
          inplace = True)

mean_value=test['Mall_Nearest_Distance'].mean()
test.fillna(value = {'Mall_Nearest_Distance': mean_value}, 
          inplace = True)


# Replace null values in 'Mall_Nearest_Distance' to 2000 (ie. 2km)
# Note that this is different from train set, but we can't drop rows for test set
test.fillna(value = {'Mall_Nearest_Distance': 2000}, 
          inplace = True)

# Replacing null values with zeroes in the columns 'Hawker_Within_500m', 'Hawker_Within_1km', 'Hawker_Within_2km'.
test.fillna(value = {'Hawker_Within_500m': 0, 'Hawker_Within_1km': 0, 'Hawker_Within_2km': 0}, 
          inplace = True)

# Converting 'Tranc_YearMonth' to datetime
test['Tranc_YearMonth'] = pd.to_datetime(test['Tranc_YearMonth'], format = '%Y-%m')


# Dropping 'residential'
test.drop(columns = 'residential', inplace = True)

# Replacing booleans
test.replace({'commercial': {'Y':1, 'N':0}, 
           'market_hawker': {'Y':1, 'N':0}, 
           'multistorey_carpark': {'Y':1, 'N':0}, 
           'precinct_pavilion': {'Y':1, 'N':0}}, inplace = True)

# Change columns to lowercase
test.columns = test.columns.str.lower()

# Creating new feature
test['age_when_sold'] = test['tranc_year'] - test['lease_commence_date']

# Dropping other columns
test.drop(columns = ['id','tranc_yearmonth', 'tranc_month','tranc_year', 'lease_commence_date',
                   'town', 'flat_type', 'block', 'street_name', 'storey_range', 'flat_model', 
                   'lower', 'upper', 'mid',
                   'address', 'floor_area_sqft', 'hdb_age', 'year_completed', '1room_sold', 
                   '2room_sold','3room_sold', '4room_sold', '5room_sold', 'exec_sold', 
                   'multigen_sold', 'studio_apartment_sold', '1room_rental', '2room_rental', 
                   '3room_rental', 'other_room_rental', 'postal', 'latitude', 'longitude',
                   'mall_within_500m', 'mall_within_1km', 'mall_within_2km', 'hawker_within_500m',
                   'hawker_within_1km', 'hawker_within_2km', 'mrt_name','mrt_latitude', 
                   'mrt_longitude', 'bus_stop_name', 'bus_stop_latitude', 'bus_stop_longitude', 
                   'pri_sch_latitude', 'pri_sch_longitude', 
                   'sec_sch_latitude', 'sec_sch_longitude'], inplace = True)

# Dropping additional columns
test.drop(columns = ['cutoff_point', 'vacancy', 'total_dwelling_units', 'hawker_food_stalls',
                  'bus_stop_nearest_distance', 'max_floor_lvl', 'market_hawker', 
                   'multistorey_carpark', 'precinct_pavilion', 'hawker_market_stalls',
                  'bus_interchange', 'affiliation'], inplace = True)

In [5]:
test.shape

(16737, 15)

## Step 3. Fit-transform on cleaned trained data

In [6]:
# Defining numerical columns and categorical columns

num_columns = ['floor_area_sqm', 'mid_storey', 
                'mall_nearest_distance', 'hawker_nearest_distance',
                'mrt_nearest_distance',
               'pri_sch_nearest_distance', 
               'sec_sch_nearest_dist',
                'age_when_sold']

cat_columns = ['full_flat_type', 'commercial', 
               'planning_area',  'mrt_interchange',  
                'pri_sch_affiliation', 'pri_sch_name', 'sec_sch_name']

In [7]:
X = train_clean.drop(columns= 'resale_price')
y = train_clean['resale_price']

In [8]:
# Train test split on clean trained data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [9]:
# Test data
X_test_kaggle = test

In [10]:
# Instantiating OHE, SS and CT
ohe = OneHotEncoder(min_frequency = 10)
ss = StandardScaler()
ct = make_column_transformer(
    (ohe, cat_columns),
    (ss, num_columns))

In [11]:
ct

In [12]:
# Fit transforming train data, transforming test data
Z_train = ct.fit_transform(X_train)
Z_test = ct.transform(X_test_kaggle)

In [13]:
# Creating prediction from test data
y_test_preds_final = lr.predict(Z_test)

In [14]:
y_test_preds_final

array([367509.87616079, 499175.53739952, 350328.67721479, ...,
       387637.78083723, 520046.9901176 , 379266.50822185])

In [15]:
# Creating a DataFrame from the test id that was sliced earlier
result = pd.DataFrame(test_id)
result

Unnamed: 0,id
0,114982
1,95653
2,40303
3,109506
4,100149
...,...
16732,23347
16733,54003
16734,128921
16735,69352


In [16]:
# Renaming the id column
result.rename(columns = {'id': 'Id'})
# Adding the prediction from test data alongside the id
result['Predicted'] = y_test_preds_final
result

Unnamed: 0,id,Predicted
0,114982,367509.876161
1,95653,499175.537400
2,40303,350328.677215
3,109506,313725.261537
4,100149,427615.851235
...,...,...
16732,23347,349692.298915
16733,54003,506398.780605
16734,128921,387637.780837
16735,69352,520046.990118


In [17]:
# Final upload as csv
result.to_csv('../data/06_final_result.csv', index = False)

## Step 4. Upload to Kaggle

Here is the screenshot:

![](../images/kaggle_score.png)