In [1]:
#import statements

import csv
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Section 1: Data Cleaning and Exploration 
### In this section, we will:
- Import the data
- Clean the data
- Combine the data
- Create and/or generalise relevant variables 
- Find relevant details about the data worth exploring

In [15]:
#Processing the data
flatPrice1990=pd.read_csv('resale-flat-prices-based-on-approval-date-1990-1999.csv')
flatPrice2000=pd.read_csv('resale-flat-prices-based-on-approval-date-2000-feb-2012.csv')
flatPrice2012=pd.read_csv('resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv')
flatPrice2015=pd.read_csv('resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv')
flatPrice2017=pd.read_csv('resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv')

desc1990 = flatPrice1990.isnull().sum()
desc2000 = flatPrice2000.isnull().sum()
desc2012 = flatPrice2012.isnull().sum()
desc2015 = flatPrice2015.isnull().sum()
desc2017 = flatPrice2017.isnull().sum()

flatInfo = pd.concat([desc1990, desc2000, desc2012, desc2015, desc2017], axis=1)
flatInfo.columns = ["1990 to 1999", "2000 to 2012", "2012 to 2014", "2015 to 2016", "2017 onwards" ]
flatInfo

Unnamed: 0,1990 to 1999,2000 to 2012,2012 to 2014,2015 to 2016,2017 onwards
month,0.0,0.0,0.0,0,0
town,0.0,0.0,0.0,0,0
flat_type,0.0,0.0,0.0,0,0
block,0.0,0.0,0.0,0,0
street_name,0.0,0.0,0.0,0,0
storey_range,0.0,0.0,0.0,0,0
floor_area_sqm,0.0,0.0,0.0,0,0
flat_model,0.0,0.0,0.0,0,0
lease_commence_date,0.0,0.0,0.0,0,0
resale_price,0.0,0.0,0.0,0,0


In [31]:
#Attempting to understand what "remaining_lease" refers to
display(flatPrice2015.sample(3))
display(flatPrice2017.sample(3))

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
29942,2016-08,JURONG WEST,EXECUTIVE,861,JURONG WEST ST 81,04 TO 06,146.0,Maisonette,1996,79,580000.0
35207,2016-11,QUEENSTOWN,4 ROOM,56,STRATHMORE AVE,01 TO 03,85.0,Model A,2002,84,530000.0
14451,2015-10,SENGKANG,5 ROOM,260C,SENGKANG EAST WAY,10 TO 12,110.0,Improved,2001,84,508000.0


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
62892,2019-12,CHOA CHU KANG,5 ROOM,277,CHOA CHU KANG AVE 2,01 TO 03,122.0,Improved,1993,72 years 06 months,420000.0
7648,2017-05,TOA PAYOH,3 ROOM,117,POTONG PASIR AVE 1,04 TO 06,67.0,New Generation,1984,66 years 05 months,282500.0
47113,2019-04,CHOA CHU KANG,4 ROOM,412,CHOA CHU KANG AVE 3,10 TO 12,104.0,Model A,1992,72 years 03 months,330000.0


### **<u>Subsection 1.1: Generalising `remaining_lease` for the entire dataset</u>**
From the above, it seems like "remaining_lease" was a statistic which was introduced starting from the 2015-2016 csv. However, one may also compute an estimation for this variable via taking:

$$ \text{lease\_commence\_date} + 99 \text{ years} - \text{month (or pretty much, the date when the transaction was made)}$$


However, a shortcoming of this method of manual computation is the lack of precision; namely that we may only evaluate the remaining lease to the nearest year. Furthermore, it seems as if the method in which the data was stored differs between the `2015-2016` csv and the `2017 onwards` csvs, with the former storing the data to the nearest year, and the latter storing the data to the nearest month.  

These inconsistencies prove to be a problem, but oh well, what can we do about it. 

As such, noting that most of the data could only compute the `remaining_lease` to the nearest year ($\pm$ 1 year for the data before 2015), we will proceed by using the aforementioned; crude method. Sure, it does not provide a high degree of accuracy, but at least there is consistency at that. 

In [47]:
# Making the collated dataframe and verifying its datatypes
flatPriceCollated = pd.concat([flatPrice1990, flatPrice2000, flatPrice2012, flatPrice2015, flatPrice2017], join = "inner")
display(flatPriceCollated.sample(10))
flatPriceCollated.dtypes

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
60861,2001-09,CLEMENTI,3 ROOM,339,CLEMENTI AVE 5,04 TO 06,82.0,New Generation,1979,180000.0
41754,2001-03,HOUGANG,3 ROOM,210,HOUGANG ST 21,01 TO 03,74.0,Model A,1984,160000.0
269757,2008-09,BEDOK,5 ROOM,98,BEDOK NTH AVE 4,07 TO 09,123.0,Standard,1978,433000.0
79056,2020-09,MARINE PARADE,3 ROOM,34,MARINE CRES,04 TO 06,59.0,Improved,1975,290000.0
265327,1999-07,JURONG WEST,3 ROOM,423,JURONG WEST AVE 1,07 TO 09,74.0,MODEL A,1984,138000.0
285227,1999-12,CHOA CHU KANG,4 ROOM,615,CHOA CHU KANG ST 62,04 TO 06,108.0,MODEL A,1996,270000.0
208557,1998-08,YISHUN,3 ROOM,825,YISHUN ST 81,10 TO 12,73.0,MODEL A,1988,165000.0
79969,1994-10,TAMPINES,EXECUTIVE,942,TAMPINES AVE 5,10 TO 12,146.0,MAISONETTE,1988,438000.0
46847,1993-07,BEDOK,3 ROOM,521,BEDOK NTH AVE 1,10 TO 12,67.0,NEW GENERATION,1979,70000.0
342143,2010-11,YISHUN,5 ROOM,159,YISHUN ST 11,10 TO 12,126.0,Improved,1985,435000.0


month                   object
town                    object
flat_type               object
block                   object
street_name             object
storey_range            object
floor_area_sqm         float64
flat_model              object
lease_commence_date      int64
resale_price           float64
dtype: object

In [49]:
# This is the aforementioned calculation for the remaining lease of a house with respect to when the house was resold. We 
# first implement the creation of the column "remaining_lease_manual"
def calcYearsLeft(row):
  row.remaining_lease_manual = row.lease_commence_date + 99 - int(str(row.month)[:4])
  return row

flatPriceCollated["remaining_lease_manual"] = 0
flatPriceCollated = flatPriceCollated.apply(calcYearsLeft, axis='columns')
display(flatPriceCollated.sample(10))
flatPriceCollated.describe()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease_manual
5288,2012-05,GEYLANG,3 ROOM,31,BALAM RD,01 TO 05,68.0,New Generation,1985,323000.0,72
138608,2004-01,BUKIT PANJANG,4 ROOM,252,BANGKIT RD,10 TO 12,103.0,Model A,1989,190000.0,84
284117,2009-03,PASIR RIS,5 ROOM,188,PASIR RIS ST 12,04 TO 06,122.0,Improved,1993,400000.0,83
27160,2013-06,ANG MO KIO,3 ROOM,586,ANG MO KIO AVE 3,10 TO 12,67.0,New Generation,1979,371000.0,65
59825,2001-08,YISHUN,5 ROOM,331,YISHUN RING RD,01 TO 03,122.0,Improved,1988,276000.0,86
201691,2006-03,BEDOK,5 ROOM,774,BEDOK RESERVOIR VIEW,16 TO 18,115.0,Premium Apartment,2000,334000.0,93
244333,2007-09,TAMPINES,4 ROOM,244,SIMEI ST 5,04 TO 06,105.0,Model A,1997,362000.0,89
41213,2001-03,BUKIT MERAH,3 ROOM,78,INDUS RD,16 TO 18,65.0,Improved,1982,145000.0,80
223707,2006-12,KALLANG/WHAMPOA,3 ROOM,43,BENDEMEER RD,04 TO 06,63.0,Standard,1981,145000.0,74
243058,2007-09,BUKIT PANJANG,5 ROOM,516,JELAPANG RD,10 TO 12,121.0,Improved,1998,288000.0,90


Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price,remaining_lease_manual
count,864616.0,864616.0,864616.0,864616.0
mean,95.678732,1987.590754,302883.5,81.624547
std,25.966473,9.939253,155576.2,10.147514
min,28.0,1966.0,5000.0,44.0
25%,73.0,1980.0,185000.0,75.0
50%,93.0,1986.0,282000.0,83.0
75%,113.0,1995.0,395000.0,90.0
max,307.0,2019.0,1360000.0,101.0


In [59]:
# verifying the validity of my proposed computation

flatPrice2015["remaining_lease_manual"] = 0
flatPrice2015New = flatPrice2015.apply(calcYearsLeft, axis='columns')
flatPrice2015New["remaining_lease_difference"] = abs(flatPrice2015New.remaining_lease_manual - flatPrice2015New.remaining_lease)
display(flatPrice2015New.describe())

def calcYearsDiff2017(row):
  #       72 years 06 months	
  #index: 0123456789
  years = int(str(row.remaining_lease)[:2])
  months = int(str(row.remaining_lease)[9:11]) if (len(row.remaining_lease) > 12) else 0
  row.remaining_lease_difference = abs(row.remaining_lease_manual - ( years + months/12 ) )
  return row

flatPrice2017["remaining_lease_manual"] = 0
flatPrice2017New = flatPrice2017.apply(calcYearsLeft, axis='columns')
flatPrice2017New["remaining_lease_difference"] = 0
flatPrice2017New = flatPrice2017New.apply(calcYearsDiff2017, axis='columns')
display(flatPrice2017New.describe())

Unnamed: 0,floor_area_sqm,lease_commence_date,remaining_lease,resale_price,remaining_lease_manual,remaining_lease_difference
count,37153.0,37153.0,37153.0,37153.0,37153.0,37153.0
mean,97.020386,1990.920195,73.913116,436862.8,74.398756,0.486717
std,24.19836,10.86233,10.885456,135805.2,10.847577,0.49983
min,31.0,1966.0,48.0,190000.0,49.0,0.0
25%,74.0,1984.0,66.0,340000.0,67.0,0.0
50%,96.0,1989.0,72.0,408000.0,72.0,0.0
75%,111.0,2000.0,83.0,495000.0,83.0,1.0
max,280.0,2013.0,97.0,1150000.0,97.0,1.0


Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price,remaining_lease_manual,remaining_lease_difference
count,118413.0,118413.0,118413.0,118413.0,118413.0
mean,97.837401,1995.101568,460645.8,74.897866,0.328707
std,24.120851,13.444761,159298.3,13.335732,0.235065
min,31.0,1966.0,140000.0,44.0,0.0
25%,82.0,1985.0,345000.0,64.0,0.166667
50%,94.0,1996.0,430000.0,75.0,0.25
75%,113.0,2005.0,540000.0,85.0,0.5
max,249.0,2019.0,1360000.0,98.0,1.083333


As we may see from the results, the mean of `remaining_lease_difference` hovers around 0.3 to 0.5, which is explected. This shows that our method of computing the `remaining_lease` has a very low deviation from the ones provided in the government dataset. As such, this variable is valid to use in our analysis of the dataset. 

### **<u>Subsection 1.2: Creating a variable `resale_price_adjusted` to adjust for inflation</u>**
From the above, it seems like "remaining_lease" was a statistic which was introduced starting from the 2015-2016 csv. However, one may also compute an estimation for this variable via taking:

$$ \text{lease\_commence\_date} + 99 \text{ years} - \text{month (or pretty much, the date when the transaction was made)}$$


However, a shortcoming of this method of manual computation is the lack of precision; namely that we may only evaluate the remaining lease to the nearest year. Furthermore, it seems as if the method in which the data was stored differs between the `2015-2016` csv and the `2017 onwards` csvs, with the former storing the data to the nearest year, and the latter storing the data to the nearest month.  

These inconsistencies prove to be a problem, but oh well, what can we do about it. 

As such, noting that most of the data could only compute the `remaining_lease` to the nearest year ($\pm$ 1 year for the data before 2015), we will proceed by using the aforementioned; crude method. Sure, it does not provide a high degree of accuracy, but at least there is consistency at that. 