In [1]:
#import statements

import csv
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Data Cleaning and Exploration 
### In this section, we will:
- Import the data
- Clean the data
- Combine the data
- Find relevant details about the data worth exploring

In [15]:
#Processing the data
flatPrice1990=pd.read_csv('resale-flat-prices-based-on-approval-date-1990-1999.csv')
flatPrice2000=pd.read_csv('resale-flat-prices-based-on-approval-date-2000-feb-2012.csv')
flatPrice2012=pd.read_csv('resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv')
flatPrice2015=pd.read_csv('resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv')
flatPrice2017=pd.read_csv('resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv')

desc1990 = flatPrice1990.isnull().sum()
desc2000 = flatPrice2000.isnull().sum()
desc2012 = flatPrice2012.isnull().sum()
desc2015 = flatPrice2015.isnull().sum()
desc2017 = flatPrice2017.isnull().sum()

flatInfo = pd.concat([desc1990, desc2000, desc2012, desc2015, desc2017], axis=1)
flatInfo.columns = ["1990 to 1999", "2000 to 2012", "2012 to 2014", "2015 to 2016", "2017 onwards" ]
flatInfo

Unnamed: 0,1990 to 1999,2000 to 2012,2012 to 2014,2015 to 2016,2017 onwards
month,0.0,0.0,0.0,0,0
town,0.0,0.0,0.0,0,0
flat_type,0.0,0.0,0.0,0,0
block,0.0,0.0,0.0,0,0
street_name,0.0,0.0,0.0,0,0
storey_range,0.0,0.0,0.0,0,0
floor_area_sqm,0.0,0.0,0.0,0,0
flat_model,0.0,0.0,0.0,0,0
lease_commence_date,0.0,0.0,0.0,0,0
resale_price,0.0,0.0,0.0,0,0


In [20]:
#Attempting to understand what "remaining_lease" refers to
flatPrice2015.sample(5)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
33862,2016-10,TAMPINES,4 ROOM,703,TAMPINES ST 71,04 TO 06,100.0,Model A,1997,79,425000.0
33564,2016-10,PUNGGOL,5 ROOM,638A,PUNGGOL DR,04 TO 06,110.0,Premium Apartment,2005,87,432888.0
1283,2015-02,ANG MO KIO,3 ROOM,302,ANG MO KIO AVE 3,07 TO 09,73.0,New Generation,1978,62,348000.0
17276,2015-12,QUEENSTOWN,3 ROOM,165,STIRLING RD,07 TO 09,60.0,Improved,1970,53,320000.0
2079,2015-02,SENGKANG,4 ROOM,309A,ANCHORVALE RD,16 TO 18,90.0,Premium Apartment,2002,86,420000.0


From the above, it seems like "remaining_lease" was a statistic which was introduced starting from the 2015-2016 csv. Its strange that this statistic was not here for all the csvs, but oh well, what can we do about it. 

As such, we combine the dataframes in 2 ways, namely the data from 1990, and the data from 2015. Then, we may still analyse data in regard to the statistic "remaining_lease", albeit with a smaller dataset. 

In [26]:
flatPriceCollated = pd.concat([flatPrice1990, flatPrice2000, flatPrice2012, flatPrice2015, flatPrice2017], join = "inner")
flatPriceFrom2015 = pd.concat([flatPrice2015, flatPrice2017], join = "inner")
display(flatPriceCollated.sample(10))
display(flatPriceFrom2015.sample(10))

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
17932,2016-01,BISHAN,4 ROOM,228,BISHAN ST 23,04 TO 06,105.0,Model A,1992,580000.0
304366,2009-11,BUKIT PANJANG,4 ROOM,655,SENJA RD,01 TO 03,91.0,Premium Apartment,2001,309000.0
15516,1991-04,ANG MO KIO,3 ROOM,330,ANG MO KIO AVE 1,10 TO 12,82.0,NEW GENERATION,1981,60320.0
31679,2016-09,JURONG WEST,5 ROOM,637,JURONG WEST ST 61,10 TO 12,110.0,Improved,2004,495000.0
45813,1993-06,BUKIT BATOK,EXECUTIVE,526,BT BATOK ST 51,01 TO 03,146.0,MAISONETTE,1986,247500.0
104234,1995-10,PASIR RIS,4 ROOM,112,PASIR RIS ST 11,07 TO 09,113.0,MODEL A,1990,256000.0
89097,2021-01,PUNGGOL,5 ROOM,676B,PUNGGOL DR,13 TO 15,113.0,Premium Apartment,2016,652000.0
74967,2002-01,PASIR RIS,5 ROOM,187,PASIR RIS ST 11,07 TO 09,124.0,Improved,1993,323000.0
268931,2008-08,SERANGOON,EXECUTIVE,535,SERANGOON NTH AVE 4,01 TO 03,152.0,Maisonette,1992,430000.0
251628,1999-05,BEDOK,3 ROOM,527,BEDOK NTH ST 3,01 TO 03,68.0,NEW GENERATION,1979,128500.0


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
2070,2017-02,TAMPINES,EXECUTIVE,147,TAMPINES AVE 5,04 TO 06,150.0,Maisonette,1985,67 years 02 months,700000.0
8896,2017-06,KALLANG/WHAMPOA,EXECUTIVE,11,JLN BATU,01 TO 03,141.0,Apartment,1986,67 years 07 months,838000.0
76445,2020-08,JURONG WEST,4 ROOM,465,JURONG WEST ST 41,01 TO 03,91.0,New Generation,1984,63 years 03 months,335000.0
50414,2019-05,WOODLANDS,EXECUTIVE,832,WOODLANDS ST 83,10 TO 12,192.0,Apartment,1994,74 years 05 months,755000.0
88,2015-01,BEDOK,3 ROOM,551,BEDOK NTH AVE 1,04 TO 06,67.0,New Generation,1980,64,315000.0
16639,2017-10,TOA PAYOH,3 ROOM,231,LOR 8 TOA PAYOH,07 TO 09,65.0,Improved,1976,58 years 01 month,275000.0
43460,2019-01,WOODLANDS,5 ROOM,587,WOODLANDS DR 16,01 TO 03,111.0,Improved,2001,81 years 07 months,378000.0
42418,2019-01,BUKIT PANJANG,5 ROOM,166,GANGSA RD,16 TO 18,121.0,Improved,1998,78 years 04 months,608000.0
18876,2017-11,YISHUN,4 ROOM,349,YISHUN AVE 11,04 TO 06,104.0,Model A,1988,69 years 09 months,345000.0
81325,2020-10,JURONG WEST,3 ROOM,197B,BOON LAY DR,10 TO 12,68.0,Model A,2015,93 years 07 months,360000.0
