## Singapore Private Property 
This set of python codes serve to download past 5 years' of URA private property transactions from URA API for purpose of providing a basis for further data analysis.

The raw data will undergo basic cleaning and then saved into csv file for archival (because URA will continuously remove data older than 5 years old).

In [1]:
import pandas as pd
import requests
from collections import Counter
import datetime as dt

In [2]:
#token given by URA
ura_access_key = 'd8722f05-25ff-44f6-bb6f-5d728aa4c9b1'

#getting the token for the day
r = requests.get("https://www.ura.gov.sg/uraDataService/insertNewToken.action", headers={"AccessKey":ura_access_key})
token = r.json()['Result']

In [3]:
#accessing the data in 4 batches as required by URA API
for i in range(1,5):
    #requesting data by passing in access key and token, in 4 batches as stated in URA API website
    req = requests.get('https://www.ura.gov.sg/uraDataService/invokeUraDS?service=PMI_Resi_Transaction&batch=' + str(i),
                      headers={"AccessKey":ura_access_key, "Token":token})
    
    df_temp = pd.json_normalize(req.json()['Result'], 'transaction', ['street','x','y','project'],errors='ignore')

    #adding each batch to DataFrame
    if i==1:
        df = df_temp
    else:
        df = df.append(df_temp, ignore_index=True) #ignore index so that append can be done
    
    #printing the progress for monitoring
    print('Batch ' + str(i) + ' completed')

Batch 1 completed
Batch 2 completed
Batch 3 completed
Batch 4 completed


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113032 entries, 0 to 113031
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   area          113032 non-null  object
 1   floorRange    113032 non-null  object
 2   noOfUnits     113032 non-null  object
 3   contractDate  113032 non-null  object
 4   typeOfSale    113032 non-null  object
 5   price         113032 non-null  object
 6   propertyType  113032 non-null  object
 7   district      113032 non-null  object
 8   typeOfArea    113032 non-null  object
 9   tenure        113032 non-null  object
 10  nettPrice     1137 non-null    object
 11  street        113032 non-null  object
 12  x             88177 non-null   object
 13  y             88177 non-null   object
 14  project       113032 non-null  object
dtypes: object(15)
memory usage: 12.9+ MB


In [5]:
#saving raw data to local drive
df.to_csv("./ura_caveats_2015-11 to 2020-10 raw data.csv")

In [22]:
#making copy of df
df1 = df.copy()

#performing basic tidying up

#nett price is final selling price, hence will replace price if nettprice is not null
df1.loc[df1['nettPrice'].notnull(),'price'] = df1.loc[df1['nettPrice'].notnull(),'nettPrice']

#remove nettprice and geo-coordinates
df1 = df1.drop(['nettPrice','x','y'],axis=1)

#convert area to sq feet, and type float
df1.area = df1.area.astype('float') * 10.76

#convert data types of various columns
df1 = df1.astype({'area':'int32','noOfUnits':'int32','floorRange':'category','typeOfSale':'category',
                  'propertyType':'category','district':'category','typeOfArea':'category',
                  'tenure':'str'})

#forcing to numeric before changing to int type
df1.price = pd.to_numeric(df1.price, errors='coerce').astype('int')

In [23]:
#create new column to hold calculated psf price
df1['Unit Price psf'] = (df1['price']/df1['area']).astype('int')

#tidy up contract date to proper pandas datetime format
df1.contractDate = pd.to_datetime(df1.contractDate, format='%m%y').dt.to_period('M')

print(df1.info())
display(df1.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113032 entries, 0 to 113031
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype    
---  ------          --------------   -----    
 0   area            113032 non-null  int32    
 1   floorRange      113032 non-null  category 
 2   noOfUnits       113032 non-null  int32    
 3   contractDate    113032 non-null  period[M]
 4   typeOfSale      113032 non-null  category 
 5   price           113032 non-null  int32    
 6   propertyType    113032 non-null  category 
 7   district        113032 non-null  category 
 8   typeOfArea      113032 non-null  category 
 9   tenure          113032 non-null  object   
 10  street          113032 non-null  object   
 11  project         113032 non-null  object   
 12  Unit Price psf  113032 non-null  int32    
dtypes: category(5), int32(4), object(3), period[M](1)
memory usage: 5.7+ MB
None


Unnamed: 0,area,floorRange,noOfUnits,contractDate,typeOfSale,price,propertyType,district,typeOfArea,tenure,street,project,Unit Price psf
0,5641,-,1,2020-02,3,5500000,Semi-detached,5,Land,Freehold,ZEHNDER ROAD,LANDED HOUSING DEVELOPMENT,975
1,3314,-,1,2018-09,3,5000000,Semi-detached,5,Land,Freehold,ZEHNDER ROAD,LANDED HOUSING DEVELOPMENT,1508
2,3378,-,1,2018-06,3,4750000,Semi-detached,5,Land,Freehold,ZEHNDER ROAD,LANDED HOUSING DEVELOPMENT,1406
3,1714,-,1,2020-03,3,2630000,Terrace,5,Land,Freehold,NEO PEE TECK LANE,LANDED HOUSING DEVELOPMENT,1534
4,2431,01-05,1,2020-09,3,3400000,Condominium,4,Strata,99 yrs lease commencing from 2007,COVE DRIVE,TURQUOISE,1398


In [24]:
#display(df[df['project'].str.contains('CARIBBEAN')].head(20))

In [25]:
print(Counter(df1.floorRange))
print(Counter(df1.typeOfSale))
print(Counter(df1.propertyType))
print(Counter(df1.district))
print(Counter(df1.typeOfArea))

Counter({'01-05': 36302, '06-10': 26890, '11-15': 19277, '-': 11504, '16-20': 8992, '21-25': 4161, '26-30': 2619, '31-35': 1827, '36-40': 875, '41-45': 307, '46-50': 116, '51-55': 62, '56-60': 36, '61-65': 26, '66-70': 21, 'B1-B5': 13, '71-75': 4})
Counter({'3': 57168, '1': 54195, '2': 1669})
Counter({'Condominium': 51076, 'Apartment': 37418, 'Executive Condominium': 13096, 'Terrace': 5419, 'Semi-detached': 3097, 'Detached': 1408, 'Strata Terrace': 1187, 'Strata Semi-detached': 254, 'Strata Detached': 77})
Counter({'19': 17394, '15': 8012, '18': 6937, '23': 6884, '05': 6845, '03': 6584, '14': 6490, '10': 6105, '27': 5800, '09': 5159, '16': 4334, '13': 4065, '20': 3918, '21': 3646, '11': 2961, '28': 2960, '12': 2722, '22': 2553, '17': 1677, '25': 1602, '04': 1338, '08': 1244, '01': 1037, '02': 986, '07': 899, '26': 875, '06': 5})
Counter({'Strata': 103089, 'Land': 9943})


In [31]:
print(Counter(df1.tenure))

Counter({'Freehold': 32006, '99 yrs lease commencing from 2018': 12172, '99 yrs lease commencing from 2014': 8865, '99 yrs lease commencing from 2015': 7423, '99 yrs lease commencing from 2016': 5990, '99 yrs lease commencing from 2013': 5341, '99 yrs lease commencing from 2017': 4789, '99 yrs lease commencing from 2011': 4007, '99 yrs lease commencing from 2012': 3697, '99 yrs lease commencing from 2010': 3214, '99 yrs lease commencing from 2019': 2432, '99 yrs lease commencing from 1997': 2306, '99 yrs lease commencing from 2008': 1743, '99 yrs lease commencing from 1996': 1499, '99 yrs lease commencing from 1995': 1263, '99 yrs lease commencing from 2000': 1162, '99 yrs lease commencing from 2007': 1118, '99 yrs lease commencing from 2006': 959, '99 yrs lease commencing from 1994': 925, '99 yrs lease commencing from 2002': 811, '99 yrs lease commencing from 2001': 794, '99 yrs lease commencing from 1993': 739, '99 yrs lease commencing from 2009': 722, '99 yrs lease commencing from 1

In [33]:
this_year = dt.date.today().year

for i in range(df.shape[0]):
    if (df.loc[i,'tenure'][0:3] == '99 '):
        df.loc[i,'lease left'] = 99 - this_year + int(df.loc[i,'tenure'][-4:])
    elif (df.loc[i,'tenure'][0:3] == '10'):
        df.loc[i,'lease left'] = int(df.loc[i,'tenure'][0:3]) - this_year + int(df.loc[i,'tenure'][-4:])
    else:
        df.loc[i,'lease left'] = 999

ValueError: invalid literal for int() with base 10: 'hold'

In [None]:
df1.to_csv("d:/downloads/ura_caveats_2017-10 to 2020-09")

In [None]:
df = pd.read_csv("d:/downloads/ura_caveats_2017-10 to 2020-09", parse_dates=['contractDate'])

In [None]:
df.info()

In [None]:
df.loc[df.nettPrice.notnull(),'price'] = df.loc[df.nettPrice.notnull(),'nettPrice']
display(df[df.nettPrice.notnull()].sample(10))

In [None]:
display(df.sample(10))