## Singapore Private Property 
This set of python codes serve to download past 5 years' of URA private property transactions from URA API for purpose of providing a basis for further data analysis.

The raw data will undergo basic cleaning and then saved into csv file for archival (because URA will continuously remove data older than 5 years old).

In [1]:
import pandas as pd
import requests
from collections import Counter
import datetime as dt
import re
import matplotlib.pyplot as plt

In [2]:
#token given by URA
ura_access_key = 'd8722f05-25ff-44f6-bb6f-5d728aa4c9b1'

#getting the token for the day
r = requests.get("https://www.ura.gov.sg/uraDataService/insertNewToken.action", headers={"AccessKey":ura_access_key})
token = r.json()['Result']

In [None]:
#accessing the data in 4 batches as required by URA API
for i in range(1,5):
    #requesting data by passing in access key and token, in 4 batches as stated in URA API website
    req = requests.get('https://www.ura.gov.sg/uraDataService/invokeUraDS?service=PMI_Resi_Transaction&batch=' + str(i),
                      headers={"AccessKey":ura_access_key, "Token":token})
    
    df_temp = pd.json_normalize(req.json()['Result'], 'transaction', ['street','x','y','project'],errors='ignore')

    #adding each batch to DataFrame
    if i==1:
        df = df_temp
    else:
        df = df.append(df_temp, ignore_index=True) #ignore index so that append can be done
    
    #printing the progress for monitoring
    print('Batch ' + str(i) + ' completed')

Batch 1 completed
Batch 2 completed
Batch 3 completed


In [None]:
df.info()

In [None]:
#saving raw data to local drive
this_year = dt.date.today().year
this_month = dt.date.today().month

df.to_csv("ura_caveats downloaded year " + str(this_year) + ' month ' + str(this_month) + " raw data.csv")

In [None]:
#making copy of df
df1 = df.copy()

#performing basic tidying up

#nett price is final selling price, hence will replace price if nettprice is not null
df1.loc[df1['nettPrice'].notnull(),'price'] = df1.loc[df1['nettPrice'].notnull(),'nettPrice']

#remove nettprice and geo-coordinates
df1 = df1.drop(['nettPrice','x','y'],axis=1)

#convert area to sq feet, and type float
df1.area = df1.area.astype('float') * 10.76

#forcing to numeric before changing to int type
df1.price = pd.to_numeric(df1.price, errors='coerce').astype('int')

#create new column to hold calculated psf price
df1['Unit Price psf'] = (df1['price']/df1['area']).astype('int')

#parsing the contract date to proper pandas datetime format
df1.contractDate = pd.to_datetime(df1.contractDate, format='%m%y').dt.to_period('M')

In [None]:
#checking out the tenure column and find a way standardize data
print(Counter(df1.tenure))

In [None]:
for i in range(df1.shape[0]):
    if (df1.loc[i,'tenure'] == 'NA'):
        #set data where tenure is NA to 999 years
        df1.loc[i,'lease left'] = 999
    elif (df1.loc[i,'tenure'][2] == ' ') & (df1.loc[i,'tenure'][-4:]=='hold'):
        #set data where tenure is 99 years but without start year to 99
        df1.loc[i,'lease left'] = 99
    elif (df1.loc[i,'tenure'][0] == '1'):
        df1.loc[i,'lease left'] = int(df1.loc[i,'tenure'][0:3]) - this_year + int(df1.loc[i,'tenure'][-4:])
    elif df1.loc[i,'tenure'][2] == ' ':
        df1.loc[i,'lease left'] = int(df1.loc[i,'tenure'][0:2]) - this_year + int(df1.loc[i,'tenure'][-4:])
    else:
        df1.loc[i,'lease left'] = 999

In [None]:
#checking the rest of the columns to make sure there is no unusual entries
print(Counter(df1.typeOfSale))
print(Counter(df1.propertyType))
print(Counter(df1.district))
print(Counter(df1.typeOfArea))

In [None]:
#Splitting into 2 dataframes: landed transactions and non-landed property transactions

df_non_landed = df1[df1.propertyType.isin(['Condominium','Apartment','Executive Condominium'])]
df_landed = df1[df1.propertyType.isin(['Terrace','Semi-detached','Detached','Strata Terrace',
                                       'Strata Semi-detached','Strata Detached'])]

In [None]:
#remove floorRange column of landed because it is not relevant
df_landed = df_landed.drop(['floorRange'],axis=1)

In [None]:
#checking for errors in floorRange column
print(df_non_landed.floorRange.value_counts())

#amend the floorRange error in original URA data 
df_non_landed.loc[df_non_landed['floorRange']=='B1-B5','floorRange'] = '01-05'
df_non_landed.loc[df_non_landed['floorRange']=='-','floorRange'] = '01-05'

In [None]:
#convert data types of various columns
df_non_landed = df_non_landed.astype({'area':'int32','noOfUnits':'int32','floorRange':'category',
                                      'typeOfSale':'category','propertyType':'category',
                                      'district':'category','typeOfArea':'category','tenure':'str'})

df_landed = df_landed.astype({'area':'int32','noOfUnits':'int32','typeOfSale':'category','propertyType':'category',
                              'district':'category','typeOfArea':'category','tenure':'str'})

In [None]:
#finding out the en-bloc sales data
display(df_non_landed[df_non_landed.noOfUnits>1].sort_values('noOfUnits'))

#further splitting the en-bloc non-landed transactions from the non-landed dataframe
#enbloc deals are picked up by dual conditions of large deal value of more than $20mil and >1 unit per transaction.
#this is not a perfect method but is able to pick up all enbloc deals while minimizing non-enbloc transactions

df_enbloc = df_non_landed[(df_non_landed.noOfUnits>1) & (df_non_landed.price>20000000)]
df_non_landed = df_non_landed[(df_non_landed.noOfUnits<=10) & (df_non_landed.price<=20000000)]

#convert data types of various columns
df_enbloc = df_enbloc.astype({'area':'int32','noOfUnits':'int32','floorRange':'category',
                              'typeOfSale':'category','propertyType':'category','district':'category',
                              'typeOfArea':'category','tenure':'str'})

In [None]:
Price_change_floor = df_non_landed.groupby('floorRange')['Unit Price psf'].mean().pct_change()*100
Price_change_floor.plot(kind='bar')
_ = plt.ylabel("Percent change in price compared to lower floor range")

In [None]:
diff = df_non_landed.loc[df_non_landed.propertyType=='Apartment','Unit Price psf'].mean() - df_non_landed.loc[df_non_landed.propertyType=='Condominium','Unit Price psf'].mean()
print(diff)

In [None]:
df_non_landed.groupby('district')['Unit Price psf'].mean().sort_values()

In [None]:
df_landed.groupby('district')['Unit Price psf'].mean().sort_values()

In [None]:
#saving the cleaned data into csv
df_non_landed.to_csv("Non-landed transactions downloaded and cleaned on year " + str(this_year) + " month " + str(this_month) + ".csv")
df_landed.to_csv("landed transactions downloaded and cleaned on year " + str(this_year) + " month " + str(this_month) + ".csv")
df_enbloc.to_csv("enbloc transactions downloaded and cleaned on year " + str(this_year) + " month " + str(this_month) + ".csv")

In [None]:
#Analysis of single condo project
condo = 'CARIBBEAN'
df_condo = df_non_landed[df_non_landed['project'].str.contains(condo)]
df_condo.groupby('floorRange')['Unit Price psf'].mean()