Google Colab notebooks have an idle timeout of 90 minutes and absolute timeout of 12 hours. 
Colab Pro+ supports continuous code execution for up to 24 hours if you have sufficient compute units.

[Electoral Boundary 2020](https://data.gov.sg/dataset/electoral-boundary_2020)

Singapore HDB Postal Code Mapper (2018) [link text](https://www.kaggle.com/datasets/mylee2009/singapore-postal-code-mapper)

A list of all HDB postal code and address (latitude, longtitude, proper address) extracted from:

Postal Code - HDB Resale Flat Price from data.sg (https://data.gov.sg/dataset/resale-flat-prices)

Proper address/ latitude/longitude from onemap.sg api (https://docs.onemap.sg/)

Python codes used to extract information can be found here: https://github.com/mylee16/onemap-api

[Master Plan 2019 Subzone Boundary (No Sea)](https://data.gov.sg/dataset/master-plan-2019-subzone-boundary-no-sea)

[Master Plan 2019 Planning Area Boundary](https://data.gov.sg/dataset/master-plan-2019-planning-area-boundary-no-sea)

In [1]:
%%capture
!pip install --quiet geopandas
import geopandas as gpd 
import fiona
import requests
import json
import pandas as pd
import os
import glob
# download all zipped compressed folders

# Master Plan 2019 Subzone Boundary (No Sea)
!wget -O data4.zip https://data.gov.sg/dataset/c754450d-ecbd-4b7d-8dc1-c07ee842c6d1/download
# Master Plan 2019 Planning Area Boundary
# !wget -O data5.zip https://data.gov.sg/dataset/40267ab6-7c08-45c4-b777-a3b10e68f1c8/download

# Electoral Boundary 2020
!wget -O data7.zip https://data.gov.sg/dataset/6241ae7f-6dfe-4351-8570-611357d1a90e/download

# unzip all
!unzip data4.zip
# !unzip data5.zip
!unzip data7.zip
# data7
!unzip electoral-boundary-dataset.kmz

# CDC dataset from YY
!pip install --upgrade --no-cache-dir gdown

!gdown 1hPxde9qZwt297SsBnkDyb9k4YoYJWDz7
!gdown 127OyOlsGJV5sOX0ej3dINhE8c7gn3_sE
!gdown 1tv32HHtH3A1ZKcj76xvXZavShCIZmjYG
!gdown 1ZLTC3j8gtfiYYih5ffDpP0QbECtOTFrY
!gdown 1Q51nrjV-WkdiJDJSOapSkNg4cE6xTBSI


# rename files to make it clearer
# !mv oldnamefile1 newnamefile1


# data4
!mv master-plan-2019-subzone-boundary-no-sea-kml.kml URA2019-Subzone.kml   
!mv master-plan-2019-subzone-boundary-no-sea-geojson.geojson URA2019-Subzone.geojson   

# data5
# !mv planning-boundary-area.kml URA2019-Plan.kml     #dataset no longer needed

# data7
!mv 62C4422C0D5147ED8C28FA94627357DB.xsl electoral2020.xsl
!mv doc.kml electoral2020.kml

# data8
# PA_CDC_Boundary_2020.kml

# remove unnecessary files
import os

contents = os.listdir()

for i in contents:
    if ".zip" in i:
        os.remove(i)
    if ".txt" in i:
        os.remove(i)
    if ".kmz" in i:
        os.remove(i)
    if ".lyr" in i:
        os.remove(i)
    if ".log" in i:
        os.remove(i)

# download 200k addresses/postal codes
!gdown 1AiVKnBjWelL4O7nUCFBRg99Ns3i9kM_n

# download YN RHS files
# all
!gdown 19NdmAcqDQpf-Oci39DCypAy-gFZR24W-

# converts kml files to json files
!pip install kml2geojson

!k2g -sf PA_CDC_Boundary.json PA_CDC_Boundary_2020.kml ./
# !k2g -sf URA2019-Plan.json URA2019-Plan.kml ./
!k2g -sf URA2019-Subzone.json URA2019-Subzone.kml ./
# !k2g -sf community-in-bloom-cib.json community-in-bloom-cib.kml ./
!k2g -sf electoral2020.json electoral2020.kml ./

# RHS kml to json
!k2g -sf RHS_3_regions.json RHS_3_regions.kml ./


In [51]:
import warnings
warnings.filterwarnings('ignore')

from IPython.display import HTML, display
import time

def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

def pipeline(df):
    '''
        Input: original 200k df (with all columns)
        Output: transformed 200k df (only required information)
    '''

    # masterplan for all the data sets (how to use each dataset)
    masterplan1 = {
        "electoral2020.kml" : "ELD_ELECTORAL_DIVISION_NAME",
        "RHS_3_regions.kml" : "MOH_RHS_ZONE_NAME",
        "URA2019-Subzone.kml" : "URA",
        "PA_CDC_Boundary_2020.kml" : "CDC"
    }

    masterplan2 = {
        "URA2019-Subzone.kml" : {"SUBZONE_N" : "URA_PLANNING_SUBZONE_NAME",
                                "PLN_AREA_N" : "URA_PLANNING_AREA_NAME",
                                "REGION_N" : "URA_"},
        "PA_CDC_Boundary_2020.kml" : {"CDC_NAME" : "CDC Name"}
    }

    files = ["URA2019-Subzone.kml", "PA_CDC_Boundary_2020.kml"]

    masterplan3 = {}

    def get_URA_Region(name="kml_1"):
        '''Get URA region using the masterplan'''
        try:
            plan = masterplan3["URA2019-Subzone.kml"]
            return plan[name]["URA_"]
        except:
            return 'N.A.'

    def get_URA_Planning_Area(name="kml_1"):
        '''Get URA planning area using the masterplan'''
        try:
            plan = masterplan3["URA2019-Subzone.kml"]
            return plan[name]["URA_PLANNING_AREA_NAME"]
        except:
            return 'N.A.'

    def get_URA_Subzone(name='kml_1'):
        '''Get URA subzone using the masterplan'''
        try:
            plan = masterplan3["URA2019-Subzone.kml"]
            return plan[name]["URA_PLANNING_SUBZONE_NAME"]
        except:
            return 'N.A.'

    def get_CDC_Name(name="kml_1"):
        '''Get CDC name using the masterplan'''
        try:
            plan = masterplan3["PA_CDC_Boundary_2020.kml"]
            return plan[name]["CDC Name"]
        except:
            return 'N.A.'
            
    def kml_to_df(file):
        try:
            df = gpd.read_file(file, driver='KML')
        except:
            fiona.drvsupport.supported_drivers['KML'] = 'rw'
            df = gpd.read_file(file, driver='KML')
        return df

    def get_attributes_value(html_description, attribute_name='SUBZONE_N'):
        def row_transform(row):
            return pd.DataFrame(pd.read_html(row)[0])
        def html_df(html_description):
            row = row_transform(html_description)
            return row[row['Attributes']==attribute_name]
        return html_df(html_description).values[0][1]

    for file_name in files:
        file_kml = kml_to_df(file_name)
        file_dict = {}

        plan = masterplan2[file_name]

        for name in file_kml["Name"]: # kml1, kml2, ...
            # want to find the URA_plan..
            description = file_kml[file_kml["Name"]==name]["Description"].reset_index(drop=True)[0]
            file_dict[name] = dict([(new_colname, get_attributes_value(html_description=description, 
                            attribute_name=colname)) for colname, new_colname in masterplan2[file_name].items()])
        masterplan3[file_name] = file_dict

    def digit_extend(POSTAL_CODE):
        if len(str(POSTAL_CODE)) != 6:
          POSTAL_CODE = '0' + str(POSTAL_CODE)
        return POSTAL_CODE

    def add_lnglat(df, x_name="X_ADDR", y_name="Y_ADDR"):
        '''
        Converts (X_ADDR, Y_ADDR) to (longitude, latitude) using OneMap API
        Add new columns lng & lat to df
        '''
        list_of_lnglat = []
        for row in df.iloc:
            x, y = row[x_name], row[y_name]
            url = f"https://developers.onemap.sg/commonapi/convert/3414to4326?X={x}4&Y={y}"
            # hdr = {'User-Agent': 'Mozilla/5.0'}
            hdr = {"User-Agent": "pandas"}
            resp = requests.get(url, headers=hdr)
            extract = json.loads(resp.content)
            lng = extract['longitude']
            lat = extract['latitude']
            list_of_lnglat.append([lng, lat])
        # transpose list_of_lnglat, then list[0] is lng, list[1] is lat
        transposed = list(map(list, zip(*list_of_lnglat)))
        df["LONG"] = transposed[0]
        df["LAT"] = transposed[1]
        return df

    def get_pip (gdf, regions, new_colname):
        '''
        Point in Polygon
        Input:
            df - dataframe with geometry POINT
            regions - dataframe with geometry POLYGON/MULTIPOLYGON
        Output:
            df - original dataframe + each row with info on regions
        '''
        r_list = list(regions.Name)
        #create empty dataframe
        df = pd.DataFrame().reindex_like(gdf).dropna()
        for r in r_list:
            #get geometry for specific region
            pol = (regions.loc[regions.Name==r])
            pol.reset_index(drop = True, inplace = True)
            #identify those records from gdf that are intersecting with the region polygon
            pip_mask = gdf.within(pol.loc[0, 'geometry'])

            gdf.loc[pip_mask, new_colname] = r
        return gdf

    # add latitude & longitude columns using OneMap API
    df = add_lnglat(df)

    # add geopandas Point as column
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.LONG,df.LAT))


    for file, plan in masterplan1.items():
        kml_df = kml_to_df(file)
        gdf = get_pip(gdf, kml_df, new_colname=plan)

    # Add URA Region
    gdf["URA_REGION_NAME"] = gdf.apply(lambda row: get_URA_Region(row.URA), axis=1)

    # Add URA Subzone
    gdf["URA_PLANNING_SUBZONE_NAME"] = gdf.apply(lambda row: get_URA_Subzone(row.URA), axis=1)

    # Add URA Planning Area
    gdf["URA_PLANNING_AREA_NAME"] = gdf.apply(lambda row: get_URA_Planning_Area(row.URA), axis=1)

    # Add CDC Name
    gdf["CDC_NAME"] = gdf.apply(lambda row: get_CDC_Name(row.CDC), axis=1)

    # digit add postal codes, padding postcodes to 6-digit string object
    gdf["POSTAL_ADDR"] = gdf.apply(lambda row: digit_extend(row.POSTAL_CODE), axis=1)
    
    mapping = {
        'East RHS': 'Singapore Health Services',
        'Central RHS': 'National Healthcare Group',
        'West RHS': 'National University Health System'
    }

    gdf['MOH_RHS_ZONE_NAME'] = gdf['MOH_RHS_ZONE_NAME'].map(mapping)
    gdf.rename(columns={
    "LAST_UPD_DT": "REFERENCE_DT",
    }, inplace=True)
    
    # final dropping of redundant columns
    gdf = gdf[['LAT','LONG',
               'BUILDING_NAME',
               'HOUSE_BLK_NO',
               'ROAD_NAME',
               'POSTAL_ADDR',
               'MULTI_ADDR_IND',
               'CDC_NAME',
               'ELD_ELECTORAL_DIVISION_NAME',
               'URA_REGION_NAME',
               'URA_PLANNING_AREA_NAME',
               'URA_PLANNING_SUBZONE_NAME',
               'MOH_RHS_ZONE_NAME',
               'REFERENCE_DT']]
    # convert LAST_UPD_DT datetime into REFERENCE_DT date
    gdf['REFERENCE_DT'] = pd.to_datetime(gdf['REFERENCE_DT']).dt.date
    # save processed file
    if 'Generated.csv' not in glob.glob(os.path.join('', "*.csv")):
        gdf.to_csv("Generated.csv", mode="a", index=True, header=True)
    else:
        gdf.to_csv("Generated.csv", mode="a", index=True, header=False)
    # get a list of outliers
    gdf.query('ELD_ELECTORAL_DIVISION_NAME == "N.A." or \
              URA_REGION_NAME == "N.A." or \
              URA_PLANNING_AREA_NAME == "N.A." or \
              URA_PLANNING_SUBZONE_NAME == "N.A." or \
              MOH_RHS_ZONE_NAME == "N.A."'\
              ).to_csv("Outliers.csv")
    return gdf

In [52]:
if __name__ == '__main__':
    bulk_series = 6

    bulk = 30000
    batch = 300
    start = int(bulk*bulk_series/batch)
    end = int(bulk*(bulk_series+1)/batch)
    csv_file_path = '200kpostal_xy.csv'
    try:
        os.remove('Generated.csv')
    except:
        pass
    df = pd.read_csv(csv_file_path, on_bad_lines='skip')

    out = display((start*batch, (end+1)*batch), display_id=True)
    for ii in range(start, end):
        out.update(progress(ii*batch, (end+1)*batch))
        if ii*batch <=len(df):
            test_df = pipeline(df[ii*batch:(ii+1)*batch])