In [13]:
# NEA Link
link = "https://www.nea.gov.sg/dengue-zika/dengue/dengue-clusters"

# Importing Libraries
import requests, pandas as pd, time, glob, os, win32com.client, pythoncom, pickle, json, shapely.speedups, swifter
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from shapely.geometry import Point
from shapely.geometry import shape
from swifter import set_defaults

shapely.speedups.enable()
tqdm.pandas()
set_defaults(
    npartitions=None,
    dask_threshold=1,
    scheduler="processes",
    progress_bar=True,
    progress_bar_desc=None,
    allow_dask_on_strings=True,
    force_parallel=True,
)

In [2]:
# Get all snapshots of NEA's dengue cases via wayback machine from 2020 to 2022
def get_snapshots(link):
    # Use Wayback server to get all snapshots of the link
    wayback = "https://web.archive.org/cdx/search/cdx?url=" + link + "&output=json&fl=timestamp,original&collapse=digest&filter=statuscode:200"
    response = requests.get(wayback)
    print(response)
    data = response.json()
    snapshots = data[1:]
    # Keep only snapshots from 2021 to 2022
    snapshots = [snap for snap in snapshots if snap[0][:4] in ["2021", "2022"]]
    # Keep only timestamps
    snapshots = [snap[0] for snap in snapshots]
    return snapshots

In [3]:
# Get the snapshots
while True:
    try: 
        snapshots = get_snapshots(link)
        break
    except: 
        print("Error")
        time.sleep(1)
        continue
snapshots

<Response [200]>


['20210308151403',
 '20210808172247',
 '20211027051137',
 '20211203081908',
 '20220310082530',
 '20220319234337',
 '20220329152026',
 '20220425143805',
 '20220605020619',
 '20220609053540',
 '20220616104319',
 '20220618163415',
 '20220805162654',
 '20220814023504',
 '20220901011605',
 '20220907192246',
 '20220911051635',
 '20220918172033',
 '20220921205322']

In [None]:
def get_soup(link, timestamp):
    # Get the snapshot
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Accept-Encoding": "*",
    "Connection": "keep-alive"
    }
    snapshot = "https://web.archive.org/web/" + timestamp + "/" + link
    response = requests.get(snapshot, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    # print(snapshot)
    return soup

In [None]:
def extract_clusters(soup):
    # In the soup, we want to extract the table with the class "dengue-fixed-table"
    table = soup.find_all("table", class_="dengue-fixed-table")[0]

    # The number of clusters is the number of times data-row-table appears
    # We want to extract the number of clusters
    clusters = len(table.find_all("table", {"data-row-table": True}))
    # print("Number of clusters:", clusters, "\n")
    return clusters, table

def extract_cluster(cluster_no, table, snapshot_no):
    # Let's extract the data from the first cluster
    rows = table.find_all("tr", {"data-row": cluster_no})[-1]
    # print("Cluster No.:", cluster_no)

    # the last 2 td tags with a number is the total number of cases and recent cases
    total = int(rows.find_all("td")[-1].text)
    recent = int(rows.find_all("td")[-2].text)
    # print("Total cases:", total)   
    # print("Recent cases:", recent)

    # remove first 3 and last 2 td tags
    rows = rows.find_all("td")[3:-2]

    # The remaining td tags are sorted as such:
    # Tags containing numbers are the number of cases of each area in the cluster
    # Tags containing text are the names of the areas in the cluster
    # We want to extract the names of the areas and the number of cases
    areas = []
    cases = []
    for i in range(len(rows)):
        if i % 2 == 0:
            areas.append(rows[i].text)
        else:
            cases.append(int(rows[i].text))
    # print("Areas:", areas)
    # print("Cases:", cases)

    # Get the date and month number
    date = snapshots[snapshot_no][:8]
    date = pd.to_datetime(date, format="%Y/%m/%d")
    month = date.month
    # print("Date:", date)
    # print("Month:", month, "\n")

    # Create a dictionary to store the data
    data = {"Number of Cases": cases, "Street Address": areas, "Cluster Number": cluster_no, "Recent Cases in Cluster": recent, "Total Cases in Cluster": total, "Date": date, "Month Number": month}
    df = pd.DataFrame(data)
    df = df.explode("Street Address")
    return df

In [None]:

def extract_snapshot(snapshot):
    # Extract the data from the first snapshot
    soup = get_soup(link, snapshots[snapshot])
    clusters, table = extract_clusters(soup)

    # Create a dataframe to store the data
    df = pd.DataFrame(columns=["Number of Cases", "Street Address", "Cluster Number", "Recent Cases in Cluster", "Total Cases in Cluster", "Date", "Month Number"])

    # from 1 to the number of clusters, keep in mind NEA is stupid sometimes, there may be skipped cluster numbers
    startindex = 1
    endindex = clusters + 1
    while True:
        try:
            df = df.append(extract_cluster(startindex, table, snapshot))
            startindex += 1            
        except:
            if startindex == endindex:
                break
            else:
                startindex += 1
                endindex += 1
                continue           

    return df

In [None]:
# Extract the data from all snapshots
df = pd.DataFrame(columns=["Number of Cases", "Street Address", "Cluster Number", "Recent Cases in Cluster", "Total Cases in Cluster", "Date", "Month Number"])
start = time.time()

# Use concurrent.futures to speed up the process
with ThreadPoolExecutor(max_workers=10) as executor:
    results = [executor.submit(extract_snapshot, snapshot) for snapshot in range(len(snapshots))]

    for i in as_completed(results):
        df = df.append(i.result())

# Change data format to slashes and change street address to lowercase
df["Date"] = df["Date"].dt.strftime("%d/%m/%Y")
df["Street Address"] = df["Street Address"].str.lower()

print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")
df

Total Time taken: 87.91 seconds


Unnamed: 0,Number of Cases,Street Address,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number
0,11,goodlink park,1,8,24,08/08/2021,8
1,3,jalan mata ayer,1,8,24,08/08/2021,8
2,5,platina road,1,8,24,08/08/2021,8
3,5,sembawang road,1,8,24,08/08/2021,8
0,1,buangkok green,2,1,4,08/08/2021,8
...,...,...,...,...,...,...,...
1,1,yishun street 21(blk 208),398,1,2,16/06/2022,6
0,1,yishun street 71(blk 729),399,1,2,16/06/2022,6
1,1,yishun street 72(blk 755),399,1,2,16/06/2022,6
0,1,yung kuang road(blk 165b),400,1,2,16/06/2022,6


In [None]:
# Save progress
df.to_csv("dengue2.csv", index=False)

In [None]:
# Based on Singapore street address, we can extract the latitude and longitude of the street address
def get_lat_long(address):
    trash = [[")", ""],["(blk ", " "],["(", " "],["cl0se","close"]]
    for i in trash:
        address = address.replace(i[0], i[1]).strip()

    # Get the latitude and longitude of the address
    req = "https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=N&pageNum=1"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Accept-Encoding": "*",
    "Connection": "keep-alive"
    }
    # Request might time out, so we need to try again
    while True:
        try:
            response = requests.get(req.format(address=address), headers=headers).json()
            break
        except:
            time.sleep(1)
            continue
    
    # If the address is not found, return ""
    try:
        if len(response['results'])>0:
            return response['results'][0]['LATITUDE'], response['results'][0]['LONGITUDE']
        else:
            return "", ""
    except:
        return "", ""


# get only unique street addresses as separate dataframe
df_addresses = df[["Street Address"]].drop_duplicates().reset_index(drop=True)
df_addresses.shape

(5024, 1)

In [None]:
# get the latitude and longitude of the street addresses
# Use threading to speed up the process
start = time.time()
with ThreadPoolExecutor() as executor:
    results = [executor.submit(get_lat_long, address) for address in df_addresses["Street Address"]]
    for i, result in enumerate(as_completed(results)):
        df_addresses.loc[i, "Latitude"], df_addresses.loc[i, "Longitude"] = result.result()

print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")

Total Time taken: 46.57 seconds


In [None]:
# Compare with just normal progress_apply
# start = time.time()
# df_addresses["Latitude"], df_addresses["Longitude"] = zip(*df_addresses["Street Address"].progress_apply(get_lat_long))
# print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")

In [None]:
def retry_get_lat_long(df_addresses):
    # Create separate dataframe for missing latitude and longitude and drop them from the original dataframe
    df_missing = df_addresses[(df_addresses["Latitude"] == "") | (df_addresses["Longitude"] == "")].reset_index(drop=True)
    df_addresses = df_addresses[(df_addresses["Latitude"] != "") & (df_addresses["Longitude"] != "")].reset_index(drop=True)

    # Get the latitude and longitude of the missing street addresses
    with ThreadPoolExecutor() as executor:
        results = [executor.submit(get_lat_long, address) for address in df_missing["Street Address"]]
        for i, result in enumerate(as_completed(results)):
            df_missing.loc[i, "Latitude"], df_missing.loc[i, "Longitude"] = result.result()

    # Append the missing latitude and longitude to the original dataframe
    df_addresses = df_addresses.append(df_missing).reset_index(drop=True)
    
    return df_addresses

In [None]:
# Retry to get the latitude and longitude of the missing street addresses
start = time.time()
while True:
    df_addresses = retry_get_lat_long(df_addresses)
    if df_addresses[(df_addresses["Latitude"] == "") | (df_addresses["Longitude"] == "")].shape[0] > 0:
        print("Missing:", df_addresses[(df_addresses["Latitude"] == "") | (df_addresses["Longitude"] == "")].shape[0])
        continue
    else:
        break
print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")
df_addresses

Missing: 17
Missing: 8
Missing: 5
Missing: 3
Missing: 2
Missing: 1
Total Time taken: 3.30 seconds


Unnamed: 0,Street Address,Latitude,Longitude
0,goodlink park,1.38159451793256,103.885209972539
1,jalan mata ayer,1.42756202366437,103.826112602001
2,platina road,1.3682399094409,103.888940713812
3,sembawang road,1.38168746675049,103.884323802695
4,buangkok green,1.3135755760062,103.907322802774
...,...,...,...
5019,commonweath close(blk 87),1.35952087347427,103.888340028762
5020,woodlands drive 72(blk 797),1.44365656317263,103.803047711815
5021,commonweath close(blk 83),1.35952087347427,103.888340028762
5022,commonweath drive(blk 90),1.35952087347427,103.888340028762


In [None]:
# Merge the latitude and longitude to the original dataframe
df = df.merge(df_addresses, on="Street Address", how="left")
df

Unnamed: 0,Number of Cases,Street Address,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number,Latitude,Longitude
0,11,goodlink park,1,8,24,08/08/2021,8,1.38159451793256,103.885209972539
1,3,jalan mata ayer,1,8,24,08/08/2021,8,1.42756202366437,103.826112602001
2,5,platina road,1,8,24,08/08/2021,8,1.3682399094409,103.888940713812
3,5,sembawang road,1,8,24,08/08/2021,8,1.38168746675049,103.884323802695
4,1,buangkok green,2,1,4,08/08/2021,8,1.3135755760062,103.907322802774
...,...,...,...,...,...,...,...,...,...
18964,1,yishun street 21(blk 208),398,1,2,16/06/2022,6,1.27823379159423,103.789180979037
18965,1,yishun street 71(blk 729),399,1,2,16/06/2022,6,1.40347414894291,103.897938437602
18966,1,yishun street 72(blk 755),399,1,2,16/06/2022,6,1.30656471420158,103.898292204928
18967,1,yung kuang road(blk 165b),400,1,2,16/06/2022,6,1.37427380230521,103.770688640795


In [None]:
# Check for missing values in the dataframe
df.isnull().sum()

Number of Cases            0
Street Address             0
Cluster Number             0
Recent Cases in Cluster    0
Total Cases in Cluster     0
Date                       0
Month Number               0
Latitude                   0
Longitude                  0
dtype: int64

In [None]:
# Bring the latitude and longitude to column 3 and 4
df = df[["Number of Cases", "Street Address", "Latitude", "Longitude", "Cluster Number", "Recent Cases in Cluster", "Total Cases in Cluster", "Date", "Month Number"]]
# Save progress
df.to_csv("dengue2.csv", index=False)

In [24]:
# dir_path = os.path.dirname(os.path.realpath(sys.argv[0])) + "\\"
dir_path = os.getcwd() + "\\"

def set_options(xl, option):
    try:
        xl.Visible = option
        xl.ScreenUpdating = option
        xl.DisplayAlerts = option
        xl.EnableEvents = option
    except: 
        pass

def convert(file):
    file = os.path.splitext(file)[0]
    print('Converting {}...'.format(file))
    xl=win32com.client.Dispatch("Excel.Application",pythoncom.CoInitialize())
    set_options(xl, False)
    
    # check if file with .csv exists
    if not os.path.isfile(dir_path + file + '.csv'):
        try:
            wb = xl.Workbooks.Open(Filename= dir_path + file + '.xlsx',ReadOnly=1)
        except:
            wb = xl.Workbooks.Open(Filename= dir_path + file + '.xlsb',ReadOnly=1)  
        xl.ActiveWorkbook == wb
        
        #6 means csv
        wb.SaveAs(Filename= dir_path + file + '.csv', FileFormat='6') 
        set_options(xl, True)
        wb.Close(True)
        xl.Application.Quit()
        wb=xl=None
    
    df = pd.read_csv(dir_path + file + '.csv', low_memory = False, encoding='ISO-8859-1')
    df.to_pickle(dir_path + file + '.pkl')
    
    return file + '.pkl'

def read(i):
    try: 
        pkl = glob.glob(os.path.join('*{0}*.pkl'.format(i)))[0]
    except: 
        pkl = convert(glob.glob(os.path.join('*{0}*'.format(i)))[0])
    df = pd.read_pickle(dir_path + pkl)
    return df

In [25]:
df_1 = read('dengue')
df_2 = read('dengue2')

In [26]:
# Remove duplicate rows from df_2 which are defined by having the same number of cases, cluster number, recent cases in cluster, total cases in cluster, and street address
df_2 = df_2.drop_duplicates(subset=["Number of Cases", "Recent Cases in Cluster", "Total Cases in Cluster", "Street Address"], keep="first").reset_index(drop=True)

# list of unique date
date_list = df_2["Date"].unique()

from datetime import datetime
date_list = sorted(date_list, key=lambda x: datetime.strptime(x, '%d/%m/%Y'))
print(date_list)

new_list = []
for item in date_list:
    if len(new_list) == 0:
        new_list.append(item)
    else:
        if (datetime.strptime(item, '%d/%m/%Y') - datetime.strptime(new_list[-1], '%d/%m/%Y')).days >= 7:
            new_list.append(item) 
print(new_list)

# filter df_2 with better regularity
df_2 = df_2[df_2["Date"].isin(new_list)].reset_index(drop=True)
df_2

['08/03/2021', '08/08/2021', '27/10/2021', '03/12/2021', '10/03/2022', '19/03/2022', '29/03/2022', '25/04/2022', '05/06/2022', '09/06/2022', '16/06/2022', '18/06/2022', '05/08/2022', '14/08/2022', '01/09/2022', '07/09/2022', '11/09/2022', '18/09/2022', '21/09/2022']
['08/03/2021', '08/08/2021', '27/10/2021', '03/12/2021', '10/03/2022', '19/03/2022', '29/03/2022', '25/04/2022', '05/06/2022', '16/06/2022', '05/08/2022', '14/08/2022', '01/09/2022', '11/09/2022', '18/09/2022']


Unnamed: 0,Number of Cases,Street Address,Latitude,Longitude,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number
0,11,goodlink park,1.381595,103.885210,1,8,24,08/08/2021,8
1,3,jalan mata ayer,1.427562,103.826113,1,8,24,08/08/2021,8
2,5,platina road,1.368240,103.888941,1,8,24,08/08/2021,8
3,5,sembawang road,1.381687,103.884324,1,8,24,08/08/2021,8
4,1,buangkok green,1.313576,103.907323,2,1,4,08/08/2021,8
...,...,...,...,...,...,...,...,...,...
12000,1,tuas view square,1.350699,103.957516,391,2,2,16/06/2022,6
12001,2,upper serangoon crescent(riversails),1.432722,103.796846,393,2,2,16/06/2022,6
12002,2,west coast way(carabelle),1.396871,103.879860,395,1,2,16/06/2022,6
12003,1,yishun street 71(blk 729),1.403474,103.897938,399,1,2,16/06/2022,6


In [27]:
# Concatenate the two dataframes
df_final = pd.concat([df_1, df_2], ignore_index=True)
df_final

Unnamed: 0,Number of Cases,Street Address,Latitude,Longitude,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number
0,4,bishan street 22 (block 232),1.358286,103.845226,1,24,83,3/7/2015,7
1,7,bishan street 22 (block 233),1.358639,103.845259,1,24,83,3/7/2015,7
2,3,bishan street 22 (block 234),1.358390,103.845955,1,24,83,3/7/2015,7
3,1,bishan street 22 (block 235),1.358719,103.846477,1,24,83,3/7/2015,7
4,4,bishan street 22 (block 236),1.359041,103.846849,1,24,83,3/7/2015,7
...,...,...,...,...,...,...,...,...,...
68976,1,tuas view square,1.350699,103.957516,391,2,2,16/06/2022,6
68977,2,upper serangoon crescent(riversails),1.432722,103.796846,393,2,2,16/06/2022,6
68978,2,west coast way(carabelle),1.396871,103.879860,395,1,2,16/06/2022,6
68979,1,yishun street 71(blk 729),1.403474,103.897938,399,1,2,16/06/2022,6


In [28]:
df_lat_long = df_final[["Latitude", "Longitude"]].drop_duplicates().reset_index(drop=True)

In [29]:
# use master-plan-2019-subzone-boundary-no-sea-geojson.geojson to get the subzone
with open("master-plan-2019-subzone-boundary-no-sea-geojson.geojson") as f:
    geojson = json.load(f)

def get_subzone(lat, long):
    point = Point(long, lat)
    for feature in geojson["features"]:
        polygon = shape(feature["geometry"])
        if polygon.contains(point):
            properties = feature["properties"]["Description"]
            soup = BeautifulSoup(properties, "lxml")
            subzone = soup.find("th", text="SUBZONE_N").find_next_sibling("td").text
            planning_area = soup.find("th", text="PLN_AREA_N").find_next_sibling("td").text
            region = soup.find("th", text="REGION_N").find_next_sibling("td").text
            return subzone, planning_area, region        
    return "NA", "NA", "NA"

In [30]:
df_lat_long["Subzone"], df_lat_long["Planning Area"], df_lat_long["Region"] = zip(*df_lat_long.swifter.apply(lambda x: get_subzone(x["Latitude"], x["Longitude"]), axis=1))
df_lat_long 

Dask Apply: 100%|██████████| 16/16 [01:01<00:00,  3.87s/it]


Unnamed: 0,Latitude,Longitude,Subzone,Planning Area,Region
0,1.358286,103.845226,MARYMOUNT,BISHAN,CENTRAL REGION
1,1.358639,103.845259,MARYMOUNT,BISHAN,CENTRAL REGION
2,1.358390,103.845955,MARYMOUNT,BISHAN,CENTRAL REGION
3,1.358719,103.846477,MARYMOUNT,BISHAN,CENTRAL REGION
4,1.359041,103.846849,MARYMOUNT,BISHAN,CENTRAL REGION
...,...,...,...,...,...
14240,1.433678,103.843871,YISHUN EAST,YISHUN,NORTH REGION
14241,1.350699,103.957516,TAMPINES EAST,TAMPINES,EAST REGION
14242,1.396871,103.879860,FERNVALE,SENGKANG,NORTH-EAST REGION
14243,1.403474,103.897938,MATILDA,PUNGGOL,NORTH-EAST REGION


In [118]:
# G_MP19_LAND_USE_PL.kml is used to get the land use of each polygon
# Convert it to geojson via kml2geojson
import kml2geojson

geojson = kml2geojson.main.convert("G_MP19_LAND_USE_PL.kml")
with open("G_MP19_LAND_USE_PL.geojson", "w") as f:
    json.dump(geojson, f)


In [31]:
# use G_MP19_LAND_USE_PL.geojson to get the land use
with open("G_MP19_LAND_USE_PL.geojson") as f:
    geojson = json.load(f)[0]

def get_land_use(lat, long):
    point = Point(long, lat)
    for feature in geojson["features"]:
        polygon = shape(feature["geometry"])
        if polygon.contains(point):
            properties = feature["properties"]["description"]
            soup = BeautifulSoup(properties, "lxml")
            land_use = soup.find("th", text="LU_DESC").find_next_sibling("td").text
            return land_use
    return "NA"

In [32]:
# Function takes forever due to sheer volume of data, Maybe i should have just used Geopandas and Sjoin it
df_lat_long["Land Use"] = df_lat_long.swifter.apply(lambda x: get_land_use(x["Latitude"], x["Longitude"]), axis=1)

Dask Apply: 100%|██████████| 16/16 [2:22:44<00:00, 535.27s/it]   


In [33]:
# Merge the subzone, planning area, and region to the original dataframe
df_final = df_final.merge(df_lat_long, on=["Latitude", "Longitude"], how="left")
df_final

Unnamed: 0,Number of Cases,Street Address,Latitude,Longitude,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number,Subzone,Planning Area,Region,Land Use
0,4,bishan street 22 (block 232),1.358286,103.845226,1,24,83,3/7/2015,7,MARYMOUNT,BISHAN,CENTRAL REGION,RESIDENTIAL
1,7,bishan street 22 (block 233),1.358639,103.845259,1,24,83,3/7/2015,7,MARYMOUNT,BISHAN,CENTRAL REGION,RESIDENTIAL
2,3,bishan street 22 (block 234),1.358390,103.845955,1,24,83,3/7/2015,7,MARYMOUNT,BISHAN,CENTRAL REGION,RESIDENTIAL
3,1,bishan street 22 (block 235),1.358719,103.846477,1,24,83,3/7/2015,7,MARYMOUNT,BISHAN,CENTRAL REGION,RESIDENTIAL
4,4,bishan street 22 (block 236),1.359041,103.846849,1,24,83,3/7/2015,7,MARYMOUNT,BISHAN,CENTRAL REGION,RESIDENTIAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68976,1,tuas view square,1.350699,103.957516,391,2,2,16/06/2022,6,TAMPINES EAST,TAMPINES,EAST REGION,RESIDENTIAL WITH COMMERCIAL AT 1ST STOREY
68977,2,upper serangoon crescent(riversails),1.432722,103.796846,393,2,2,16/06/2022,6,WOODLANDS SOUTH,WOODLANDS,NORTH REGION,RESIDENTIAL
68978,2,west coast way(carabelle),1.396871,103.879860,395,1,2,16/06/2022,6,FERNVALE,SENGKANG,NORTH-EAST REGION,RESIDENTIAL
68979,1,yishun street 71(blk 729),1.403474,103.897938,399,1,2,16/06/2022,6,MATILDA,PUNGGOL,NORTH-EAST REGION,RESIDENTIAL


In [34]:
# Save
df_final.to_csv("dengue_final.csv", index=False)

In [54]:
# Make a authentication post request to Onemap Singapore API
from dotenv import load_dotenv
load_dotenv()

email = os.getenv("EMAIL")
password = os.getenv("PASSWORD")
token = os.getenv("TOKEN")

def save_token():
    params = {
    "email": email,
    "password": password
    }

    r = requests.post("https://developers.onemap.sg/privateapi/auth/post/getToken", params=params)
    token = r.json()["access_token"]

    with open(".env", "a") as f:
        f.write("TOKEN={0}".format(token))

    return token


# Get list of planning areas
try: 
    r = requests.get("https://developers.onemap.sg/privateapi/popapi/getPlanningareaNames", params={"token": token})
except: 
    token = save_token()
    r = requests.get("https://developers.onemap.sg/privateapi/popapi/getPlanningareaNames", params={"token": token})

planning_areas = [item["pln_area_n"] for item in r.json()]


# save into dataframe
df_population = pd.DataFrame(planning_areas, columns=["Planning Area"])
df_population.head()

Unnamed: 0,Planning Area
0,ANG MO KIO
1,BEDOK
2,BISHAN
3,BOON LAY
4,BUKIT BATOK


In [55]:
# For each planning area, get the population
def get_population(year, planning_area):

    # There is an api limit of 250 per minute
    while True:
        try:
            r = requests.get("https://developers.onemap.sg/privateapi/popapi/getPopulationAgeGroup?token={0}&year={1}&planningArea={2}".format(token, year, planning_area))
            break
        except:
            time.sleep(1)
    # json contains 3 responses, one for each gender type
    # we only need the total population which is usually the second response, but we will check just in case 
    if r.json()[1]["gender"] == "Total":
        return r.json()[1]
    elif r.json()[2]["gender"] == "Total":
        return r.json()[2]
    else:
        return r.json()[0]
    

In [None]:
# Test for row 1
test_columns = get_population(2019, df_population["Planning Area"][0])

# We will have each age category as a column as well as the total population
columns = [item for item in test_columns.keys() if item.startswith("age") or item == "total"]
for column in columns:
    df_population[column] = 0

# Get the json response for each planning area via swifter in column json first
df_population["json_2019"] = df_population.swifter.apply(lambda x: get_population(2019, x["Planning Area"]), axis=1)

# Then we will populate the columns
for column in columns:
    df_population[column] = df_population.swifter.apply(lambda x: x["json_2019"][column], axis=1)


In [57]:
#move json to 3rd dataframe
df_population_all = df_population[["Planning Area", "json_2019"]].copy()
df_population.drop("json_2019", axis=1, inplace=True)

# Save
df_population.to_csv("population_2019.csv", index=False)

In [58]:
# Now we go crazy with all the years except 2019 which we already got
years = [2015, 2016, 2017, 2018, 2020, 2021]

for year in years:
    df_population_all["json_{0}".format(year)] = df_population_all.swifter.apply(lambda x: get_population(year, x["Planning Area"]), axis=1)

# Save
df_population_all = df_population_all.reindex(sorted(df_population_all.columns), axis=1)
df_population_all.to_csv("population_all.csv", index=False)

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]