In [90]:
# NEA Link
link = "https://www.nea.gov.sg/dengue-zika/dengue/dengue-clusters"

# Importing Libraries
import requests, pandas as pd, time
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
tqdm.pandas()

In [91]:
# Get all snapshots of NEA's dengue cases via wayback machine from 2020 to 2022
def get_snapshots(link):
    # Use Wayback server to get all snapshots of the link
    wayback = "https://web.archive.org/cdx/search/cdx?url=" + link + "&output=json&fl=timestamp,original&collapse=digest&filter=statuscode:200"
    response = requests.get(wayback)
    data = response.json()
    snapshots = data[1:]
    # Keep only snapshots from 2021 to 2022
    snapshots = [snap for snap in snapshots if snap[0][:4] in ["2021", "2022"]]
    # Keep only timestamps
    snapshots = [snap[0] for snap in snapshots]
    return snapshots

In [92]:
# Get the snapshots
snapshots = get_snapshots(link)
snapshots

['20210308151403',
 '20210808172247',
 '20211027051137',
 '20211203081908',
 '20220310082530',
 '20220319234337',
 '20220329152026',
 '20220425143805',
 '20220605020619',
 '20220609053540',
 '20220616104319',
 '20220618163415',
 '20220805162654',
 '20220814023504',
 '20220901011605',
 '20220907192246',
 '20220911051635',
 '20220918172033',
 '20220921205322']

In [93]:
def get_soup(link, timestamp):
    # Get the snapshot
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Accept-Encoding": "*",
    "Connection": "keep-alive"
    }
    snapshot = "https://web.archive.org/web/" + timestamp + "/" + link
    response = requests.get(snapshot, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    # print(snapshot)
    return soup

In [94]:
def extract_clusters(soup):
    # In the soup, we want to extract the table with the class "dengue-fixed-table"
    table = soup.find_all("table", class_="dengue-fixed-table")[0]

    # The number of clusters is the number of times data-row-table appears
    # We want to extract the number of clusters
    clusters = len(table.find_all("table", {"data-row-table": True}))
    # print("Number of clusters:", clusters, "\n")
    return clusters, table

def extract_cluster(cluster_no, table, snapshot_no):
    # Let's extract the data from the first cluster
    rows = table.find_all("tr", {"data-row": cluster_no})[-1]
    # print("Cluster No.:", cluster_no)

    # the last 2 td tags with a number is the total number of cases and recent cases
    total = int(rows.find_all("td")[-1].text)
    recent = int(rows.find_all("td")[-2].text)
    # print("Total cases:", total)   
    # print("Recent cases:", recent)

    # remove first 3 and last 2 td tags
    rows = rows.find_all("td")[3:-2]

    # The remaining td tags are sorted as such:
    # Tags containing numbers are the number of cases of each area in the cluster
    # Tags containing text are the names of the areas in the cluster
    # We want to extract the names of the areas and the number of cases
    areas = []
    cases = []
    for i in range(len(rows)):
        if i % 2 == 0:
            areas.append(rows[i].text)
        else:
            cases.append(int(rows[i].text))
    # print("Areas:", areas)
    # print("Cases:", cases)

    # Get the date and month number
    date = snapshots[snapshot_no][:8]
    date = pd.to_datetime(date, format="%Y/%m/%d")
    month = date.month
    # print("Date:", date)
    # print("Month:", month, "\n")

    # Create a dictionary to store the data
    data = {"Number of Cases": cases, "Street Address": areas, "Cluster Number": cluster_no, "Recent Cases in Cluster": recent, "Total Cases in Cluster": total, "Date": date, "Month Number": month}
    df = pd.DataFrame(data)
    df = df.explode("Street Address")
    return df

In [95]:

def extract_snapshot(snapshot):
    # Extract the data from the first snapshot
    soup = get_soup(link, snapshots[snapshot])
    clusters, table = extract_clusters(soup)

    # Create a dataframe to store the data
    df = pd.DataFrame(columns=["Number of Cases", "Street Address", "Cluster Number", "Recent Cases in Cluster", "Total Cases in Cluster", "Date", "Month Number"])

    # from 1 to the number of clusters, keep in mind NEA is stupid sometimes, there may be skipped cluster numbers
    startindex = 1
    endindex = clusters + 1
    while True:
        try:
            df = df.append(extract_cluster(startindex, table, snapshot))
            startindex += 1            
        except:
            if startindex == endindex:
                break
            else:
                startindex += 1
                endindex += 1
                continue           

    return df

In [96]:
# Extract the data from all snapshots
df = pd.DataFrame(columns=["Number of Cases", "Street Address", "Cluster Number", "Recent Cases in Cluster", "Total Cases in Cluster", "Date", "Month Number"])
start = time.time()

# Use concurrent.futures to speed up the process
with ThreadPoolExecutor(max_workers=10) as executor:
    results = [executor.submit(extract_snapshot, snapshot) for snapshot in range(len(snapshots))]

    for i in as_completed(results):
        df = df.append(i.result())

# Change data format to slashes and change street address to lowercase
df["Date"] = df["Date"].dt.strftime("%d/%m/%Y")
df["Street Address"] = df["Street Address"].str.lower()

print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")
df

Total Time taken: 87.91 seconds


Unnamed: 0,Number of Cases,Street Address,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number
0,11,goodlink park,1,8,24,08/08/2021,8
1,3,jalan mata ayer,1,8,24,08/08/2021,8
2,5,platina road,1,8,24,08/08/2021,8
3,5,sembawang road,1,8,24,08/08/2021,8
0,1,buangkok green,2,1,4,08/08/2021,8
...,...,...,...,...,...,...,...
1,1,yishun street 21(blk 208),398,1,2,16/06/2022,6
0,1,yishun street 71(blk 729),399,1,2,16/06/2022,6
1,1,yishun street 72(blk 755),399,1,2,16/06/2022,6
0,1,yung kuang road(blk 165b),400,1,2,16/06/2022,6


In [97]:
# Save progress
df.to_csv("dengue2.csv", index=False)

In [98]:
# Based on Singapore street address, we can extract the latitude and longitude of the street address
def get_lat_long(address):
    trash = [[")", ""],["(blk ", " "],["(", " "],["cl0se","close"]]
    for i in trash:
        address = address.replace(i[0], i[1]).strip()

    # Get the latitude and longitude of the address
    req = "https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=N&pageNum=1"
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
    "Accept-Encoding": "*",
    "Connection": "keep-alive"
    }
    # Request might time out, so we need to try again
    while True:
        try:
            response = requests.get(req.format(address=address), headers=headers).json()
            break
        except:
            time.sleep(1)
            continue
    
    # If the address is not found, return ""
    try:
        if len(response['results'])>0:
            return response['results'][0]['LATITUDE'], response['results'][0]['LONGITUDE']
        else:
            return "", ""
    except:
        return "", ""


# get only unique street addresses as separate dataframe
df_addresses = df[["Street Address"]].drop_duplicates().reset_index(drop=True)
df_addresses.shape

(5024, 1)

In [99]:
# get the latitude and longitude of the street addresses
# Use threading to speed up the process
start = time.time()
with ThreadPoolExecutor() as executor:
    results = [executor.submit(get_lat_long, address) for address in df_addresses["Street Address"]]
    for i, result in enumerate(as_completed(results)):
        df_addresses.loc[i, "Latitude"], df_addresses.loc[i, "Longitude"] = result.result()

print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")

Total Time taken: 46.57 seconds


In [100]:
# Compare with just normal progress_apply
# start = time.time()
# df_addresses["Latitude"], df_addresses["Longitude"] = zip(*df_addresses["Street Address"].progress_apply(get_lat_long))
# print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")

In [101]:
def retry_get_lat_long(df_addresses):
    # Create separate dataframe for missing latitude and longitude and drop them from the original dataframe
    df_missing = df_addresses[(df_addresses["Latitude"] == "") | (df_addresses["Longitude"] == "")].reset_index(drop=True)
    df_addresses = df_addresses[(df_addresses["Latitude"] != "") & (df_addresses["Longitude"] != "")].reset_index(drop=True)

    # Get the latitude and longitude of the missing street addresses
    with ThreadPoolExecutor() as executor:
        results = [executor.submit(get_lat_long, address) for address in df_missing["Street Address"]]
        for i, result in enumerate(as_completed(results)):
            df_missing.loc[i, "Latitude"], df_missing.loc[i, "Longitude"] = result.result()

    # Append the missing latitude and longitude to the original dataframe
    df_addresses = df_addresses.append(df_missing).reset_index(drop=True)
    
    return df_addresses

In [102]:
# Retry to get the latitude and longitude of the missing street addresses
start = time.time()
while True:
    df_addresses = retry_get_lat_long(df_addresses)
    if df_addresses[(df_addresses["Latitude"] == "") | (df_addresses["Longitude"] == "")].shape[0] > 0:
        print("Missing:", df_addresses[(df_addresses["Latitude"] == "") | (df_addresses["Longitude"] == "")].shape[0])
        continue
    else:
        break
print("Total Time taken:", format(time.time() - start, ".2f"), "seconds")
df_addresses

Missing: 17
Missing: 8
Missing: 5
Missing: 3
Missing: 2
Missing: 1
Total Time taken: 3.30 seconds


Unnamed: 0,Street Address,Latitude,Longitude
0,goodlink park,1.38159451793256,103.885209972539
1,jalan mata ayer,1.42756202366437,103.826112602001
2,platina road,1.3682399094409,103.888940713812
3,sembawang road,1.38168746675049,103.884323802695
4,buangkok green,1.3135755760062,103.907322802774
...,...,...,...
5019,commonweath close(blk 87),1.35952087347427,103.888340028762
5020,woodlands drive 72(blk 797),1.44365656317263,103.803047711815
5021,commonweath close(blk 83),1.35952087347427,103.888340028762
5022,commonweath drive(blk 90),1.35952087347427,103.888340028762


In [104]:
# Merge the latitude and longitude to the original dataframe
df = df.merge(df_addresses, on="Street Address", how="left")
df

Unnamed: 0,Number of Cases,Street Address,Cluster Number,Recent Cases in Cluster,Total Cases in Cluster,Date,Month Number,Latitude,Longitude
0,11,goodlink park,1,8,24,08/08/2021,8,1.38159451793256,103.885209972539
1,3,jalan mata ayer,1,8,24,08/08/2021,8,1.42756202366437,103.826112602001
2,5,platina road,1,8,24,08/08/2021,8,1.3682399094409,103.888940713812
3,5,sembawang road,1,8,24,08/08/2021,8,1.38168746675049,103.884323802695
4,1,buangkok green,2,1,4,08/08/2021,8,1.3135755760062,103.907322802774
...,...,...,...,...,...,...,...,...,...
18964,1,yishun street 21(blk 208),398,1,2,16/06/2022,6,1.27823379159423,103.789180979037
18965,1,yishun street 71(blk 729),399,1,2,16/06/2022,6,1.40347414894291,103.897938437602
18966,1,yishun street 72(blk 755),399,1,2,16/06/2022,6,1.30656471420158,103.898292204928
18967,1,yung kuang road(blk 165b),400,1,2,16/06/2022,6,1.37427380230521,103.770688640795


In [105]:
# Check for missing values in the dataframe
df.isnull().sum()

Number of Cases            0
Street Address             0
Cluster Number             0
Recent Cases in Cluster    0
Total Cases in Cluster     0
Date                       0
Month Number               0
Latitude                   0
Longitude                  0
dtype: int64

In [106]:
# Bring the latitude and longitude to column 3 and 4
df = df[["Number of Cases", "Street Address", "Latitude", "Longitude", "Cluster Number", "Recent Cases in Cluster", "Total Cases in Cluster", "Date", "Month Number"]]
# Save progress
df.to_csv("dengue2.csv", index=False)