In [16]:
import numpy as np
import pandas as pd

In [17]:
input_file = "20210311_v0.2-FSI" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

### Dataset

The dataset used here:

- Excludes all partnerships/sole proprietorships
- Includes only if `Financial and Insurance Activities` are in the Primary Section Description

Seems to have some duplicates (see `unique` vs. `count`), and the dataset shows the same UEN with varying entity names - probably because company has been renamed, can confirm this through www.bizfile.gov.sg.


In [18]:
df.describe()

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
count,6389,6389,6389,6389
unique,342,6197,6382,1
top,2020-03-13,202036310W,SEVIORA HOLDINGS PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
freq,60,3,2,6389


In [19]:
df

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
0,2020-01-01,202000015R,JULIAN GREY PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
1,2020-01-01,202000015R,JULIAN GREY VENTURES PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
2,2020-01-01,202000017W,PRECIOUS (GLOBAL) PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
3,2020-01-01,202000018H,ALLIED STAR PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
4,2020-01-01,202000024W,LAUNCH I/O PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
...,...,...,...,...
6384,2020-12-17,T20VC0183A,SEAVI ADVENT EQUITY VII FUND VCC,FINANCIAL AND INSURANCE ACTIVITIES
6385,2020-12-23,T20VC0185D,WELLINGTON MANAGEMENT FUNDS (SINGAPORE) VCC,FINANCIAL AND INSURANCE ACTIVITIES
6386,2020-12-23,T20VC0187G,PENCO CAPITAL VCC,FINANCIAL AND INSURANCE ACTIVITIES
6387,2020-12-23,T20VC0190G,RAINMAKING VENTURES (S) VCC,FINANCIAL AND INSURANCE ACTIVITIES


### Part 1

We first begin by crawling for the website addresses from existing data sources.

In [20]:
import requests
import bs4
import re
from urllib.parse import urlparse, parse_qs
import time
import random

In [21]:
url = "https://httpbin.org/ip"

response = requests.get(url)

my_ip = response.json()['origin']


proxies = {}
proxies[0] = {
    "http": "socks4://109.75.35.12:3629",
    "https": "socks4://109.75.35.12:3629"
}
# proxies[1] = {
#     "http": "socks4://1.2.187.126:4145",
#     "https": "socks4://1.2.187.126:4145"
# }


# proxies[2] = {
#     "http": "http://121.230.209.203:3256",
#     "https": "http://121.230.209.203:3256"
# }

proxy_count = len(proxies)

for i in range(proxy_count):
    try:
        response = requests.get(url, proxies=proxies[i])
        ip = response.json()['origin']
        print(ip)
        # if ip is not my_ip:
    except Exception as e:
        print("Proxy {} is down, error={}".format(i, e))

109.75.35.12


In [None]:
# Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
})

# Target URL to scrape
# url = "http://3.0.205.74/search-results?target={\"value\":\"Registration Number\",\"label\":\"Registration Number\",\"searchTarget\":\"registrationNumber\"}&value="
# url2 = "https://sgpgrid.com/search-results?target={\"value\":\"Company Name\",\"label\":\"Company Name\",\"searchTarget\":\"fullName\"}&value="
url = "https://opengovsg.com/corporate/"

output_file = "{}-result.csv".format(input_file)
size = len(df) # Size of dataset

max_tries = 15

# f.write("Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description,Description,Website,Industry\n")

# Finding Description, Website, Industry, hardcode at 5 requests just to test
for i in range(1000, size):
    entity = df.loc[i] # current row record
    entity_uen = entity[1] # entity UEN
    entity_name = entity[2] # entity name
    
    # Accessing the webpage
    print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
    for j in range(max_tries):
        try:
            random_stuff = str(random.random()) + "&utm_source=linkedin&utm_medium=unpaid&utm_campaign="
            url_new = url + entity_uen + "?ra=" + random_stuff
            # print(url_new)
            
            
            # pick a random rotating proxy
            id = random.randint(0, proxy_count)
#             req = requests.get(url_new, headers=headers, proxies=proxies[id])
            
            # no proxy
            resp = requests.get(url_new, headers=headers, proxies=proxies[id])
#             print(resp.headers)
            
            soup = bs4.BeautifulSoup(resp.text, "html.parser")
            
            # Now look for elements
            tables = soup.find_all("table", {"class": "table table-striped table-hover table-dl"})
            

            # we know that column 0 is description (sometimes same as SSIC), 3 is website, 5 is SSIC
            
            ssic_table_data = tables[1].find_all("td")
            ssic_table_data_count = len(ssic_table_data)
            
            # Format existing data as CSV
            formatted_csv_row = "{},{},{},{}".format(
                entity[0],
                entity[1],
                entity[2],
                entity[3]
            )
            
            data = []
#             print(ssic_table_data)

            
            for k in range(1, ssic_table_data_count, 2):
                data.append(ssic_table_data[k].get_text().strip())

            # Then append scraped data...
            data_count = len(data)
            for l in range(data_count):
                formatted_csv_row += ",\"{}\"".format(data[l])
#             formatted_csv_row += ",\"{}\",\"{}\",\"{}\"".format(description, website, ssic)

            f = open(output_file, "a")
            f.write(formatted_csv_row + "\n")
            f.close()
            print("    [SUCCESS] Wrote to file... {}".format(data[0]))
    
    
            break # next for loop
        except Exception as e:
            print("    [ERROR] Failed at id={} retry={} error={}, retrying...".format(i, j, e))
            
            if j == max_tries - 1:
                f = open("{}-failed.csv".format(input_file), "a")
                f.write("{},{}\n".format(entity_uen, entity_name))
                f.close()
            continue # retry
    
    
    
    time.sleep(random.randint(1, 8)) # Maybe randomise the waiting time?
    


Searching for id=1000 202006532Z QUASAR VENTURES PTE. LTD.
    [ERROR] Failed at id=1000 retry=0 error=1, retrying...
    [ERROR] Failed at id=1000 retry=1 error=1, retrying...
    [ERROR] Failed at id=1000 retry=2 error=1, retrying...
    [ERROR] Failed at id=1000 retry=3 error=1, retrying...
    [ERROR] Failed at id=1000 retry=4 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202006532Z?ra=0.5470506584742992&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHTTPSConnection object at 0x000001B084BBA370>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond')), retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1001 202006534C NEW VENTURES HOLDING 

    [ERROR] Failed at id=1033 retry=10 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202006796H?ra=0.38965088557246297&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHTTPSConnection object at 0x000001B084C75DF0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond')), retrying...
    [ERROR] Failed at id=1033 retry=11 error=1, retrying...
    [ERROR] Failed at id=1033 retry=12 error=list index out of range, retrying...
    [ERROR] Failed at id=1033 retry=13 error=1, retrying...
    [ERROR] Failed at id=1033 retry=14 error=list index out of range, retrying...
Searching for id=1034 202006797Z VISTA INVESTMENT HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1034 retry=0 error=1, retrying.

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1071 202007131G AVACARA PTE. LTD.
    [ERROR] Failed at id=1071 retry=0 error=1, retrying...
    [ERROR] Failed at id=1071 retry=1 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1072 202007133H GRAND WELL INVESTMENTS PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1073 202007137M PROTOZOA HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1073 retry=0 error=1, retrying...
    [ERROR] Failed at id=1073 retry=1 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202007137M?ra=0.7777940642074449&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHTTPSConnection object at 0x000001B085609F70>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not prope

Searching for id=1106 202007358W MESO CAPITAL PTE. LTD.
    [ERROR] Failed at id=1106 retry=0 error=1, retrying...
    [ERROR] Failed at id=1106 retry=1 error=1, retrying...
    [ERROR] Failed at id=1106 retry=2 error=1, retrying...
    [SUCCESS] Wrote to file... 66305: VENTURE CAPITAL ACTIVITIES
Searching for id=1107 202007362E LIBERALIZE PTE. LTD.
    [SUCCESS] Wrote to file... 66195: TRANSACTION/PAYMENT PROCESSING SERVICES
Searching for id=1108 202007366H ASVDK HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1108 retry=0 error=1, retrying...
    [ERROR] Failed at id=1108 retry=1 error=1, retrying...
    [ERROR] Failed at id=1108 retry=2 error=1, retrying...
    [ERROR] Failed at id=1108 retry=3 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202007366H?ra=0.35654565159001217&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHTTPSConnection object at 0x000001B085078250>:

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1146 202007647W INTERAIR ACQUISITION HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1146 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1147 202007649Z EVERTON TRUST PTE. LTD.
    [ERROR] Failed at id=1147 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1148 202007652R PROVISE CENTRE SG PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1149 202007668N KUSANAGI MAJOR HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1149 retry=0 error=1, retrying...
    [ERROR] Failed at id=1149 retry=1 error=1, retrying...
    [ERROR] Failed at id=1149 retry=2 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1150 202007675N AI INVEST PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1151 

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1177 202007848W JP88 HOLDINGS PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1178 202007862W SU LIN INVESTMENT CORPORATION PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1179 202007890N PI CAPITAL PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1180 202007911W P9 CAPITAL PTE. LTD.
    [ERROR] Failed at id=1180 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1181 202007914K ETFDAO FOUNDATION PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1182 202007919E NEW SIGHT CAPITAL PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1183 202007922C ABSOLUTE CONFIDENCE INDONESIA PTE. LTD.
    [ERROR] Failed at id=1183 retry=0 error=1, retrying...
    [SUCCESS] W

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1213 202008044N REF ADVISORS PRIVATE LIMITED
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1214 202008071Z THE COCOA LEAF PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1215 202008074M ROBERN ASSOCIATES (SG) PTE. LTD.
    [ERROR] Failed at id=1215 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 66199: OTHER ACTIVITIES AUXILIARY TO FINANCIAL SERVICE ACTIVITIES N.E.C. (EG MORTGAGE ADVISORY FIRMS)
Searching for id=1216 202008083N AR14 HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1216 retry=0 error=1, retrying...
    [ERROR] Failed at id=1216 retry=1 error=1, retrying...
    [ERROR] Failed at id=1216 retry=2 error=1, retrying...
    [ERROR] Failed at id=1216 retry=3 error=1, retrying...
    [ERROR] Failed at id=1216 retry=4 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1217 

    [ERROR] Failed at id=1248 retry=14 error=list index out of range, retrying...
Searching for id=1249 202008299C GALAXY READERS PTE. LTD.
    [ERROR] Failed at id=1249 retry=0 error=1, retrying...
    [ERROR] Failed at id=1249 retry=1 error=1, retrying...
    [ERROR] Failed at id=1249 retry=2 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1250 202008300D SCORPIO ASSET HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1250 retry=0 error=1, retrying...
    [ERROR] Failed at id=1250 retry=1 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1251 202008304G VP-ZACD FUND HOLDINGS PTE. LTD.
    [ERROR] Failed at id=1251 retry=0 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202008304G?ra=0.9527264249059382&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHTTPSConnection object

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1280 202008436C RATA PTE. LTD.
    [ERROR] Failed at id=1280 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1281 202008437M SIL INTERNATIONAL PTE. LIMITED
    [ERROR] Failed at id=1281 retry=0 error=1, retrying...
    [ERROR] Failed at id=1281 retry=1 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1282 202008438D HAISIGHT HOLDINGS PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1283 202008443C SCVN PRIVATE LIMITED
    [ERROR] Failed at id=1283 retry=0 error=1, retrying...
    [ERROR] Failed at id=1283 retry=1 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202008443C?ra=0.144760365475061&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHT

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1315 202008599E DEROYCE HOLDINGS (SINGAPORE) PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1316 202008606M DZT RESEARCH PRIVATE LIMITED
    [ERROR] Failed at id=1316 retry=0 error=1, retrying...
    [ERROR] Failed at id=1316 retry=1 error=1, retrying...
    [ERROR] Failed at id=1316 retry=2 error=1, retrying...
    [SUCCESS] Wrote to file... 66199: OTHER ACTIVITIES AUXILIARY TO FINANCIAL SERVICE ACTIVITIES N.E.C. (EG MORTGAGE ADVISORY FIRMS)
Searching for id=1317 202008608N MAGNETAR CORPORATION PTE. LTD.
    [ERROR] Failed at id=1317 retry=0 error=1, retrying...
    [ERROR] Failed at id=1317 retry=1 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1318 202008615N ROICO PTE. LTD.
    [ERROR] Failed at id=1318 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1343 202008747H HOWX8 PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1344 202008751R MEISHAN STNA DEVELOPMENT PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1345 202008752G BLUE-OX PTE. LTD.
    [ERROR] Failed at id=1345 retry=0 error=1, retrying...
    [ERROR] Failed at id=1345 retry=1 error=1, retrying...
    [SUCCESS] Wrote to file... 66306: SINGLE/MULTIPLE FAMILY OFFICES ACTIVITIES (E.G. MANAGING INVESTMENTS AND TRUSTS FOR A SINGLE OR MULTIPLE FAMILIES)
Searching for id=1346 202008755Z GIANT STEPS PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1347 202008780D SHINELAKE PTE. LTD.
    [SUCCESS] Wrote to file... 64300: TRUSTS, FUNDS AND SIMILAR FINANCIAL ENTITIES (EG COLLECTIVE PORTFOLIO INVESTMENT FUNDS (EXCLUDING THOSE WITH RENTAL INCOME))
Searching for id=1348 202008781N STAROCEAN

    [ERROR] Failed at id=1377 retry=7 error=list index out of range, retrying...
    [ERROR] Failed at id=1377 retry=8 error=list index out of range, retrying...
    [ERROR] Failed at id=1377 retry=9 error=1, retrying...
    [ERROR] Failed at id=1377 retry=10 error=list index out of range, retrying...
    [ERROR] Failed at id=1377 retry=11 error=1, retrying...
    [ERROR] Failed at id=1377 retry=12 error=1, retrying...
    [ERROR] Failed at id=1377 retry=13 error=1, retrying...
    [ERROR] Failed at id=1377 retry=14 error=1, retrying...
Searching for id=1378 202008922W GJ OMNI PTE. LTD.
    [SUCCESS] Wrote to file... 46900: WHOLESALE TRADE OF A VARIETY OF GOODS WITHOUT A DOMINANT PRODUCT
Searching for id=1379 202008927M HERMETIA BIO SCIENCE PTE. LTD.
    [ERROR] Failed at id=1379 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1380 202008931Z GOAL INTERNATIONAL INVESTMENT PTE. LTD.
    [ERROR] Failed at id=1380 retry=0 error=1

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1408 202009159M CLF2 SINGAPORE (32) PTE. LTD.
    [ERROR] Failed at id=1408 retry=0 error=SOCKSHTTPSConnectionPool(host='opengovsg.com', port=443): Max retries exceeded with url: /corporate/202009159M?ra=0.11177558638873009&utm_source=linkedin&utm_medium=unpaid&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSHTTPSConnection object at 0x000001B0850E4730>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond')), retrying...
    [ERROR] Failed at id=1408 retry=1 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1409 202009161W CLF2 SINGAPORE (33) PTE. LTD.
    [ERROR] Failed at id=1409 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHE

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1449 202009421K BTO SKYY BIDCO SINGAPORE HOLDING PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1450 202009422C BTO SKYY SINGAPORE HOLDING PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1451 202009430M IMARATU CAPITAL PTE. LTD.
    [SUCCESS] Wrote to file... 66306: SINGLE/MULTIPLE FAMILY OFFICES ACTIVITIES (E.G. MANAGING INVESTMENTS AND TRUSTS FOR A SINGLE OR MULTIPLE FAMILIES)
Searching for id=1452 202009431D ALAMEDA RESEARCH PTE. LTD.
    [ERROR] Failed at id=1452 retry=0 error=1, retrying...
    [ERROR] Failed at id=1452 retry=1 error=1, retrying...
    [ERROR] Failed at id=1452 retry=2 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1453 202009434R ASTER ASIA HOLDINGS I PTE. LTD.
    [ERROR] Failed at id=1453 retry=0 error=1, retrying...
    [ERROR] Failed at id=1453 ret

    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1487 202009638H FIRST DUTCH INVESTMENT COMPANY PTE. LTD.
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1488 202009647K BLH (INFRA) PTE. LTD.
    [ERROR] Failed at id=1488 retry=0 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1489 202009648C BLH (6) PTE. LTD.
    [ERROR] Failed at id=1489 retry=0 error=1, retrying...
    [ERROR] Failed at id=1489 retry=1 error=1, retrying...
    [ERROR] Failed at id=1489 retry=2 error=1, retrying...
    [ERROR] Failed at id=1489 retry=3 error=1, retrying...
    [SUCCESS] Wrote to file... 64202: OTHER HOLDING COMPANIES
Searching for id=1490 202009684H REEM TAKAFUL HOLDING PTE. LTD.
    [SUCCESS] Wrote to file... 70201: MANAGEMENT CONSULTANCY SERVICES (GENERAL)
Searching for id=1491 202009685Z SINGAPORE ENLIGHTENED POWER PTE. LTD.
    [ERROR] Failed at id=1491 retry=0 error=1, retrying...
   

### Old Codes

The following code attempts to:

- crawl Google Search for the first 10 results
- then it grabs the URLs so that we can crawl them again for the metadata

In [None]:

# f = open(output_file, "w")

# f.write("Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8,Link 9,Link 10\n")

# for i in range(1, 20):
#     entity = df.loc[i] # current row record
#     entity_name = entity[2] # entity name
#     query = "{} singapore website".format(entity_name)
    
#     print("Searching for id={} \"{}\"".format(i, query))
    
#     # prepare to search "<entity name> singapore website" for more localised search context
#     req = requests.get(url.format(query))
#     soup = bs4.BeautifulSoup(req.text, "html.parser")
#     headers = soup.find_all("h3")
#     links = soup.find_all(href=re.compile(r'\/url\?q=')) # pick top 10 search results & its link
    
#     # file write here
    
#     formatted_csv_row = "{},{},{},{}".format(
#         entity[0],
#         entity[1],
#         entity[2],
#         entity[3]
#     )
    
#     for i in range(10):
#         google_url = links[i].get('href')
#         parsed_url = parse_qs(google_url)
#         formatted_csv_row += ",{}".format(parsed_url['/url?q'][0])
#     f.write(formatted_csv_row + "\n")
# f.close()


