In [1]:
import numpy as np
import pandas as pd

In [2]:
input_file = "20210311_v0.2-FSI" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

### Dataset

The dataset used here:

- Excludes all partnerships/sole proprietorships
- Includes only if `Financial and Insurance Activities` are in the Primary Section Description

Seems to have some duplicates (see `unique` vs. `count`), and the dataset shows the same UEN with varying entity names - probably because company has been renamed, can confirm this through www.bizfile.gov.sg.


In [4]:
df.describe()

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
count,6389,6389,6389,6389
unique,342,6197,6382,1
top,2020-03-13,202004989E,TARGET MULTI ASSET FUND VCC,FINANCIAL AND INSURANCE ACTIVITIES
freq,60,3,2,6389


In [5]:
df

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
0,2020-01-01,202000015R,JULIAN GREY PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
1,2020-01-01,202000015R,JULIAN GREY VENTURES PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
2,2020-01-01,202000017W,PRECIOUS (GLOBAL) PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
3,2020-01-01,202000018H,ALLIED STAR PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
4,2020-01-01,202000024W,LAUNCH I/O PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
...,...,...,...,...
6384,2020-12-17,T20VC0183A,SEAVI ADVENT EQUITY VII FUND VCC,FINANCIAL AND INSURANCE ACTIVITIES
6385,2020-12-23,T20VC0185D,WELLINGTON MANAGEMENT FUNDS (SINGAPORE) VCC,FINANCIAL AND INSURANCE ACTIVITIES
6386,2020-12-23,T20VC0187G,PENCO CAPITAL VCC,FINANCIAL AND INSURANCE ACTIVITIES
6387,2020-12-23,T20VC0190G,RAINMAKING VENTURES (S) VCC,FINANCIAL AND INSURANCE ACTIVITIES


In [7]:
import requests
import bs4
import re
from urllib.parse import urlparse, parse_qs
import time

# Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
})

# Target URL to scrape
url = "https://sgpgrid.com/search-results?target={\"value\":\"Registration Number\",\"label\":\"Registration Number\",\"searchTarget\":\"registrationNumber\"}&value="

output_file = "{}-result.csv".format(input_file)
size = len(df) # Size of dataset

# f.write("Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description,Description,Website,Industry\n")

# Finding Description, Website, Industry, hardcode at 5 requests just to test
for i in range(6, 10):
    entity = df.loc[i] # current row record
    entity_uen = entity[1] # entity UEN
    entity_name = entity[2] # entity name
    
    # Accessing the webpage
    print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    req = requests.get(url + entity_uen, headers=headers)
    soup = bs4.BeautifulSoup(req.text, "html.parser")

    # Now look for elements
    columns = soup.find_all("div", {"class": "rt-td table-cell"})
    
    # we know that column 0 is description (sometimes same as SSIC), 3 is website, 5 is SSIC
    
    # print(soup)

    description = columns[0].get_text()
    website = columns[3].get_text()
    ssic = columns[5].get_text()
    
    print("Found company info", description, website, ssic)
    
    # Format existing data as CSV
    formatted_csv_row = "{},{},{},{}".format(
        entity[0],
        entity[1],
        entity[2],
        entity[3]
    )
    
    # Then append SGPGrid's data...
    formatted_csv_row += ",\"{}\",\"{}\",\"{}\"".format(description, website, ssic)
    
    f = open(output_file, "a")
    f.write(formatted_csv_row + "\n")
    f.close()
    
    time.sleep(5) # Maybe randomise the waiting time?
    





Searching for id=6 202000051D EMINENT MANAGEMENT CONSULTANTS PTE. LTD.
Found company info OTHER HOLDING COMPANIES  OTHER HOLDING COMPANIES
Searching for id=7 202000061R SEMITECH HOLDINGS PTE. LTD.
Found company info INVEST SEMICONDUTOR AND MICROELECTRONIC R&D  OTHER HOLDING COMPANIES
Searching for id=8 202000083D DAISYFIELDS HOLDINGS PTE. LTD.
Found company info INVESTMENT HOLDING  OTHER HOLDING COMPANIES
Searching for id=9 202000088W 8X CAPITAL PTE. LTD.
Found company info HEDGE FUND MANAGEMENT  HEDGE FUND MANAGEMENT


### Old Codes

The following code attempts to crawl Google Search for the first 10 results, then i

In [None]:

# f = open(output_file, "w")

# f.write("Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8,Link 9,Link 10\n")

# for i in range(1, 20):
#     entity = df.loc[i] # current row record
#     entity_name = entity[2] # entity name
#     query = "{} singapore website".format(entity_name)
    
#     print("Searching for id={} \"{}\"".format(i, query))
    
#     # prepare to search "<entity name> singapore website" for more localised search context
#     req = requests.get(url.format(query))
#     soup = bs4.BeautifulSoup(req.text, "html.parser")
#     headers = soup.find_all("h3")
#     links = soup.find_all(href=re.compile(r'\/url\?q=')) # pick top 10 search results & its link
    
#     # file write here
    
#     formatted_csv_row = "{},{},{},{}".format(
#         entity[0],
#         entity[1],
#         entity[2],
#         entity[3]
#     )
    
#     for i in range(10):
#         google_url = links[i].get('href')
#         parsed_url = parse_qs(google_url)
#         formatted_csv_row += ",{}".format(parsed_url['/url?q'][0])
#     f.write(formatted_csv_row + "\n")
# f.close()


