In [13]:
import numpy as np
import pandas as pd

In [14]:
input_file = "20210311_v0.2-FSI" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

### Dataset

The dataset used here:

- Excludes all partnerships/sole proprietorships
- Includes only if `Financial and Insurance Activities` are in the Primary Section Description

Seems to have some duplicates (see `unique` vs. `count`), and the dataset shows the same UEN with varying entity names - probably because company has been renamed, can confirm this through www.bizfile.gov.sg.


In [15]:
df.describe()

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
count,6389,6389,6389,6389
unique,342,6197,6382,1
top,2020-03-13,202024182H,MRJS INCORPORATED PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
freq,60,3,2,6389


In [16]:
df

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
0,2020-01-01,202000015R,JULIAN GREY PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
1,2020-01-01,202000015R,JULIAN GREY VENTURES PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
2,2020-01-01,202000017W,PRECIOUS (GLOBAL) PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
3,2020-01-01,202000018H,ALLIED STAR PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
4,2020-01-01,202000024W,LAUNCH I/O PTE. LTD.,FINANCIAL AND INSURANCE ACTIVITIES
...,...,...,...,...
6384,2020-12-17,T20VC0183A,SEAVI ADVENT EQUITY VII FUND VCC,FINANCIAL AND INSURANCE ACTIVITIES
6385,2020-12-23,T20VC0185D,WELLINGTON MANAGEMENT FUNDS (SINGAPORE) VCC,FINANCIAL AND INSURANCE ACTIVITIES
6386,2020-12-23,T20VC0187G,PENCO CAPITAL VCC,FINANCIAL AND INSURANCE ACTIVITIES
6387,2020-12-23,T20VC0190G,RAINMAKING VENTURES (S) VCC,FINANCIAL AND INSURANCE ACTIVITIES


### Part 1

We first begin by crawling for the website addresses from existing data sources.

In [17]:
import requests
import bs4
import re
from urllib.parse import urlparse, parse_qs
import time
import random

In [21]:
url = "https://httpbin.org/ip"

response = requests.get(url)

my_ip = response.json()['origin']


proxies = {}
proxies[0] = {
    "http": "socks4://1.2.187.126:4145",
    "https": "socks4://1.2.187.126:4145"
}
# proxies[1] = {
#     "http": "socks4://46.225.242.179:4145",
#     "https": "socks4://46.225.242.179:4145"
# }

# proxies[2] = {
#     "http": "http://121.230.209.203:3256",
#     "https": "http://121.230.209.203:3256"
# }

proxy_count = len(proxies)

for i in range(proxy_count):
    try:
        response = requests.get(url, proxies=proxies[i])
        ip = response.json()['origin']
        print(ip)
        # if ip is not my_ip:
    except:
        print("Proxy {} is down.".format(i))



1.2.187.126


In [25]:
# Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
headers = requests.utils.default_headers()
headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
})

# Target URL to scrape
url = "http://3.0.205.74/search-results?target={\"value\":\"Registration Number\",\"label\":\"Registration Number\",\"searchTarget\":\"registrationNumber\"}&value="
# url2 = "https://sgpgrid.com/search-results?target={\"value\":\"Company Name\",\"label\":\"Company Name\",\"searchTarget\":\"fullName\"}&value="

output_file = "{}-result.csv".format(input_file)
size = len(df) # Size of dataset

max_tries = 10

# f.write("Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description,Description,Website,Industry\n")

# Finding Description, Website, Industry, hardcode at 5 requests just to test
for i in range(225, 300):
    entity = df.loc[i] # current row record
    entity_uen = entity[1] # entity UEN
    entity_name = entity[2] # entity name
    
    # Accessing the webpage
    print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
    for j in range(max_tries):
        try:
            random_stuff = str(random.random()) + "&utm_source=linkedin&utm_medium=unpaidsoc&utm_campaign="
            url_new = url + entity_uen + "&ra=" + random_stuff
            # print(url_new)
            
            
            # pick a random rotating proxy
            id = random.randint(0, proxy_count)
            req = requests.get(url_new, headers=headers, proxies=proxies[id])
            
            soup = bs4.BeautifulSoup(req.text, "html.parser")
            
            # Now look for elements
            columns = soup.find_all("div", {"class": "rt-td table-cell"})

            # we know that column 0 is description (sometimes same as SSIC), 3 is website, 5 is SSIC

            # print(soup)

            description = columns[0].get_text()
            website = columns[3].get_text()
            ssic = columns[5].get_text()

            print("Found company info", description, website, ssic)

            # Format existing data as CSV
            formatted_csv_row = "{},{},{},{}".format(
                entity[0],
                entity[1],
                entity[2],
                entity[3]
            )

            # Then append SGPGrid's data...
            formatted_csv_row += ",\"{}\",\"{}\",\"{}\"".format(description, website, ssic)

            f = open(output_file, "a")
            f.write(formatted_csv_row + "\n")
            f.close()
    
    
            break # next for loop
        except Exception as e:
            print("[ERROR] Failed at id={} retry={} error={}, retrying...".format(i, j, e))
            continue # retry
    
    
    
    time.sleep(random.randint(1, 8)) # Maybe randomise the waiting time?
    


Searching for id=225 202001741H ZACD (TRIBE) PTE. LTD.
Found company info TRUSTS, FUNDS AND SIMILAR FINANCIAL ENTITIES (EG COLLECTIVE PORTFOLIO INVESTMENT FUNDS (EXCLUDING THOSE WITH RENTAL INCOME))  TRUSTS, FUNDS AND SIMILAR FINANCIAL ENTITIES (EG COLLECTIVE PORTFOLIO INVESTMENT FUNDS (EXCLUDING THOSE WITH RENTAL INCOME))
Searching for id=226 202001746D CEDRUS PTE. LTD.
[ERROR] Failed at id=226 retry=0 error=SOCKSHTTPConnectionPool(host='3.0.205.74', port=80): Max retries exceeded with url: /search-results?target=%7B%22value%22:%22Registration%20Number%22,%22label%22:%22Registration%20Number%22,%22searchTarget%22:%22registrationNumber%22%7D&value=202001746D&ra=0.5434976294233533&utm_source=linkedin&utm_medium=unpaidsoc&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSConnection object at 0x000002A0A004CCD0>: Failed to establish a new connection: [WinError 10054] An existing connection was forcibly closed by the remote host')), retrying...
[ERROR] Failed at id=2

Found company info MANAGEMENT CONSULTANCY SERVICES (GENERAL)  MANAGEMENT CONSULTANCY SERVICES (GENERAL)
Searching for id=248 202001869R SERENDIPITY CAPITAL FUND MANAGEMENT COMPANY PTE. LTD.
[ERROR] Failed at id=248 retry=0 error=1, retrying...
[ERROR] Failed at id=248 retry=1 error=1, retrying...
[ERROR] Failed at id=248 retry=2 error=SOCKSHTTPConnectionPool(host='3.0.205.74', port=80): Max retries exceeded with url: /search-results?target=%7B%22value%22:%22Registration%20Number%22,%22label%22:%22Registration%20Number%22,%22searchTarget%22:%22registrationNumber%22%7D&value=202001869R&ra=0.7837657013281383&utm_source=linkedin&utm_medium=unpaidsoc&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSConnection object at 0x000002A0A030C1F0>: Failed to establish a new connection: [WinError 10054] An existing connection was forcibly closed by the remote host')), retrying...
[ERROR] Failed at id=248 retry=3 error=SOCKSHTTPConnectionPool(host='3.0.205.74', port=80): Max re

Found company info TRADITIONAL/LONG-ONLY ASSET/PORTFOLIO MANAGEMENT  TRADITIONAL/LONG-ONLY ASSET/PORTFOLIO MANAGEMENT
Searching for id=266 202001956E KEJORA PARTNERS GLOBAL PTE. LTD.
Found company info OTHER HOLDING COMPANIES  OTHER HOLDING COMPANIES
Searching for id=267 202001958G IHANISAN PTE. LTD.
[ERROR] Failed at id=267 retry=0 error=SOCKSHTTPConnectionPool(host='3.0.205.74', port=80): Max retries exceeded with url: /search-results?target=%7B%22value%22:%22Registration%20Number%22,%22label%22:%22Registration%20Number%22,%22searchTarget%22:%22registrationNumber%22%7D&value=202001958G&ra=0.37134134201811086&utm_source=linkedin&utm_medium=unpaidsoc&utm_campaign= (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSConnection object at 0x000002A0A034CFD0>: Failed to establish a new connection: [WinError 10054] An existing connection was forcibly closed by the remote host')), retrying...
Found company info OTHER HOLDING COMPANIES  OTHER HOLDING COMPANIES
Searching for id=268 2020

Searching for id=290 202002097M RECO TURMERIC PRIVATE LIMITED
[ERROR] Failed at id=290 retry=0 error=1, retrying...
[ERROR] Failed at id=290 retry=1 error=1, retrying...
Found company info INVESTMENT HOLDING COMPANY  OTHER HOLDING COMPANIES
Searching for id=291 202002111Z YES FULL CIRCLE SOLUTIONS HOLDINGS PTE. LTD.
[ERROR] Failed at id=291 retry=0 error=1, retrying...
Found company info HOLDING COMPANY  OTHER HOLDING COMPANIES
Searching for id=292 202002112K CHANG CHENG HOLDINGS PTE. LTD.
Found company info OTHER HOLDING COMPANIES  OTHER HOLDING COMPANIES
Searching for id=293 202002112K DK126 PTE. LTD.
[ERROR] Failed at id=293 retry=0 error=1, retrying...
[ERROR] Failed at id=293 retry=1 error=1, retrying...
Found company info OTHER HOLDING COMPANIES  OTHER HOLDING COMPANIES
Searching for id=294 202002119G HDG INVESTMENT PTE. LTD.
Found company info OTHER HOLDING COMPANIES  OTHER HOLDING COMPANIES
Searching for id=295 202002129Z DILIGENT SHINE PTE. LTD.
[ERROR] Failed at id=295 retry=

### Old Codes

The following code attempts to:

- crawl Google Search for the first 10 results
- then it grabs the URLs so that we can crawl them again for the metadata

In [None]:

# f = open(output_file, "w")

# f.write("Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description,Link 1,Link 2,Link 3,Link 4,Link 5,Link 6,Link 7,Link 8,Link 9,Link 10\n")

# for i in range(1, 20):
#     entity = df.loc[i] # current row record
#     entity_name = entity[2] # entity name
#     query = "{} singapore website".format(entity_name)
    
#     print("Searching for id={} \"{}\"".format(i, query))
    
#     # prepare to search "<entity name> singapore website" for more localised search context
#     req = requests.get(url.format(query))
#     soup = bs4.BeautifulSoup(req.text, "html.parser")
#     headers = soup.find_all("h3")
#     links = soup.find_all(href=re.compile(r'\/url\?q=')) # pick top 10 search results & its link
    
#     # file write here
    
#     formatted_csv_row = "{},{},{},{}".format(
#         entity[0],
#         entity[1],
#         entity[2],
#         entity[3]
#     )
    
#     for i in range(10):
#         google_url = links[i].get('href')
#         parsed_url = parse_qs(google_url)
#         formatted_csv_row += ",{}".format(parsed_url['/url?q'][0])
#     f.write(formatted_csv_row + "\n")
# f.close()


