In [None]:
# pip install -U python-dotenv

In [None]:
# pip install cloudscraper

In [1]:
import numpy as np
import pandas as pd

In [2]:
input_file = "20210311_v0.2" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

### Dataset

The dataset used here seems to have some duplicates (see `unique` vs. `count`), and the dataset shows the same UEN with varying entity names - probably because company has been renamed, can confirm this through www.bizfile.gov.sg.


In [3]:
df.describe()

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
count,24245,24245,24245,24245
unique,362,23231,23899,3
top,2020-09-16,202028206K,ASIA PAAS HOLDINGS PTE. LTD.,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
freq,198,5,3,10638


In [4]:
df

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
0,2020-01-01,53407676M,GRAVITY FILM,INFORMATION AND COMMUNICATIONS
1,2020-01-01,53407679C,SMARTMOUTH,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
2,2020-01-01,53407682C,INNOVIC TECHNOLOGY,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
3,2020-01-01,53407694K,SUPREM9 SOLUTIONS,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
4,2020-01-01,53407706D,THE LOVERS,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
...,...,...,...,...
24240,2020-12-17,T20VC0183A,SEAVI ADVENT EQUITY VII FUND VCC,FINANCIAL AND INSURANCE ACTIVITIES
24241,2020-12-23,T20VC0185D,WELLINGTON MANAGEMENT FUNDS (SINGAPORE) VCC,FINANCIAL AND INSURANCE ACTIVITIES
24242,2020-12-23,T20VC0187G,PENCO CAPITAL VCC,FINANCIAL AND INSURANCE ACTIVITIES
24243,2020-12-23,T20VC0190G,RAINMAKING VENTURES (S) VCC,FINANCIAL AND INSURANCE ACTIVITIES


So let's remove the duplicates, and only take the last item of entities with duplicate entries...

In [5]:
df = df.drop_duplicates(subset="Entity Profile UEN", keep="last")
df

Unnamed: 0,Entity Registration Date,Entity Profile UEN,Entity Name,Primary Section Description
0,2020-01-01,53407676M,GRAVITY FILM,INFORMATION AND COMMUNICATIONS
1,2020-01-01,53407679C,SMARTMOUTH,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
2,2020-01-01,53407682C,INNOVIC TECHNOLOGY,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
3,2020-01-01,53407694K,SUPREM9 SOLUTIONS,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
4,2020-01-01,53407706D,THE LOVERS,"PROFESSIONAL, SCIENTIFIC AND TECHNICAL ACTIVITIES"
...,...,...,...,...
24240,2020-12-17,T20VC0183A,SEAVI ADVENT EQUITY VII FUND VCC,FINANCIAL AND INSURANCE ACTIVITIES
24241,2020-12-23,T20VC0185D,WELLINGTON MANAGEMENT FUNDS (SINGAPORE) VCC,FINANCIAL AND INSURANCE ACTIVITIES
24242,2020-12-23,T20VC0187G,PENCO CAPITAL VCC,FINANCIAL AND INSURANCE ACTIVITIES
24243,2020-12-23,T20VC0190G,RAINMAKING VENTURES (S) VCC,FINANCIAL AND INSURANCE ACTIVITIES


In [None]:
df.to_csv("{}-filtered.csv".format(input_file), index=False) # re-write to new file, don't include dataframe's index

### Part 1

We first begin by crawling for the website addresses from existing data sources.

In [6]:
import requests
import bs4
import re
import time
import random
import os
import unittest
from urllib.parse import urlparse, parse_qs
from dotenv import load_dotenv

In [7]:
input_file = "20210311_v0.2-filtered" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

In [8]:
load_dotenv()

PROXIES = os.getenv("PROXIES")

if not PROXIES:
    print("No proxies found, please enter them in CSV format in .env")

In [9]:
url = "https://httpbin.org/ip"

response = requests.get(url)
my_ip = response.json()['origin']

proxies = {}

proxies_string = PROXIES.split(",") # split our proxies into array e.g. ["proxy1.com", "proxy2.com"]

for i, p in enumerate(proxies_string):
    proxies[i] = {
        "http": p,
        "https": p
    }

proxy_count = len(proxies)

In [10]:
# Checking if proxies are ok
# for i in range(proxy_count):
#     try:
#         response = requests.get(url, proxies=proxies[i])
#         ip = response.json()['origin']
#         # print(ip)
#         # if ip is not my_ip:
#     except Exception as e:
#         print("Proxy {} is down, error={}".format(i, e))

In [11]:
def get_random_user_agent():
    ua_strings = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
        "Mozilla/5.0 (iPad; CPU OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1",
        "Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Mobile Safari/537.36",
        "Mozilla/5.0 (Linux; Android 10; SM-N960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Mobile Safari/537.36"
    ]
 
    return random.choice(ua_strings)

In [12]:
def get_entity_url_slug(entity_name):
    entity_url_slug = re.sub("[^a-zA-Z0-9 ]", "", entity_name.lower())
    entity_url_slug = re.sub(" +", " ", entity_url_slug).replace(" ", "-")
    return entity_url_slug

class TestEntityUrlSlug(unittest.TestCase):
    
    def test_brackets(self):
        self.assertEqual(get_entity_url_slug("POLY GLOBAL COMMERCIAL (AU) HOLDINGS PTE. LTD."), "poly-global-commercial-au-holdings-pte-ltd")
    
    def test_brackets_2(self):
        self.assertEqual(get_entity_url_slug("ES SOLUTIONS (S) PTE. LTD."), "es-solutions-s-pte-ltd")
    
    def test_brackets_3(self):
        self.assertEqual(get_entity_url_slug("JEM(S) WATERPLUS PTE. LTD."), "jems-waterplus-pte-ltd")
        
    def test_special_characters(self):
        self.assertEqual(get_entity_url_slug("G+ PTE. LTD."), "g-pte-ltd")
        
    def test_period_in_entity_name(self):
        self.assertEqual(get_entity_url_slug("IK.SG PTE. LTD."), "iksg-pte-ltd")
        
    def test_period_in_entity_name_2(self):
        self.assertEqual(get_entity_url_slug("JIUZHANG TECHNOLOGIES (S.E.A.) PTE. LTD."), "jiuzhang-technologies-sea-pte-ltd")
    
    def test_dashes_and_ampersands(self):
        self.assertEqual(get_entity_url_slug("PRO-FIN CONSULT & COMPANY PTE. LTD."), "profin-consult-company-pte-ltd")
        
    def test_pte_limited(self):
        self.assertEqual(get_entity_url_slug("ASIA OPPORTUNITIES (SINGAPORE) PTE. LIMITED"), "asia-opportunities-singapore-pte-limited")
    
    def test_brackets_2(self):
        self.assertEqual(get_entity_url_slug("ES SOLUTIONS (S) PTE. LTD."), "es-solutions-s-pte-ltd")
    
        
unittest.main(argv=[''], verbosity=2, exit=False)

test_brackets (__main__.TestEntityUrlSlug) ... ok
test_brackets_2 (__main__.TestEntityUrlSlug) ... ok
test_brackets_3 (__main__.TestEntityUrlSlug) ... ok
test_dashes_and_ampersands (__main__.TestEntityUrlSlug) ... ok
test_period_in_entity_name (__main__.TestEntityUrlSlug) ... ok
test_period_in_entity_name_2 (__main__.TestEntityUrlSlug) ... ok
test_pte_limited (__main__.TestEntityUrlSlug) ... ok
test_special_characters (__main__.TestEntityUrlSlug) ... ok

----------------------------------------------------------------------
Ran 8 tests in 0.013s

OK


<unittest.main.TestProgram at 0x262a2c0fb50>

In [13]:
import cloudscraper

In [None]:
# Target URL to scrape

url = "https://sgpgrid.com/company-details/"
output_file = "{}-result.csv".format(input_file)
size = len(df) # Size of dataset

start = 79
end = start+1 # size

max_tries = 15

print("Beginning job with id={} of size={}".format(start, size))


scraper = cloudscraper.create_scraper()

for i in range(start, end):
    entity = df.loc[i] # current row record
    entity_uen = entity[1] # entity UEN
    entity_name = entity[2] # entity name
    
    entity_url_slug = get_entity_url_slug(entity_name)
    
    
    # Prepare the CSV to write out
    formatted_csv_row = "\"{}\",\"{}\",\"{}\",\"{}\"".format(
        entity[0],
        entity[1],
        entity[2],
        entity[3]
    )

    # Accessing the webpage
    print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
    for j in range(max_tries):
        # Pick a random rotating proxy
        id = random.randint(0, proxy_count)
        
        try:
            random_stuff = "?ra={}&utm_source=facebook&utm_medium=facebook&utm_campaign=".format(str(random.random()))
            url_new = url + entity_url_slug + random_stuff
            print(url_new)

            # Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
#             headers = requests.utils.default_headers()
#             headers.update({
#                 "User-Agent": get_random_user_agent()
#             })
            
            # Now, we query the target URL using a random proxy
            resp = scraper.get(url_new, proxies=proxies)
            
            # Use BeautifulSoup to parse the HTML
            soup = bs4.BeautifulSoup(resp.text, "html.parser")
            print(soup)
            
            # Now look for the specific elements
            columns = soup.find_all("div", {"class": "rt-td table-cell"})
            
           
            
            # Then append SGPGrid's data...
#             formatted_csv_row += ",\"{}\",\"{}\",\"{}\"".format(description, website, ssic)

            # Finally we write out to file by appending
            f = open(output_file, "a")
            f.write(formatted_csv_row + "\n")
            f.close()
            
            print("    [SUCCESS] Found company info for id={} on retry={}".format(i, j), description, website, ssic)
    
            break # next for loop
        except Exception as e:
            print("    [ERROR] Failed at id={} retry={} error={} using proxy id={}, retrying...".format(i, j, e, id))
            
            if j == max_tries - 1:
                f = open("{}-failed.csv".format(input_file), "a")
                f.write(formatted_csv_row + "\n")
                f.close()
            continue # retry

    time.sleep(random.randint(1, 8)) # Randomise the waiting time


Beginning job with id=79 of size=23231
Searching for id=79 53408070K ALPYX
https://sgpgrid.com/company-details/alpyx?ra=0.7510683232325807&utm_source=facebook&utm_medium=facebook&utm_campaign=
    [ERROR] Failed at id=79 retry=0 error=Detected a Cloudflare version 2 Captcha challenge, This feature is not available in the opensource (free) version. using proxy id=12, retrying...
https://sgpgrid.com/company-details/alpyx?ra=0.8674663678430043&utm_source=facebook&utm_medium=facebook&utm_campaign=
    [ERROR] Failed at id=79 retry=1 error=Detected a Cloudflare version 2 Captcha challenge, This feature is not available in the opensource (free) version. using proxy id=10, retrying...
https://sgpgrid.com/company-details/alpyx?ra=0.22067073318082442&utm_source=facebook&utm_medium=facebook&utm_campaign=
    [ERROR] Failed at id=79 retry=2 error=Detected a Cloudflare version 2 Captcha challenge, This feature is not available in the opensource (free) version. using proxy id=5, retrying...
https:/

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
AssertionError
Traceback (most recent call last):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
TypeError: object of type 'NoneType' has no len()

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
AttributeError: 'TypeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:


### Old Codes

The following code attempts to:

- crawl Google Search for the first 10 results
- then it grabs the URLs so that we can crawl them again for the metadata

In [None]:
# # Target URL to scrape

# url = "https://www.google.com/search?q={}&sourceid=chrome&ie=UTF-8"
# output_file = "{}-result.csv".format(input_file)
# size = len(df) # Size of dataset

# max_tries = 15

# for i in range(0, size):
#     entity = df.loc[i] # current row record
#     entity_uen = entity[1] # entity UEN
#     entity_name = entity[2] # entity name
    
#     # Randomise search terms
#     search_terms = [
#         "{} singapore website".format(entity_name),
#         "{} sg website".format(entity_name)
#     ]
    
#     search_terms_count = len(search_terms)
    
#     # Prepare the CSV to write out
#     formatted_csv_row = "\"{}\",\"{}\",\"{}\",\"{}\"".format(
#         entity[0],
#         entity[1],
#         entity[2],
#         entity[3]
#     )

#     # Accessing the webpage
#     print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
#     for j in range(max_tries):
#         try:
#             id = random.randint(0, search_terms_count)
#             search_term = search_terms[id]
#             url_new = url.format(search_term)
            
#             # Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
#             headers = requests.utils.default_headers()
#             headers.update({
#                 'User-Agent': get_random_user_agent()
#             })

#             # Pick a random rotating proxy
#             id = random.randint(0, proxy_count)
            
#             # Now, we query the target URL using a random proxy
#             resp = requests.get(url_new, headers=headers, proxies=proxies[id])
            
#             # Use BeautifulSoup to parse the HTML
#             soup = bs4.BeautifulSoup(resp.text, "html.parser")
#             print(soup)
            
#             # Now look for the specific elements
#             headers = soup.find_all("h3")
#             links = soup.find_all(href=re.compile(r'\/url\?q=')) # pick top 10 search results & its link
            
#             for i in range(10):
#                 google_url = links[i].get("href").strip()
#                 parsed_url = parse_qs(google_url)
#                 formatted_csv_row += ",{}".format(parsed_url["/url?q"][0])
            
#             # Finally we write out to file by appending
#             f = open(output_file, "a")
#             f.write(formatted_csv_row + "\n")
#             f.close()
            
#             print("    [SUCCESS] Wrote to file for id={} on retry={}".format(i, j))
    
#             break # next for loop
#         except Exception as e:
#             print("    [ERROR] Failed at id={} retry={} error={}, retrying...".format(i, j, e))
            
#             if j == max_tries - 1:
#                 f = open("{}-failed.csv".format(input_file), "a")
#                 f.write(formatted_csv_row)
#                 f.close()
#             continue # retry

#     time.sleep(random.randint(1, 3)) # Randomise the waiting time


In [None]:
# # Target URL to scrape

# url = "http://3.0.205.74/search-results?target={\"value\":\"Registration Number\",\"label\":\"Registration Number\",\"searchTarget\":\"registrationNumber\"}&value="
# output_file = "{}-result.csv".format(input_file)
# size = len(df) # Size of dataset

# start = 70
# print("Beginning job with id={} of size={}".format(start, size))

# max_tries = 15

# for i in range(start, size):
#     entity = df.loc[i] # current row record
#     entity_uen = entity[1] # entity UEN
#     entity_name = entity[2] # entity name
    
#     # Prepare the CSV to write out
#     formatted_csv_row = "\"{}\",\"{}\",\"{}\",\"{}\"".format(
#         entity[0],
#         entity[1],
#         entity[2],
#         entity[3]
#     )

#     # Accessing the webpage
#     print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
#     for j in range(max_tries):
#         # Pick a random rotating proxy
#         id = random.randint(0, proxy_count)
        
#         try:
#             random_stuff = "&ra={}&utm_source=facebook&utm_medium=facebook&utm_campaign=".format(str(random.random()))
#             url_new = url + entity_uen + random_stuff

#             # Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
#             headers = requests.utils.default_headers()
#             headers.update({
#                 "User-Agent": get_random_user_agent()
#             })
            
#             # Now, we query the target URL using a random proxy
#             resp = requests.get(url_new, headers=headers, proxies=proxies[id])
            
#             # Use BeautifulSoup to parse the HTML
#             soup = bs4.BeautifulSoup(resp.text, "html.parser")
#             # print(soup)
            
#             # Now look for the specific elements
#             columns = soup.find_all("div", {"class": "rt-td table-cell"})
            
#             description = columns[0].get_text()
#             website = columns[3].get_text()
#             ssic = columns[5].get_text()
            
#             # Then append SGPGrid's data...
#             formatted_csv_row += ",\"{}\",\"{}\",\"{}\"".format(description, website, ssic)

#             # Finally we write out to file by appending
#             f = open(output_file, "a")
#             f.write(formatted_csv_row + "\n")
#             f.close()
            
#             print("    [SUCCESS] Found company info for id={} on retry={}".format(i, j), description, website, ssic)
    
#             break # next for loop
#         except Exception as e:
#             print("    [ERROR] Failed at id={} retry={} error={} using proxy id={}, retrying...".format(i, j, e, id))
            
#             if j == max_tries - 1:
#                 f = open("{}-failed.csv".format(input_file), "a")
#                 f.write(formatted_csv_row + "\n")
#                 f.close()
#             continue # retry

#     time.sleep(random.randint(1, 8)) # Randomise the waiting time
