In [None]:
# pip install -U python-dotenv

In [1]:
import numpy as np
import pandas as pd

In [None]:
input_file = "20210311_v0.2" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

### Dataset

The dataset used here seems to have some duplicates (see `unique` vs. `count`), and the dataset shows the same UEN with varying entity names - probably because company has been renamed, can confirm this through www.bizfile.gov.sg.


In [None]:
df.describe()

In [None]:
df

So let's remove the duplicates, and only take the last item of entities with duplicate entries...

In [None]:
df = df.drop_duplicates(subset="Entity Profile UEN", keep="last")
df

In [None]:
df.to_csv("{}-filtered.csv".format(input_file), index=False) # re-write to new file, don't include dataframe's index

### Part 1

We first begin by crawling for the website addresses from existing data sources.

In [2]:
import requests
import bs4
import re
import time
import random
import os
from urllib.parse import urlparse, parse_qs
from dotenv import load_dotenv

In [3]:
input_file = "20210311_v0.2-filtered" # no need for file extension, will be used later for file output
df = pd.read_csv("{}.csv".format(input_file))

In [4]:
load_dotenv()

PROXIES = os.getenv("PROXIES")

if not PROXIES:
    print("No proxies found, please enter them in CSV format in .env")

In [5]:
url = "https://httpbin.org/ip"

response = requests.get(url)
my_ip = response.json()['origin']

proxies = {}

proxies_string = PROXIES.split(",") # split our proxies into array e.g. ["proxy1.com", "proxy2.com"]

for i, p in enumerate(proxies_string):
    proxies[i] = {
        "http": p,
        "https": p
    }

proxy_count = len(proxies)

In [6]:
# Checking if proxies are ok
# for i in range(proxy_count):
#     try:
#         response = requests.get(url, proxies=proxies[i])
#         ip = response.json()['origin']
#         # print(ip)
#         # if ip is not my_ip:
#     except Exception as e:
#         print("Proxy {} is down, error={}".format(i, e))

In [7]:
def get_random_user_agent():
    ua_strings = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
        "Mozilla/5.0 (iPad; CPU OS 14_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1",
        "Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Mobile Safari/537.36",
        "Mozilla/5.0 (Linux; Android 10; SM-N960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Mobile Safari/537.36"
    ]
 
    return random.choice(ua_strings)

In [11]:
# Target URL to scrape

url = "http://3.0.205.74/search-results?target={\"value\":\"Registration Number\",\"label\":\"Registration Number\",\"searchTarget\":\"registrationNumber\"}&value="
output_file = "{}-result.csv".format(input_file)
size = len(df) # Size of dataset

start = 70
print("Beginning job with id={} of size={}".format(start, size))

max_tries = 15

for i in range(start, size):
    entity = df.loc[i] # current row record
    entity_uen = entity[1] # entity UEN
    entity_name = entity[2] # entity name
    
    # Prepare the CSV to write out
    formatted_csv_row = "\"{}\",\"{}\",\"{}\",\"{}\"".format(
        entity[0],
        entity[1],
        entity[2],
        entity[3]
    )

    # Accessing the webpage
    print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
    for j in range(max_tries):
        # Pick a random rotating proxy
        id = random.randint(0, proxy_count)
        
        try:
            random_stuff = "&ra={}&utm_source=facebook&utm_medium=facebook&utm_campaign=".format(str(random.random()))
            url_new = url + entity_uen + random_stuff

            # Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
            headers = requests.utils.default_headers()
            headers.update({
                "User-Agent": get_random_user_agent()
            })
            
            # Now, we query the target URL using a random proxy
            resp = requests.get(url_new, headers=headers, proxies=proxies[id])
            
            # Use BeautifulSoup to parse the HTML
            soup = bs4.BeautifulSoup(resp.text, "html.parser")
            # print(soup)
            
            # Now look for the specific elements
            columns = soup.find_all("div", {"class": "rt-td table-cell"})
            
            description = columns[0].get_text()
            website = columns[3].get_text()
            ssic = columns[5].get_text()
            
            # Then append SGPGrid's data...
            formatted_csv_row += ",\"{}\",\"{}\",\"{}\"".format(description, website, ssic)

            # Finally we write out to file by appending
            f = open(output_file, "a")
            f.write(formatted_csv_row + "\n")
            f.close()
            
            print("    [SUCCESS] Found company info for id={} on retry={}".format(i, j), description, website, ssic)
    
            break # next for loop
        except Exception as e:
            print("    [ERROR] Failed at id={} retry={} error={} using proxy id={}, retrying...".format(i, j, e, id))
            
            if j == max_tries - 1:
                f = open("{}-failed.csv".format(input_file), "a")
                f.write(formatted_csv_row + "\n")
                f.close()
            continue # retry

    time.sleep(random.randint(1, 8)) # Randomise the waiting time


Beginning job with id=70 of size=23231
Searching for id=70 53408030L WOMEN ON TOP MEDIA
    [ERROR] Failed at id=70 retry=0 error=HTTPSConnectionPool(host='sgpgrid.com', port=443): Max retries exceeded with url: /too-many (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request'))) using proxy id=11, retrying...
    [ERROR] Failed at id=70 retry=1 error=list index out of range using proxy id=8, retrying...
    [ERROR] Failed at id=70 retry=2 error=list index out of range using proxy id=2, retrying...
    [ERROR] Failed at id=70 retry=3 error=list index out of range using proxy id=2, retrying...
    [SUCCESS] Found company info for id=70 on retry=4 PUBLISHING OF BOOKS, BROCHURES, MUSICAL BOOKS AND OTHER PUBLICATIONS  PUBLISHING OF BOOKS, BROCHURES, MUSICAL BOOKS AND OTHER PUBLICATIONS
Searching for id=71 53408031J STAR MUSICAL
    [SUCCESS] Found company info for id=71 on retry=0 MUSIC  SOUND RECORDING PRODUCTION
Searching for id=72 53408034X 

    [ERROR] Failed at id=77 retry=7 error=list index out of range using proxy id=9, retrying...
    [ERROR] Failed at id=77 retry=8 error=list index out of range using proxy id=4, retrying...
    [ERROR] Failed at id=77 retry=9 error=list index out of range using proxy id=8, retrying...
    [ERROR] Failed at id=77 retry=10 error=list index out of range using proxy id=2, retrying...
    [ERROR] Failed at id=77 retry=11 error=list index out of range using proxy id=2, retrying...
    [ERROR] Failed at id=77 retry=12 error=12 using proxy id=12, retrying...
    [ERROR] Failed at id=77 retry=13 error=list index out of range using proxy id=0, retrying...
    [ERROR] Failed at id=77 retry=14 error=list index out of range using proxy id=3, retrying...
Searching for id=78 53408068E EZ WEB DEVELOPMENT
    [ERROR] Failed at id=78 retry=0 error=12 using proxy id=12, retrying...
    [ERROR] Failed at id=78 retry=1 error=list index out of range using proxy id=1, retrying...
    [ERROR] Failed at id=7

KeyboardInterrupt: 

### Old Codes

The following code attempts to:

- crawl Google Search for the first 10 results
- then it grabs the URLs so that we can crawl them again for the metadata

In [None]:
# # Target URL to scrape

# url = "https://www.google.com/search?q={}&sourceid=chrome&ie=UTF-8"
# output_file = "{}-result.csv".format(input_file)
# size = len(df) # Size of dataset

# max_tries = 15

# for i in range(0, size):
#     entity = df.loc[i] # current row record
#     entity_uen = entity[1] # entity UEN
#     entity_name = entity[2] # entity name
    
#     # Randomise search terms
#     search_terms = [
#         "{} singapore website".format(entity_name),
#         "{} sg website".format(entity_name)
#     ]
    
#     search_terms_count = len(search_terms)
    
#     # Prepare the CSV to write out
#     formatted_csv_row = "\"{}\",\"{}\",\"{}\",\"{}\"".format(
#         entity[0],
#         entity[1],
#         entity[2],
#         entity[3]
#     )

#     # Accessing the webpage
#     print("Searching for id={} {} {}".format(i, entity_uen, entity_name))
    
#     for j in range(max_tries):
#         try:
#             id = random.randint(0, search_terms_count)
#             search_term = search_terms[id]
#             url_new = url.format(search_term)
            
#             # Setup User Agent headers, attempt to imitate a "browser-like" request to the webpage
#             headers = requests.utils.default_headers()
#             headers.update({
#                 'User-Agent': get_random_user_agent()
#             })

#             # Pick a random rotating proxy
#             id = random.randint(0, proxy_count)
            
#             # Now, we query the target URL using a random proxy
#             resp = requests.get(url_new, headers=headers, proxies=proxies[id])
            
#             # Use BeautifulSoup to parse the HTML
#             soup = bs4.BeautifulSoup(resp.text, "html.parser")
#             print(soup)
            
#             # Now look for the specific elements
#             headers = soup.find_all("h3")
#             links = soup.find_all(href=re.compile(r'\/url\?q=')) # pick top 10 search results & its link
            
#             for i in range(10):
#                 google_url = links[i].get("href").strip()
#                 parsed_url = parse_qs(google_url)
#                 formatted_csv_row += ",{}".format(parsed_url["/url?q"][0])
            
#             # Finally we write out to file by appending
#             f = open(output_file, "a")
#             f.write(formatted_csv_row + "\n")
#             f.close()
            
#             print("    [SUCCESS] Wrote to file for id={} on retry={}".format(i, j))
    
#             break # next for loop
#         except Exception as e:
#             print("    [ERROR] Failed at id={} retry={} error={}, retrying...".format(i, j, e))
            
#             if j == max_tries - 1:
#                 f = open("{}-failed.csv".format(input_file), "a")
#                 f.write(formatted_csv_row)
#                 f.close()
#             continue # retry

#     time.sleep(random.randint(1, 3)) # Randomise the waiting time
