In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()

import sqlite3

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import json
import random

Define Class objects for Bing Search and Web scraping

In [34]:
class BingSearchAPI():

    def __init__(self) -> None:

        self.verbose = True
        self.n_webpages_to_scrape = 5

        # replace with list of keys once i run out
        self.subscription_key_free  = os.getenv("BING_SEARCH_API_KEY_FREE_1")
        self.subscription_key_paid = os.getenv("BING_SEARCH_API_KEY_PAID_1")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key_free}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}

    def switch_api_keys(self) -> None:
        if self.headers["Ocp-Apim-Subscription-Key"] == self.subscription_key_free:
            self.headers["Ocp-Apim-Subscription-Key"] = self.subscription_key_paid
        elif self.headers["Ocp-Apim-Subscription-Key"] == self.subscription_key_paid:
            print("ERROR: Both API keys have reached their limit")

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return "Bing Search has failed"
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            self.switch_api_keys()
            response = requests.get(self.search_url, headers=self.headers, params=self.params)
        
        self.search_results = response.json()
    
        return self.search_results
    
    def get_top_webpages(self) -> list:
        if self.verbose: print("Getting urls of top webpages")
        
        # get dict of site names and urls
        for result in self.search_results['webPages']['value']:
            # print("Result is ", result)
            # print(result["siteName"])
            if "siteName" in result.keys():
                self.webpages[result["siteName"]] = result["url"]
            elif "name" in result.keys():
                self.webpages[result["name"]] = result["url"]
            else:
                pass

            if len(self.webpages) >= self.n_webpages_to_scrape:
                break
            
        print("debug: length of webpages is ", len(self.webpages))
        return self.webpages
    

In [47]:
class SeleniumExtractionError(Exception):
    """Custom exception for Selenium extraction errors."""
    pass

class WebScraper():
    def __init__(self) -> None:
        # Set up Chrome driver with webdriver manager
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')  # Run headless for no browser window
        self.options.add_argument('--disable-gpu')  # Disable GPU acceleration
        self.options.add_argument('--no-sandbox')  # Required for some Linux environments

        # Automatically download and use ChromeDriver
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.options)

        # for using requests
        self.requests_headers =  {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

    def extract_text_with_selenium(self,url):
        try:
            # set timeout
            self.driver.set_page_load_timeout(5)

            # Open the URL in the browser
            print("Selenium: getting url")
            self.driver.get(url)
            time.sleep(0.1)
 
            # Wait for the page body to be present (max 5 seconds)
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            print("Selenium: found body")
            # Get page source and parse it after it has fully loaded
            page_source = self.driver.page_source
            print("Selenium: found page source")
            soup = BeautifulSoup(page_source, 'html.parser')
            text = soup.get_text(separator='\n')
            return text
        
        except TimeoutException:
            error_message = "Page load exceeded time limit of 5 seconds"
            print(f"An error occurred: {error_message}")
            return "Page contents not loaded in time"

        except Exception as e:
            error_message = str(e)
            print(f"An error occurred: {error_message}")
            if "disconnected: not connected to DevTools" in error_message:
                print(f"Error occurred: {error_message}")
                raise SeleniumExtractionError(f"DevTools disconnection error for URL: {url}")
            else:
                raise SeleniumExtractionError(f"An unexpected error occurred while extracting: {error_message}")
            
        
    def extract_text_with_requests(self,url):
        try:
            # Fetch the content from the URL with headers
            response = requests.get(url, headers=self.requests_headers)
            response.raise_for_status()  # Check if the request was successful

            # Parse text
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator='\n')  # Using '\n' to preserve some structure
            return text
        except Exception as e:
            print(f"Error fetching the URL: {e}")
            return "No content"
        
    def scrape_webpage(self, url: str) -> str:
        # use both methods and return the one that works
        # print("Extracting webpage with selenium")
        text_1 = self.extract_text_with_selenium(url)
        # print("Extracting webpage with requests")
        text_2 = self.extract_text_with_requests(url)

        if len(text_1) > len(text_2):
            return text_1
        else:
            return text_2

Function to clean text

In [48]:
import re
def clean_text(text):
    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)
    # Remove leading/trailing whitespace from each line
    text = '\n'.join(line.strip() for line in text.splitlines())
    # Remove extra spaces between words
    text = re.sub(r'\s+', ' ', text)
    return text

In [49]:
bing_searcher = BingSearchAPI()
site_scraper = WebScraper()

Test run for two firms, 10 fields

Read in list of firms

In [38]:
open_corporates_database = "../Opencorporates/Equifax Project Data - Sheet1.csv"
firms = pd.read_csv(open_corporates_database)
firms = firms.name.to_list()
print(len(firms))
random.shuffle(firms)

10000


In [39]:
# firms = ["TIN DRUM ASIACAFE', LLC", "ANDREW THOMAS LEE PHOTOGRAPHY, LLC"]

fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status"]

# override for testing with single firm value
# firms = ["TIN DRUM ASIACAFE', LLC"]
# fields = ["Registered_Address","CEO"]

Set up sqlite
- This table will store the results of the bing search and web scraping in a json format

In [40]:
conn = sqlite3.connect("firms_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               )
               ''')

<sqlite3.Cursor at 0x1245cb4c0>

In [50]:
cursor.execute(''' SELECT id, Firm_Name FROM firms_search_results ''')
print(cursor.fetchall())

[(1, "TIN DRUM ASIACAFE', LLC"), (2, 'ANDREW THOMAS LEE PHOTOGRAPHY, LLC'), (3, '"Q" ASSOCIATES, INC.'), (4, '"N" LIGHTZ ENTERTAINMENT, INC.'), (6, '"R" Mini Storage Inc.'), (7, "(JENNY'S) MUSTANG SALLY'S LLC"), (8, '#1 PRO-MAINTENANCE, LLC'), (9, '1 Deal Away, LLC'), (10, '"About My Fathers Business Transportation" LLC'), (11, '"K" WAY MAINTENANCE COMPANY, INCORPORATED'), (12, '10 PIN, LLC'), (13, '"HIGH PRAISES" FELLOWSHIP'), (14, '"HOT" DOGZ INC.'), (15, '"DIESEL-TECH INC."'), (16, '"OUT" Bail Bonds, Inc.'), (17, '.999BP, LLC')]


In [42]:
# Check for a speicfic firm
cursor.execute(f"SELECT * FROM firms_search_results WHERE id = 13")
a = cursor.fetchall()
a

[(13,
  '"HIGH PRAISES" FELLOWSHIP',
  '{"search_results": {"_type": "SearchResponse", "queryContext": {"originalQuery": "\\"HIGH PRAISES\\" FELLOWSHIP , Establishment Year"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=%22HIGH+PRAISES%22+FELLOWSHIP+%2c+Establishment+Year", "totalEstimatedMatches": 113000, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "\\"HIGH PRAISES\\" FELLOWSHIP :: Washington (US) - OpenCorporates", "url": "https://opencorporates.com/companies/us_wa/601459585", "isFamilyFriendly": true, "displayUrl": "https://opencorporates.com/companies/us_wa/601459585", "snippet": "2 April 1993 (almost 30 years ago) Company Type WA NONPROFIT CORPORATION Jurisdiction Washington (US) Registered Address. 810 ACADEMY ST; KELSO; 98626; WA; UNITED STATES; Agent Name HIGH PRAISES FELLOWSHIP Agent Address 810 ACADEMY ST, KELSO, WA, 98626-4419, UNITED STATES Directors / Officers. DAWN GREGG, governor; HIGH PRAISES FELLOWSHIP ...", "dateLastCr

In [43]:
b = json.loads(a[0][2])
b['search_results']

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': '"HIGH PRAISES" FELLOWSHIP , Registered_Address',
  'askUserForLocation': True},
 'webPages': {'webSearchUrl': 'https://www.bing.com/search?q=%22HIGH+PRAISES%22+FELLOWSHIP+%2c+Registered_Address',
  'totalEstimatedMatches': 116000,
  'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
    'name': '"HIGH PRAISES" FELLOWSHIP | Washington Companies Directory',
    'url': 'https://www.washingtoncompanysearch.com/companies/high-praises-fellowship/',
    'isFamilyFriendly': True,
    'displayUrl': 'https://www.washingtoncompanysearch.com/companies/high-praises-fellowship',
    'snippet': '"high Praises" Fellowship was registered on 1993-04-02 as a Nonprofit Corporation Regular Corporation type incorporated at 810 ACADEMY ST, KELSO, WA, 98626, UNITED STATES. The company is classified under NAICS code 36, which is for Religious. The agent of the company is High Praises Fellowship.',
    'dateLastCrawled': '2024-06

In [44]:
# Delete smt
# cursor.execute("DELETE FROM firms_search_results WHERE Firm_Name = ?", ('"R" Mini Storage Inc.',))
# conn.commit()

### Loop to construct database

In [51]:
current_firm = ""
current_field = ""

for firm in firms:
    print("----- Debug: Now on firm: ", firm)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_search_results WHERE Firm_Name = ?", (firm,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm)
        cursor.execute("INSERT INTO firms_search_results (Firm_Name) VALUES (?)", (firm,))
        firm_id = cursor.lastrowid  # Get the id of the inserted row
    else:
        # Get the existing firm's id
        print("Found row for firm, ", firm)
        firm_id = firm_row[0]
    

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm,))
        if cursor.fetchone() is not None:
            print(f"Field '{field}' already has data for firm '{firm}', skipping.")
            continue

        query = f"{firm} , {field.replace('_', ' ')}"

        print("---------- Debug: Now on field: ", field)
        

        try:
            # Get search results TODO: Handle search errors
            search_results = bing_searcher.search(query)


            # Get website contents
            urls = bing_searcher.get_top_webpages()
            website_info = {}

            for website_name, website_url in urls.items():
                print(f"Getting Contents of the website of {website_name} with url {website_url}")
                result = site_scraper.scrape_webpage(website_url)
                result = clean_text(result)
                website_info[website_name] = result


            # Convert the combined data into a JSON string
            combined_data = {
                "search_results": search_results,
                "website_info": website_info
            }
            json_combined_data = json.dumps(combined_data)

            # Update cell value in database
            cursor.execute(f"""
                        UPDATE firms_search_results
                        SET {field} = ?
                        WHERE id = ? AND Firm_Name = ?
                        """, (json_combined_data, firm_id, firm))
            
            conn.commit()

        except SeleniumExtractionError as e:
            print(f"Error extracting data for {firm} and {field}: {e}")
            continue

cursor.close()
conn.close()

----- Debug: Now on firm:  .999BP, LLC
Found row for firm,  .999BP, LLC
---------- Debug: Now on field:  Registered_Address
Conducting bing search for :  .999BP, LLC , Registered Address
Getting urls of top webpages
debug: length of webpages is  5
Getting Contents of the website of Secretary of State with url https://www.secstates.com/
Selenium: getting url
An error occurred: Page load exceeded time limit of 5 seconds
Getting Contents of the website of Georgia Corporations Division with url https://ecorp.sos.ga.gov/BusinessSearch
Selenium: getting url
Selenium: found body
Selenium: found page source
Error fetching the URL: 403 Client Error: Forbidden for url: https://ecorp.sos.ga.gov/BusinessSearch
Getting Contents of the website of Bizapedia with url https://www.bizapedia.com/ga/orbe-properties-llc.html
Selenium: getting url
Selenium: found body
Selenium: found page source
Getting Contents of the website of Georgia Secretary of State with url https://sos.ga.gov/corporations-division-g

KeyboardInterrupt: 

In [33]:
bing_searcher.get_top_webpages()

Getting urls of top webpages
debug: length of webpages is  80


{'Dun & Bradstreet': 'https://www.dnb.com/business-directory/company-profiles.out_bail_bonds_inc.d7098063d302c486a11388b086f2c69e.html',
 'OpenCorporates': 'https://opencorporates.com/companies/us_ga/16019105',
 'Corporation Wiki': 'https://www.corporationwiki.com/p/2m1eif/diesel-tech-inc',
 'Facebook': 'https://www.facebook.com/WEREZOUT/',
 'Boat Planet': 'https://boatplanet.com/pro/diesel-tech-inc',
 'SICCODE.com': 'https://siccode.com/business/in-n-out-bail-bonds-1',
 'NAICS Association': 'https://www.naics.com/naics-code-description/?code=812990',
 'Census.gov': 'https://www.census.gov/naics/?input=3130&year=2017',
 'Manta': 'https://www.manta.com/c/mx6fprr/in-out-bail-bonds',
 'Government Contracts': 'https://www.governmentcontracts.us/government-contractors/company-BSG1855914-diesel-tech-inc-Freeport-NY.htm',
 'Dcontrol.com': 'https://dcontrol.com/profile/diesel-tech-inc',
 'Bizapedia': 'https://www.bizapedia.com/tn/all-out-bail-bonds-inc.html',
 'Chamber of Commerce': 'https://w

In [56]:
cursor.close()
conn.close()

Test reading the json

In [None]:
firm_name

In [20]:
firm_name = ' "Q" ASSOCIATES, INC.  '

cursor.execute(''' SELECT * FROM firms_search_results
               WHERE Firm_Name = ?''', (firm_name,))

a = cursor.fetchall()
# address = json.loads(a[0][0])
# ceo = json.loads(a[0][1])

# print(address['website_info'])
# print(ceo['website_info'])

In [None]:
a

Looping through websites

In [None]:
# for a in bing_searcher.search_results['webPages']['value']:
#     print(a.keys())
#     print(a['url'])
#     print(a['displayUrl'])
#     print(a['name'])
