In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()

import sqlite3

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from requests.exceptions import Timeout, RequestException
import time
import json
import re
import random
import threading

Define Class objects for Bing Search and Web scraping

In [2]:
class BingSearchAPI():

    def __init__(self) -> None:

        self.verbose = True
        self.n_webpages_to_scrape = 5

        # replace with list of keys once i run out
        self.subscription_key_free  = os.getenv("BING_SEARCH_API_KEY_FREE_1")
        self.subscription_key_paid = os.getenv("BING_SEARCH_API_KEY_PAID_1")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key_free}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 5

    def switch_api_keys(self) -> None:
        if self.headers["Ocp-Apim-Subscription-Key"] == self.subscription_key_free:
            self.headers["Ocp-Apim-Subscription-Key"] = self.subscription_key_paid
        elif self.headers["Ocp-Apim-Subscription-Key"] == self.subscription_key_paid:
            print("ERROR: Both API keys have reached their limit")

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return "Bing Search has failed"
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            self.switch_api_keys()
            response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)
        
        self.search_results = response.json()
    
        return self.search_results
    
    def get_top_webpages(self) -> list:
        if self.verbose: print("Getting urls of top webpages")
        
        # get dict of site names and urls
        if 'webPages' not in self.search_results.keys():
            print("No webpages found")
            return self.webpages
        else:

            for result in self.search_results['webPages']['value']:
                # print("Result is ", result)
                # print(result["siteName"])
                if "siteName" in result.keys():
                    self.webpages[result["siteName"]] = result["url"]
                elif "name" in result.keys():
                    self.webpages[result["name"]] = result["url"]
                else:
                    pass

                if len(self.webpages) >= self.n_webpages_to_scrape:
                    break
                
            print("debug: length of webpages is ", len(self.webpages))
            return self.webpages
        

Todo: put the selenium process in a thread and impose a max timeout of 5s

In [3]:
class SeleniumExtractionError(Exception):
    """Custom exception for Selenium extraction errors."""
    pass

class WebScraper():
    def __init__(self) -> None:
        # Set up Chrome driver with webdriver manager
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')  # Run headless for no browser window
        self.options.add_argument('--disable-gpu')  # Disable GPU acceleration
        # self.options.add_argument('--no-sandbox')  # Required for some Linux environments
        self.options.add_argument('--disable-extensions')
        self.options.add_argument('--disable-plugins')
        self.options.add_argument('--disable-images')  # Prevent loading images to save bandwidth
        self.options.add_argument('--disable-browser-side-navigation')
        self.options.page_load_strategy = 'eager'  

        # Automatically download and use ChromeDriver
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.options)
        self.driver.set_page_load_timeout(5)
        
        # Paremeters for requests
        self.requests_headers =  {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        self.requests_timeout = 5
        

    # def extract_text_with_selenium(self,url):
    #     try:

    #         # Open the URL in the browser
    #         print("Selenium DEBUG: getting url")
    #         self.driver.get(url)
    #         time.sleep(0.1)
 
    #         # Wait for the page body to be present (max 5 seconds)
    #         WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    #         print("Selenium DEBUG: found body")
    #         # Get page source and parse it after it has fully loaded
    #         page_source = self.driver.page_source
    #         print("Selenium DEBUG: found page source")
    #         soup = BeautifulSoup(page_source, 'html.parser')
    #         text = soup.get_text(separator='\n')
    #         return text
        
    #     except TimeoutException:
    #         error_message = "Page load exceeded time limit of 5 seconds"
    #         print(f"An error occurred: {error_message}")
    #         return "Page contents not loaded in time"

    #     except Exception as e:
    #         error_message = str(e)
    #         print(f"An error occurred: {error_message}")
    #         if "disconnected: not connected to DevTools" in error_message:
    #             print(f"Error occurred: {error_message}")
    #             raise SeleniumExtractionError(f"DevTools disconnection error for URL: {url}")
    #         else:
    #             raise SeleniumExtractionError(f"An unexpected error occurred while extracting: {error_message}")

    def extract_text_with_selenium(self, url):
        def selenium_process():
            try:
                print("Selenium DEBUG: getting url")
                self.driver.get(url)
                WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
                print("Selenium DEBUG: found body")
                page_source = self.driver.page_source
                print("Selenium DEBUG: found page source")
                soup = BeautifulSoup(page_source, 'html.parser')
                self.selenium_result = soup.get_text(separator='\n')
            except Exception as e:
                self.selenium_result = None
                self.selenium_error = str(e)

        self.selenium_result = None
        self.selenium_error = None
        thread = threading.Thread(target=selenium_process)
        thread.start()
        thread.join(timeout=10)  # Wait for 10 seconds

        if thread.is_alive():
            print("Selenium process timed out after 10 seconds")
            self.safe_quit_selenium()
            self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.options)  # Reinitialize the driver
            return None

        if self.selenium_error:
            print(f"An error occurred: {self.selenium_error}")
            return None

        return self.selenium_result
    
    def safe_quit_selenium(self):
        try:
            self.driver.close()
            self.driver.quit()
        except Exception as e:
            print(f"Error during driver quit: {e}")
        finally:
            self.kill_chrome_processes()

    def kill_chrome_processes(self):
        import psutil
        PROCNAME = "chromedriver" # or "chrome" depending on your setup
        for proc in psutil.process_iter():
            # check whether the process name matches
            if proc.name() == PROCNAME:
                proc.kill()
            
        
    def extract_text_with_requests(self,url):
        try:
            # Fetch the content from the URL with headers
            print("Requests DEBUG: getting url")
            response = requests.get(url, headers=self.requests_headers, timeout=self.requests_timeout)
            response.raise_for_status()  # Check if the request was successful
            print("Requests DEBUG: parsing text")
            # Parse text
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator='\n')  # Using '\n' to preserve some structure
            return text

        except Timeout:
            print(f"Request timed out after 5 seconds for URL: {url}")
            return None
        except RequestException as e:
            print(f"Error fetching the URL {url}: {e}")
            return None
        except Exception as e:
            print(f"Unexpected error occurred while processing {url}: {e}")
            return None
        
    def scrape_webpage(self, url: str) -> str:
        # use both methods and return the one that works
        # print("Extracting webpage with selenium")
        text_1 = self.extract_text_with_selenium(url)
        # print("Extracting webpage with requests")
        text_2 = self.extract_text_with_requests(url)

        if text_1 is None and text_2 is None:
            return None
        elif text_1 is None:
            return text_2
        elif text_2 is None:
            return text_1
        elif len(text_1) > len(text_2):
            return text_1
        else:
            return text_2
        
    def clean_text(text):
        if text is None:
            return None
        # Replace multiple newlines with a single newline
        text = re.sub(r'\n+', '\n', text)
        # Remove leading/trailing whitespace from each line
        text = '\n'.join(line.strip() for line in text.splitlines())
        # Remove extra spaces between words
        text = re.sub(r'\s+', ' ', text)
        return text

Test run for two firms, 10 fields

Read in list of firms

In [4]:
open_corporates_database = "../Opencorporates/Equifax Project Data - Sheet1.csv"
firms = pd.read_csv(open_corporates_database)
firms = firms.name.to_list()
print(len(firms))
random.shuffle(firms)

10000


In [5]:
# firms = ["TIN DRUM ASIACAFE', LLC", "ANDREW THOMAS LEE PHOTOGRAPHY, LLC"]

fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status"]

# override for testing with single firm value
# firms = ["TIN DRUM ASIACAFE', LLC"]
# fields = ["Registered_Address","CEO"]

Set up sqlite
- This table will store the results of the bing search and web scraping in a json format

In [6]:
conn = sqlite3.connect("firms_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               )
               ''')

<sqlite3.Cursor at 0x11c063ac0>

In [13]:
cursor.execute(''' SELECT COUNT(id) FROM firms_search_results ''')
print(cursor.fetchall())

[(41,)]


In [16]:
# Check for a speicfic firm
cursor.execute(f"SELECT * FROM firms_search_results WHERE id = 41")
a = cursor.fetchall()
a

[(41,
  '0xMac LLC',
  '{"search_results": {"_type": "SearchResponse", "queryContext": {"originalQuery": "0xMac LLC , Registered Address"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=0xMac+LLC+%2c+Registered+Address", "totalEstimatedMatches": 3340000, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "0xMac LLC :: Mississippi (US) - OpenCorporates", "url": "https://opencorporates.com/companies/us_ms/1223272", "isFamilyFriendly": true, "displayUrl": "https://opencorporates.com/companies/us_ms/1223272", "snippet": "Free and open company data on Mississippi (US) company 0xMac LLC (company number 1223272) Learn how to leverage transparent company data at scale. Subscribe to our emails. ... POSEY LAWN SERVICE LLC; Agent Name Jonathan Posey Agent Address 1021 Halton Ct, BRANDON, MS 39047 Directors / Officers. Jonathan Posey, agent; Jonathan Posey, manager ...", "dateLastCrawled": "2023-01-09T07:46:00.0000000Z", "cachedPageUrl": "http://cc.bingj.co

In [9]:
# b = json.loads(a[0][2])
# b['search_results']

In [10]:
# Delete smt
# cursor.execute("DELETE FROM firms_search_results WHERE Firm_Name = ?", ('"R" Mini Storage Inc.',))
# conn.commit()

### Loop to construct database

In [17]:
current_firm = ""
current_field = ""
bing_searcher = BingSearchAPI()
site_scraper = WebScraper()

for firm in firms:
    print("----- Debug: Now on firm: ", firm)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_search_results WHERE Firm_Name = ?", (firm,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm)
        cursor.execute("INSERT INTO firms_search_results (Firm_Name) VALUES (?)", (firm,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        print("Found row for firm, ", firm)
        firm_id = firm_row[0]
    

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm,))
        if cursor.fetchone() is not None:
            print(f"Field '{field}' already has data for firm '{firm}', skipping.")
            continue

        query = f"{firm} , {field.replace('_', ' ')}"

        print("---------- Debug: Now on field: ", field)
        

        try:
            # Get search results TODO: Handle search errors
            search_results = bing_searcher.search(query)


            # Get website contents
            urls = bing_searcher.get_top_webpages()
            website_info = {}

            for website_name, website_url in urls.items():
                print(f"Getting Contents of the website of {website_name} with url {website_url}")
                result = site_scraper.scrape_webpage(website_url)
                website_info[website_name] = result


            # Convert the combined data into a JSON string
            combined_data = {
                "search_results": search_results,
                "website_info": website_info
            }
            json_combined_data = json.dumps(combined_data)

            # Update cell value in database
            cursor.execute(f"""
                        UPDATE firms_search_results
                        SET {field} = ?
                        WHERE id = ? AND Firm_Name = ?
                        """, (json_combined_data, firm_id, firm))
            
            conn.commit()

        except SeleniumExtractionError as e:
            print(f"Error extracting data for {firm} and {field}: {e}")
            continue

cursor.close()
conn.close()

----- Debug: Now on firm:  "C.C." TOOLS, INC.
Found row for firm,  "C.C." TOOLS, INC.
---------- Debug: Now on field:  Registered_Address
Conducting bing search for :  "C.C." TOOLS, INC. , Registered Address
Getting urls of top webpages
debug: length of webpages is  5
Getting Contents of the website of Dun & Bradstreet with url https://www.dnb.com/business-directory/company-profiles.c_c_tools_inc.26c49fb75e721ebba1af107ec6c3fc02.html
Selenium DEBUG: getting url
Selenium DEBUG: found body
Selenium DEBUG: found page source
Requests DEBUG: getting url
Requests DEBUG: parsing text
Getting Contents of the website of Buzzfile with url https://www.buzzfile.com/business/C-C-Tools-Inc-480-654-8091
Selenium DEBUG: getting url
Selenium DEBUG: found body
Selenium DEBUG: found page source
Requests DEBUG: getting url
Error fetching the URL https://www.buzzfile.com/business/C-C-Tools-Inc-480-654-8091: 403 Client Error: Forbidden for url: https://www.buzzfile.com/business/C-C-Tools-Inc-480-654-8091
Ge

KeyboardInterrupt: 

In [38]:
search_results

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': '"We Make Dirty Things Clean" Pressure Washing Excellence !!! LLC , SIC Code'},
 'rankingResponse': {}}

In [23]:

response = requests.get("https://markets.businessinsider.com/funds/invesco-endeavor-fund-class-a-us00141t2969", 
                        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        })


In [26]:
cursor.close()
conn.close()