- This code uses selenium and requests to scrape the contents of the top 5 websites found in the bing web search results for each Firm_Name and Field. It reads in data from "firms_web_search_results.db" and stores website scraping results in "firms_web_search_website_scrapings.db".
- It will skip over fields that are already populated

Imports

In [42]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()

import sqlite3
import threading

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from requests.exceptions import Timeout, RequestException
import concurrent.futures
import time
import json
import re
from gemini_prompts import *

## Class for webscraping
- Uses requests and selenium to scrape a website, and returns the text with the longest length
- *IMPORTANT* Update CHROMEDRIVER_PATH with the path to your chrome driver installation

In [43]:
no_web_result = "No website scrapings found"
class SeleniumExtractionError(Exception):
    """Custom exception for Selenium extraction errors."""
    pass

CHROMEDRIVER_PATH = '/opt/homebrew/bin/chromedriver'

class WebScraper():
    def __init__(self) -> None:
        # Set up Chrome driver with webdriver manager
        self.options = webdriver.ChromeOptions()
        self.options.add_argument('--headless')  # Run headless for no browser window
        self.options.add_argument('--disable-gpu')  # Disable GPU acceleration
        self.options.add_argument('--no-sandbox')  # Required for some Linux environments
        self.options.add_argument('--disable-extensions')
        self.options.add_argument('--disable-plugins')
        self.options.add_argument('--disable-images')  # Prevent loading images to save bandwidth
        self.options.add_argument('--disable-browser-side-navigation')
        self.options.add_argument('--mute-audio') 
        self.options.page_load_strategy = 'eager'  

        # # Set up chrome driver
        # self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.options)
        # self.driver.set_page_load_timeout(4)
        
        # Paremeters for requests
        self.requests_headers =  {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        self.requests_timeout = 4

        self.n_webpages_to_scrape = 5
        self.webpages = {}
        self.service = Service(executable_path=CHROMEDRIVER_PATH)


    def get_top_webpages(self,web_search_results: dict) -> dict:
        '''
        Get the top n webpages' names and urls from a given Bing search result
        Returns dict with 'site_name': 'url' pairs
        '''
        self.webpages = {}
        print("Getting urls of top webpages")
    
        # get dict of site names and urls
        if 'webPages' not in web_search_results.keys():
            print("No webpages found")
            return self.webpages
        else:
            for result in web_search_results['webPages']['value']:
                # print("Result is ", result)
                # print(result["siteName"])
                if "siteName" in result.keys():
                    self.webpages[result["siteName"]] = result["url"]
                elif "name" in result.keys():
                    self.webpages[result["name"]] = result["url"]
                else:
                    pass
                if len(self.webpages) >= self.n_webpages_to_scrape:
                    break
                
            # print("debug: length of webpages is ", len(self.webpages))
            return self.webpages
        
    def extract_text_with_selenium(self,url):
        '''
        Extract contents of given url with Selenium with max 5s timeout
        Returns text if successful, None otherwise
        '''
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
        self.driver.set_page_load_timeout(4)
        try:
            # Open the URL in the browser
            # print("Selenium DEBUG: getting url")
            self.driver.get(url) # KEEPS GETTING STUCK HERE!!!!
            time.sleep(0.1) # this counteracts some automatic blocking

            # Wait for the page body to be present (max 5 seconds)
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            # print("Selenium DEBUG: found body")
            # Get page source and parse it after it has fully loaded
            page_source = self.driver.page_source
            # print("Selenium DEBUG: found page source")
            soup = BeautifulSoup(page_source, 'html.parser')
            text = soup.get_text(separator='\n')

            self.safe_quit_selenium()
            # self.safe_quit_selenium()
            return text
        
        except TimeoutException:
            error_message = "Page load exceeded time limit of 5 seconds"
            print(f"An error occurred with Selenium:: {error_message}")
            # self.safe_quit_selenium()
            return "Page contents not loaded in time"

        except Exception as e:
            error_message = str(e)
            print(f"An error occurred with Selenium: {error_message}")
            self.safe_quit_selenium()
            if "disconnected: not connected to DevTools" in error_message:
                print(f"Error occurred: {error_message}")
                raise SeleniumExtractionError(f"DevTools disconnection error for URL: {url}")
            else:
                raise SeleniumExtractionError(f"An unexpected error occurred while extracting: {error_message}")
    
    def safe_quit_selenium(self):
        try:
            self.driver.quit()
        except Exception as e:
            print(f"Error during driver quit: {e}")
            
        
    def extract_text_with_requests(self,url):
        ''' Extract the url with requests and return the text if successfull, None if not'''
        try:
            # Fetch the content from the URL with headers
            # print("Requests DEBUG: getting url")
            response = requests.get(url, headers=self.requests_headers, timeout=self.requests_timeout)
            response.raise_for_status()  # Check if the request was successful
            # print("Requests DEBUG: parsing text")
            # Parse text
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator='\n')  # Using '\n' to preserve some structure
            return text

        except Timeout:
            print(f"Request timed out after 5 seconds for URL: {url}")
            return None
        except RequestException as e:
            print(f"Requests Error fetching the URL {url}: {e}")
            return None
        except Exception as e:
            print(f"Requests Unexpected error occurred while processing {url}: {e}")
            return None
        
    def scrape_webpage(self, url: str) -> str:
        ''' Scrape the given url using selenium and requests, return the longer text'''

        # print("Extracting webpage with selenium")
        # print("---Debug: extracting webpage with url", url)
        text_selenium = self.extract_text_with_selenium(url)
        text_requests = self.extract_text_with_requests(url)

        texts = [text for text in [text_selenium, text_requests] if text]
        if not texts:
            return no_web_result

        # Return the text with more content
        longest_text = max(texts, key=len)
        return self.clean_text(longest_text)
            
    def clean_text(self,text):
        ''' Cleans result text '''
        if text is None:
            return None
        # Replace multiple newlines with a single newline
        text = re.sub(r'\n+', '\n', text)
        # Remove leading/trailing whitespace from each line
        text = '\n'.join(line.strip() for line in text.splitlines())
        # Remove extra spaces between words
        text = re.sub(r'\s+', ' ', text)
        return text
    
# used for parallel processing
def scrape_website(scraper_instance, website_name, website_url):
    print(f"Getting Contents of the website of {website_name} with url {website_url}")
    return website_name, scraper_instance.scrape_webpage(website_url)

Connect to sqlite tables

In [44]:
conn_websearch = sqlite3.connect("firms_web_search_results.db")
conn_websites = sqlite3.connect("firms_web_search_website_scrapings.db")
cursor_websearch = conn_websearch.cursor()
cursor_websites = conn_websites.cursor()

cursor_websites.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_website_scrapings (
               id INTEGER PRIMARY KEY,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT,
               Dissolvement_Year TEXT,
               Company_Type TEXT,
               Previous_Names TEXT,
               Alternative_Names TEXT,
               Key_Executive_Personnel TEXT
               )
               ''')

conn_websites.commit()

## Loop to construct database

In [45]:
scrapers = [WebScraper() for _ in range(5)]
site_scraper = scrapers[0]

In [46]:
# Get all firms from the web search results database
cursor_websearch.execute(''' SELECT id, Firm_Name FROM firms_web_search_results ''')
firm_web_search_results = cursor_websearch.fetchall()

Read sample firms

In [47]:
df = pd.read_csv("firms_sample.csv")
for i,row in df.iterrows():
    row = row.to_dict()
    print(row)

{'id': 2929, 'name': '1003 N KRESSON OWNER LLC'}
{'id': 2930, 'name': '10 PORTLAND STREET SOUTH BERWICK LLC'}
{'id': 2931, 'name': '100 Dates, LLC'}


In [48]:
start_time = time.time()
counter = 0


for i,row in df.iterrows():
    row = row.to_dict()
    firm_id = row['id']
    firm_name = row['name']
    print("----- Debug: Now on firm: ", firm_name, " -----", firm_id)

    # Check if a row for the firm already exists in the target database, insert row if it doesnt
    cursor_websites.execute("SELECT id FROM firms_web_search_website_scrapings WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    firm_row = cursor_websites.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor_websites.execute("INSERT INTO firms_web_search_website_scrapings (id, Firm_Name) VALUES (?,?)", (firm_id,firm_name,))
        firm_id = cursor_websites.lastrowid  
    else:
        # print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]

    # Now iterate through each field's search results for the given firm
    for field in fields:
        print("Debug field is ", field, " for firm ", firm_name , " and id ", firm_id)

        # Check if the field value in the target database is NULL to decide if we need to fill it in
        cursor_websites.execute(f"SELECT {field} FROM firms_web_search_website_scrapings WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        existing_result = cursor_websites.fetchone()
        # If the field already has a value, skip this iteration
        if existing_result is not None and existing_result[0] != '{}':
            print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue
      
        #get the websearch results
        cursor_websearch.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        web_search_result = cursor_websearch.fetchone()

        # check that bing web search results are actually available
        if (web_search_result is not None) and (web_search_result[0] is not None):
            web_search_result = json.loads(web_search_result[0])
        else:
            web_search_result = "No web search data available" # skip the iteration, nothing to do
            continue

        # Get the website URLs from the web search results
        try:
            # print("web_search_result is ", web_search_result)
            # skip failed search
            if web_search_result == "Bing Search has failed" or web_search_result is None:
                print(" Skipping failed search, for field ", field)
                continue

            # Get the top 5 webpages from the search results
            webpages = site_scraper.get_top_webpages(web_search_result)
            website_info = {} # initialize dictionary to fill

            # Use paralllism to scrape the 5 websites simultaneous with 5 Selenium instances

            with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
                # Submit each website scrape task to the executor with a unique SiteScraper instance
                future_to_site = {
                    executor.submit(scrape_website, scrapers[i % 5], name, url): name
                    for i, (name, url) in enumerate(webpages.items())
                }

                # Collect results as they complete
                for future in concurrent.futures.as_completed(future_to_site):
                    website_name = future_to_site[future]
                    try:
                        name, result = future.result()
                        website_info[name] = result
                    except Exception as e:
                        print(f"Error scraping {website_name}: {e}")

            website_info = json.dumps(website_info)

            # Update cell value in database
            cursor_websites.execute(f"""
                        UPDATE firms_web_search_website_scrapings
                        SET {field} = ?
                        WHERE id = ? AND Firm_Name = ?
                        """, (website_info, firm_id, firm_name))
            
            conn_websites.commit()
            counter += 1
            print(f"****Successfully updated {field} for {firm_name}), counter at {counter} ***")

        except SeleniumExtractionError as e:
            print(f"Error extracting data for {firm_name} and {field}: {e}")
            continue

----- Debug: Now on firm:  1003 N KRESSON OWNER LLC  ----- 2929
Debug field is  Registered_Address  for firm  1003 N KRESSON OWNER LLC  and id  2929
Field 'Registered_Address' already has data for firm '1003 N KRESSON OWNER LLC', skipping.
Debug field is  CEO  for firm  1003 N KRESSON OWNER LLC  and id  2929
Field 'CEO' already has data for firm '1003 N KRESSON OWNER LLC', skipping.
Debug field is  Establishment_Year  for firm  1003 N KRESSON OWNER LLC  and id  2929
Field 'Establishment_Year' already has data for firm '1003 N KRESSON OWNER LLC', skipping.
Debug field is  Number_Of_Employees  for firm  1003 N KRESSON OWNER LLC  and id  2929
Field 'Number_Of_Employees' already has data for firm '1003 N KRESSON OWNER LLC', skipping.
Debug field is  Revenue_Size  for firm  1003 N KRESSON OWNER LLC  and id  2929
Field 'Revenue_Size' already has data for firm '1003 N KRESSON OWNER LLC', skipping.
Debug field is  Website  for firm  1003 N KRESSON OWNER LLC  and id  2929
Field 'Website' alread

In [49]:
names = df.name.tolist()

In [50]:
names

['1003 N KRESSON OWNER LLC',
 '10 PORTLAND STREET SOUTH BERWICK LLC',
 '100 Dates, LLC']

In [51]:
data = cursor_websites.execute("SELECT * FROM firms_web_search_website_scrapings WHERE Firm_Name IN ({})".format(
    ",".join(["?"] * len(df))
), names).fetchall()
data_dict = json.loads(data[0][2])

In [52]:
data_dict

{'NeighborWho': 'Just a moment... www.neighborwho.com Verifying you are human. This may take a few seconds. www.neighborwho.com needs to review the security of your connection before proceeding. Verification successful Waiting for www.neighborwho.com to respond... Enable JavaScript and cookies to continue Ray ID: 8ecce16c2cf7ed83 Performance & security by Cloudflare',
 'Bizapedia': ' Company Search Subscription Service BIZAPEDIA PRO SEARCH SUBSCRIPTION We are unable to process your request at the moment, as your computer or network may be generating automated queries. To ensure continued access, please click the "Verify Human" button below. If you\'re looking for enhanced search functionality, we invite you to consider a Bizapedia Pro Search Subscription, which you can learn more about on this page. Performing verification EXTENSIVE SEARCHES With the Bizapedia Pro Search™ service you will get nearly unlimited searches via our various search forms, with up to 5 times the number of maxim

In [53]:
print(data_dict['OpenGovUS'])

 Taylor Northeast Inc · 1003 N Kresson St Baltimore MD 21205-3024 OPEN GOV US Business Money Services Business Registrations Moter Carriers System for Award Management Entities Charities and Non-Profit Organizations SEC EDGAR Entities SBA Paycheck Protection Program (PPP) SBA COVID-19 Economic Injury Disaster Loans (EIDL) SBA Targeted EIDL Advance and Supplemental Targeted Advance SBA Restaurant Revitalization Fund (RRF) Alabama Business Entities Alaska Corporation Registrations Arizona Corporation Registrations Colorado Business Entities Connecticut Business Registrations Delaware Business Licenses Delaware Certified Business Vendors Delaware Professional and Occupational Licenses Florida Corporations Florida Fictitious Name Registrations Florida Business and Professional Licenses Georgia Business Entities Indiana Business Entities Iowa Business Entities Massachusetts Corporations Michigan Professional and Occupational Licenses Nevada Corporation Registrations New Orleans Occupational

In [54]:
# # close all cursosrs and connections
# cursor_websearch.close()
# cursor_websites.close()
# conn_websearch.close()
# conn_websites.close()

## Analyze lengths of results to determine cutoff point (otherwise LLM costs too much)

In [55]:
# Create an empty list to store the results
data = []
lengths = []

# Get all firms from the database
cursor_websites.execute('SELECT id, Firm_Name FROM firms_web_search_website_scrapings WHERE id > 2000')
firm_records = cursor_websites.fetchall()

for firm_record in firm_records:
    firm_id = firm_record[0]
    firm_name = firm_record[1]
    firm_data = {'id': firm_id, 'Firm_Name': firm_name}
    
    print(f"Processing firm: {firm_name} (ID: {firm_id})")
    
    for field in fields:
        # Fetch the content of the field
        cursor_websites.execute(f'''
            SELECT {field} FROM firms_web_search_website_scrapings
            WHERE id = ? AND Firm_Name = ?
        ''', (firm_id, firm_name))
        
        result = cursor_websites.fetchone()
        if result:
            content = result[0]
            if content:
                try:
                    # Parse the JSON content if necessary
                    content_json = json.loads(content)
                    # Flatten the JSON to a string
                    content_str = json.dumps(content_json)
                    content_length = len(content_str)
                except json.JSONDecodeError:
                    # If content is not JSON, treat it as a string
                    content_length = len(content)
            else:
                content_length = 0
        else:
            content_length = 0
        
        # Add the length to the firm_data dictionary
        firm_data[field] = content_length
        lengths.append(content_length)
    
    # Append the firm_data to the data list
    data.append(firm_data)

# Convert the data list to a pandas DataFrame
df_lengths = pd.DataFrame(data)

Processing firm: "BASILE'S CLEAR BROOK FARM INCORPORATED" (ID: 2001)
Processing firm: "BEDAZZLED" PROFESSIONAL MOBILE DETAILERS LLC (ID: 2002)
Processing firm: "BERT'S KIDS, INC." THE JAMES & ALBERTA TEAGUE WILLIAMS FAMILY FOUNDATION (ID: 2003)
Processing firm: "BIG DADDYS" LLC (ID: 2004)
Processing firm: "BLUE BACK & 4TH", L.L.C. (ID: 2005)
Processing firm: "BUNCH OF GUYS" CLUB, INC., THE (ID: 2006)
Processing firm: "C & N" COMMUNITY & NETWORK L.L.C. (ID: 2007)
Processing firm: "C. COPPOLA, INC." (ID: 2008)
Processing firm: "C. FRANKLIN PROPERTIES, LLC" (ID: 2009)
Processing firm: "CAPE COD PARTNERS, LLC". (ID: 2010)
Processing firm: "CHEERS" BUTLER SERVICES, LLC (ID: 2011)
Processing firm: "CHURCH OF GOD" OF BRIDGEPORT, CONN. (ID: 2012)
Processing firm: "CHURCH OF OUR LADY OF PERPETUAL HELP" (ID: 2013)
Processing firm: "CHURCH OF OUR LADY OF PERPETUAL HELP", WASHINGTON, CONNECTICUT. (ID: 2014)
Processing firm: "CISZEK LLC" (ID: 2015)
Processing firm: "CLEAN, GREEN & PRISTINE", LLC (I

In [56]:
df_lengths.head()

Unnamed: 0,id,Firm_Name,Registered_Address,CEO,Establishment_Year,Number_Of_Employees,Revenue_Size,Website,NAICS_Code,SIC_Code,Status,Dissolvement_Year,Company_Type,Previous_Names,Alternative_Names,Key_Executive_Personnel
0,2001,"""BASILE'S CLEAR BROOK FARM INCORPORATED""",17752,8435,45254,26751,28729,36721,26880,55326,8679,18299,17607,10140,60656,47907
1,2002,"""BEDAZZLED"" PROFESSIONAL MOBILE DETAILERS LLC",12341,12535,25159,24776,9389,11968,8558,9497,27258,11058,9111,25642,27258,24621
2,2003,"""BERT'S KIDS, INC."" THE JAMES & ALBERTA TEAGUE...",26528,13831,10111,10185,12382,10914,10109,14523,9023,12848,10914,13727,10914,10120
3,2004,"""BIG DADDYS"" LLC",7456,16155,6043,1865,1567,13600,45932,26969,12535,76798,14152,12467,91120,8393
4,2005,"""BLUE BACK & 4TH"", L.L.C.",74340,648595,40950,59005,34819,24712,4756,21013,32990,389439,32130,41919,8134,474683


In [57]:
lengths_series = pd.Series(lengths)

percentiles = [50, 75, 90, 91,92,93,94, 95, 96, 97, 98, 99]

combined_percentiles = lengths_series.quantile([p / 100 for p in percentiles])

print(f"\nPercentiles across all fields:")
print(combined_percentiles)


Percentiles across all fields:
0.50      24032.50
0.75      46155.00
0.90      88127.60
0.91      95146.06
0.92     105340.60
0.93     116400.34
0.94     142091.38
0.95     188747.05
0.96     302549.40
0.97     962496.26
0.98    1294888.34
0.99    5068581.84
dtype: float64


In [60]:
lengths_series[lengths_series < 1000000].mean()/ 4

10070.988841064882

Estimate cost

In [12]:
v = lengths_series[lengths_series < 976996.73 ]
int(v.sum() / 4) /1000000 * 1.25

155.39265125

In [19]:
cursor_websearch.close()
conn_websearch.close()
cursor_websites.close()
conn_websites.close()