- This code uses the Bing Web Search API to retrieve search results for "Firm_Name + Field" and stores this in a sqlite database "firms_web_search_results.db" in the table named "firms_web_search_results".

Imports
- You need to have a .env file with BING_SEARCH_API_KEY defined

In [17]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import sqlite3
import time
import json
import random

load_dotenv()

True

Class object to do bing search and catch errors

In [18]:
class BingSearchAPI():
    counter = 0

    def __init__(self) -> None:

        self.verbose = True

        # replace with list of keys once i run out
        self.subscription_key = os.getenv("BING_SEARCH_API_KEY")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 10

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)
        self.counter +=1
        print("search counter at ", self.counter)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return None
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            return None
        
        
        self.search_results = response.json()
    
        return self.search_results
    

Read in list of firms from the Operncorporates csv file 

In [19]:
open_corporates_database = "../Opencorporates/Equifax Project Data - Sheet1.csv"
df = pd.read_csv(open_corporates_database)
firms = df.name.to_list()
print(len(firms))

10000


Define fields we do web searches for for each firm

In [20]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status", 
        "Dissolvement_Year","Company_Type","Previous_Names", "Alternative_Names", "Key_Executive_Personnel"]
len(fields)

14

Connect to sqlite table that stores web search results
- Note: Download the latest firms_web_search_results.db from Google drive (In the datasets folder)

In [21]:
conn = sqlite3.connect("firms_web_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               Dissolvement_Year TEXT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x16768b440>

Check how many firms are present

In [22]:
cursor.execute(''' SELECT COUNT(id) FROM firms_web_search_results ''')
print(cursor.fetchall())

[(2023,)]


- can also check if a specific firm is there

In [23]:
cursor.execute(f"SELECT COUNT(*) FROM firms_web_search_results WHERE Alternative_Names IS NOT NULL")
a = cursor.fetchall()
a

[(2023,)]

In [24]:
bing_searcher = BingSearchAPI()

### Loop to construct database

In [25]:
firms = firms[:1250] # new firms from opencorporates csv
existing_firms = cursor.execute("SELECT Firm_Name FROM firms_web_search_results").fetchall()
existing_firms = [firm[0] for firm in existing_firms]
firms = firms + existing_firms
firms = list(set(firms))

In [26]:
len(firms)

2023

In [27]:
start_time = time.time()

for firm_name in firms:
    print("----- Debug: Now on firm: ", firm_name)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_web_search_results WHERE Firm_Name = ?", (firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firms_web_search_results (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        # print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    
    print("Firm ID: ", firm_id)

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            # print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue

        query = f"{firm_name} , {field.replace('_', ' ')}"

        # print("---------- Debug: Now on field: ", field)

        # Get search results TODO: Handle search errors
        search_results = bing_searcher.search(query)
        
        if search_results is None:
            print("Search results are None")
            break

        json_combined_data = json.dumps(search_results)

        # Update cell value in database
        cursor.execute(f"""
                    UPDATE firms_web_search_results
                    SET {field} = ?
                    WHERE id = ? AND Firm_Name = ?
                    """, (json_combined_data, firm_id, firm_name))
        
        conn.commit()

     
cursor.close()
conn.close()

----- Debug: Now on firm:  "PANIC APPRAISAL INC."
Firm ID:  835
----- Debug: Now on firm:  "ASSOCIATED LOCKSMITHS"
Firm ID:  1799
----- Debug: Now on firm:  1 DEPOT ST. LLC
Firm ID:  627
----- Debug: Now on firm:  "19 THIRTEEN" LLC
Firm ID:  1677
----- Debug: Now on firm:  "Island Style" LLC
Firm ID:  1010
----- Debug: Now on firm:  "A DEEPER CLEAN" RESIDENTIAL CLEANING SERVICES, LLC
Firm ID:  457
----- Debug: Now on firm:  "10" THE RESTAURANT, INC.
Firm ID:  519
----- Debug: Now on firm:  "One Love" Hearing Concepts, LLC
Firm ID:  1059
----- Debug: Now on firm:  "H20'S CONSTRUCTION GROUP, INC"
Firm ID:  64
----- Debug: Now on firm:  !FESTCOFFEEMISSION US INC.
Firm ID:  35
----- Debug: Now on firm:  "CHRISTIAN COMMUNITY SERVICE, INC."
Firm ID:  1904
----- Debug: Now on firm:  !Every1EatZ! LLC
Firm ID:  1805
----- Debug: Now on firm:  1020 M Street, LLC
Firm ID:  1197
----- Debug: Now on firm:  #GOTGLAMMED MOVEMENT
Firm ID:  1473
----- Debug: Now on firm:  101 MACRAE ROAD II, LLC
Firm I

In [37]:
cursor.close()
conn.close()

In [19]:
df.company_type.value_counts().head(20)

company_type
Domestic Limited Liability Company          1149
Limited Liability Company                    992
Domestic Profit Corporation                  368
Domestic LLC                                 360
Domestic Nonprofit Corporation               315
DOMESTIC LIMITED LIABILITY COMPANY           252
Corporation                                  232
DOMESTIC BUSINESS CORPORATION                229
Domestic Limited Liability Company (LLC)     223
Domestic For-Profit Corporation              184
LLC - Domestic                               160
Domestic Business Corporation                144
Domestic Corporation                         143
Domestic BCA                                 131
Kentucky Corporation                         128
Nonprofit Corporation                        120
CORPORATION                                  118
Limited Liability Company (D)                115
DLLC                                         114
Domestic for Profit                          108
Name: c