- This code uses the Bing Web Search API to retrieve search results for "Firm_Name + Field" and stores this in a sqlite database "firms_web_search_results.db" in the table named "firms_web_search_results".

Imports
- You need to have a .env file with BING_SEARCH_API_KEY defined

In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import sqlite3
import time
import json
import random

load_dotenv()

True

Define fields we do web searches for for each firm

In [2]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status"]

Class object to do bing search and catch errors

In [3]:
class BingSearchAPI():
    counter = 0

    def __init__(self) -> None:

        self.verbose = True

        # replace with list of keys once i run out
        self.subscription_key = os.getenv("BING_SEARCH_API_KEY")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 10

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)
        self.counter +=1
        print("search counter at ", self.counter)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return None
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            return None
        
        
        self.search_results = response.json()
    
        return self.search_results
    

Read in list of firms from the Operncorporates csv file 

In [4]:
open_corporates_database = "../Opencorporates/Equifax Project Data - Sheet1.csv"
df = pd.read_csv(open_corporates_database)
firms = df.name.to_list()
print(len(firms))

10000


In [5]:
df['company_type'].value_counts().head()

company_type
Domestic Limited Liability Company    1149
Limited Liability Company              992
Domestic Profit Corporation            368
Domestic LLC                           360
Domestic Nonprofit Corporation         315
Name: count, dtype: int64

In [6]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status", 
        "Dissolvement_Year","Company_Type","Previous_Names", "Alternative_Names", "Key_Executive_Personnel"]

Connect to sqlite table that stores web search results
- Note: Download the latest firms_web_search_results.db from Google drive (In the datasets folder)

In [7]:
conn = sqlite3.connect("firms_web_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               Dissolvement_Year TEXT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x11ff0c3c0>

Check how many firms are present

In [8]:
cursor.execute(''' SELECT COUNT(id) FROM firms_web_search_results ''')
print(cursor.fetchall())

[(1899,)]


- can also check if a specific firm is there

In [9]:
cursor.execute(f"SELECT * FROM firms_web_search_results WHERE id = 1510")
a = cursor.fetchall()
a

[(1510,
  '$TAY FOCU$ LLC',
  '{"_type": "SearchResponse", "queryContext": {"originalQuery": "$TAY FOCU$ LLC , Registered Address"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=%24TAY+FOCU%24+LLC+%2c+Registered+Address", "totalEstimatedMatches": 654000, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "$TAY FOCU$ LLC :: Arkansas (US) :: OpenCorporates", "url": "https://opencorporates.com/companies/us_ar/811285219", "isFamilyFriendly": true, "displayUrl": "https://opencorporates.com/companies/us_ar/811285219", "snippet": "Free and open company data on Arkansas (US) company $TAY FOCU$ LLC (company number 811285219)", "dateLastCrawled": "2023-07-21T21:20:00.0000000Z", "cachedPageUrl": "http://cc.bingj.com/cache.aspx?q=%24TAY+FOCU%24+LLC+%2c+Registered+Address&d=5052952179245355&mkt=en-US&setlang=en-US&w=7Pc0KUzyFzb50QTA1cdjwpp6DSMVYY7e", "language": "en", "isNavigational": true, "noCache": false, "siteName": "OpenCorporates"}, {"id": "https://a

In [10]:
bing_searcher = BingSearchAPI()

### Loop to construct database

In [16]:
firms = firms[:1250]

In [17]:
start_time = time.time()

for firm_name in firms:
    print("----- Debug: Now on firm: ", firm_name)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_web_search_results WHERE Firm_Name = ?", (firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firms_web_search_results (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        # print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    
    print("Firm ID: ", firm_id)

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            # print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue

        query = f"{firm_name} , {field.replace('_', ' ')}"

        # print("---------- Debug: Now on field: ", field)

        # Get search results TODO: Handle search errors
        search_results = bing_searcher.search(query)
        
        if search_results is None:
            print("Search results are None")
            break

        json_combined_data = json.dumps(search_results)

        # Update cell value in database
        cursor.execute(f"""
                    UPDATE firms_web_search_results
                    SET {field} = ?
                    WHERE id = ? AND Firm_Name = ?
                    """, (json_combined_data, firm_id, firm_name))
        
        conn.commit()

     
cursor.close()
conn.close()

----- Debug: Now on firm:  " MROCK LLC "
Firm ID:  897
----- Debug: Now on firm:  " OVER THE TOPP " INC
Firm ID:  898
----- Debug: Now on firm:  "1856" Historical Preservation Society
Firm ID:  899
----- Debug: Now on firm:  "2", Inc.
Firm ID:  900
----- Debug: Now on firm:  "2nd" Chance, Inc.
Firm ID:  901
----- Debug: Now on firm:  "2nd" Chance, Inc.
Firm ID:  901
----- Debug: Now on firm:  "3 PPP Auto Parts & Warehouse Inc.
Firm ID:  902
----- Debug: Now on firm:  "3" MENTORS, INC.
Firm ID:  429
----- Debug: Now on firm:  "3-6-4" Club
Firm ID:  903
----- Debug: Now on firm:  "447, LLC"
Firm ID:  904
----- Debug: Now on firm:  "4D Automation, Inc."
Firm ID:  905
----- Debug: Now on firm:  "7 - 2 - 14, Inc."
Firm ID:  906
----- Debug: Now on firm:  "77" Tire & Oil Co., Inc.
Firm ID:  907
----- Debug: Now on firm:  "78 Skateway" Skating Rink Company, Inc.
Firm ID:  908
----- Debug: Now on firm:  "7K" Logging, Inc.
Firm ID:  909
----- Debug: Now on firm:  "8" Point Hunting Club, Inc.
Fi

In [15]:
firms.index('"D" ENTERPRISES, INC.')

1248

In [37]:
cursor.close()
conn.close()

In [19]:
df.company_type.value_counts().head(20)

company_type
Domestic Limited Liability Company          1149
Limited Liability Company                    992
Domestic Profit Corporation                  368
Domestic LLC                                 360
Domestic Nonprofit Corporation               315
DOMESTIC LIMITED LIABILITY COMPANY           252
Corporation                                  232
DOMESTIC BUSINESS CORPORATION                229
Domestic Limited Liability Company (LLC)     223
Domestic For-Profit Corporation              184
LLC - Domestic                               160
Domestic Business Corporation                144
Domestic Corporation                         143
Domestic BCA                                 131
Kentucky Corporation                         128
Nonprofit Corporation                        120
CORPORATION                                  118
Limited Liability Company (D)                115
DLLC                                         114
Domestic for Profit                          108
Name: c