- This code uses the Bing Web Search API to retrieve search results for "Firm_Name + Field" and stores this in a sqlite database "firms_web_search_results.db" in the table named "firms_web_search_results".

Imports
- You need to have a .env file with BING_SEARCH_API_KEY defined

In [7]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import sqlite3
import time
import json
import numpy as np
import random
from gemini_prompts import *
load_dotenv()

True

Class object to do bing search and catch errors

In [8]:
class BingSearchAPI():
    counter = 0

    def __init__(self) -> None:

        self.verbose = True

        # replace with list of keys once i run out
        self.subscription_key = os.getenv("BING_SEARCH_API_KEY")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 10

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)
        self.counter +=1
        print("search counter at ", self.counter)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return None
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            return None
        
        self.search_results = response.json()
    
        return self.search_results

Read in list of firms from the Operncorporates csv file 
- ids 2028 onwards are companies founded in 2024

In [9]:
# df = pd.read_csv("../Opencorporates/Equifax Project Data - Sheet1.csv")
df = pd.read_csv("../Opencorporates/Companies2024Only.csv") # 2024 only companies
# shuffle order of rows
firms = df["name"].tolist()
firms = firms[1000:2000]

In [10]:
np.random.choice(len(firms), 10, replace=False)

array([567, 505, 407, 249, 149, 543, 422,  65, 185, 961])

In [11]:
# randomly select 10 firms
firms_sample = random.sample(firms, 3)

In [12]:
firms_sample

['1003 N KRESSON OWNER LLC',
 '10 PORTLAND STREET SOUTH BERWICK LLC',
 '100 Dates, LLC']

Connect to sqlite table that stores web search resultsa
- Note: Download the latest firms_web_search_results.db from Google drive (In the datasets folder)

In [13]:
conn = sqlite3.connect("firms_web_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               Dissolvement_Year TEXT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x1051199c0>

# Override with examples

Loop to construct the dataset

In [14]:
bing_searcher = BingSearchAPI()
start_time = time.time()

for firm_name in firms_sample:
    print("----- Debug: Now on firm: ", firm_name)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_web_search_results WHERE Firm_Name = ?", (firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firms_web_search_results (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        # print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    
    print("Firm ID: ", firm_id)

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            # print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue

        query = f"{firm_name} , {field.replace('_', ' ')}"

        # print("---------- Debug: Now on field: ", field)

        # Get search results TODO: Handle search errors
        search_results = bing_searcher.search(query)
        
        if search_results is None:
            print("Search results are None")
            break

        json_combined_data = json.dumps(search_results)

        # Update cell value in database
        cursor.execute(f"""
                    UPDATE firms_web_search_results
                    SET {field} = ?
                    WHERE id = ? AND Firm_Name = ?
                    """, (json_combined_data, firm_id, firm_name))
        
        conn.commit()


----- Debug: Now on firm:  1003 N KRESSON OWNER LLC
Inserting new firm:,  1003 N KRESSON OWNER LLC
Firm ID:  2929
Conducting bing search for :  1003 N KRESSON OWNER LLC , Registered Address
search counter at  1
Conducting bing search for :  1003 N KRESSON OWNER LLC , CEO
search counter at  2
Conducting bing search for :  1003 N KRESSON OWNER LLC , Establishment Year
search counter at  3
Conducting bing search for :  1003 N KRESSON OWNER LLC , Number Of Employees
search counter at  4
Conducting bing search for :  1003 N KRESSON OWNER LLC , Revenue Size
search counter at  5
Conducting bing search for :  1003 N KRESSON OWNER LLC , Website
search counter at  6
Conducting bing search for :  1003 N KRESSON OWNER LLC , NAICS Code
search counter at  7
Conducting bing search for :  1003 N KRESSON OWNER LLC , SIC Code
search counter at  8
Conducting bing search for :  1003 N KRESSON OWNER LLC , Status
search counter at  9
Conducting bing search for :  1003 N KRESSON OWNER LLC , Dissolvement Year

In [15]:
# Convert firms_sample to a tuple
data = cursor.execute("SELECT id, firm_name FROM firms_web_search_results WHERE Firm_Name IN ({})".format(
    ",".join(["?"] * len(firms_sample))
), firms_sample).fetchall()
print(data)
df = pd.DataFrame(data, columns=["id", "name"])
df.to_csv("firms_sample.csv", index=False)

[(2929, '1003 N KRESSON OWNER LLC'), (2930, '10 PORTLAND STREET SOUTH BERWICK LLC'), (2931, '100 Dates, LLC')]


In [16]:
data = cursor.execute("SELECT * FROM firms_web_search_results WHERE Firm_Name IN ({})".format(
    ",".join(["?"] * len(firms_sample))
), firms_sample).fetchall()
data

[(2929,
  '1003 N KRESSON OWNER LLC',
  '{"_type": "SearchResponse", "queryContext": {"originalQuery": "1003 N KRESSON OWNER LLC , Registered Address"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=1003+N+KRESSON+OWNER+LLC+%2c+Registered+Address", "totalEstimatedMatches": 69800, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "1003 N KRESSON OWNER LLC - Bizapedia", "url": "https://www.bizapedia.com/us/1003-n-kresson-owner-llc.html", "isFamilyFriendly": true, "displayUrl": "https://www.bizapedia.com/us/1003-n-kresson-owner-llc.html", "snippet": "There are 2 companies that go by the name of 1003 N Kresson Owner LLC. These companies are located in Baltimore MD, Dover DE, and New York NY. ... DELAWARE DOMESTIC LIMITED-LIABILITY COMPANY: WRITE REVIEW: Address: 555 E. Loockerman Street Suite 320 Dover, DE 19901: Registered Agent: Platinum Filings LLC: Filing Date: July 29, 2024: File Number ...", "dateLastCrawled": "2024-11-27T17:20:00.0000000Z", 

In [17]:
data_dict = json.loads(data[0][2])
data_dict

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': '1003 N KRESSON OWNER LLC , Registered Address'},
 'webPages': {'webSearchUrl': 'https://www.bing.com/search?q=1003+N+KRESSON+OWNER+LLC+%2c+Registered+Address',
  'totalEstimatedMatches': 69800,
  'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
    'name': '1003 N KRESSON OWNER LLC - Bizapedia',
    'url': 'https://www.bizapedia.com/us/1003-n-kresson-owner-llc.html',
    'isFamilyFriendly': True,
    'displayUrl': 'https://www.bizapedia.com/us/1003-n-kresson-owner-llc.html',
    'snippet': 'There are 2 companies that go by the name of 1003 N Kresson Owner LLC. These companies are located in Baltimore MD, Dover DE, and New York NY. ... DELAWARE DOMESTIC LIMITED-LIABILITY COMPANY: WRITE REVIEW: Address: 555 E. Loockerman Street Suite 320 Dover, DE 19901: Registered Agent: Platinum Filings LLC: Filing Date: July 29, 2024: File Number ...',
    'dateLastCrawled': '2024-11-27T17:20:00.0000000Z',
    'cached

Show results

In [18]:
cursor.close()
conn.close()

Inspect the Database