- This code uses the Bing Web Search API to retrieve search results for "Firm_Name + Field" and stores this in a sqlite database "firms_web_search_results.db" in the table named "firms_web_search_results".

Imports
- You need to have a .env file with BING_SEARCH_API_KEY defined

In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import sqlite3
import time
import json
import random
from gemini_prompts import *
load_dotenv()

True

Class object to do bing search and catch errors

In [2]:
class BingSearchAPI():
    counter = 0

    def __init__(self) -> None:

        self.verbose = True

        # replace with list of keys once i run out
        self.subscription_key = os.getenv("BING_SEARCH_API_KEY")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 10

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)
        self.counter +=1
        print("search counter at ", self.counter)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return None
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            return None
        
        self.search_results = response.json()
    
        return self.search_results

Read in list of firms from the Operncorporates csv file 
- ids 2028 onwards are companies founded in 2024

In [4]:
# df = pd.read_csv("../Opencorporates/Equifax Project Data - Sheet1.csv")
df = pd.read_csv("../Opencorporates/Equifax Project Data - Sheet1.csv") # 2024 only companies
# shuffle order of rows
firms = df["name"].tolist()
firms = firms[:900]

In [5]:
firms

['" MROCK LLC "',
 '" OVER THE TOPP " INC',
 '"1856" Historical Preservation Society',
 '"2", Inc.',
 '"2nd" Chance, Inc.',
 '"2nd" Chance, Inc.',
 '"3 PPP Auto Parts & Warehouse Inc.',
 '"3" MENTORS, INC.',
 '"3-6-4" Club',
 '"447, LLC"',
 '"4D Automation, Inc."',
 '"7 - 2 - 14, Inc."',
 '"77" Tire & Oil Co., Inc.',
 '"78 Skateway" Skating Rink Company, Inc.',
 '"7K" Logging, Inc.',
 '"8" Point Hunting Club, Inc.',
 '"A Shining Star"',
 '"A Suttle Event" L.L.C.',
 '"A to Z, LLC"',
 '"A" Bail Bond Company, Incorporated',
 '"A" Mobile Home Transporters, Inc.',
 '"A" Ms. Mary Taxi Service LLC',
 '"A" Plus Cleaning & Restoration, Inc.',
 '"A" Plus Realty, L.L.C.',
 '"A" Thrift Sales, L.L.C.',
 '"A" xcellence Tutorial, LLC',
 '"ABC" ALL BASES COVERED COMMERCIAL CLEANING AND SANITIZING COMPANY, LLC',
 '"AG-ANALYSIS, INC."',
 '"AIP Corp"',
 '"AMERIPLAN, INC."',
 '"ARC" (Awakening Respect and Compassion for all Sentient Beings)',
 '"Abe & Jim Service" Incorporated',
 '"Aharas Chesed"',
 '"Ala

Connect to sqlite table that stores web search resultsa
- Note: Download the latest firms_web_search_results.db from Google drive (In the datasets folder)

In [6]:
conn = sqlite3.connect("firms_web_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               Dissolvement_Year TEXT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x12a13f5c0>

# Override with examples

In [7]:
# example_firm = "Starbucks"
# firms = [example_firm]
# Note: ids btw 2023-2028 exclusive are examples

In [None]:
# cursor.execute("DELETE FROM firms_web_search_results WHERE id > 2023 AND id < 2028").fetchall()

[]

Loop to construct the dataset

In [9]:
bing_searcher = BingSearchAPI()
start_time = time.time()

for firm_name in firms:
    print("----- Debug: Now on firm: ", firm_name)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_web_search_results WHERE Firm_Name = ?", (firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firms_web_search_results (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        # print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    
    print("Firm ID: ", firm_id)

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            # print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue

        query = f"{firm_name} , {field.replace('_', ' ')}"

        # print("---------- Debug: Now on field: ", field)

        # Get search results TODO: Handle search errors
        search_results = bing_searcher.search(query)
        
        if search_results is None:
            print("Search results are None")
            break

        json_combined_data = json.dumps(search_results)

        # Update cell value in database
        cursor.execute(f"""
                    UPDATE firms_web_search_results
                    SET {field} = ?
                    WHERE id = ? AND Firm_Name = ?
                    """, (json_combined_data, firm_id, firm_name))
        
        conn.commit()


----- Debug: Now on firm:  05 COATINGS LLC
Firm ID:  2076
----- Debug: Now on firm:  0X40 LLC
Firm ID:  2077
----- Debug: Now on firm:  10 STRAPS LLC
Firm ID:  2078
----- Debug: Now on firm:  10-4 GRADING CO
Firm ID:  2079
----- Debug: Now on firm:  1002 CASITAS HILL, LLC
Firm ID:  2080
----- Debug: Now on firm:  1003 AZRE HOLDINGS, LLC
Firm ID:  2081
----- Debug: Now on firm:  1010DATA SERVICES LLC
Firm ID:  2082
----- Debug: Now on firm:  1011 WEST NORTH TEMPLE, LLC
Firm ID:  2083
----- Debug: Now on firm:  10125 NORTH RIVERSIDE LANE, LLC
Firm ID:  2084
----- Debug: Now on firm:  10241 ROYAL DRIVE INDUSTRIES, LLC
Firm ID:  2085
----- Debug: Now on firm:  1025 WEST NORTH TEMPLE, LLC
Firm ID:  2086
----- Debug: Now on firm:  1028 CANYON VISTA, LLC
Firm ID:  2087
----- Debug: Now on firm:  1028 DENNER STREET INDUSTRIES, LLC
Firm ID:  2088
----- Debug: Now on firm:  1030A, LLC
Firm ID:  2089
----- Debug: Now on firm:  03MRCTR, LLC
Firm ID:  2090
----- Debug: Now on firm:  08MRCTR, LLC
Fi

In [10]:
cursor.close()
conn.close()

In [31]:
a = [1,2,3]
a[len(a):]

[]

Inspect the Database

In [23]:
cursor.execute(f"SELECT * FROM firms_web_search_results WHERE firm_name = '{example_firm}' ")
a = cursor.fetchall()
a

[(2027,
  'Starbucks',
  '{"_type": "SearchResponse", "queryContext": {"originalQuery": "Starbucks , Registered Address"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=Starbucks+%2c+Registered+Address", "totalEstimatedMatches": 1740000, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "Starbucks Corporate Office Headquarters", "url": "https://corporateofficeheadquarters.com/starbucks-corporate-office-headquarters-hq/", "thumbnailUrl": "https://www.bing.com/th?id=OIP.xkhYBSS2DwYO9uZ4ZMp_6AHaE7&w=80&h=80&c=1&pid=5.1", "isFamilyFriendly": true, "displayUrl": "https://corporateofficeheadquarters.com/starbucks-corporate-office-headquarters-hq", "snippet": "Starbucks Corporate Office Address: 2401 Utah Ave S, Seattle, WA 98134, USA; Starbucks HQ Phone Number: 1-206-447-1575; Starbucks Main Office: 1-800-STARBUC (1-800-782-7282) Starbucks Customer Care: Accessible via their website or customer service line; Website: starbucks.com.", "dateLastCrawled

In [24]:
example = json.loads(a[0][2])
example

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': 'Starbucks , Registered Address'},
 'webPages': {'webSearchUrl': 'https://www.bing.com/search?q=Starbucks+%2c+Registered+Address',
  'totalEstimatedMatches': 1740000,
  'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
    'name': 'Starbucks Corporate Office Headquarters',
    'url': 'https://corporateofficeheadquarters.com/starbucks-corporate-office-headquarters-hq/',
    'thumbnailUrl': 'https://www.bing.com/th?id=OIP.xkhYBSS2DwYO9uZ4ZMp_6AHaE7&w=80&h=80&c=1&pid=5.1',
    'isFamilyFriendly': True,
    'displayUrl': 'https://corporateofficeheadquarters.com/starbucks-corporate-office-headquarters-hq',
    'snippet': 'Starbucks Corporate Office Address: 2401 Utah Ave S, Seattle, WA 98134, USA; Starbucks HQ Phone Number: 1-206-447-1575; Starbucks Main Office: 1-800-STARBUC (1-800-782-7282) Starbucks Customer Care: Accessible via their website or customer service line; Website: starbucks.com.',
    'dateLas

In [25]:
cursor.close()
conn.close()