### Explanation

- This code uses the Bing Web Search API to retrieve search results for "Firm_Name + Field" and stores this in a sqlite database "firms_web_search_results.db" in the table named "firms_web_search_results".

Imports
- You need to have a .env file with BING_SEARCH_API_KEY defined

In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import sqlite3
import time
import json
import random
from gemini_prompts import *
load_dotenv()

True

# 1. Define class object to do bing search and catch errors

In [2]:
class BingSearchAPI():
    counter = 0

    def __init__(self) -> None:

        self.verbose = True

        # replace with list of keys once i run out
        self.subscription_key = os.getenv("BING_SEARCH_API_KEY")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 10

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)
        self.counter +=1
        print("search counter at ", self.counter)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return None
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            return None
        
        self.search_results = response.json()
    
        return self.search_results

#### Read in list of firms obtained from Opencorporates
- Note: Ids 2028 onwards are companies exclusively founded in 2024

In [3]:
df = pd.read_csv("../Opencorporates/Companies2024Only.csv") # 2024 only companies
firms = df["name"].tolist()

# 2. Connect to sqlite table that stores web search results
- Note: Download the latest firms_web_search_results.db from Google drive (In the datasets folder), and update there when done

In [4]:
conn = sqlite3.connect("firms_web_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               Dissolvement_Year TEXT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x137026040>

# 3. Execute Main Loop to construct the database
- Skip over existing entries

In [5]:
firms_sample = ["Paces"]

Loop to construct the dataset

In [6]:
bing_searcher = BingSearchAPI()
start_time = time.time()

# currently overridden to use examples
for firm_name in firms_sample:
    print("----- Debug: Now on firm: ", firm_name)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_web_search_results WHERE Firm_Name = ?", (firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firms_web_search_results (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        # print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    
    print("Firm ID: ", firm_id)

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            # print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue

        query = f"{firm_name} , {field.replace('_', ' ')}"

        # print("---------- Debug: Now on field: ", field)

        # Get search results TODO: Handle search errors
        search_results = bing_searcher.search(query)
        
        if search_results is None:
            print("Search results are None")
            break

        json_combined_data = json.dumps(search_results)

        # Update cell value in database
        cursor.execute(f"""
                    UPDATE firms_web_search_results
                    SET {field} = ?
                    WHERE id = ? AND Firm_Name = ?
                    """, (json_combined_data, firm_id, firm_name))
        
        conn.commit()


----- Debug: Now on firm:  Paces
Inserting new firm:,  Paces
Firm ID:  2945
Conducting bing search for :  Paces , Registered Address
search counter at  1
Conducting bing search for :  Paces , CEO
search counter at  2
Conducting bing search for :  Paces , Establishment Year
search counter at  3
Conducting bing search for :  Paces , Number Of Employees
search counter at  4
Conducting bing search for :  Paces , Revenue Size
search counter at  5
Conducting bing search for :  Paces , Website
search counter at  6
Conducting bing search for :  Paces , NAICS Code
search counter at  7
Conducting bing search for :  Paces , SIC Code
search counter at  8
Conducting bing search for :  Paces , Status
search counter at  9
Conducting bing search for :  Paces , Dissolvement Year
search counter at  10
Conducting bing search for :  Paces , Company Type
search counter at  11
Conducting bing search for :  Paces , Previous Names
search counter at  12
Conducting bing search for :  Paces , Alternative Names
s

#  Display some results for Demo purposes

In [7]:
data = cursor.execute("SELECT * FROM firms_web_search_results WHERE Firm_Name IN ({})".format(
    ",".join(["?"] * len(firms_sample))
), firms_sample).fetchall()
data

[(2945,
  'Paces',
  '{"_type": "SearchResponse", "queryContext": {"originalQuery": "Paces , Registered Address"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=Paces+%2c+Registered+Address", "totalEstimatedMatches": 1150000, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "PACES OFFICES \\u2013 Palestine Association for Children\'s Encouragement of ...", "url": "https://www.pacescharity.org/regional-office/", "isFamilyFriendly": true, "displayUrl": "https://www.pacescharity.org/regional-office", "snippet": "REGISTERED OFFICE Carter Lemon Camerons LLP 3 rd Floor, 20 King Street London EC2V 8EG. ... Head Office. Phone: +972 22421771 Email: [email protected] Address: Ramallah/Al Bireh AL Fara\\u2019 Building, Baghdad Street. REGISTRATION. PACES is a UK Registered Charity Number 1117085.", "dateLastCrawled": "2024-09-09T16:27:00.0000000Z", "language": "en", "isNavigational": true, "noCache": false}, {"id": "https://api.bing.microsoft.com/api/v7/

In [8]:
data_dict = json.loads(data[0][2]) # registered address
data_dict

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': 'Paces , Registered Address'},
 'webPages': {'webSearchUrl': 'https://www.bing.com/search?q=Paces+%2c+Registered+Address',
  'totalEstimatedMatches': 1150000,
  'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
    'name': "PACES OFFICES – Palestine Association for Children's Encouragement of ...",
    'url': 'https://www.pacescharity.org/regional-office/',
    'isFamilyFriendly': True,
    'displayUrl': 'https://www.pacescharity.org/regional-office',
    'snippet': 'REGISTERED OFFICE Carter Lemon Camerons LLP 3 rd Floor, 20 King Street London EC2V 8EG. ... Head Office. Phone: +972 22421771 Email: [email protected] Address: Ramallah/Al Bireh AL Fara’ Building, Baghdad Street. REGISTRATION. PACES is a UK Registered Charity Number 1117085.',
    'dateLastCrawled': '2024-09-09T16:27:00.0000000Z',
    'language': 'en',
    'isNavigational': True,
    'noCache': False},
   {'id': 'https://api.bing.microsoft.c

Save sample firms' names and ids to a separate csv file to continue with the demo quickly

In [9]:
data = cursor.execute("SELECT id, firm_name FROM firms_web_search_results WHERE Firm_Name IN ({})".format(
    ",".join(["?"] * len(firms_sample))
), firms_sample).fetchall()
print(data)
df = pd.DataFrame(data, columns=["id", "name"])
df.to_csv("firms_sample.csv", index=False)

[(2945, 'Paces')]


In [10]:
cursor.close()
conn.close()