In [1]:
import requests
from dotenv import load_dotenv
import os
import pandas as pd
import sqlite3
import time
import json
import random

load_dotenv()

True

In [2]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status"]

Define Class objects for Bing Search

In [3]:
class BingSearchAPI():

    def __init__(self) -> None:

        self.verbose = True
        self.n_webpages_to_scrape = 5

        # replace with list of keys once i run out
        self.subscription_key = os.getenv("BING_SEARCH_API_KEY")
        self.headers = {"Ocp-Apim-Subscription-Key": self.subscription_key}             
            
        self.search_url = "https://api.bing.microsoft.com/v7.0/search"

        self.params = {
            "q": "Insert Query here",
            "count": 20,  # Number of search results to return
            "offset": 0,  # The offset for pagination
            "mkt": "en-US",  # Region
            "safesearch": "Moderate"  # Safe search filter
        }

        self.search_results = {}
        self.webpages = {}
        self.request_timeout = 10

    def search(self, query: str) -> dict:
        self.webpages = {}
    
        if self.verbose: print("Conducting bing search for : ", query)
        
        self.params["q"] = query
        response = requests.get(self.search_url, headers=self.headers, params=self.params, timeout=self.request_timeout)

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"Request failed: {e}")
            return None
        
        # handle api credit limit by switching to paid api key
        if response.status_code == 403:
            print("API key has reached its limit")
            return None
        
    
        
        self.search_results = response.json()
    
        return self.search_results
    

Read in list of firms from the Operncorporates

In [4]:
open_corporates_database = "../Opencorporates/Equifax Project Data - Sheet1.csv"
firms = pd.read_csv(open_corporates_database)
firms = firms.name.to_list()
print(len(firms))
random.shuffle(firms)

10000


In [5]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status"]

Connect to sqlite table that stores web search results

In [6]:
conn = sqlite3.connect("firms_web_search_results.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firms_web_search_results (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year TEXT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code TEXT,
               SIC_Code TEXT,
               Status TEXT
               )
               ''')

<sqlite3.Cursor at 0x12d34be40>

Check how many firms are present

In [12]:
cursor.execute(''' SELECT COUNT(id) FROM firms_web_search_results ''')
print(cursor.fetchall())

[(897,)]


Check for a specific firm

In [13]:
cursor.execute(f"SELECT * FROM firms_web_search_results WHERE id = 883")
a = cursor.fetchall()
a

[(883,
  '! EXCLAMATION COMPANY',
  '{"_type": "SearchResponse", "queryContext": {"originalQuery": "! EXCLAMATION COMPANY , Registered Address"}, "webPages": {"webSearchUrl": "https://www.bing.com/search?q=!+EXCLAMATION+COMPANY+%2c+Registered+Address", "totalEstimatedMatches": 2200000, "value": [{"id": "https://api.bing.microsoft.com/api/v7/#WebPages.0", "name": "! EXCLAMATION COMPANY :: Illinois (US) - OpenCorporates", "url": "https://opencorporates.com/companies/us_il/CORP_62192437", "isFamilyFriendly": true, "displayUrl": "https://opencorporates.com/companies/us_il/CORP_62192437", "snippet": "Free and open company data on Illinois (US) company ! EXCLAMATION COMPANY (company number CORP_62192437)", "dateLastCrawled": "2024-03-04T08:59:00.0000000Z", "cachedPageUrl": "http://cc.bingj.com/cache.aspx?q=!+EXCLAMATION+COMPANY+%2c+Registered+Address&d=4740759601694564&mkt=en-US&setlang=en-US&w=huQCQT_fF_dr2U09n210Sun0WJkdLBpt", "language": "en", "isNavigational": true, "noCache": false, "si

### Loop to construct database

In [10]:
start_time = time.time()

for firm in firms:
    print("----- Debug: Now on firm: ", firm)

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_web_search_results WHERE Firm_Name = ?", (firm,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm)
        cursor.execute("INSERT INTO firms_web_search_results (Firm_Name) VALUES (?)", (firm,))
        firm_id = cursor.lastrowid 
    else:
        # Get the existing firm's id
        print("Found row for firm, ", firm)
        firm_id = firm_row[0]
    

    # Iterate through fields for the given firm
    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm,))
        if cursor.fetchone() is not None:
            print(f"Field '{field}' already has data for firm '{firm}', skipping.")
            continue

        query = f"{firm} , {field.replace('_', ' ')}"

        print("---------- Debug: Now on field: ", field)

        # Get search results TODO: Handle search errors
        search_results = bing_searcher.search(query)
        
        if search_results is None:
            print("Search results are None")
            break

        json_combined_data = json.dumps(search_results)

        # Update cell value in database
        cursor.execute(f"""
                    UPDATE firms_web_search_results
                    SET {field} = ?
                    WHERE id = ? AND Firm_Name = ?
                    """, (json_combined_data, firm_id, firm))
        
        conn.commit()

     
cursor.close()
conn.close()

----- Debug: Now on firm:  "OPERATION WE DO CARE"
Inserting new firm:,  "OPERATION WE DO CARE"
---------- Debug: Now on field:  Registered_Address
Conducting bing search for :  "OPERATION WE DO CARE" , Registered Address
---------- Debug: Now on field:  CEO
Conducting bing search for :  "OPERATION WE DO CARE" , CEO
---------- Debug: Now on field:  Establishment_Year
Conducting bing search for :  "OPERATION WE DO CARE" , Establishment Year
---------- Debug: Now on field:  Number_Of_Employees
Conducting bing search for :  "OPERATION WE DO CARE" , Number Of Employees
---------- Debug: Now on field:  Revenue_Size
Conducting bing search for :  "OPERATION WE DO CARE" , Revenue Size
---------- Debug: Now on field:  Website
Conducting bing search for :  "OPERATION WE DO CARE" , Website
---------- Debug: Now on field:  NAICS_Code
Conducting bing search for :  "OPERATION WE DO CARE" , NAICS Code
---------- Debug: Now on field:  SIC_Code
Conducting bing search for :  "OPERATION WE DO CARE" , SIC 

KeyboardInterrupt: 

In [14]:
cursor.close()
conn.close()

In [40]:
b = json.loads(a[0][2])
b.keys()

dict_keys(['_type', 'queryContext', 'webPages', 'rankingResponse'])

In [42]:
b['webPages']

{'webSearchUrl': 'https://www.bing.com/search?q=%24+SAVE+INN%2c+INC.+%2c+Registered+Address',
 'totalEstimatedMatches': 2460000,
 'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
   'name': '$ Save Inn, Inc. - AR, KS, MO, and MS - Bizapedia',
   'url': 'https://www.bizapedia.com/us/save-inn-inc.html',
   'isFamilyFriendly': True,
   'displayUrl': 'https://www.bizapedia.com/us/save-inn-inc.html',
   'snippet': 'There are 4 companies that go by the name of $ Save Inn, Inc. in Garden City KS, Jackson MS, Magnolia AR, Prescott AR, and Springfield MO ... Address: 302 Fleming, Suite 2, Garden City, KS 67846: Registered Agent: George P Naab: Filing Date: August 02, 1993: File Number: F00384072: Contact Us About The Company Profile For Save Inn, Inc.',
   'dateLastCrawled': '2024-07-01T03:13:00.0000000Z',
   'cachedPageUrl': 'http://cc.bingj.com/cache.aspx?q=%24+SAVE+INN%2c+INC.+%2c+Registered+Address&d=4714405670172541&mkt=en-US&setlang=en-US&w=3pCAvcBLBxtf3VIkIjils1TPpSa