In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time
import sqlite3

Define fields except firm name

In [2]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status" ]

field_to_query = {
        "Registered_Address": "After looking at the data below, tell me the Registered Adress for the firm named {firm_name}",
        "CEO": "After looking at the data below, tell me the name and surname of the CEO for the firm named {firm_name} ",
        "Establishment_Year": "After looking at the data below, tell me the name and surname of the CEO for the firm named {firm_name}. Your answer should be 4 digits in the format YYYY",
        "Number_Of_Employees": "After looking at the data below, tell me the estimated number of employees for the firm named {firm_name}. Your answer should be an approximate range, like 1-10, 10-100, 100-1000, 1000-10000, 10000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Revenue_Size": "After looking at the data below, tell me the estimated annual revenue in dollars for the firm named {firm_name}. Your answer should be an approximate range, 10000-100000, 1000000-10000000, or 1000000000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Website": "After looking at the data below, tell me the url of the official website of for the firm named {firm_name}. If you can't find a website for the firm, please write 'No website found'",
        "NAICS_Code": "After looking at the data below, tell me the numeric NAICS code and surname of the CEO for the firm named {firm_name}.",
        "SIC_Code":  "After looking at the data below, tell me the numeric SIC code and surname of the CEO for the firm named {firm_name}.",
        "Status": "After looking at the data below, tell me the whether the firm named {firm_name} is Active or Dissolved. Answer with one word.",
}

Model Config

In [3]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-8b",
  generation_config= {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
    }
 )

chat_session = model.start_chat(
  history=[
  ]
)

In [4]:
response = chat_session.send_message("Hello there")
print(response.text)

Hello! How can I help you?



In [5]:
def form_prompt(context, query, data):
    prompt = f"""
    Context:
    {context}

    Query:
    {query}

    Relevant Data:
    {data}
    """

    return prompt

In [6]:
general_context = """
You will be assisting me with filling in data fields for a firm database I am building.
I will tell you the name of the firm i am interested in, and the field I want you to fill.
I will give you potentially relevant information I have gained from web search results and the scraped contents of certain websites that I gathered by searching for the firm name and field and the field. 
The text will be in Json format.
You will give your answer by simply stating the value of the field I am interested in. 
Do not form sentences, just give the value of the field.
If you have absolutely no idea about the answer based on the data provided, such as for instance when there is no web results or insufficient results, then answer with 'null' .
"""

### Open firm database

In [10]:
# Table to store the results
conn = sqlite3.connect("firm_database_llm.db")
cursor = conn.cursor()

# cursor.execute('''
# DROP TABLE IF EXISTS firm_properties
#                ''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS firm_properties (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees INT,
               Revenue_Size INT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT
               )
               ''')

conn_web_results = sqlite3.connect("firms_web_search_results.db")
cursor_websearch = conn_web_results.cursor()

Loop through firm names in "firm_search_results.db"

In [11]:
cursor_websearch.execute(''' SELECT id, Firm_Name FROM firms_web_search_results ''')
firm_web_search_results = cursor_websearch.fetchall()

for web_search_result in firm_web_search_results:
    firm_id = web_search_result[0]
    firm_name = web_search_result[1]

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firm_properties WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firm_properties (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid  
    else:
        # Get the existing firm's id
        print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    

    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firm_properties WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue
      
        cursor_websearch.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        web_search_result = cursor_websearch.fetchone()
        if web_search_result is not None:
            web_search_result = json.loads(web_search_result[0]) # TODO check indexing here
        else:
            web_search_result = "No web search data available"
       

        # Update queries with firm name
        updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

        prompt = form_prompt(general_context, updated_queries[field], web_search_result)

        success = False
        failure_counter = 0
        while not success and failure_counter < 3:
            try:
                response = chat_session.send_message(prompt).text
                # print("Success for ", firm_name, field)
                success = True
            except Exception as e:
                print("Exception occured: ", e)
                time.sleep(1) 
                failure_counter += 1
                
        print(f"Response for {firm_name} - {field}: ", response)
        # Update database
        cursor.execute(f"INSERT INTO firm_properties (Firm_Name, {field}) VALUES (?, ?)", (firm_name, response))
        conn.commit()

Found row for firm,  "TRACTION POWER SYSTEMS, INC."
Response for "TRACTION POWER SYSTEMS, INC." - Registered_Address:  8550 MOSLEY, Houston, TX, 77075
Response for "TRACTION POWER SYSTEMS, INC." - CEO:  null

Response for "TRACTION POWER SYSTEMS, INC." - Establishment_Year:  null

Response for "TRACTION POWER SYSTEMS, INC." - Number_Of_Employees:  null

Response for "TRACTION POWER SYSTEMS, INC." - Revenue_Size:  null



KeyboardInterrupt: 

In [21]:
web_search_result

{'_type': 'SearchResponse',
 'queryContext': {'originalQuery': '"TRACTION POWER SYSTEMS, INC." , Status'},
 'webPages': {'webSearchUrl': 'https://www.bing.com/search?q=%22TRACTION+POWER+SYSTEMS%2c+INC.%22+%2c+Status',
  'totalEstimatedMatches': 18,
  'value': [{'id': 'https://api.bing.microsoft.com/api/v7/#WebPages.0',
    'name': 'TRACTION POWER SYSTEMS, INC. :: Delaware (US) - OpenCorporates',
    'url': 'https://opencorporates.com/companies/us_de/2297026',
    'isFamilyFriendly': True,
    'displayUrl': 'https://opencorporates.com/companies/us_de/2297026',
    'snippet': 'Free and open company data on Delaware (US) company TRACTION POWER SYSTEMS, INC. (company number 2297026)',
    'dateLastCrawled': '2024-04-18T18:48:00.0000000Z',
    'cachedPageUrl': 'http://cc.bingj.com/cache.aspx?q=%22TRACTION+POWER+SYSTEMS%2c+INC.%22+%2c+Status&d=5014241641054353&mkt=en-US&setlang=en-US&w=2PSOE56M5ljoNtJSrUykNN-VXesK_L__',
    'language': 'en',
    'isNavigational': True,
    'noCache': False,


In [28]:
conn.close()
cursor.close()

ProgrammingError: Cannot operate on a closed database.