In [5]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time
import sqlite3

Define fields except firm name

In [1]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status" ]

field_to_query = {
        "Registered_Address": "After looking at the data below, tell me the Registered Adress for the firm named {firm_name}}",
        "CEO": "After looking at the data below, tell me the name and surname of the CEO for the firm named {firm_name} ",
        "Establishment_Year": "After looking at the data below, tell me the name and surname of the CEO for the firm named {firm_name}. Your answer should be 4 digits in the format YYYY",
        "Number_Of_Employees": "After looking at the data below, tell me the estimated number of employees for the firm named {firm_name}. Your answer should be an approximate range, like 1-10, 10-100, 100-1000, 1000-10000, 10000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Revenue_Size": "After looking at the data below, tell me the estimated annual revenue in dollars for the firm named {firm_name}. Your answer should be an approximate range, 10000-100000, 1000000-10000000, or 1000000000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Website": "After looking at the data below, tell me the url of the official website of for the firm named {firm_name}. If you can't find a website for the firm, please write 'No website found'",
        "NAICS_Code": "After looking at the data below, tell me the numeric NAICS code and surname of the CEO for the firm named {firm_name}.",
        "SIC_Code":  "After looking at the data below, tell me the numeric SIC code and surname of the CEO for the firm named {firm_name}.",
        "Status": "After looking at the data below, tell me the whether the firm named {firm_name} is Active or Dissolved. Answer with one word.",
}

Model Config

In [7]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-8b",
  generation_config= {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
    }
 )

chat_session = model.start_chat(
  history=[
  ]
)

In [8]:
response = chat_session.send_message("Hello there")
print(response.text)

Hello! How can I help you today?



In [9]:
def form_prompt(context, query, data):
    prompt = f"""
    Context:
    {context}

    Query:
    {query}

    Relevant Data:
    {data}
    """

    return prompt

In [4]:
general_context = """
You will be assisting me with filling in data fields for a firm database I am building.
I will tell you the name of the firm i am interested in, and the field I want you to fill.
I will give you potentially relevant information I have gained from web search results and the scraped contents of certain websites that I gathered by searching for the firm name and field and the field. 
The text will be in Json format.
You will give your answer by simply stating the value of the field I am interested in. 
Do not form sentences, just give the value of the field.
If you have absolutely no idea about the answer, then answer with 'null' .
"""

### Open firm database

In [3]:
use_stored_web_search_data = True

In [1]:
# Table to store the results
conn = sqlite3.connect("firm_database_llm.db")
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS firm_properties (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees INT,
               Revenue_Size INT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT
               )
               ''')

conn_web_results = sqlite3.connect("firms_search_results.db")
cursor_web_results = conn_web_results.cursor()

NameError: name 'sqlite3' is not defined

Loop through firm names in "firm_search_results.db"

In [None]:
cursor_web_results.execute(''' SELECT id, Firm_Name FROM firms_search_results ''')
firm_web_search_results = cursor_web_results.fetchall()

for web_search_result in firm_web_search_results:
    firm_id = web_search_result[0]
    firm_name = web_search_result[1]

    # Check if firm already exists, insert row if it doesnt
    cursor.execute("SELECT id FROM firms_search_results WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute("INSERT INTO firms_search_results (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid  
    else:
        # Get the existing firm's id
        print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    

    for field in fields:

        # Check if the field value in the database is NULL
        cursor.execute(f"SELECT {field} FROM firms_search_results WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue

        if use_stored_web_search_data:
            cursor_web_results.execute(f''' SELECT {field} FROM firms_search_results WHERE id = ? AND Firm_Name = ?'''(firm_id,firm_name))
            web_search_result = cursor_web_results.fetchone()
            web_search_result = json.loads(web_search_result[0]) # TODO check indexing here
        else:
            raise NotImplementedError("Stored web search data not available and live querying is not yet implemented")


    # Update queries with firm name
    updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

    prompt = form_prompt(general_context, updated_queries[field], web_search_result[field])

    success = False
    failure_counter = 0
    while not success and failure_counter < 3:
        try:
            response = chat_session.send_message(prompt).text
            # print("Success for ", firm_name, field)
            success = True
        except Exception as e:
            print("Exception occured: ", e)
            time.sleep(1) 
            failure_counter += 1
            
    # Update database
    cursor.execute(f''' INSERT INTO firm_properties (Firm_Name, {field}) VALUES ('{firm_name}', '{response.text}') ''')
    conn.commit()

In [14]:
conn.close()
cursor.close()

ProgrammingError: Cannot operate on a closed database.