In [62]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time
import sqlite3
load_dotenv()

True

Define fields except firm name

In [63]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status" ]

I'm using this to test how getting predictions just using gemini (without the databases) works

In [64]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="models/gemini-1.5-pro-002",
  # generation_config=generation_config,
)

chat_session = model.start_chat(
  history=[
  ]
)

Test responses

In [65]:
response = model.generate_content(contents="What is the land area of Spain?",
                                  tools='google_search_retrieval')

In [66]:
response = chat_session.send_message("How many people died in the recent floods in Valencia?", tools='google_search_retrieval')
print(response.text)

As of November 8th, 2024, the death toll from the recent floods in Valencia, Spain, has risen to 223, according to Spain's transport minister.  As of November 9, 2024 another source mentioned at least 220 people died as of Saturday, and about 80 people were still missing.  Earlier reports put the number of deaths at 214 as of November 3rd, with 211 of those in the Valencia region, and later at 217 with 89 missing as of November 5th.  The numbers have continued to be adjusted as more information becomes available.



Function to form prompt to give gemini

Make sure gemini returns one answer or Null to each query instead of a paragraph

In [67]:
answer_format_v1 = """
If you have absolutely no idea on how to answer my query please write 'No information found'. 
Do not leave the answer blank or give me a long explanation on why you couldn't find the answer or an explanation of your answer.
If you find multiple possible answers return your best answer
To reiterate i just want you to give a short answer in the format described above. Do not give me a long textual response."""

In [68]:
def form_prompt(query):
    prompt = f"""
    Context:
    {query}.
    {answer_format_v1}
    """

    return prompt

Define the queries we will give Gemini for each field. We need to specify the type of output we expect etc. May need some prompt engineering here

In [69]:
field_to_query = {
        "Registered_Address": "Tell me the Registered Adress for the firm named {firm_name}. Your answer should consist of the street address, city, state, country, and postal code.",
        "CEO": "Tell me the name and surname of the CEO for the firm named {firm_name}. Your answer should be in the format 'Name Surname'",
        "Establishment_Year": "Tell me the establishment year of {firm_name}. Your answer should be 4 digits in the format YYYY",
        "Number_Of_Employees": "Tell me the estimated number of employees for the firm named {firm_name}. Your answer should be an approximate range, like 1-10, 10-100, 100-1000, 1000-10000, 10000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Revenue_Size": "Tell me the estimated annual revenue in dollars for the firm named {firm_name}. Your answer should be an approximate range, 10000-100000, 1000000-10000000, or 1000000000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Website": "Tell me the url of the official website of for the firm named {firm_name}. If you can't find a website for the firm, please write 'No website found'",
        "NAICS_Code": "Tell me the numeric NAICS code for the firm named {firm_name}.",
        "SIC_Code":  "Tell me the numeric SIC code for the firm named {firm_name}.",
        "Status": "Tell me the whether the firm named {firm_name} is Active or Dissolved. Answer with one word.",
}

TODO - Check if gemini remembers recent context like below. May be helpful to give a general context first, then ask each query, making sure the context resets with each firm

### Open firm databases

In [70]:
# Table to store the results
conn = sqlite3.connect("firm_database_gemini_with_grounding.db")
cursor = conn.cursor()


# Drop the table if you're starting from scratch
# cursor.execute('''
# DROP TABLE IF EXISTS firm_properties
#                ''')


table_name = "firm_properties_gemini_with_grounding_v2"
# You can create different tables for different prompt types, gemini model etc, if you rename the table
cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {table_name} (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT
               )
               ''')

<sqlite3.Cursor at 0x30222b5c0>

Main Loop

Get the list of firms from the web search database to be consistent

In [71]:
conn_web_results = sqlite3.connect("../firms_web_search_results.db")
cursor_websearch = conn_web_results.cursor()
cursor_websearch.execute(''' SELECT id, Firm_Name FROM firms_web_search_results ''')
firm_web_search_results = cursor_websearch.fetchall()

In [72]:
# select subset for testing
firm_web_search_results = firm_web_search_results[:10]
firm_web_search_results

[(1, '"TRACTION POWER SYSTEMS, INC."'),
 (2, '"A DAY TO REMEMBER", LLC'),
 (3, '" MALCO INC. -A NEVADA CORP.'),
 (4, ".BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST"),
 (5, '"J.CRESCI FARMS, L.L.C."'),
 (6, '1 MAN ARMY GOURMET FOODS LLC'),
 (7, '10-Code, LLC'),
 (8, "'Dat Hit Da Spot' LLC"),
 (9, '" LESLIE A. FEAST CONSTRUCTION COMPANY, INC."'),
 (10, '"R" ROCKY RANCH, INC.')]

In [73]:
for web_search_result in firm_web_search_results:
    firm_id = web_search_result[0]
    firm_name = web_search_result[1]

    # Check if firm already exists, insert row if it doesnt
    cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm:, ", firm_name)
        cursor.execute(f"INSERT INTO {table_name} (Firm_Name) VALUES (?)", (firm_name,))
        firm_id = cursor.lastrowid  
    else:
        # Get the existing firm's id
        print("Found row for firm, ", firm_name)
        firm_id = firm_row[0]
    

    # Fill in each field
    for field in fields:
        # Check if the field value in the target database is NULL
        cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
        if cursor.fetchone() is not None:
            print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue
      

        # Query Gemini
        # Update queries with firm name
        updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}
        # pass promot ot gemini
        prompt = form_prompt(updated_queries[field])


        # Send the prompt to model, try 3 times to get a response
        success = False
        failure_counter = 0
        while not success and failure_counter < 3:
            try:
                response = chat_session.send_message(prompt,tools='google_search_retrieval').text
                # print("Success for ", firm_name, field)
                success = True
            except Exception as e:
                print("Exception occured: ", e)
                time.sleep(1) 
                failure_counter += 1
                
        print(f"Response for {firm_name} - {field}: ", response)
        # Update database
        cursor.execute(f"INSERT INTO {table_name} (Firm_Name, {field}) VALUES (?, ?)", (firm_name, response))
        conn.commit()

Found row for firm,  "TRACTION POWER SYSTEMS, INC."
Response for "TRACTION POWER SYSTEMS, INC." - Registered_Address:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - CEO:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - Establishment_Year:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - Number_Of_Employees:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - Revenue_Size:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - Website:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - NAICS_Code:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - SIC_Code:  No information found

Response for "TRACTION POWER SYSTEMS, INC." - Status:  No information found

Inserting new firm:,  "A DAY TO REMEMBER", LLC
Response for "A DAY TO REMEMBER", LLC - Registered_Address:  88 Lowell Street, Methuen, Massachusetts, USA, 01844

Response for "A DAY TO REMEMBER", LLC - CEO:  

KeyboardInterrupt: 

Versioning Notes
- table_name = firm_properties_gemini_with_grounding
        - prompt only includes field_to_query_v0
- table_name = firm_properties_gemini_with_grounding_v2
        - prompt also includes answer_format_v1
- 

field_to_query_v0 = {
        "Registered_Address": "Tell me the Registered Adress for the firm named {firm_name}. Your answer should consist of the street address, city, state, country, and postal code.",
        "CEO": "Tell me the name and surname of the CEO for the firm named {firm_name}. Your answer should be in the format 'Name Surname'",
        "Establishment_Year": "Tell me the establishment year of {firm_name}. Your answer should be 4 digits in the format YYYY",
        "Number_Of_Employees": "Tell me the estimated number of employees for the firm named {firm_name}. Your answer should be an approximate range, like 1-10, 10-100, 100-1000, 1000-10000, 10000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Revenue_Size": "Tell me the estimated annual revenue in dollars for the firm named {firm_name}. Your answer should be an approximate range, 10000-100000, 1000000-10000000, or 1000000000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Website": "Tell me the url of the official website of for the firm named {firm_name}. If you can't find a website for the firm, please write 'No website found'",
        "NAICS_Code": "Tell me the numeric NAICS code for the firm named {firm_name}.",
        "SIC_Code":  "Tell me the numeric SIC code for the firm named {firm_name}.",
        "Status": "Tell me the whether the firm named {firm_name} is Active or Dissolved. Answer with one word.",
}

answer_format_v1 = """
If you can't find the answer to my query please write 'No information found'. 
Do not leave the answer blank or give me a long explanation on why you couldn't find the answer or why you are unsure.
 If you find multiple possible answers return your best answer"""