In [54]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time
import sqlite3
load_dotenv()

True

Define fields except firm name

In [55]:
fields = ["Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status", 
        "Dissolvement_Year","Company_Type","Previous_Names", "Alternative_Names", "Key_Executive_Personnel"]

I'm using this to test how getting predictions just using gemini (without the databases) works

In [56]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="models/gemini-1.5-pro-002",
  # generation_config=generation_config,
)

# chat_session = model.start_chat(
#   history=[
#   ]
# )

Test responses

In [57]:
# response = chat_session.send_message("How many people died in the recent floods in Valencia?", tools='google_search_retrieval')
# response = model.generate_content(contents="What is the land area of Spain?", tools='google_search_retrieval' )
# print(response.text)

Function to form prompt to give gemini

Make sure gemini returns one answer or Null to each query instead of a paragraph

In [58]:
answer_format_v1 = """
If you have absolutely no idea on how to answer a query for a given field you can answer 'No information found' for that field . 
Do not leave the answer blank or give me a long explanation on why you couldn't find the answer or an explanation of your answer.
If you find multiple possible answers return your best answer.
To reiterate i just want you to give a short answer in the format described above. Do not give me a long textual response."""

In [None]:
context_single_answer_v1 = """
I will ask you a series of questions about a company. Please provide the required information in a single answer,
making sure that your answer to each of my queries is separated by the following delimiter '*-*'. 
Simply write the answers in order separated by '*-*', do not repeat the questions or the field names.'
My questions are as follows: """


context = context_single_answer_v1

In [60]:
def form_prompt(query):
    prompt = f"""
    {context}
    {query}.
    {answer_format_v1}
    """
    return prompt

Define the queries we will give Gemini for each field. We need to specify the type of output we expect etc. May need some prompt engineering here

In [61]:
field_to_query_v1 = {
        "Registered_Address": "Tell me the Registered Adress for the firm named {firm_name}. Your answer should consist of the street address, city, state, country, and postal code.",
        "CEO": "Tell me the name and surname of the CEO for the firm named {firm_name}. Your answer should be in the format 'Name Surname'",
        "Establishment_Year": "Tell me the establishment year of {firm_name}. Your answer should be 4 digits in the format YYYY",
        "Number_Of_Employees": "Tell me the estimated number of employees for the firm named {firm_name}. Your answer should be an approximate range, like 1-10, 10-100, 100-1000, 1000-10000, 10000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Revenue_Size": "Tell me the estimated annual revenue in dollars for the firm named {firm_name}. Your answer should be an approximate range, 10000-100000, 1000000-10000000, or 1000000000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Website": "Tell me the url of the official website of for the firm named {firm_name}. If you can't find a website for the firm, please write 'No website found'",
        "NAICS_Code": "Tell me the numeric NAICS code for the firm named {firm_name}.",
        "SIC_Code":  "Tell me the numeric SIC code for the firm named {firm_name}.",
        "Status": "Tell me the whether the firm named {firm_name} is Active or Dissolved. Answer with one word.",
        "Dissolvement_Year": "Tell me the year when the firm named {firm_name} was dissolved. Your answer should be 4 digits in the format YYYY. If the firm has not been dissolved, please respond with 'N/A'.",
        "Company_Type": "Tell me the type of company for the firm named {firm_name} (e.g., Public, Private, Partnership, LLC, etc.).",
        "Previous_Names": "List any previous names that the firm named {firm_name} has had. If there are no previous names, please respond with 'N/A'.",
        "Alternative_Names": "Tell me if there are any alternative or trade names for the firm named {firm_name}. If none, please respond with 'N/A'.",
        "Key_Executive_Personnel": "List the key executive personnel for the firm named {firm_name}, including their names and job titles. If there are no key executive personnel, please respond with 'N/A'."
}


field_to_query = field_to_query_v1

TODO - Check if gemini remembers recent context like below. May be helpful to give a general context first, then ask each query, making sure the context resets with each firm

### Open firm databases

In [62]:
# Table to store the results
conn = sqlite3.connect("firm_database_gemini_with_grounding.db")
cursor = conn.cursor()


In [63]:
# Check existing tables and drop if you want

# Check existing tables
existing_tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print(existing_tables)

[('sqlite_sequence',), ('firm_properties_gemini_with_grounding_v1',), ('firm_properties_gemini_with_grounding_v2',)]


Drop tables if needed

In [64]:
# table_to_drop = "firm_properties_gemini_with_grounding_v2"
# cursor.execute(f"DROP TABLE IF EXISTS {table_to_drop} ")
# conn.commit()

Create table

In [65]:
# You can create different tables for different prompt types, gemini model etc, if you rename the table
table_name = "firm_properties_gemini_with_grounding_v2"

In [66]:
cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {table_name} (
               id INTEGER PRIMARY KEY,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT,
               Dissolvement_Year INT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x302c538c0>

Check how many rows are in the current table

In [67]:
cursor.execute(f"SELECT COUNT(*) FROM {table_name}").fetchall()

[(307,)]

In [68]:
cursor.execute(f"SELECT  * FROM {table_name}").fetchall()

[(1,
  '"TRACTION POWER SYSTEMS, INC."',
  '925 N. Rengstorff Ave, Mountain View, CA, 94043, USA',
  'No information found',
  'No information found',
  '1-10',
  '100000-1000000',
  'No website found',
  541613,
  8742,
  'Active',
  'N/A',
  'Private',
  'N/A',
  'N/A',
  'No information found\n'),
 (2,
  '"A DAY TO REMEMBER", LLC',
  '6006 Kingsfield West Bloomfield, MI 48322, USA',
  'N/A',
  'N/A',
  '1-10',
  '10000-100000',
  'No website found',
  'N/A',
  'N/A',
  'Active',
  'N/A',
  'LLC',
  'N/A',
  'N/A',
  'N/A*\n'),
 (3,
  '" MALCO INC. -A NEVADA CORP.',
  '1700 Sahara Ave Ste 17, Las Vegas, Nevada, United States, 89102\n',
  '\nM. Malick\n',
  1971,
  '\n10-100\n',
  '\n1000000-10000000\n',
  '\nNo website found\n',
  423490,
  5072,
  '\nActive\n',
  '\nN/A\n',
  '\nPrivate\n',
  '\nN/A\n',
  '\nMALCO Products, SBC\n',
  '\nRich Benninghoff (President and Chief Operating Officer)\n'),
 (4,
  ".BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST",
  '201 Bishopsgate, London, Englan

Check for specific entry

In [69]:
field = 'Registered_Address'
firm_id = 26
cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = ? AND {field} IS NOT NULL", (firm_id,))
print(cursor.fetchone())
if cursor.fetchone() is not None:
    print(f"Field '{field}' already has data for firm , skipping.")

('2201 Coliseum Blvd E, Fort Wayne, IN 46805, USA',)


# Update the database

### Option 1: Get the list of firms from the web search database so as to be consistent

In [70]:
conn_web_results = sqlite3.connect("../firms_web_search_results.db")
cursor_websearch = conn_web_results.cursor()

Select Subset of firms for testing

In [71]:
cursor_websearch.execute(''' SELECT id, Firm_Name FROM firms_web_search_results ORDER BY id ASC''')
firm_web_search_results = cursor_websearch.fetchall()
firm_web_search_results = firm_web_search_results[:1000]
firm_web_search_results

[(1, '"TRACTION POWER SYSTEMS, INC."'),
 (2, '"A DAY TO REMEMBER", LLC'),
 (3, '" MALCO INC. -A NEVADA CORP.'),
 (4, ".BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST"),
 (5, '"J.CRESCI FARMS, L.L.C."'),
 (6, '1 MAN ARMY GOURMET FOODS LLC'),
 (7, '10-Code, LLC'),
 (8, "'Dat Hit Da Spot' LLC"),
 (9, '" LESLIE A. FEAST CONSTRUCTION COMPANY, INC."'),
 (10, '"R" ROCKY RANCH, INC.'),
 (11, '"FLEMING COUNTY KIWANIS CLUB INCORPORATED."'),
 (12, '$6 & $8 FASHIONS, INC.'),
 (13, "'JOSE ALFREDO PEREZ LOPEZ LLC"),
 (14, '"B & S GRAIN, LTD."'),
 (15, '"THE SUMMIT, L.L.C."'),
 (16, '"Foam It" Spray Foam Insulation, LLC'),
 (17, '"C.C.C. INC."'),
 (18, '(N)SQUARED ADVISORY, LLC'),
 (19, '#forabetternevada LLC'),
 (20, '#2 BLACKBOARD PROPERTIES, LLC'),
 (21, '"THE TRADES" GROUP, INC.'),
 (22, '008 PROJECT MANAGEMENT, LLC'),
 (23, '" C" CLUB INC.'),
 (24, '"SENN"SATIONAL CREATIONS, LLC'),
 (25, "'4' WHEELER'S HUNTING CLUB"),
 (26, '"ECHO LODGE NO. 103, OF THE INDEPENDENT ORDER OF ODD FELLOWS"'),
 (27, '1 By 

### Main Loop - Single Query Per Field

Fill database using a single query for each field

In [None]:
single_query_per_field = False
if single_query_per_field:
    context = " "
else:
    context = context_single_answer_v1

In [73]:

if single_query_per_field:
    for web_search_result in firm_web_search_results:
        firm_id = web_search_result[0]
        firm_name = web_search_result[1]

        # Check if firm already exists, insert row if it doesnt

        cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        firm_row = cursor.fetchone()
        if firm_row is None:
            print("Inserting new firm:, ", firm_name)
            cursor.execute(f"INSERT INTO {table_name} (Firm_Name) VALUES (?)", (firm_name,))
            firm_id = cursor.lastrowid  
        else:
            # Get the existing firm's id
            print("Found row for firm, ", firm_name)
            firm_id = firm_row[0]
        


        # Update queries with firm name
        updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

        # Fill in each field
        for field in fields:
            # Check if the field value in the target database is NULL
            cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
            if cursor.fetchone() is not None:
                print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
                continue
        

            # Query Gemini
            # pass promot ot gemini
            prompt = form_prompt(updated_queries[field])

            # Send the prompt to model, try 3 times to get a response
            success = False
            response = None 
            failure_counter = 0
            while not success and failure_counter < 3:
                try:
                    response = model.generate_content(prompt,tools='google_search_retrieval').text
                    # print("Success for ", firm_name, field)
                    success = True
                except Exception as e:
                    print("Exception occured: ", e)
                    time.sleep(1) 
                    failure_counter += 1
                        
            print(f"Gemini response for {firm_name} - {field}: ", response)
            # Update database
            cursor.execute(f"UPDATE {table_name} SET {field} = ? WHERE Firm_Name = ? AND id = ?",
                (response, firm_name, firm_id))
            conn.commit()
    cursor.close()
    conn.close()
    cursor_websearch.close()
    conn_web_results.close()

### Main Loop with single query for each field

In [74]:
if not single_query_per_field:
    for web_search_result in firm_web_search_results:
        firm_id = web_search_result[0]
        firm_name = web_search_result[1]

        # Check if firm already exists, insert row if it doesnt
        print("Debug -- now on firm id ", firm_id)

        cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        firm_row = cursor.fetchone()
        if firm_row is None:
            print("Inserting new firm:, ", firm_name)
            cursor.execute(f"INSERT INTO {table_name} (Firm_Name) VALUES (?)", (firm_name,))
            firm_id = cursor.lastrowid  
        else:
            # Get the existing firm's id
            print("Found row for firm, ", firm_name)
            firm_id = firm_row[0]
        


        # Update queries with firm name
        updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

        # Fill in every field at the same time
        
        # Query Gemini
        # pass promot ot gemini
        prompt = form_prompt(updated_queries)

        # Send the prompt to model, try 3 times to get a response
        success = False
        response = None 
        failure_counter = 0
        while not success and failure_counter < 3:
            try:
                response = model.generate_content(prompt,tools='google_search_retrieval').text
                # print("Success for ", firm_name, field)
                success = True
            except Exception as e:
                print("Exception occured: ", e)
                time.sleep(1) 
                failure_counter += 1

        print(f"Gemini response for {firm_name}: ", response)
        answers_per_field = response.split("*-*")
        print(len(answers_per_field))
        if len(answers_per_field) != len(fields):
            print("Error: Number of answers do not match number of fields," , len(answers_per_field), len(fields))
            continue
        for field, answer in zip(fields, answers_per_field):

            # # strip out field name if it is in the answer
            # if field + ":" in answer:
            #     answer = answer.replace(field + ":", "")

            print(f"Gemini response for {firm_name} - {field}: ", answer)
            
            cursor.execute(f"UPDATE {table_name} SET {field} = ? WHERE Firm_Name = ? AND id = ?", (answer, firm_name, firm_id))
            conn.commit()

    cursor.close()
    conn.close()
    cursor_websearch.close()
    conn_web_results.close()

Debug -- now on firm id  1
Found row for firm,  "TRACTION POWER SYSTEMS, INC."
Gemini response for "TRACTION POWER SYSTEMS, INC.":  Abel Smith House, Gunnels Wood Road, Stevenage, Hertfordshire, SG1 2ST, United Kingdom*-*No information found*-*No information found*-*No information found*-*No information found*-*No website found*-*No information found*-*No information found*-*No information found*-*N/A*-*No information found*-*N/A*-*N/A*-*No information found*

14
Gemini response for "TRACTION POWER SYSTEMS, INC." - Registered_Address:  Abel Smith House, Gunnels Wood Road, Stevenage, Hertfordshire, SG1 2ST, United Kingdom
Gemini response for "TRACTION POWER SYSTEMS, INC." - CEO:  No information found
Gemini response for "TRACTION POWER SYSTEMS, INC." - Establishment_Year:  No information found
Gemini response for "TRACTION POWER SYSTEMS, INC." - Number_Of_Employees:  No information found
Gemini response for "TRACTION POWER SYSTEMS, INC." - Revenue_Size:  No information found
Gemini resp

In [53]:
cursor.close()
conn.close()
cursor_websearch.close()
conn_web_results.close()

In [54]:
table_name

'firm_properties_gemini_with_grounding_v2'

TODO:
- strip new line from gemini responses
- problem with it still returning paragraphs
- limit token output, 
    - limit token output when p value is low