In [19]:
import pandas as pd
from dotenv import load_dotenv
import sys
import os
import google.generativeai as genai
import json
import time
import sqlite3
load_dotenv()

# Add the parent directory to sys.pth
import sys
from pathlib import Path
sys.path.insert(0, str(Path().resolve().parent))
%load_ext autoreload
%autoreload 2
from gemini_prompts import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


I'm using this to test how getting predictions just using gemini (without the databases) works

In [20]:
genai.configure(api_key=os.environ["GEMINI_API_KEY_PAID"])

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="models/gemini-1.5-pro-002",
  # generation_config=generation_config,
)

Test responses

Function to form prompt to give gemini

Make sure gemini returns one answer or Null to each query instead of a paragraph

In [21]:
context = context_single_answer_v1
field_to_query = field_to_query_v1
answer_format = answer_format_v1

In [22]:
def form_prompt(query):
    prompt = f"""
    {context}
    {query}
    {answer_format}
    """
    return prompt

TODO - Check if gemini remembers recent context like below. May be helpful to give a general context first, then ask each query, making sure the context resets with each firm

### Open firm databases

In [23]:
# Table to store the results
conn = sqlite3.connect("firm_database_gemini_with_grounding.db")
cursor = conn.cursor()

In [24]:
# Check existing tables
existing_tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print(existing_tables)

[('sqlite_sequence',), ('firm_properties_gemini_with_grounding_v1',), ('firm_properties_gemini_with_grounding_v2',)]


Drop tables if needed

In [25]:
# table_to_drop = "firm_properties_gemini_with_grounding_v2"
# cursor.execute(f"DROP TABLE IF EXISTS {table_to_drop} ")
# conn.commit()

Create table

In [26]:
# You can create different tables for different prompt types, gemini model etc, if you rename the table
table_name = "firm_properties_gemini_with_grounding_v2"

In [27]:
cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {table_name} (
               id INTEGER PRIMARY KEY,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees TEXT,
               Revenue_Size TEXT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT,
               Dissolvement_Year INT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')

<sqlite3.Cursor at 0x12fe1f040>

Check how many rows are in the current table

In [28]:
cursor.execute(f"SELECT COUNT(*) FROM {table_name}").fetchall()

[(2917,)]

In [29]:
cursor.execute(f"SELECT  * FROM {table_name} WHERE id = 2").fetchall()

[(2,
  '"A DAY TO REMEMBER", LLC',
  '13131 Champion Forest Dr Ste 200, Houston, TX 77069, United States',
  'Claudia',
  'No information found',
  '1-10',
  '100000-1000000',
  'daytoremember.net',
  'No information found',
  'No information found',
  'Active',
  'N/A',
  'LLC',
  'N/A',
  'N/A',
  'Claudia (Owner and Lead Planner), Veronica (Financial and Administrative Aspect)\n')]

Check for specific entry

In [30]:
field = 'Registered_Address'
cursor.execute(f"SELECT COUNT(id) FROM {table_name} WHERE {field} IS NOT NULL")
print(cursor.fetchall())

[(2917,)]


Rows where every field is not null

In [31]:
condition = ' AND '.join([f"{field} IS NOT NULL" for field in fields])

# Execute the query to find rows where all specified fields are not null
cursor.execute(f"SELECT COUNT(*) FROM {table_name} WHERE {condition}")
print(cursor.fetchall())

[(2917,)]


# Update the database

### Option 1: Get the list of firms from the web search database so as to be consistent

In [14]:
conn_web_results = sqlite3.connect("../firms_web_search_results.db")
cursor_websearch = conn_web_results.cursor()

Select Subset of firms for testing

In [15]:
cursor_websearch.execute(''' SELECT id, Firm_Name FROM firms_web_search_results ORDER BY id ASC''')
firm_web_search_results = cursor_websearch.fetchall()
firm_web_search_results

[(1, '"TRACTION POWER SYSTEMS, INC."'),
 (2, '"A DAY TO REMEMBER", LLC'),
 (3, '" MALCO INC. -A NEVADA CORP.'),
 (4, ".BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST"),
 (5, '"J.CRESCI FARMS, L.L.C."'),
 (6, '1 MAN ARMY GOURMET FOODS LLC'),
 (7, '10-Code, LLC'),
 (8, "'Dat Hit Da Spot' LLC"),
 (9, '" LESLIE A. FEAST CONSTRUCTION COMPANY, INC."'),
 (10, '"R" ROCKY RANCH, INC.'),
 (11, '"FLEMING COUNTY KIWANIS CLUB INCORPORATED."'),
 (12, '$6 & $8 FASHIONS, INC.'),
 (13, "'JOSE ALFREDO PEREZ LOPEZ LLC"),
 (14, '"B & S GRAIN, LTD."'),
 (15, '"THE SUMMIT, L.L.C."'),
 (16, '"Foam It" Spray Foam Insulation, LLC'),
 (17, '"C.C.C. INC."'),
 (18, '(N)SQUARED ADVISORY, LLC'),
 (19, '#forabetternevada LLC'),
 (20, '#2 BLACKBOARD PROPERTIES, LLC'),
 (21, '"THE TRADES" GROUP, INC.'),
 (22, '008 PROJECT MANAGEMENT, LLC'),
 (23, '" C" CLUB INC.'),
 (24, '"SENN"SATIONAL CREATIONS, LLC'),
 (25, "'4' WHEELER'S HUNTING CLUB"),
 (26, '"ECHO LODGE NO. 103, OF THE INDEPENDENT ORDER OF ODD FELLOWS"'),
 (27, '1 By 

In [16]:
single_query_per_field = False
if single_query_per_field:
    context = " "
else:
    context = context_single_answer_v1

### Main Loop with single query for each field

In [17]:
failed_firms = []

In [18]:
if not single_query_per_field:
    for web_search_result in firm_web_search_results:
        firm_id = web_search_result[0]
        firm_name = web_search_result[1]

        # Check if firm already exists, insert row if it doesnt
        print("Debug -- now on firm id ", firm_id)

        cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        firm_row = cursor.fetchone()
        if firm_row is None:
            print("Inserting new firm:, ", firm_name)
            cursor.execute(f"INSERT INTO {table_name} (Firm_Name) VALUES (?)", (firm_name,))
            firm_id = cursor.lastrowid  
        else:
            # Get the existing firm's id
            print("Found row for firm, ", firm_name , firm_id )
            firm_id = firm_row[0]
            # check if any fields are missing


        # Identify if any field is missing for this firm with a single query
        cursor.execute(f"""
            SELECT id FROM {table_name}
            WHERE id = ? AND Firm_Name = ? AND ({' OR '.join([f"{field} IS NULL" for field in fields])})
        """, (firm_id, firm_name,))

        # Check if any fields are missing
        firm_row_with_missing_fields = cursor.fetchone()
        if firm_row_with_missing_fields is None:
            print(f"All fields are filled for {firm_name}, skipping.")
            continue

        

        # Update queries with firm name
        updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

        # Fill in every field at the same time
        
        # Query Gemini
        # pass promot ot gemini
        prompt = form_prompt(updated_queries)

        # Send the prompt to model, try 3 times to get a response
        response = None 
        try:
            response = model.generate_content(prompt,tools='google_search_retrieval').text
            # print("Success for ", firm_name, field)
            success = True
        except Exception as e:
            print("Exception occured: ", e)
            time.sleep(1) 

    
        # print(f"Gemini response for {firm_name}: ", response)
        if not response:
            print(f"Failed to get response for {firm_name}")
            failed_firms.append(firm_name)
            continue
    
        answers_per_field = response.split("*-*")
        answers_per_field = answers_per_field[:len(fields)]
        print(len(answers_per_field))
        if len(answers_per_field) != len(fields):
            print("Error: Number of answers do not match number of fields," , len(answers_per_field), len(fields))
            continue
        for field, answer in zip(fields, answers_per_field):

            # # strip out field name if it is in the answer
            # if field + ":" in answer:
            #     answer = answer.replace(field + ":", "")

            print(f"Gemini response for {firm_name} - {field}: ", answer)
            
            cursor.execute(f"UPDATE {table_name} SET {field} = ? WHERE Firm_Name = ? AND id = ?", (answer, firm_name, firm_id))
            conn.commit()

    cursor.close()
    conn.close()
    cursor_websearch.close()
    conn_web_results.close()

Debug -- now on firm id  1
Found row for firm,  "TRACTION POWER SYSTEMS, INC." 1
All fields are filled for "TRACTION POWER SYSTEMS, INC.", skipping.
Debug -- now on firm id  2
Found row for firm,  "A DAY TO REMEMBER", LLC 2
All fields are filled for "A DAY TO REMEMBER", LLC, skipping.
Debug -- now on firm id  3
Found row for firm,  " MALCO INC. -A NEVADA CORP. 3
All fields are filled for " MALCO INC. -A NEVADA CORP., skipping.
Debug -- now on firm id  4
Found row for firm,  .BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST 4
All fields are filled for .BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST, skipping.
Debug -- now on firm id  5
Found row for firm,  "J.CRESCI FARMS, L.L.C." 5
All fields are filled for "J.CRESCI FARMS, L.L.C.", skipping.
Debug -- now on firm id  6
Found row for firm,  1 MAN ARMY GOURMET FOODS LLC 6
All fields are filled for 1 MAN ARMY GOURMET FOODS LLC, skipping.
Debug -- now on firm id  7
Found row for firm,  10-Code, LLC 7
All fields are filled for 10-Code, LLC, skipping.
Deb

In [54]:
table_name

'firm_properties_gemini_with_grounding_v2'

TODO:
- strip new line from gemini responses
- problem with it still returning paragraphs
- limit token output, 
    - limit token output when p value is low