In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time
import sqlite3
from gemini_prompts import *

Model Config
- You need a .env file with GEMINI_API_KEY defined

In [100]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-pro-002",
  generation_config=generation_config,
)

chat_session = model.start_chat(
  history=[
  ]
)

Check that it works

In [101]:
response = chat_session.send_message("Hello there, what is your name ")
print(response.text)

I don't have a name. I'm a large language model, an AI.



Function to form prompt to give gemini

In [None]:
context = context_single_answer_v1 + context_local_dataset_v1
answer_format = answer_format_v1
field_to_query = field_to_query_v1

In [104]:
def form_prompt(query, local_data):
    prompt = f"""

    {context}
    {query}
    {answer_format_v1}

    Here are the results of my search for this firm.
    {local_data}
    """
    return prompt

### Open firm databases

In [109]:
conn = sqlite3.connect("firm_database_llm.db")
cursor = conn.cursor()


# Drop the table if you're starting from scratch
# cursor.execute('''
# DROP TABLE IF EXISTS firm_properties
#                ''')

table_name = "firm_properties_gemini_without_grounding_local_dataset_v1"
# You can create different tables for different prompt types, gemini model etc, if you rename the table
cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {table_name} (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees INT,
               Revenue_Size INT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT,
               Dissolvement_Year INT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')


web_search_table_name = "firms_web_search_results"
webscraping_table_name = "firms_web_search_website_scrapings"
conn_web_results = sqlite3.connect("firms_web_search_results.db")
conn_websites = sqlite3.connect("firms_web_search_website_scrapings.db")
cursor_websearch = conn_web_results.cursor()
cursor_websites = conn_websites.cursor()

In [110]:
# Check existing tables in the database
existing_tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print(existing_tables)

[('sqlite_sequence',), ('firm_properties_gemini_without_grounding_local_dataset_v1',)]


In [111]:
# table_to_drop = "firm_properties_gemini_without_grounding_local_dataset_v1"
# cursor.execute(f"DROP TABLE IF EXISTS {table_to_drop} ")
# conn.commit()

Check how many entries are here

In [112]:
cursor.execute(f''' SELECT * FROM {table_name} WHERE id = 1''').fetchall()

[]

Check how many firms have their website scrapings

In [113]:
cursor_websites.execute(f''' SELECT COUNT(*) FROM {webscraping_table_name}''').fetchall()

[(1315,)]

Get list of firms we'll loop through from the webscraping dataset

In [114]:
cursor_websites.execute(f''' SELECT id, Firm_Name FROM {webscraping_table_name} ''')
firm_list = cursor_websites.fetchall()

In [115]:
single_query_per_field = False
if single_query_per_field:
    context = " "
else:
    context = context_single_answer_v1

Single query per field

In [116]:
if single_query_per_field:
    # TODO: This needs a different prompt to return a single answer
    for firm_data in firm_list:
        firm_id = firm_data[0]
        firm_name = firm_data[1]

        # Check if firm already exists, insert row if it doesnt
        cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        firm_row = cursor.fetchone()
        if firm_row is None:
            print("Inserting new firm:, ", firm_name)
            cursor.execute(f"INSERT INTO {table_name} (Firm_Name) VALUES (?)", (firm_name,))
            firm_id = cursor.lastrowid  
        else:
            # Get the existing firm's id
            print("Found row for firm, ", firm_name)
            firm_id = firm_row[0]
        
        # begin new chat session
        chat_session = model.start_chat(
            history=[
            ]
        )

        for field in fields:
            print("Debug now for ", firm_name, field)

            # 1. Check if the field value in the prdiction database is NULL
            cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id,firm_name,))
            if cursor.fetchone() is not None:
                print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
                continue
        
            # 2.1Get the web search results for the firm
            cursor_websearch.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
            web_search_result = cursor_websearch.fetchone()
            if web_search_result is not None:
                web_search_result = json.loads(web_search_result[0]) # TODO check indexing here
            else:
                web_search_result = "No web search data available"
            
            # 2.2 Get the website scraping results for the firm
            cursor_websites.execute(f"SELECT {field} FROM firms_web_search_website_scrapings WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
            website_scraping_result = cursor_websites.fetchone()
            if website_scraping_result is not None:
                website_scraping_result = json.loads(website_scraping_result[0])
            else:
                website_scraping_result = "No website scraping data available"
        
            data_to_pass_llm = {
                f"Results of searching the web for {firm_name}": web_search_result,
                f"Scraped contents of top 5 websites for {firm_name}": website_scraping_result
            }

            # Update queries with firm name
            updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

            prompt = form_prompt(updated_queries[field], data_to_pass_llm)

            print('debug: prompt is of length', len(prompt))
            # break
            success = False
            try:
                response = model.generate_content(prompt).text
                # response = chat_session.send_message(prompt).text
                # print("Success for ", firm_name, field)
                success = True
            except Exception as e:
                print("Gemini Exception: ", e)
                response = None
                    
            print(f"Response for {firm_name} - {field}: ", response)
            # Update database
            cursor.execute(f"INSERT INTO {table_name} (Firm_Name, {field}) VALUES (?, ?)", (firm_name, response))
            conn.commit()

Multiple queries per field

In [None]:
if not single_query_per_field:
    for firm_data in firm_list[:10]:
        firm_id = firm_data[0]
        firm_name = firm_data[1]

        # Check if firm already exists, insert row if it doesnt
        print("Debug -- now on firm id ", firm_id)

        cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
        firm_row = cursor.fetchone()
        if firm_row is None:
            print("Inserting new firm:, ", firm_name)
            cursor.execute(f"INSERT INTO {table_name} (Firm_Name) VALUES (?)", (firm_name,))
            firm_id = cursor.lastrowid  
        else:
            # Get the existing firm's id
            print("Found row for firm, ", firm_name , firm_id)
            firm_id = firm_row[0]


        # Identify if any field is missing for this firm with a single query
        cursor.execute(f"""
            SELECT id FROM {table_name}
            WHERE id = ? AND Firm_Name = ? AND ({' OR '.join([f"{field} IS NULL" for field in fields])})
        """, (firm_id, firm_name,))

        # Check if any fields are missing
        firm_row_with_missing_fields = cursor.fetchone()
        if firm_row_with_missing_fields is None:
            print(f"All fields are filled for {firm_name}, skipping.")
            continue

        # 2.1Get the web search results and scrapings for the firm

        # 2.1Get the web search results for the firm
        local_data_to_pass_llm = {}
        for field in fields:
            cursor_websearch.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
            web_search_result = cursor_websearch.fetchone()
            if web_search_result is not None:
                web_search_result = json.loads(web_search_result[0]) # TODO check indexing here
            else:
                web_search_result = "No web search data available"
            
            # 2.2 Get the website scraping results for the firm
            cursor_websites.execute(f"SELECT {field} FROM firms_web_search_website_scrapings WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
            website_scraping_result = cursor_websites.fetchone()
            if website_scraping_result is not None:
                website_scraping_result = json.loads(website_scraping_result[0])
            else:
                website_scraping_result = "No website scraping data available"
        
            local_data_field = {
                f"Results of searching the web for {firm_name} + {field} ": web_search_result,
                f"Scraped contents of top 5 websites for {firm_name} + {field}": website_scraping_result
            }

            local_data_to_pass_llm[field] = local_data_field


        # Update queries with firm name
        updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

        # Form prompt and query gemini
        prompt = form_prompt(updated_queries, local_data_to_pass_llm)

        # Send the prompt to model, try 3 times to get a response
        response = None 
        try:
            response = model.generate_content(prompt).text
            # print("Success for ", firm_name, field)
            success = True
        except Exception as e:
            print("Gemini Exception occured: ", e)

        print(f"Gemini response for {firm_name}: ", response)

        if not response:
            print("Gemini failed to respond, skipping")
            continue
        answers_per_field = response.split("*-*")
        print(len(answers_per_field))
        if len(answers_per_field) != len(fields):
            print("Error: Number of answers do not match number of fields," , len(answers_per_field), len(fields))
            continue
    
        for field, answer in zip(fields, answers_per_field):

            print(f"Gemini response for {firm_name} - {field}: ", answer)
            
            cursor.execute(f"UPDATE {table_name} SET {field} = ? WHERE Firm_Name = ? AND id = ?", (answer, firm_name, firm_id))
            conn.commit()

    cursor.close()
    conn.close()
    cursor_websearch.close()
    conn_web_results.close()

Debug -- now on firm id  1
Found row for firm,  "TRACTION POWER SYSTEMS, INC." 1 skipping
All fields are filled for "TRACTION POWER SYSTEMS, INC.", skipping.
Debug -- now on firm id  2
Found row for firm,  "A DAY TO REMEMBER", LLC 2 skipping
All fields are filled for "A DAY TO REMEMBER", LLC, skipping.
Debug -- now on firm id  3
Found row for firm,  " MALCO INC. -A NEVADA CORP. 3 skipping
Gemini Exception occured:  400 Request payload size exceeds the limit: 20971520 bytes. The file size is too large. Please use the File API to upload your files instead. Example: `f = genai.upload_file(path); m.generate_content(['tell me about this file:', f])`
Gemini response for " MALCO INC. -A NEVADA CORP.:  None
Gemini failed to respond, skipping
Debug -- now on firm id  4
Found row for firm,  .BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST 4 skipping
All fields are filled for .BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST, skipping.
Debug -- now on firm id  5
Found row for firm,  "J.CRESCI FARMS, L.L.C." 5 s