### Explanation
- Loop through the firms in the bing web search and website scraping databases, and generate predictions

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time
import sqlite3
from concurrent.futures import ThreadPoolExecutor, as_completed
%load_ext autoreload
%autoreload 2
from gemini_prompts import *

# 1. Configure Model
- You need a .env file with GEMINI_API_KEY defined

In [2]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY_PAID"))

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  # model_name="gemini-1.5-pro-002",
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
)


Check that model works

In [None]:
response = model.generate_content("Hello there, what is your name ")
print(response.text)

## 2. Define functions to form prompts

In [4]:
# Update prompts in the gemini_prompts.py file 
context =  context_local_dataset_v1 + context_single_field_v1
answer_format = answer_format_v1
field_to_query = field_to_query_v1

In [5]:
def form_prompt(query, local_data):
    prompt = f"""

    {context}
    {query}
    {answer_format}

    Here are the results of my search for this firm.
    {local_data}
    """
    return prompt

# 3. Connect to the source and target databases

In [6]:
conn = sqlite3.connect("firm_database_llm.db")
cursor = conn.cursor()

# Drop the table if you're starting from scratch
# cursor.execute('''
# DROP TABLE IF EXISTS firm_properties
#                ''')

table_name = "firm_properties_gemini_without_grounding_local_dataset_v1"
# You can create different tables for different prompt types, gemini model etc, if you rename the table
cursor.execute(f'''
CREATE TABLE IF NOT EXISTS {table_name} (
               id INTEGER PRIMARY KEY AUTOINCREMENT,
               Firm_Name TEXT NOT NULL,
               Registered_Address TEXT,
               CEO TEXT,
               Establishment_Year INT,
               Number_Of_Employees INT,
               Revenue_Size INT,
               Website TEXT,
               NAICS_Code INT,
               SIC_Code INT,
               Status TEXT,
               Dissolvement_Year INT,
               Company_Type TEXT,
               Previous_Names TEXT, 
               Alternative_Names TEXT, 
               Key_Executive_Personnel TEXT
               )
               ''')


web_search_table_name = "firms_web_search_results"
webscraping_table_name = "firms_web_search_website_scrapings"
conn_web_results = sqlite3.connect("firms_web_search_results.db")
conn_websites = sqlite3.connect("firms_web_search_website_scrapings.db")
cursor_websearch = conn_web_results.cursor()
cursor_websites = conn_websites.cursor()

In [None]:
# Check existing tables in the database
existing_tables = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
print(existing_tables)

In [8]:
# table_to_drop = "firm_properties_gemini_without_grounding_local_dataset_v1"
# cursor.execute(f"DROP TABLE IF EXISTS {table_to_drop} ")
# conn.commit()

In [9]:
max_data_length_allowed = 1000000 

### Override with sample firms for demo

In [None]:
df = pd.read_csv("firms_sample.csv") 
# change this to all firms in the bing search database for actual run
# cursor_websearch.execute(f"SELECT * FROM {web_search_table_name}").fetchall()
for i,row in df.iterrows():
    row = row.to_dict()
    print(row)
    

# 4. Main Loop
- Parallelize with an executor for each field, update all fields of a single row at once

In [None]:
def process_field(field, firm_id, firm_name, updated_query, max_data_length_allowed):
    # Open new connections for each thread (SQLite connections are not thread-safe)
    conn_websearch_thread = sqlite3.connect('firms_web_search_results.db')
    cursor_websearch_thread = conn_websearch_thread.cursor()

    conn_websites_thread = sqlite3.connect('firms_web_search_website_scrapings.db')
    cursor_websites_thread = conn_websites_thread.cursor()

    # Get the web search results
    cursor_websearch_thread.execute(f"SELECT {field} FROM firms_web_search_results WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    web_search_result = cursor_websearch_thread.fetchone()
    if web_search_result is not None and web_search_result[0]:
        web_search_result = json.loads(web_search_result[0])
    else:
        web_search_result = "No web search data available"

    # Get the website scraping results
    cursor_websites_thread.execute(f"SELECT {field} FROM firms_web_search_website_scrapings WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    website_scraping_result = cursor_websites_thread.fetchone()
    if website_scraping_result is not None and website_scraping_result[0]:
        website_scraping_result = json.loads(website_scraping_result[0])
        # Check if scraped contents are too long
        if len(json.dumps(website_scraping_result)) > max_data_length_allowed:
            print(f"Scraped contents for {firm_name} - {field} is too long. Skipping.")
            website_scraping_result = "No website scraping data available"
    else:
        website_scraping_result = "No website scraping data available"

    data_to_pass_llm = {
        f"Results of searching the web for {firm_name}": web_search_result,
        f"Scraped contents of top websites for {firm_name}": website_scraping_result
    }

    # Generate the prompt
    prompt = form_prompt(updated_query, data_to_pass_llm)

    # Call the LLM to get the response
    response = model.generate_content(prompt).text

    print(f"Response for the firm named: {firm_name}, with id  {firm_id}, for the field {field} is {response} ")

    # Close the thread-specific connections
    conn_websearch_thread.close()
    conn_websites_thread.close()

    return field, response


# Main processing loop
for i,row in df.iterrows():
    row = row.to_dict()
    firm_id = row['id']
    firm_name = row['name']
    print("\n ---- Debug now for ", firm_name, "id", firm_id)

    # Check if firm already exists, insert row if it doesn't
    cursor.execute(f"SELECT id FROM {table_name} WHERE id = ? AND Firm_Name = ?", (firm_id, firm_name,))
    firm_row = cursor.fetchone()
    if firm_row is None:
        print("Inserting new firm: ", firm_name)
        cursor.execute(f"INSERT INTO {table_name} (id, Firm_Name) VALUES (?, ?)", (firm_id, firm_name))
        conn.commit()
    else:
        # Get the existing firm's id
        print("Found row for firm: ", firm_name, firm_id)

    # Update queries with firm name
    updated_queries = {key: value.format(firm_name=firm_name) for key, value in field_to_query.items()}

    # List to hold fields that need processing
    fields_to_process = []

    for field in fields:
        # Check if the field value in the prediction database is NULL
        cursor.execute(f"SELECT {field} FROM {table_name} WHERE id = ? AND Firm_Name = ? AND {field} IS NOT NULL", (firm_id, firm_name,))
        if cursor.fetchone() is not None:
            # print(f"Field '{field}' already has data for firm '{firm_name}', skipping.")
            continue
        else:
            fields_to_process.append(field)

    # If there are no fields to process, continue to the next firm
    if not fields_to_process:
        continue

    # Use ThreadPoolExecutor to process fields in parallel else its kinda slow
    results = {}
    with ThreadPoolExecutor(max_workers=14) as executor:
        # Create a future for each field
        futures = {
            executor.submit(process_field, field, firm_id, firm_name, updated_queries[field], max_data_length_allowed): field
            for field in fields_to_process
        }

        for future in as_completed(futures):
            field = futures[future]
            try:
                field, response = future.result()
                if response is not None:
                    results[field] = response
            except Exception as e:
                print(f"Error in future for field {field}: {e}")
    
    time.sleep(1) # avoid hitting the gemini quota per minute

    # Updat database sequentially
    for field, response in results.items():
        # print(f"Updating database for {firm_name} - {field}")
        cursor.execute(f"UPDATE {table_name} SET {field} = ? WHERE Firm_Name = ? AND id = ?", (response, firm_name, firm_id))
        conn.commit()


In [None]:
names = df.name.tolist()
data = cursor.execute("SELECT * FROM {} WHERE Firm_Name IN ({})".format(table_name, ",".join(["?"] * len(df))
), names).fetchall()

In [None]:
data

In [12]:
conn.close()