In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import google.generativeai as genai
import json
import time

In [2]:
fields = ["Firm_Name", "Registered_Address", "CEO", "Establishment_Year", "Number_Of_Employees", "Revenue_Size" ,
        "Website", "NAICS_Code", "SIC_Code", "Status" ]

Model Config

In [3]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash-8b",
  generation_config= {
    "temperature": 1,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
    }
 )

chat_session = model.start_chat(
  history=[
  ]
)

In [4]:
response = chat_session.send_message("What gemini model are you")

print(response.text)

I'm currently running on a large language model, and I don't have a specific Gemini model designation.



In [5]:
def form_prompt(context, query, data):
    prompt = f"""
    Context:
    {context}

    Query:
    {query}

    Relevant Data:
    {data}
    """

    return prompt

In [6]:
general_context = "You will be assisting me with filling in data fields for a firm database I am building. I will tell you the name of the firm i am interested in, and the field I want you to fill. I will give you relevant information from websites or google search results that I gathered by searching for the firm name and field. You will give your answer by simply stating the value of the field I am interested in. Do not form sentences, just give the value of the field. If you have absolutely no idea about the answer, then answer with 'null' ."

Loop through all firm/field combinations

Get google search result and list of firm names and data fields

In [7]:
search_results = json.load(open("firm_google_search_results.json"))
# Get Firm names and data fields
df_firms = pd.read_csv('FirmData.csv')
data_fields = df_firms.columns.tolist()
data_fields.remove('Firm_Name')
data_fields
firm_names = df_firms.Firm_Name.tolist()
firm_names

['01K Capital LLC.',
 '1 Act Services, LLC',
 "TIN DRUM ASIACAFE', LLC",
 'Dancing Goats Coffee ARR, LLC',
 'Clickety Clack Vape Gifts LLC',
 'Amin petrol electric llc',
 'CAB CHINA, LLC',
 'E R Enterprise for Freedom LLC',
 'Georgia Tech Savannah, LLC',
 'ANDREW THOMAS LEE PHOTOGRAPHY, LLC']

In [8]:
llm_firm_data = json.load(open("llm_firm_data.json"))

Ask the llm to extract the relevant field based on search results for the given firm
- Alternate between Gemini 1.5 Flash and 1.0 Pro when usage limit is hit

In [9]:
for firm_name in firm_names:
    if firm_name not in llm_firm_data:
        llm_firm_data[firm_name] = {}
    for field in data_fields:
        if field not in llm_firm_data[firm_name]:
            llm_firm_data[firm_name][field] = {}

        prompt = form_prompt(
            context=general_context,
            query= f"Fill in the field {field} for the firm {firm_name}", 
            data = search_results[firm_name][field])
        
        llm_firm_data[firm_name][field]['prompt']  = prompt

        # check if we already filled the field      
        if 'response' in llm_firm_data[firm_name][field]:
            print("response exists for ", firm_name, field)            
            continue
        
        success = False

        failure_counter = 0
        while not success and failure_counter < 10:
            try:
                response = chat_session.send_message(prompt)
                llm_firm_data[firm_name][field]['response'] = response.text
                print("Success for ", firm_name, field)
                success = True
            except Exception as e:
                print("Exception occured: ", e)
                time.sleep(1) 
                failure_counter += 1

            
        

response exists for  01K Capital LLC. Registered_Address
response exists for  01K Capital LLC. CEO
response exists for  01K Capital LLC. Establishment_Year
response exists for  01K Capital LLC. Number_Of_Employees
response exists for  01K Capital LLC. Revenue_Size
response exists for  01K Capital LLC. Website
response exists for  01K Capital LLC. NAICS_Code
response exists for  01K Capital LLC. SIC_Code
response exists for  01K Capital LLC. Status
response exists for  1 Act Services, LLC Registered_Address
response exists for  1 Act Services, LLC CEO
response exists for  1 Act Services, LLC Establishment_Year
response exists for  1 Act Services, LLC Number_Of_Employees
response exists for  1 Act Services, LLC Revenue_Size
response exists for  1 Act Services, LLC Website
response exists for  1 Act Services, LLC NAICS_Code
response exists for  1 Act Services, LLC SIC_Code
response exists for  1 Act Services, LLC Status
response exists for  TIN DRUM ASIACAFE', LLC Registered_Address
respo

KeyboardInterrupt: 

In [10]:
with open("llm_firm_data.json", "w") as f:
    json.dump(llm_firm_data, f)

Fill in dataframe

In [17]:
for firm_name in firm_names:
    for field in data_fields:
        value = llm_firm_data[firm_name][field]['response']
        value = value.strip("\n")
        if value == "null":
            value = None
        df_firms.loc[df_firms.Firm_Name == firm_name, field] = value

In [19]:
df_firms.to_csv("FirmDataLLMAugmented.csv", index=False)