In [1]:
import sqlite3
import pandas as pd

# Step 1: Load data from the SQLite databases

# Connect to the first database and load the relevant table
conn_results = sqlite3.connect('/Users/taeeunkwon/Desktop/MSA/EquifaxPracticum/LLMWithWebData/firms_web_search_results.db')
results_df = pd.read_sql_query("SELECT * FROM firms_web_search_results", conn_results)
conn_results.close()

# Connect to the second database and load the relevant table
conn_scrapings = sqlite3.connect('/Users/taeeunkwon/Desktop/MSA/EquifaxPracticum/LLMWithWebData/firms_web_search_website_scrapings.db')
scrapings_df = pd.read_sql_query("SELECT * FROM firms_web_search_website_scrapings", conn_scrapings)
conn_scrapings.close()

# Step 2: Clean unnecessary columns (e.g., unnamed index columns if present)
results_df = results_df.loc[:, ~results_df.columns.str.contains('^Unnamed')]
scrapings_df = scrapings_df.loc[:, ~scrapings_df.columns.str.contains('^Unnamed')]

# Step 3: Join the dataframes on common columns, using an outer join to keep all entries
merged_df = pd.merge(results_df, scrapings_df, on=['id', 'Firm_Name'], how='outer', suffixes=('_results', '_scrapings'))

# Step 4: View the merged data to ensure it's loaded and joined correctly
print("Merged Data Sample:")
print(merged_df.head())



Merged Data Sample:
   id                                  Firm_Name  \
0   1             "TRACTION POWER SYSTEMS, INC."   
1   2                   "A DAY TO REMEMBER", LLC   
2   3               " MALCO INC. -A NEVADA CORP.   
3   4  .BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST   
4   5                   "J.CRESCI FARMS, L.L.C."   

                          Registered_Address_results  \
0  {"_type": "SearchResponse", "queryContext": {"...   
1  {"_type": "SearchResponse", "queryContext": {"...   
2  {"_type": "SearchResponse", "queryContext": {"...   
3  {"_type": "SearchResponse", "queryContext": {"...   
4  {"_type": "SearchResponse", "queryContext": {"...   

                                         CEO_results  \
0  {"_type": "SearchResponse", "queryContext": {"...   
1  {"_type": "SearchResponse", "queryContext": {"...   
2  {"_type": "SearchResponse", "queryContext": {"...   
3  {"_type": "SearchResponse", "queryContext": {"...   
4  {"_type": "SearchResponse", "queryContext": {".

In [2]:
import json

# List of columns that contain JSON-like entries in the merged DataFrame
json_columns = [
    'Registered_Address_results', 'CEO_results', 'Establishment_Year_results', 
    'Number_Of_Employees_results', 'Revenue_Size_results', 'Website_results', 
    'NAICS_Code_results', 'SIC_Code_results', 'Status_results',
    'Registered_Address_scrapings', 'CEO_scrapings', 'Establishment_Year_scrapings', 
    'Number_Of_Employees_scrapings', 'Revenue_Size_scrapings', 'Website_scrapings', 
    'NAICS_Code_scrapings', 'SIC_Code_scrapings', 'Status_scrapings'
]

# Parse JSON-like entries in the specified columns
for col in json_columns:
    merged_df[col] = merged_df[col].apply(lambda x: json.loads(x) if isinstance(x, str) and x.startswith('{') else x)

# Display a sample of the processed data
print("Sample data after parsing JSON entries:")
print(merged_df.head())

Sample data after parsing JSON entries:
   id                                  Firm_Name  \
0   1             "TRACTION POWER SYSTEMS, INC."   
1   2                   "A DAY TO REMEMBER", LLC   
2   3               " MALCO INC. -A NEVADA CORP.   
3   4  .BISHOP'S GATE RESIDENTIAL MORTGAGE TRUST   
4   5                   "J.CRESCI FARMS, L.L.C."   

                          Registered_Address_results  \
0  {'_type': 'SearchResponse', 'queryContext': {'...   
1  {'_type': 'SearchResponse', 'queryContext': {'...   
2  {'_type': 'SearchResponse', 'queryContext': {'...   
3  {'_type': 'SearchResponse', 'queryContext': {'...   
4  {'_type': 'SearchResponse', 'queryContext': {'...   

                                         CEO_results  \
0  {'_type': 'SearchResponse', 'queryContext': {'...   
1  {'_type': 'SearchResponse', 'queryContext': {'...   
2  {'_type': 'SearchResponse', 'queryContext': {'...   
3  {'_type': 'SearchResponse', 'queryContext': {'...   
4  {'_type': 'SearchResponse',

In [None]:
import sqlite3
import requests
import pandas as pd

# Step 1: Function to get company info from the database
def get_primary_company_info(df, firm_name, primary_state=None):
    # Clean company name and filter by primary state if specified
    firm_name_cleaned = firm_name.replace('"', '').lower()
    df['Firm_Name'] = df['Firm_Name'].str.replace('"', '').str.lower()

    filtered_df = df[df['Firm_Name'] == firm_name_cleaned]

    if filtered_df.empty:
        return None  # No match found
    
    # Get the first match or main entity
    fields = [
        'Firm_Name', 'CEO_results', 'Registered_Address_results', 'Establishment_Year_results', 
        'Number_Of_Employees_results', 'Revenue_Size_results', 'Website_results', 
        'NAICS_Code_results', 'SIC_Code_results', 'Status_results'
    ]
    primary_record = filtered_df[fields].iloc[0].to_dict()

    # Clean up None or missing values
    for key, value in primary_record.items():
        if pd.isnull(value):
            primary_record[key] = "Information not available"
            
    return primary_record



# Step 3: Retrieve company info using get_company_info function
company_name = "TRACTION POWER SYSTEMS, INC."
company_info = get_primary_company_info(merged_df, company_name)

if company_info:
    # Format the retrieved data for clarity
    context = "\n".join([f"{key.replace('_results', '')}: {value}" for key, value in company_info.items()])
else:
    context = "Information not available for the specified company."

prompt = f"""
        "Registered_Address": "After looking at the data below, tell me the Registered Adress for the firm named {company_name}",
        "CEO": "After looking at the data below, tell me the name and surname of the CEO for the firm named {company_name} ",
        "Establishment_Year": "After looking at the data below, tell me the establishment year for firm named {company_name}. Your answer should be 4 digits in the format YYYY",
        "Number_Of_Employees": "After looking at the data below, tell me the estimated number of employees for the firm named {company_name}. Your answer should be an approximate range, like 1-10, 10-100, 100-1000, 1000-10000, 10000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Revenue_Size": "After looking at the data below, tell me the estimated annual revenue in dollars for the firm named {company_name}. Your answer should be an approximate range, 10000-100000, 1000000-10000000, or 1000000000+ etc. You can specify other ranges like 200-250 if you wish.",
        "Website": "After looking at the data below, tell me the url of the official website of for the firm named {company_name}. If you can't find a website for the firm, please write 'No website found'",
        "NAICS_Code": "After looking at the data below, tell me the numeric NAICS code for the firm named {company_name}.",
        "SIC_Code":  "After looking at the data below, tell me the numeric SIC code for the firm named {company_name}.",
        "Status": "After looking at the data below, tell me the whether the firm named {company_name} is Active or Dissolved. Answer with one word.",

Based on the provided information, please identify the all the fields above for {company_name}.
If there are multiple options for each field, please select the most likely value.
And then show me as a structured format.
"""

# Step 5: Set up the API call
api_key = "AIzaSyBMFgWbKAGmmpg5nrPMvlTHL-seGE4qSCI" 
api_url = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash-latest:generateContent"
headers = {"Content-Type": "application/json"}
payload = {"contents": [{"parts": [{"text": prompt}]}]}
params = {"key": api_key}

# Set up the API call as before
try:
    response = requests.post(api_url, headers=headers, json=payload, params=params)
    response.raise_for_status()
    response_data = response.json()

    # Navigate the response structure to extract the response text
    if response_data.get('candidates'):
        # Access the text response
        gemini_response = response_data['candidates'][0]['content']['parts'][0].get("text", "")
        if gemini_response:
            print("Gemini Response:", gemini_response)
        else:
            print("No content received. Empty text in response:", response_data)
    else:
        print("No content received. Full response:", response_data)

except requests.exceptions.RequestException as e:
    print(f"Request failed: {e}")

Gemini Response: Here is the information we can gather from the data provided:

* **CEO:**  The data doesn't directly provide a CEO name. While it does mention a "M. Zeller" who was a previous President, we can't assume they are the current CEO. 
* **Establishment Year:** The data mentions a Delaware filing in 2011, a Texas filing in 2000, and a California filing in 1992. The most likely establishment year would be **1992**, as it's the earliest filing.
* **Operational Status:** The data mentions that the Texas company is no longer active. The Ohio company is listed as "Dead". The Delaware company, which was founded in 2011, has a status of "Unknown" on Bizapedia, but OpenCorporates suggests a status of "Inactive". Based on this, the most likely operational status for TRACTION POWER SYSTEMS, INC. is **Inactive or possibly defunct**.

**Overall, the data suggests that TRACTION POWER SYSTEMS, INC. is a company that was likely established in 1992 and is currently inactive or defunct. We c