In [None]:
import pandas as pd
import time
from google import genai
from google.genai.types import Tool, GenerateContentConfig, GoogleSearch
import json
import os

In [None]:
output = "../../data/shuffled_output.json"
with open(output, "r", encoding="utf-8") as f:
    data = json.load(f)

data = pd.json_normalize(data)
data.head()

In [None]:
def get_api_key(env_path, key_name):
    with open(env_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith(key_name):
                return line.split('=')[1].strip()
    return None

In [None]:
api_key = get_api_key("../../company-researcher/.env", "GOOGLE_API_KEY")

In [None]:
def get_prod_desc_with_search(parent, company_name, api_key):
    client = genai.Client(api_key=api_key)
    model_id = "gemini-2.0-flash"

    google_search_tool = Tool(
        google_search=GoogleSearch()
    )

    prompt = f"""You are a business data assistant. Do not include any preamble or explanation. 
Only output the product_description in clean markdown format.

Retrieve and return the `product_description` of the company named "{company_name}", 
which operates as a subsidiary of "{parent}". The output must start directly with the company's name and its description.

Example:

If the request is for Garanti BBVA, the output should resemble the following:

Garanti BBVA offers a comprehensive portfolio of financial products and services, including:

- Checking and savings accounts  
- Investment products (gold, mutual funds, stocks, derivatives)  
...
"""

    response = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config=GenerateContentConfig(
            tools=[google_search_tool],
            response_modalities=["TEXT"],
        )
    )

    return response.candidates[0].content.parts[0].text

In [None]:
def add_company(parent, company_name, product_desc):
    return {
        "parent_name": parent,
        "info": {
            "company_name": company_name,
            "product_description": product_desc
        }
    }

In [None]:
def add_companies(dataframe):
    companies = []

    for index, row in dataframe.iterrows():
        parent = row["info.company_name"]
        for company_name in row["info.affiliations.subsidiaries"]:
            #product_desc = get_prod_desc_with_search(parent, company_name)
            companies.append(add_company(parent, company_name, None))
    
    return companies

## DO NOT RUN THE CELL BELOW

In [None]:
"""
#!!!! DO NOT RUN THE CELL BELOW !!!!
companies = add_companies(data)

# this is a temporary file 
with open('../../data/subsidiary_companies_with_prod_descs.json', 'w', encoding="utf-8") as f:
    json.dump(companies, f, indent=2, ensure_ascii=False)
"""

In [None]:
def add_prod_descs(companies_path, api_key, max_iters):
    companies = []
    with open(companies_path, 'r', encoding="utf-8") as f:
        companies = json.load(f)

    iter_amount = 0
    output_path = "../../data/subsidiary_companies_with_prod_descs.json"
    temp_output_path = output_path + ".tmp"

    for company in companies:
        if company["info"]["product_description"] is not None:
            continue

        if iter_amount >= max_iters:
            break

        parent_name = company["parent_name"]
        company_name = company["info"]["company_name"]

        curr_iter = 0
        while curr_iter < 10:
            try:
                # Try fetching product description
                company["info"]["product_description"] = get_prod_desc_with_search(parent_name, company_name, api_key)
                iter_amount += 1

                # Save progress safely
                with open(temp_output_path, "w", encoding="utf-8") as f:
                    json.dump(companies, f, indent=2, ensure_ascii=False)
                os.replace(temp_output_path, output_path)  # atomic write

                break
            except Exception as e:
                print(f"Error on {company_name}: {e}")
                curr_iter += 1
                time.sleep(curr_iter)
        
        if iter_amount % 15 == 0:
            time.sleep(60)



In [None]:
add_prod_descs("../../data/subsidiary_companies_with_prod_descs.json", api_key, 400)