In [4]:
import dotenv
import os
dotenv.load_dotenv()
import json

from pydantic import BaseModel
from typing import List, Dict, Tuple, Set

from google import genai
from google.genai import types
from google.genai.types import Tool, GenerateContentConfig, GoogleSearch


class Affiliation(BaseModel):
  name: str
  title: str

class ContactInfo(BaseModel):
  email: str
  phone: str

class Researcher(BaseModel):
  name_in_Chinese: str
  research_keywords: List[str]
  affiliations: List[Affiliation]
  contact_info: ContactInfo

## Multi-agent approach

Gemini does not seem to be able to do many tasks at once. For example, it cannot consistently generate a JSON string in text-output mode -- cannot do JSON output when tools are used. Some instrctions are not properly followed, such as "do not tell me how you think but just give me the result". 

So let's create individual agents for each task. 

* Agent 1: Web search with Gemini's own query rewriting and thinking
* Agent 2: Check the quality of the web search results
* Agent 3: Generate the final JSON output
* Main function: coordinate the work of the three agents. Need to obtain at least three candidates or three consistent "I cannot find" responses.

In [22]:
class InfoGatherer:
  def __init__(self, model_id: str):
    self.system_prompt = "You are a helpful cross-lingual customer discovery agent."

    self.user_prompt = """
    ## Background and overview

    Many researchers who are China-based or have affiliations in China publish papers in English. On those papers, their names are not in Chinese characters. For example, their names are transliterated or romanized, such as in Pinyin or Wade-Giles. The goal here is to find their names in Chinese characters.
    
    ### Challenges
    
    1. Mapping from such transliterated/romanized names to their Chinese names is not a simple machine translation task due to the homophony of Chinese characters and different transliteration/romanization methods. There is no one-to-one mapping. 
    2. Many researchers use names in the English-speaking world that are not a direct transliteration or romanization of their Chinese name. For example, nvidia CEO Jensen Huang's Chinese name is 黃仁勳 whose romanization is Jen-Hsun Huang. As another example, Forrest Bao's Chinese given name has nothing to do with and cannot be transliterated to "Forrest". So do not rule out a name that does not match the transliterated/romanized name.
    3. Reseachers move from one affiliation to another. So do not rule out a researcher that does not match the affiliation provided in the input. Two researchers may be the same person if they share co-authors, especially when the co-authors are from the same institutions at the two occurences of the researcher.
    4. It is possible for a researcher to have multiple affiliations, some in China and some in other countries. S/he can have multiple affliations in China and multiple affliations in other countries. S/he may have multiple affiliations on one paper. So do not rely solely on the affiliation provided in the input.

    ### Strategies

    Please iteratively search the web. First, come up with an initial search query. Then, search the web. After each round, you should analyze the results, then think, and then refine or expand the search query for the next round. Stop when you are confident that you have found the researcher in the Chinese-speaking world or no new results can be found.

    Here are some ideas to build or expand the search queries but please do not limit yourself to these ideas:

    {strategies}

    ### Knowledge about Chinese names that may help you

    1. While there is no one-to-one mapping from transliterated/romanized names to Chinese names, the mapping of last/family Chinese names are more finite than the first/given names. For example, if you see "Huang" or "Wong" at the beginning or the end of someone's name, it is highly likely to be the last/family name and the character is almost certain to be 黄, 汪, 翁, or 王.
    2. In the transliteration/romanization of a Chinese name, the last/family name usually appear as the first word or the last word -- either case is common. A word or token in the middle of a transliterated/romanized Chinese name is mostly unlikely to be a last/family name.
    3. If someone's name in the English-speaking world is in the form of "a Western name + a Chinese name", the Chinese name is most likely the last/family name.

    ### Qualification

    1. You can search in both English and Chinese resources, but the Chinese name of a researcher must appear in Chinese sources (can be part of an English-dominant resources) to support the name and affiliation of the researcher.
    2. It is possible that no information in Chinese can support the name and affiliation of the researcher. In such case, you should say I don't know. Do not use results of researchers of similar names.
    3. There is no misspelling of the name or affiliation provided in the input. 

    ### Output format

    {output_format}    

    Now, lets begin! Below is the information of the researcher to find:
    {researcher_info}
    """

    self.output_format_markdown = """
    
    Output the name and affiliation of the researcher in the following markdown format:
    * Chinese name 1
    * Affiliation 1
      - quote in Chinese source
      - title (if possible)
    * Affiliation 2
      - quote in Chinese source
      - title (if possible)
    ...

    For each name and affiliation, please provide a quote from the Chinese source to support it.   

    """

    # Tell me all information you found on the internet and how you processed it. 

    # At the end, 


    self.output_format_json = """

    Output the name and affiliation of the researcher in the following json format:

    {{
      "name": "Name of the researcher",
      "affiliations": [
        {{
          "name": "Affiliation name",
          "title": "Title of the researcher in this affiliation (if possible)",
          "quote": "Quote from the Chinese source showing this researcher is employed/affiliated with this institution"
        }}
      ]
    }}
    """

    self.strategies = """
    * Translate the affiliation to Chinese and search for it. 
    * Search using the email address or phone number in Chinese resources especially scientific journals. 
    * Different Chinese names can be transliterated or Romanized to the same name in English. Please try to come up some possible Chinese names, and then search for them. Doing so for the family/last name is easier than the first name. 
    * Prioritize your search sources to Chinese-speaking websites. Also try to search as much as possible from scientific literature sources such as Google Scholar, PubMed, CNKI, etc.
    * A researcher may move from one affiliation to another. But you can link two researchers if they share co-authors, especially when the co-authors are from the same institutions at the two occurences of the researcher. 
    * Leverage the co-author network/circle of the researcher. You may find co-author network from Semantic Scholar, Google Scholar, etc.
    * A research may publish papers under multiple affiliations. Expand your search queries to include other affiliations appear in publications as long as they are related to the research topic. 
    * From all pages you have found, extract all names and affiliations and add them to the search queries. 
    """

    self.model_id = model_id

  def call_gemini(self, prompt):
    client = genai.Client()

    google_search_tool = Tool(
        google_search = GoogleSearch()
    )

    response = client.models.generate_content(
        model=self.model_id,
        contents=prompt,
        config= {
           "tools": [google_search_tool],
           "response_modalities": ["TEXT"],
           "thinking_config": types.ThinkingConfig(thinking_budget=-1), 
           "system_instruction": self.system_prompt
        }
    )
    return response

  def calculate_cost(self, response):
    web_search_queries = response.candidates[0].grounding_metadata.web_search_queries
    
    pricing = { # per 1M tokens
      "gemini-2.5-flash": 
        {"input": 0.30, "output": 2.5, "cache": 0.075, "web_search": 0.035},
      "gemini-2.5-pro":
        {"input": 1.25, "output": 10, "cache": 0.31, "web_search": 0.035},
    }

    model_id = self.model_id

    pricing_for_model = pricing[model_id]

    cost = 0
    cost += pricing_for_model["input"] * response.usage_metadata.prompt_token_count / 1000000
    cost += pricing_for_model["output"] * (response.usage_metadata.candidates_token_count + response.usage_metadata.thoughts_token_count) / 1000000
    num_cached_tokens = 0 if response.usage_metadata.cached_content_token_count is None else response.usage_metadata.cached_content_token_count
    cost += pricing_for_model["cache"] * num_cached_tokens / 1000000
    cost += pricing_for_model["web_search"] * len(web_search_queries) / 1000
    
    return cost

  def one_prompt_approach(self, researcher_info: str, output_format: str) -> Tuple[List[str], float]:

    if output_format == "json":
      output_format_prompt = self.output_format_json
    elif output_format == "markdown":
      output_format_prompt = self.output_format_markdown
    else:
      raise ValueError(f"Invalid output format: {output_format}")

    prompt = self.user_prompt.format(researcher_info=researcher_info, strategies=self.strategies, output_format=output_format_prompt)

    response = self.call_gemini(prompt)
 
    try:
      thoughts = [part.text for part in response.candidates[0].content.parts]
    except:
      print ("Error: in candidate.content.parts. ")
      thoughts = []

    cost = self.calculate_cost(response)

    # the_other_model_id = {"gemini-2.5-flash": "gemini-2.5-pro", "gemini-2.5-pro": "gemini-2.5-flash"}[self.model_id]
    # print (f"Cost of using {model_id}: ${cost:.6f}")
    # print (f"The cost would have been ${calculate_cost(response, the_other_model_id):.6f} if using {the_other_model_id}")
    
    return thoughts, cost

In [23]:
# Try again on another researcher
info_gatherer = InfoGatherer(model_id="gemini-2.5-flash")
researcher_info = "Yifeng Wang College of Materials Science and Engineering, NanjLin Tech University, Nanjing, Jiangsu, 210009, China"
thoughts, cost = info_gatherer.one_prompt_approach(researcher_info=researcher_info, output_format="json")
print("".join(thoughts))

I have successfully identified the Chinese name and affiliation of the researcher, Yifeng Wang.

Here is the information in the requested format:

```json
{
  "name": "王一峰",
  "affiliations": [
    {
      "name": "南京工业大学 材料科学与工程学院",
      "title": "教授",
      "quote": "师资队伍总名单. ... 材料物理与化学系. 教授. ... 王一峰 [2]"
    }
  ]
}
```


In [26]:
# Try again on another researcher
info_gatherer = InfoGatherer(model_id="gemini-2.5-flash")
thoughts, cost = info_gatherer.one_prompt_approach("Professor David Z. Zhu, Department of Civil and Environmental Engineering, University of Alberta, Edmonton, AB, T6G 1H9, Canada", output_format="json")
print("".join(thoughts))

```json
{
  "name": "朱志伟 (David Z. Zhu)",
  "affiliations": [
    {
      "name": "宁波大学 (Ningbo University)",
      "title": "教授, 博士生导师, 加拿大工程院院士 (Professor, Doctoral Supervisor, Fellow of Canadian Academy of Engineering)",
      "quote": "朱志伟博士 Prof. David Z. Zhu 个人简介：宁波大学教授，博士生导师，加拿大工程院院士，原加拿大Alberta大学土木与环境工程系教授"
    },
    {
      "name": "University of Alberta",
      "title": "Professor Emeritus, Department of Civil and Environmental Engineering",
      "quote": "朱志伟博士 Prof. David Z. Zhu 个人简介：… 原加拿大Alberta大学土木与环境工程系教授"
    }
  ]
}
```


In [None]:
# try Gemini Flash with the InfoGatherer
info_gatherer = InfoGatherer(model_id="gemini-2.5-flash")
researcher_info = """gu	aijuan	Photocatalysis,Oxides,Adsorption,Organic reactions,Catalytic activity"""
thoughts, cost = info_gatherer.one_prompt_approach(researcher_info, output_format="json")
print ("".join(thoughts))
print ("Cost: ", cost)

Thoughts:  ```json
{
  "name": "顾爱娟",
  "affiliations": [
    {
      "name": "苏州大学",
      "title": "研究员/发明人",
      "quote": "发明人 顾爱娟...专利权人 苏州大学. 地址215123 江苏省苏州市苏州工业园区仁爱路199号."
    }
  ]
}
```
Cost:  0.0044104999999999995


In [None]:
class JSONGenerator:
  def __init__(self):
    self.system_prompt = """You are a helpful assistant that generates a JSON string from a list of thoughts."""
    self.user_prompt = """Given a search query to find a researcher and the web search results and the thought process of the search agent, please extract the name, research keywords, affiliations, and contact info of the researcher that are in Chinese. The web search results may contain errors. If the query contains a pinyin, transliterated, or romanized version of a Chinese name, please only process results in thoughts that contain Chinese names whose pinyin, transliteration, or romanization match the name in the query.

    Here is the query used to find the researcher:
    {query}

    Below are the thoughts from which the JSON string will be extracted:
    {thoughts}
    """

  def call_gemini(self, query: str, thoughts: List[str]) -> str:
    client = genai.Client()
    prompt = self.user_prompt.format(query=query, thoughts=thoughts)

    response = client.models.generate_content(
      model="gemini-2.5-flash",
      contents=prompt,
      config= {
          # "response_modalities": ["TEXT"],
          # "thinking_config": types.ThinkingConfig(thinking_budget=-1), 
          "system_instruction": self.system_prompt,
          "response_mime_type": "application/json",
          "response_schema": Researcher
      }
    )
    researcher = json.loads(response.text)
    return researcher


In [None]:
info_gatherer = InfoGatherer(model_id="gemini-2.5-flash")
json_generator = JSONGenerator()

query = "hua chen	jian	Palladium,Layers,Conjugated polymers,Light,Perovskites"
thoughts, cost = info_gatherer.one_prompt_approach(query)
r = json_generator.call_gemini(query, thoughts)
print (json.dumps(r, indent=2, ensure_ascii=False))

Error: in candidate.content.parts. 
{
  "name_in_Chinese": "",
  "research_keywords": [],
  "affiliations": [],
  "contact_info": {
    "email": "",
    "phone": ""
  }
}


In [None]:
# try again on another researcher
query = "Yifeng Wang College of Materials Science and Engineering, Nanjin Tech University, Nanjing, Jiangsu, 210009, China"
thoughts, cost = info_gatherer.one_prompt_approach(query)
r = json_generator.call_gemini(query, thoughts)
print (json.dumps(r, indent=2, ensure_ascii=False))

{
  "name_in_Chinese": "王一峰",
  "research_keywords": [],
  "affiliations": [
    {
      "name": "南京工业大学 材料科学与工程学院",
      "title": "教授, 博士生导师"
    }
  ],
  "contact_info": {
    "email": "yifeng.wang@njtech.edu.cn",
    "phone": ""
  }
}
