In [113]:
import dotenv
import os
dotenv.load_dotenv()
import json

from pydantic import BaseModel
from typing import List, Dict, Tuple, Set

class Affiliation(BaseModel):
  name: str
  title: str

class ContactInfo(BaseModel):
  email: str
  phone: str

class Researcher(BaseModel):
  name_in_Chinese: str
  research_keywords: List[str]
  affiliations: List[Affiliation]
  contact_info: ContactInfo

Known limitations: 
1. Tool use with JSON output is not supported.  `ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': "Tool use with a response mime type: 'application/json' is unsupported", 'status': 'INVALID_ARGUMENT'}}
`

## One-prompt, one-agent approach -- let the agent do the work behind the scene

In [105]:
prompt_template = """
Below is the name, affiliation, contact info of a researcher in English. The researcher likely has an affiliation in China.
Your job is to find the name, affiliation, and contact info of the same researcher in China.

Please first come up with a strategy. Then, iteratively generate web search queries to find the researcher in the Chinese-speaking world. After each search, you should refine or expand the search queries based on the results, including but not limited to extracting all names and affiliations from returned web pages and adding them to the search queries. You can search in both English and Chinese resources but the evidence to support the Chinese name and affiliation of the searcher must be in Chinese. Stop when you are confident that you have found the researcher in the Chinese-speaking world or no new results can be found in the latest 3 searches.

Note that some researchers may work for different institutions in the English-speaking world (e.g. University of ABC in United States) and the Chinese-speaking world (e.g. University of XYZ in China). Also note many researchers use different names, especialy first/given names, in different contextsl, e.g., a Western name in the English-speaking world and a Chinese name in the Chinese-speaking world.

Output in a JSON format that includes the name in Chinese and a list of affiliations and corresponding titles of the researcher in China like this:
{{
  "name_in_Chinese": "张三",
  "affiliations": [
    {{"name": "清华大学", "title": "教授"}}, 
    {{"name": "北京大学", "title": "副教授"}}
  ]
}}

If you cannot find the researcher, just say you don't know. 

Here are some ideas for you to get started:
{ideas}

Here is the information of the researcher to research:
{researcher_info}
"""

ideas = """
* Translate the affiliation to Chinese and search for it. 
* Search using the email address or phone number in Chinese resources especially scientific journals. 
* Different Chinese names can be transliterated or Romanized to the same name in English. Please try to come up some possible Chinese names, and then search for them. Doing so for the family/last name is easier than the first name. 
* Prioritize your search sources to Chinese-speaking websites.
* One way is to obtain all Chinese names of researchers matching the research topic and the affiliation, and then transliterate them to English to find the one that matches the English info of the researcher. 
* A researcher may move from one affiliation to another. But you can link two researchers if they share co-authors, especially when the co-authors are from the same institutions at the two occurences of the researcher. 
* Try to leaverage the co-author network/circle of the researcher. 
* A research may publish papers under multiple affiliations. Expand your search queries to include other affiliations appear in publications as long as they are related to the research topic. 
* From all pages you have found, extract all names and affiliations and add them to the search queries. 
"""


from google import genai
from google.genai import types
from google.genai.types import Tool, GenerateContentConfig, GoogleSearch

def call_gemini(prompt, model_id: str):
    client = genai.Client()

    google_search_tool = Tool(
        google_search = GoogleSearch()
    )

    response = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config= {
           "tools": [google_search_tool],
           "response_modalities": ["TEXT"],
           "thinking_config": types.ThinkingConfig(thinking_budget=-1), 
           "system_instruction": "You are a cross-lingual customer discovery agent", 
          #  "candidateCount": 3, # BUG: Why does Gemini only return 1 candidate?
          #  "response_mime_type": "application/json", # Tool use with JSON output is not supported in Gemini
        }
    )
    return response

def calculate_cost(response, model_id: str):
  web_search_queries = response.candidates[0].grounding_metadata.web_search_queries
  
  pricing = { # per 1M tokens
    "gemini-2.5-flash": 
      {"input": 0.30, "output": 2.5, "cache": 0.075, "web_search": 0.035},
    "gemini-2.5-pro":
      {"input": 1.25, "output": 10, "cache": 0.31, "web_search": 0.035},
  }

  pricing_for_model = pricing[model_id]

  cost = 0
  cost += pricing_for_model["input"] * response.usage_metadata.prompt_token_count / 1000000
  cost += pricing_for_model["output"] * (response.usage_metadata.candidates_token_count + response.usage_metadata.thoughts_token_count) / 1000000
  num_cached_tokens = 0 if response.usage_metadata.cached_content_token_count is None else response.usage_metadata.cached_content_token_count
  cost += pricing_for_model["cache"] * num_cached_tokens / 1000000
  cost += pricing_for_model["web_search"] * len(web_search_queries) / 1000
  
  return cost

def one_prompt_approach(prompt_template: str, ideas: str, researcher_info: str, model_id: str):
  prompt = prompt_template.format(researcher_info=researcher_info, ideas=ideas)
  response = call_gemini(prompt, model_id)

  for candidate in response.candidates:
    print ("One candidate:")
    try:
      for each in candidate.content.parts:
          print(each.text)
    except:
      print ("Error: in candidate.content.parts. See below:")
      print (candidate.content.parts)
    print ( "=================")

  cost = calculate_cost(response, model_id)

  the_other_model_id = {"gemini-2.5-flash": "gemini-2.5-pro", "gemini-2.5-pro": "gemini-2.5-flash"}[model_id]
  print (f"Cost of using {model_id}: ${cost:.6f}")
  print (f"The cost would have been ${calculate_cost(response, the_other_model_id):.6f} if using {the_other_model_id}")
  
  return response

In [90]:
# Try Gemini Flash
researcher_info = """Professor David Z. Zhu, Department of Civil and Environmental Engineering, University of Alberta, Edmonton, AB, T6G 1H9, Canada"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
{
  "name_in_Chinese": "朱志伟",
  "affiliations": [
    {
      "name": "宁波大学",
      "title": "教授, 博士生导师"
    },
    {
      "name": "宁波大学海洋工程研究院",
      "title": "院长"
    },
    {
      "name": "浙江大学",
      "title": "永谦讲座教授"
    }
  ]
}
Cost of using gemini-2.5-flash: $0.004002
The cost would have been $0.015415 if using gemini-2.5-pro


In [92]:
# Try Gemini Pro
researcher_info = """Professor David Z. Zhu, Department of Civil and Environmental Engineering, University of Alberta, Edmonton, AB, T6G 1H9, Canada"""
model_id = "gemini-2.5-pro"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
My strategy is to find the Chinese name and affiliation of Professor David Z. Zhu. I will start by searching for his English name along with his affiliation and translating these into Chinese to search on Chinese websites. I will look for his publications, co-authors, and any profiles on Chinese academic platforms. I will iteratively refine my searches based on the information I find, such as potential Chinese names or associated institutions in China. My goal is to find conclusive evidence in Chinese to identify his name and affiliation in the Chinese-speaking world.
{
  "name_in_Chinese": "朱志伟",
  "affiliations": [
    {
      "name": "宁波大学",
      "title": "教授"
    },
    {
      "name": "浙江大学",
      "title": "客座教授"
    },
    {
      "name": "阿尔伯塔大学",
      "title": "名誉教授"
    }
  ]
}
Cost of using gemini-2.5-pro: $0.009459
The cost would have been $0.002487 if using gemini-2.5-flash


In [None]:
# Try Gemini Flash on another random researcher
# This time, only name and research areas
researcher_info = """gu	aijuan	Photocatalysis,Oxides,Adsorption,Organic reactions,Catalytic activity"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
The initial search provided very promising results.
Specifically, and directly link "Aijuan GU" with "Soochow University, Suzhou (SUDA)" and "College of Materials Science, Chemistry and Chemical Engineering". It also mentions her as a PhD.

Additionally, lists an email `ajgu@suda.edu.cn` which confirms the Soochow University affiliation.
 also lists "Jiangsu Key Laboratory of Advanced Functional Polymer Design and Application, Department of Materials Science and Engineering, College of Chemistry, Chemical Engineering and Materials Science, Soochow University, Suzhou, 215123, P. R. China" with "Aijuan Gu" as an author.

Now I need to confirm the Chinese name. The snippet is very useful: "[GU, Aijuan 顾嫒娟, CAI, Hua 蔡华, LIANG, Guozheng 梁国正, YUAN, Li 袁莉]". This clearly provides the Chinese name "顾嫒娟" for Aijuan Gu.

Therefore, I have:
*   **Name in Chinese:** 顾嫒娟
*   **Affiliation:** Soochow University (苏州大学)
*   **Department/College:** College of Materials Science, Chemistry

In [101]:
# Try again on yet another random researcher
researcher_info = """ding	congcong	Chemistry, Thermal conductivity, Thermoelectrics, Solar cells, Electrical conductivity, Perovskites"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
Error: in candidate.content.parts. See below:
None
Cost of using gemini-2.5-flash: $0.006298
The cost would have been $0.023965 if using gemini-2.5-pro


In [95]:
# Try again on yet another random researcher
researcher_info = """hua chen	jian	Palladium,Layers,Conjugated polymers,Light,Perovskites
"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
```json
{
  "name_in_Chinese": "陈建华",
  "affiliations": [
    {
      "name": "云南大学化学科学与工程学院",
      "title": "副研究员"
    },
    {
      "name": "云南大学化学科学与工程学院",
      "title": "硕士生导师"
    }
  ],
  "contact_info": {
    "email": "chenjianhua@ynu.edu.cn",
    "phone": "17773983571"
  }
}
```
Cost of using gemini-2.5-flash: $0.010385
The cost would have been $0.040318 if using gemini-2.5-pro


In [103]:
# Try yet another random researcher
researcher_info = """tao	jiayou				Photonics,Sensors,Layers,Two dimensional materials,Perovskites"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
The initial search has yielded promising results. Several papers authored by "Tao Jiayou" mention affiliations in China.

From the search results:
*   **Affiliation 1:** School of Physics and Optoelectronics, Xiangtan University, Xiangtan, 411105, People's Republic of China
*   **Affiliation 2:** Key Laboratory of Hunan Province on Information Photonics and Free-Space Optical Communications, School of Physics and Electronic Science, Hunan Institute of Science and Technology, Yueyang 414006, People's Republic of China
*   **Affiliation 3:** School of Mathematics and Physics, Wuhan Institute of Technology, Guanggu 1st Road 206, Wuhan, 430205, China

It seems "Tao Jiayou" has multiple affiliations, which is common for researchers. The Key Laboratory affiliation appears twice, suggesting a stronger connection.

Now, I need to find the Chinese name of "Tao Jiayou" and confirm the affiliations and titles in Chinese.
I will use the found affiliations and the English name to sea

In [104]:
# Try on yet another random researcher
researcher_info = """Yifeng Wang College of Materials Science and Engineering, Nanjin Tech University, Nanjing, Jiangsu, 210009, China"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)


One candidate:
The search results provide strong evidence for the researcher.

Specifically, search result and are faculty lists for the "College of Materials Science and Engineering" (材料科学与工程学院) at "Nanjing Tech University" (南京工业大学). Both list "王一峰" (Wang Yifeng) as a professor (教授) in the "Materials Physics and Chemistry Department" (材料物理与化学系) and "Inorganic Materials and Engineering Department" (无机材料与工程系) respectively.

Even more definitively, search result is a personal page for "王一峰" on the "College of Materials Science and Engineering" website of Nanjing Tech University. It clearly states his name as 王一峰, his title as Professor (教授) and Doctoral Supervisor (博导), and his email as yifeng.wang@njtech.edu.cn. It also confirms his work experience at Nanjing Tech University's College of Materials Science and Engineering since 2012, first as an Associate Professor and then as a Professor from 2013.

Search result from Aisco Network also lists "王一峰" as an Associate Professor (副教授) at "Na

## Multi-agent approach

Gemini does not seem to be able to do many tasks at once. For example, it cannot consistently generate a JSON string in text-output mode -- cannot do JSON output when tools are used. Some instrctions are not properly followed, such as "do not tell me how you think but just give me the result". 

So let's create individual agents for each task. 

* Agent 1: Web search with Gemini's own query rewriting and thinking
* Agent 2: Check the quality of the web search results
* Agent 3: Generate the final JSON output
* Main function: coordinate the work of the three agents. Need to obtain at least three candidates or three consistent "I cannot find" responses.

In [115]:
class InfoGatherer:
  def __init__(self, model_id: str):
    self.system_prompt = "You are a helpful cross-lingual customer discovery agent for the chemical reagent market in China."

    self.user_prompt = """Given the name, affiliation, research keywords (optional), and contact info (optional) of a researcher extracted from scientific papers in English, crawl the web to find the corresponding info of the same researcher in Chinese. The researcher most likely has at least one affiliation in China.

    Please iteratively search the web. First, come up with a search query. Then, search the web. After each round, you should analyze the results, then think, and then refine or expand the search query for the next round. Stop when you are confident that you have found the researcher in the Chinese-speaking world or no new results can be found.

    Some researchers may have multiple affiliations, some in China and some in other countries. So do not rely solely on the affiliation provided in the input.
    
    It is common for a researcher to use a name in the English-speaking world that is not a direct transliteration or romanization of his/her Chinese name. For example, nvidia CEO Jensen Huang's Chinese name is 黃仁勳 and is romanized as Jen-Hsun Huang.

    You can search in both English and Chinese resources, but you must find Chinese sources (can be part of an English-dominant resources) to support the name and affiliation of the researcher.

    In the response, explain your thought process and the results of each round of search. Finally, give the final result.
    If you cannot find the researcher, just say I don't know.     

    Here are some ideas to build or expand the search queries but please do not limit yourself to these ideas:
    {query_building_ideas}

    Here is the information of the researcher to find:
    {researcher_info}
    """

    self.query_building_ideas = """
    * Translate the affiliation to Chinese and search for it. 
    * Search using the email address or phone number in Chinese resources especially scientific journals. 
    * Different Chinese names can be transliterated or Romanized to the same name in English. Please try to come up some possible Chinese names, and then search for them. Doing so for the family/last name is easier than the first name. 
    * Prioritize your search sources to Chinese-speaking websites. Also try to search as much as possible from scientific literature sources such as Google Scholar, PubMed, CNKI, etc.
    * A researcher may move from one affiliation to another. But you can link two researchers if they share co-authors, especially when the co-authors are from the same institutions at the two occurences of the researcher. 
    * Try to leaverage the co-author network/circle of the researcher. 
    * A research may publish papers under multiple affiliations. Expand your search queries to include other affiliations appear in publications as long as they are related to the research topic. 
    * From all pages you have found, extract all names and affiliations and add them to the search queries. 
    """

    self.model_id = model_id

  def call_gemini(self, prompt):
    client = genai.Client()

    google_search_tool = Tool(
        google_search = GoogleSearch()
    )

    response = client.models.generate_content(
        model=self.model_id,
        contents=prompt,
        config= {
           "tools": [google_search_tool],
           "response_modalities": ["TEXT"],
           "thinking_config": types.ThinkingConfig(thinking_budget=-1), 
           "system_instruction": self.system_prompt
        }
    )
    return response

  def calculate_cost(self, response):
    web_search_queries = response.candidates[0].grounding_metadata.web_search_queries
    
    pricing = { # per 1M tokens
      "gemini-2.5-flash": 
        {"input": 0.30, "output": 2.5, "cache": 0.075, "web_search": 0.035},
      "gemini-2.5-pro":
        {"input": 1.25, "output": 10, "cache": 0.31, "web_search": 0.035},
    }

    model_id = self.model_id

    pricing_for_model = pricing[model_id]

    cost = 0
    cost += pricing_for_model["input"] * response.usage_metadata.prompt_token_count / 1000000
    cost += pricing_for_model["output"] * (response.usage_metadata.candidates_token_count + response.usage_metadata.thoughts_token_count) / 1000000
    num_cached_tokens = 0 if response.usage_metadata.cached_content_token_count is None else response.usage_metadata.cached_content_token_count
    cost += pricing_for_model["cache"] * num_cached_tokens / 1000000
    cost += pricing_for_model["web_search"] * len(web_search_queries) / 1000
    
    return cost

  def one_prompt_approach(self, researcher_info: str):
    prompt = self.user_prompt.format(researcher_info=researcher_info, query_building_ideas=self.query_building_ideas)

    response = self.call_gemini(prompt)
 
    try:
      thoughts = [part.text for part in response.candidates[0].content.parts]
    except:
      print ("Error: in candidate.content.parts. ")
      thoughts = []


    cost = calculate_cost(response, model_id)

    the_other_model_id = {"gemini-2.5-flash": "gemini-2.5-pro", "gemini-2.5-pro": "gemini-2.5-flash"}[model_id]
    # print (f"Cost of using {model_id}: ${cost:.6f}")
    # print (f"The cost would have been ${calculate_cost(response, the_other_model_id):.6f} if using {the_other_model_id}")
    
    return thoughts, cost

In [None]:
# try Gemini Flash with the InfoGatherer
info_gatherer = InfoGatherer(model_id="gemini-2.5-flash")
thoughts, cost = info_gatherer.one_prompt_approach("hua chen	jian	Palladium,Layers,Conjugated polymers,Light,Perovskites")

In [117]:
# Try again on another researcher
thoughts, cost = info_gatherer.one_prompt_approach("Yifeng Wang College of Materials Science and Engineering, Nanjin Tech University, Nanjing, Jiangsu, 210009, China")

In [168]:
class JSONGenerator:
  def __init__(self):
    self.system_prompt = """You are a helpful assistant that generates a JSON string from a list of thoughts."""
    self.user_prompt = """Given a search query to find a researcher and the web search results and the thought process of the search agent, please extract the name, research keywords, affiliations, and contact info of the researcher that are in Chinese. The web search results may contain errors. If the query contains a pinyin, transliterated, or romanized version of a Chinese name, please only process results in thoughts that contain Chinese names whose pinyin, transliteration, or romanization match the name in the query.

    Here is the query used to find the researcher:
    {query}

    Below are the thoughts from which the JSON string will be extracted:
    {thoughts}
    """

  def call_gemini(self, query: str, thoughts: List[str]) -> str:
    client = genai.Client()
    prompt = self.user_prompt.format(query=query, thoughts=thoughts)

    response = client.models.generate_content(
      model="gemini-2.5-flash",
      contents=prompt,
      config= {
          # "response_modalities": ["TEXT"],
          # "thinking_config": types.ThinkingConfig(thinking_budget=-1), 
          "system_instruction": self.system_prompt,
          "response_mime_type": "application/json",
          "response_schema": Researcher
      }
    )
    researcher = json.loads(response.text)
    return researcher


In [176]:
info_gatherer = InfoGatherer(model_id="gemini-2.5-flash")
json_generator = JSONGenerator()

query = "hua chen	jian	Palladium,Layers,Conjugated polymers,Light,Perovskites"
thoughts, cost = info_gatherer.one_prompt_approach(query)
r = json_generator.call_gemini(query, thoughts)
print (json.dumps(r, indent=2, ensure_ascii=False))

Error: in candidate.content.parts. 
{
  "name_in_Chinese": "",
  "research_keywords": [],
  "affiliations": [],
  "contact_info": {
    "email": "",
    "phone": ""
  }
}


In [174]:
# try again on another researcher
query = "Yifeng Wang College of Materials Science and Engineering, Nanjin Tech University, Nanjing, Jiangsu, 210009, China"
thoughts, cost = info_gatherer.one_prompt_approach(query)
r = json_generator.call_gemini(query, thoughts)
print (json.dumps(r, indent=2, ensure_ascii=False))

{
  "name_in_Chinese": "王一峰",
  "research_keywords": [],
  "affiliations": [
    {
      "name": "南京工业大学 材料科学与工程学院",
      "title": "教授, 博士生导师"
    }
  ],
  "contact_info": {
    "email": "yifeng.wang@njtech.edu.cn",
    "phone": ""
  }
}
