In [None]:
import dotenv
import os
dotenv.load_dotenv()
import json

from pydantic import BaseModel
from typing import List, Dict, Tuple, Set

from google import genai
from google.genai import types
from google.genai.types import Tool, GenerateContentConfig, GoogleSearch


class Affiliation(BaseModel):
  name: str
  title: str

class ContactInfo(BaseModel):
  email: str
  phone: str

class Researcher(BaseModel):
  name_in_Chinese: str
  research_keywords: List[str]
  affiliations: List[Affiliation]
  contact_info: ContactInfo

# Known limitations of Gemini: 
1. Tool use with JSON output is not supported.  `ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': "Tool use with a response mime type: 'application/json' is unsupported", 'status': 'INVALID_ARGUMENT'}}
`

## One-prompt, one-agent approach -- let the agent do the work behind the scene

In [None]:
def call_gemini(prompt, model_id: str):
    client = genai.Client()

    google_search_tool = Tool(
        google_search = GoogleSearch()
    )

    response = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config= {
           "tools": [google_search_tool],
           "response_modalities": ["TEXT"],
           "thinking_config": types.ThinkingConfig(thinking_budget=-1), 
           "system_instruction": "You are a cross-lingual customer discovery agent", 
          #  "candidateCount": 3, # BUG: Why does Gemini only return 1 candidate?
          #  "response_mime_type": "application/json", # Tool use with JSON output is not supported in Gemini
        }
    )
    return response

def calculate_cost(response, model_id: str):
  web_search_queries = response.candidates[0].grounding_metadata.web_search_queries
  
  pricing = { # per 1M tokens
    "gemini-2.5-flash": 
      {"input": 0.30, "output": 2.5, "cache": 0.075, "web_search": 0.035},
    "gemini-2.5-pro":
      {"input": 1.25, "output": 10, "cache": 0.31, "web_search": 0.035},
  }

  pricing_for_model = pricing[model_id]

  cost = 0
  cost += pricing_for_model["input"] * response.usage_metadata.prompt_token_count / 1000000
  cost += pricing_for_model["output"] * (response.usage_metadata.candidates_token_count + response.usage_metadata.thoughts_token_count) / 1000000
  num_cached_tokens = 0 if response.usage_metadata.cached_content_token_count is None else response.usage_metadata.cached_content_token_count
  cost += pricing_for_model["cache"] * num_cached_tokens / 1000000
  cost += pricing_for_model["web_search"] * len(web_search_queries) / 1000
  
  return cost

### Simple prompt 

A few-sentence, direct-to-result instruction does not work, even when `thinking_budget` is set to `-1`.

In [None]:

def call_gemini(prompt, model_id: str):
    client = genai.Client()

    google_search_tool = Tool(
        google_search = GoogleSearch()
    )

    response = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config= {
           "tools": [google_search_tool],
           "response_modalities": ["TEXT"],
           "thinking_config": types.ThinkingConfig(thinking_budget=-1),
           "system_instruction": "You are a cross-lingual customer discovery agent", 
        }
    )

    for candidate in response.candidates:
        print ("One candidate:")
    try:
        for each in candidate.content.parts:
            print(each.text)
    except:
        print ("Error: in candidate.content.parts. See below:")
        print (candidate.content.parts)
        print ( "=================")

    return response

prompt_template = """Given the name and other information (such as affiliation, email, research keywords, etc.) of a researcher in English, find his/her Chinese name on the web. The Chinese name must be backed by a source that is in Chinese. Please quote the source in your response. 

Below is the researcher's information:
{reseacher_info}
"""

researcher_info = """David Z. Zhu, Department of Civil and Environmental Engineering, University of Alberta, Edmonton, AB, T6G 1H9, Canada"""
model_id = "gemini-2.5-flash"
response = call_gemini(prompt_template.format(reseacher_info=researcher_info), model_id)
    

One candidate:
Based on the search results, the Chinese name for David Z. Zhu is 朱兆忠.

This is supported by:

*   A page mentioning "Zhu Zhiwei" with an education history matching David Z. Zhu's, including his PhD from the University of British Columbia and MSc/BSc from Shanghai Jiao Tong University. This page lists his working experience as Professor at Zhejiang University, China, and Professor at University of Alberta, Canada. It also shows a list of publications with "Zhu, D.Z." as an author. This suggests that "Zhu Zhiwei" could be the full Chinese name, with "Zhu" being the surname and "Zhiwei" being the given name, and "D.Z." referring to "Zhiwei". However, it's important to cross-reference to confirm the exact matching.
*   The University of Alberta directory page for David Zhu, PhD, PEng, FCAE, states he is a Fellow of the Canadian Academy of Engineering (加拿大工程院院士), which includes Chinese characters. This confirms his connection to Chinese institutions and context.
*   Several 

In [None]:
# try on a researcher 


### One-prompt but a detailed prompt

This approach generally works when the affiliation info of the searcher is provided. Sometimes, when the affiliation info is not provided while the research keywords are provided, it also works, but at a very low success rate. 

In [40]:
prompt_template = """
Many researchers who have affiliations in China publish papers in English. However, mapping from their English names to their Chinese names is not straightforward due to the homophony of Chinese characters and different transliteration/romanization methods. Given the name and other information (such as affiliation, research keywords, etc.) of a researcher in English, find the Chinese name, affiliation, and contact info (if possible) of the same researcher in China. Quote the evidence, which must be from Chinese-speaking source, to support your claim. 

Please first come up with a strategy. Then, iteratively generate web search queries to find the researcher in the Chinese-speaking world. After each search, you should refine or expand the search queries based on the results, including but not limited to extracting all names and affiliations from returned web pages and adding them to the search queries. You can search in both English and Chinese resources but the evidence to support the Chinese name and affiliation of the searcher must be in Chinese. Stop when you are confident that you have found the researcher in the Chinese-speaking world or no new results can be found in the latest 3 searches.

Note that some researchers may work for different institutions in the English-speaking world (e.g. University of ABC in United States) and the Chinese-speaking world (e.g. University of XYZ in China). A researcher may even have many affiliations in either world. 

Many researchers use different names, especially first/given names, in different contexts, e.g., a Western name in the English-speaking world and a Chinese name in the Chinese-speaking world. Most people uses Pinyin in mainland China. So please reject any results whose Chinese characters cannot match the Pinyin of the researcher's name.

Output in a JSON format that includes the name in Chinese and a list of affiliations and corresponding titles of the researcher in China like this:
{{
  "name_in_Chinese": "张三",
  "affiliations": [
    {{"name": "清华大学", "title": "教授", "quote": "张三今日加盟清华大学"}}, 
    {{"name": "北京大学", "title": "副教授", "quote": "北京大学副教授张三今日拿到诺贝尔奖"}}
  ]
}}

If you cannot find the researcher, just say you don't know. 

Here are some ideas for you to get started:
{ideas}

Here is the information of the researcher to research:
{researcher_info}
"""

ideas = """
* Translate the affiliation to Chinese and search for it. 
* Search using the email address or phone number in Chinese resources especially scientific journals. 
* Different Chinese names can be transliterated or Romanized to the same name in English. Please try to come up some possible Chinese names, and then search for them. Doing so for the family/last name is easier than the first name. 
* Prioritize your search sources to Chinese-speaking websites.
* One way is to obtain all Chinese names of researchers matching the research topic and the affiliation, and then transliterate them to English to find the one that matches the English info of the researcher. 
* A researcher may move from one affiliation to another. But you can link two researchers if they share co-authors, especially when the co-authors are from the same institutions at the two occurences of the researcher. 
* Try to leaverage the co-author network/circle of the researcher. 
* A research may publish papers under multiple affiliations. Expand your search queries to include other affiliations appear in publications as long as they are related to the research topic. 
* From all pages you have found, extract all names and affiliations and add them to the search queries. 
"""

def call_gemini(prompt, model_id: str):
    client = genai.Client()

    google_search_tool = Tool(
        google_search = GoogleSearch()
    )

    response = client.models.generate_content(
        model=model_id,
        contents=prompt,
        config= {
           "tools": [google_search_tool],
           "response_modalities": ["TEXT"],
           "thinking_config": types.ThinkingConfig(thinking_budget=9064*2), 
           "system_instruction": "You are a cross-lingual customer discovery agent", 
          #  "candidateCount": 3, # BUG: Why does Gemini only return 1 candidate?
          #  "response_mime_type": "application/json", # Tool use with JSON output is not supported in Gemini
        }
    )
    return response

def calculate_cost(response, model_id: str):
  web_search_queries = response.candidates[0].grounding_metadata.web_search_queries
  
  pricing = { # per 1M tokens
    "gemini-2.5-flash": 
      {"input": 0.30, "output": 2.5, "cache": 0.075, "web_search": 0.035},
    "gemini-2.5-pro":
      {"input": 1.25, "output": 10, "cache": 0.31, "web_search": 0.035},
  }

  pricing_for_model = pricing[model_id]

  cost = 0
  cost += pricing_for_model["input"] * response.usage_metadata.prompt_token_count / 1000000
  cost += pricing_for_model["output"] * (response.usage_metadata.candidates_token_count + response.usage_metadata.thoughts_token_count) / 1000000
  num_cached_tokens = 0 if response.usage_metadata.cached_content_token_count is None else response.usage_metadata.cached_content_token_count
  cost += pricing_for_model["cache"] * num_cached_tokens / 1000000
  cost += pricing_for_model["web_search"] * len(web_search_queries) / 1000
  
  return cost

def one_prompt_approach(prompt_template: str, ideas: str, researcher_info: str, model_id: str):
  prompt = prompt_template.format(researcher_info=researcher_info, ideas=ideas)
  response = call_gemini(prompt, model_id)

  for candidate in response.candidates:
    print ("One candidate:")
    try:
      for each in candidate.content.parts:
          print(each.text)
    except:
      print ("Error: in candidate.content.parts. See below:")
      print (candidate.content.parts)
    print ( "=================")

  cost = calculate_cost(response, model_id)

  the_other_model_id = {"gemini-2.5-flash": "gemini-2.5-pro", "gemini-2.5-pro": "gemini-2.5-flash"}[model_id]
  print (f"Cost of using {model_id}: ${cost:.6f}")
  print (f"The cost would have been ${calculate_cost(response, the_other_model_id):.6f} if using {the_other_model_id}")
  
  return response

In [31]:
# Try Gemini Flash
researcher_info = """Professor David Z. Zhu, Department of Civil and Environmental Engineering, University of Alberta, Edmonton, AB, T6G 1H9, Canada"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
The initial search provided crucial information.

From the University of Alberta directory, it states that David Z. Zhu received his M.Sc. and B.Sc. from Shanghai Jiao Tong University, China, in 1989 and 1986, respectively. It also mentions he was a "Guest Professor, Zhejiang University, China, 2006".

More importantly, a seminar abstract and IAHR profile explicitly state: "Dr. David Zhu is a Professor at Ningbo University in China. He is a Professor Emeritus in the Department of Civil and Environmental Engineering at the University of Alberta, Canada where he was a faculty member for 25 years." This is also confirmed in an IAHR Water Monograph listing. Another source, apise.org, also confirms his affiliation with Ningbo University.

The Wikipedia entry for "David Zhu" refers to a Chinese racing driver, which is clearly not the same person. The entry for "Song-Chun Zhu" is also a different person. "Zhenduo Zhu" from Tsinghua University is also a different researcher.

Th

In [32]:
# Try Gemini Pro
researcher_info = """Professor David Z. Zhu, Department of Civil and Environmental Engineering, University of Alberta, Edmonton, AB, T6G 1H9, Canada"""
model_id = "gemini-2.5-pro"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
```json
{
  "name_in_Chinese": "朱志伟",
  "affiliations": [
    {
      "name": "宁波大学",
      "title": "教授、海洋工程研究院院长",
      "quote": "朱志伟教授是加拿大阿尔伯塔大学土木与环境工程学系讲座教授, 加拿大工程院院士，加拿大基金委讲席教授，现任宁波大学海洋工程研究院院长。"
    },
    {
      "name": "浙江大学",
      "title": "客座教授",
      "quote": "Guest Professor, Zhejiang University, China,2006. [2]"
    },
    {
      "name": "南京水利科学研究院",
      "title": "访问学者",
      "quote": "加拿大Alberta大学朱志伟教授来我院进行学术访问。"
    },
    {
      "name": "中国水利水电科学研究院",
      "title": "访问学者",
      "quote": "应水力学所邀请，加拿大阿尔伯塔大学土木及环境工程系朱志伟教授于2010年7月26日来水力学所交流访问，并做了题为“环境水力学：研究与应用”的学术报告。"
    }
  ]
}
```
Cost of using gemini-2.5-pro: $0.015960
The cost would have been $0.004137 if using gemini-2.5-flash


In [37]:
# Try Gemini Flash on another random researcher
# This time, only name and research areas
researcher_info = """gu	aijuan	Photocatalysis,Oxides,Adsorption,Organic reactions,Catalytic activity"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
Here's my strategy to find the Chinese name, affiliation, and contact information for the researcher "Gu Aijuan" specializing in "Photocatalysis, Oxides, Adsorption, Organic reactions, Catalytic activity":

1.  **Identify potential Chinese characters for "Gu Aijuan" based on common Pinyin surnames and given names.** Common surnames for 'Gu' include 顾 (Gù), 谷 (Gǔ), and 辜 (Gū). Common given names for 'Aijuan' include 爱娟 (Àijuān) or 艾娟 (Àijuān). I will prioritize 顾爱娟, 谷爱娟, and 辜爱娟.
2.  **Translate the research keywords into Chinese:**
    *   Photocatalysis: 光催化 (guāngcuīhuà)
    *   Oxides: 氧化物 (yǎnghuàwù)
    *   Adsorption: 吸附 (xīfù)
    *   Organic reactions: 有机反应 (yǒujī fǎnyìng)
    *   Catalytic activity: 催化活性 (cuīhuà huóxìng)
3.  **Initiate searches using combinations of English name, potential Chinese names, and translated research keywords.** I will focus on academic databases, university websites, and scientific news in China.
4.  **Analyze search results:**
    *

In [34]:
# Try again on yet another random researcher
researcher_info = """ding	congcong	Chemistry, Thermal conductivity, Thermoelectrics, Solar cells, Electrical conductivity, Perovskites"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
Error: in candidate.content.parts. See below:
None
Cost of using gemini-2.5-flash: $0.015808
The cost would have been $0.060646 if using gemini-2.5-pro


In [27]:
# Try again on yet another random researcher
researcher_info = """hua chen	jian	Palladium,Layers,Conjugated polymers,Light,Perovskites
"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
The initial search yielded promising results.

Specifically, result directly links "Hua Chen" to "School of Optoelectronic Science and Engineering & Collaborative Innovation Center of Suzhou Nano Science and Technology, Soochow University, Suzhou 215006, China" with research on Perovskite Light-Emitting Diodes. This aligns perfectly with the research keywords provided.

The article "Sufficient Hole Injection for High-Performance Blue Perovskite Light-Emitting Diodes" published in ACS Photonics (2024-12-03) lists "Hua Chen" as an author with the affiliation: "School of Optoelectronic Science and Engineering & Collaborative Innovation Center of Suzhou Nano Science and Technology, Soochow University, Suzhou 215006, China."

Another co-author, Ran Chen, shares the same affiliation. The paper also lists email addresses for Zhenwei Ren (zhwren@suda.edu.cn) and Yu Chen (chenyu_ny@suda.edu.cn) from the same institution, which could be useful for finding more information if neede

In [41]:
# Try yet another random researcher
researcher_info = """tao	jiayou				Photonics,Sensors,Layers,Two dimensional materials,Perovskites"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)

One candidate:
The initial search yielded promising results for "Tao Jiayou".

From the search results, I found several papers authored by "Tao Jiayou" with relevant keywords and affiliations in China:

*   **Affiliation 1:** School of Physics and Optoelectronics, Xiangtan University, Xiangtan, 411105, People's Republic of China.
*   **Affiliation 2:** Key Laboratory of Hunan Province on Information Photonics and Free-Space Optical Communications, School of Physics and Electronic Science, Hunan Institute of Science and Technology, Yueyang 414006, People's Republic of China.
*   **Affiliation 3:** School of Mathematics and Physics, Wuhan Institute of Technology, Guanggu 1st Road 206, Wuhan, 430205, China.

The research topics align perfectly with the provided keywords (Photonics, Sensors, Two-dimensional materials, Perovskites). For instance, mentions "Journal of Nanoelectronics and Optoelectronics" and topics like "Polypyrrole-Coated Manganese Dioxide Nanowires", while discusses "Broad

In [36]:
# Try on yet another random researcher
researcher_info = """Yifeng Wang College of Materials Science and Engineering, Nanjin Tech University, Nanjing, Jiangsu, 210009, China"""
model_id = "gemini-2.5-flash"
response = one_prompt_approach(prompt_template, ideas, researcher_info, model_id)


One candidate:
{
  "name_in_Chinese": "王一峰",
  "affiliations": [
    {
      "name": "南京工业大学材料科学与工程学院",
      "title": "教授，博导",
      "quote": "2013.07-至今，南京工业大学材料科学与工程学院，教授"
    }
  ]
}
Cost of using gemini-2.5-flash: $0.003007
The cost would have been $0.011544 if using gemini-2.5-pro
