In [None]:
!gcloud auth application-default login

# Google GenAI API

In [1]:
from pathlib import Path
from google import genai

In [25]:
api_key = ""  # Set the API key
model_name = "gemini-2.5-pro-exp-03-25"

In [7]:
client = genai.Client(api_key=api_key)

In [83]:
generate_config = genai.types.GenerateContentConfig(
    temperature=0,
    topK=1
)

In [85]:
response = client.models.generate_content(
    model=model_name,
    contents="How are you doing today?",
    config=generate_config
)

print(response.text)

Thanks for asking! As an AI, I don't have feelings or days in the human sense, but I'm functioning well and ready to help with any questions or tasks you have.

How about you? How's your day going?


## Getting the file

In [27]:
file_path = Path("data/raw/pdf/hanagasi2016.pdf")

In [28]:
file = client.files.upload(
    file=file_path,
)

## PubMed ID

In [86]:
prompt = "Please give the name of the paper, its author and year. Also find the PubMed ID of the paper - use the search tool to get this value, " \
"do not rely on your memory for that."

In [87]:
response = client.models.generate_content(
    model=model_name,
    contents=[file, prompt],
    config=generate_config
)

In [88]:
print(response.text)

'Okay, here is the information based on the provided article:\n\n*   **Paper Name:** A novel homozygous DJ1 mutation causes parkinsonism and ALS in a Turkish family\n*   **Author (First):** Hasmet A. Hanagasi\n*   **Year:** 2016\n*   **PubMed ID:** 26971994'

## Number of cases

In [89]:
prompt = "Does the paper mention any individual cases? In this context, a case is a single human subject who is affected "\
"by a disease. If there are such cases, please give how many there were."

In [90]:
response_counts = client.models.generate_content(
    model=model_name,
    contents=[file, prompt],
    config=generate_config
)

In [91]:
print(response_counts.text)

Yes, the paper mentions individual cases.

It describes a family where three individuals were affected:
1.  The index case (Individual IV-4), a 38-year-old man.
2.  His affected sister (Individual IV-5), a 38-year-old woman.
3.  His other affected sister (Individual IV-11), a 27-year-old woman.

Therefore, there are **3** individual cases described in the paper.


## Extraction

In [109]:
prompt = f"Your previous response: \n{response_counts.text} \n\n"\
"For each individual case in the paper, extract the following information about each case (if it is available):\n"\
" - Age at examination (call it AAE) \n"\
" - Age at disease onset (call it AAO) \n"\
" - Sex \n"\
" - Type of mutation - nucleotide change \n"\
" - Type of mutation - protein (amino acid) change \n"\
" - Copy number variation (call the field CNV) \n"\
" - Single point variation nucleotide change (possible values: Insertion, Deletion, Substitution; call the field SNV) \n"\
" - Single nucleotide variation impact on protein (possible values: Frameshift, Nonsense, Missense, Silent) \n"\
" - Zygocity \n"\
"Please give your answer as a parsable python dictionary where the key is the case number; put None if there is no data; add "\
"explanations as inline comments (use # sign to start a comment)"

In [110]:
print(prompt)

Your previous response: 
Yes, the paper mentions individual cases.

It describes a family where three individuals were affected:
1.  The index case (Individual IV-4), a 38-year-old man.
2.  His affected sister (Individual IV-5), a 38-year-old woman.
3.  His other affected sister (Individual IV-11), a 27-year-old woman.

Therefore, there are **3** individual cases described in the paper. 

For each individual case in the paper, extract the following information about each case (if it is available):
 - Age at examination (call it AAE) 
 - Age at disease onset (call it AAO) 
 - Sex 
 - Type of mutation - nucleotide change 
 - Type of mutation - protein (amino acid) change 
 - Copy number variation (call the field CNV) 
 - Single point variation nucleotide change (possible values: Insertion, Deletion, Substitution; call the field SNV) 
 - Single nucleotide variation impact on protein (possible values: Frameshift, Nonsense, Missense, Silent) 
 - Zygocity 
Please give your answer as a parsab

In [111]:
response = client.models.generate_content(
    model=model_name,
    contents=[file, prompt],
    config=generate_config
)

In [112]:
print(response.text)

```python
{
    "Case 1": { # Corresponds to individual IV-4 in the paper
        "AAE": 37, # Age at last examination mentioned in the detailed description (Section 3.2)
        "AAO": 24, # Age at onset of left hand tremor (Section 3.2)
        "Sex": "Male", # Described as "man" (Section 3.2)
        "Type of mutation - nucleotide change": "c.133C>T", # Identified in section 3.5 for all affected individuals
        "Type of mutation - protein (amino acid) change": "p.Q45X", # Identified in section 3.5 for all affected individuals (Q=Glutamine, X=Stop codon)
        "CNV": None, # The mutation is a point mutation, not a Copy Number Variation
        "SNV": "Substitution", # A single nucleotide C is replaced by T
        "Single nucleotide variation impact on protein": "Nonsense", # Results in a premature stop codon (X)
        "Zygocity": "Homozygous" # Stated in section 3.5 and Figure 1 legend (M/M)
    },
    "Case 2": { # Corresponds to individual IV-5 in the paper
        "AAE": 

In [101]:
tst = {
  "case_IV-4": {
    "AAE": 37, # Age at last examination mentioned in the detailed description (Section 3.2)
    "AAO": 24, # Age at disease onset (Section 3.2)
    "Sex": "Male", # Described as "man" (Section 3.2)
    "Type of mutation - nucleotide change": "c.133C>T", # Identified in genetic findings (Section 3.5) for all affected members
    "Type of mutation - protein (amino acid) change": "p.Q45X", # Identified in genetic findings (Section 3.5) for all affected members, X denotes stop codon
    "CNV": None, # The mutation is a point mutation, not described as a copy number variation
    "Zygocity": "Homozygous" # Stated in genetic findings (Section 3.5) and pedigree figure (Fig 1A)
  },
  "case_IV-5": {
    "AAE": 38, # Age at neurological examination (Section 3.3)
    "AAO": 35, # Age at disease onset (Section 3.3)
    "Sex": "Female", # Described as "house wife" (Section 3.3)
    "Type of mutation - nucleotide change": "c.133C>T", # Segregated with disease in the family (Section 3.5)
    "Type of mutation - protein (amino acid) change": "p.Q45X", # Segregated with disease in the family (Section 3.5)
    "CNV": None, # The mutation is a point mutation, not described as a copy number variation
    "Zygocity": "Homozygous" # Segregated with disease in the family (Section 3.5) and pedigree figure (Fig 1A)
  },
  "case_IV-11": {
    "AAE": 27, # Age at neurological examination (Section 3.4)
    "AAO": 23, # Age at disease onset (1 year after depression onset at age 22) (Section 3.4)
    "Sex": "Female", # Described as "house wife" (Section 3.4)
    "Type of mutation - nucleotide change": "c.133C>T", # Segregated with disease in the family (Section 3.5)
    "Type of mutation - protein (amino acid) change": "p.Q45X", # Segregated with disease in the family (Section 3.5)
    "CNV": None, # The mutation is a point mutation, not described as a copy number variation
    "Zygocity": "Homozygous" # Segregated with disease in the family (Section 3.5) and pedigree figure (Fig 1A)
  }
}

In [102]:
import pandas as pd

In [105]:
pd.DataFrame.from_dict(tst).T

Unnamed: 0,AAE,AAO,Sex,Type of mutation - nucleotide change,Type of mutation - protein (amino acid) change,CNV,Zygocity
case_IV-4,37,24,Male,c.133C>T,p.Q45X,,Homozygous
case_IV-5,38,35,Female,c.133C>T,p.Q45X,,Homozygous
case_IV-11,27,23,Female,c.133C>T,p.Q45X,,Homozygous
