In [None]:
!gcloud auth application-default login

# Google GenAI API

In [None]:
import ast
import os
from pathlib import Path

import pandas as pd
from google import genai
from tqdm.auto import tqdm

# Defines

## Google API

In [None]:
api_key = ""  # Set the API key
model_name = "gemini-2.5-pro-exp-03-25"

In [None]:
client = genai.Client(api_key=api_key)

In [None]:
generate_config = genai.types.GenerateContentConfig(
    temperature=0,
    topK=1
)

In [None]:
# Test run
response = client.models.generate_content(
    model=model_name,
    contents="How are you doing today?",
    config=generate_config
)

print(response.text)

## Files

In [None]:
raw_data_dir = os.path.join("data", "raw")
pdf_data_dir = os.path.join(raw_data_dir, "pdf")

output_data_dir = os.path.join("data", "processed")
os.makedirs(output_data_dir, exist_ok=True)

In [None]:
pdf_file_list = [x for x in Path(pdf_data_dir).glob("*.pdf")]

# Methods

In [None]:
def parse_str_to_dict(input_str):
    try:
        out = ast.literal_eval(
            input_str.replace("```python\n", "").replace("```", "")
        )
        out = {k.lower(): v for k, v in out.items()}
    except Exception:
        out = None
    return out

In [None]:
def get_paper_metadata(file):
    prompt = "Extract the following information the academic paper provided to you in a file (if it is available); "\
    "value in square brackets is the name of the key you should use (e.g. ' - Authors [authors]' means "\
    "put information about authors under the key called 'authors'):\n"\
    " - Authors [authors] \n"\
    " - Title [title] \n"\
    " - Year [year] \n"\
    " - PubMed ID [pmid] (search for it in www.ncbi.nlm.nih.gov) \n"\
    "Please give your answer as a parsable python dictionary; put None if there is no data; "\
    "add explanations as inline comments (use # sign to start a comment)"

    response = client.models.generate_content(
        model=model_name,
        contents=[file, prompt],
        config=generate_config
    )

    return response

In [None]:
def get_paper_cases(file):
    prompt_counts = "Does the paper mention any individual cases? In this context, a case is a single human subject who is affected "\
    "by a disease. If there are such cases, please give how many there were."

    response_counts = client.models.generate_content(
        model=model_name,
        contents=[file, prompt_counts],
        config=generate_config
    )

    prompt = f"Your previous response: \n{response_counts.text} \n\n"\
    "For each individual case in the paper, extract the following information about each case (if it is available); "\
    "value in square brackets is the name of the key you should use (e.g. ' - Authors [authors]' means "\
    "put information about authors under the key called 'authors'):\n"\
    " - Age at examination [aae] \n"\
    " - Age at disease onset [aao] \n"\
    " - Sex [sex] \n"\
    " - Type of mutation - nucleotide change [mutation_nucleotide_change] \n"\
    " - Type of mutation - protein (amino acid) change [mutation_protein_change] \n"\
    " - Copy number variation [cnv] \n"\
    " - Single point variation nucleotide change (possible values: Insertion, Deletion, Substitution) [snv] \n"\
    " - Single nucleotide variation impact on protein (possible values: Frameshift, Nonsense, Missense, Silent) [snv_protein] \n"\
    " - Zygocity [zygocity] \n"\
    "Please give your answer as a parsable python dictionary where the key is the case number; put None if there is no data; add "\
    "explanations as inline comments (use # sign to start a comment)"

    response = client.models.generate_content(
        model=model_name,
        contents=[file, prompt],
        config=generate_config
    )

    return response

# Analysis

In [None]:
metadata_dict = {}
cases_dict = {}

In [None]:
# file_iter = tqdm(pdf_file_list)
file_iter = tqdm(pdf_file_list[:5])

for cur_file in file_iter:
    file_iter.set_description(f'Processing {cur_file.name}')

    cur_output_dir = os.path.join(output_data_dir, cur_file.stem)
    os.makedirs(cur_output_dir, exist_ok=True)
    
    # Uploading file 
    file_iter.set_postfix_str('Uploading file')

    file = client.files.upload(
        file=cur_file,
    )
    
    # Getting paper's metadata 
    try:
        file_iter.set_postfix_str("Getting paper's metadata")
        response_metadata = get_paper_metadata(file)
        Path(os.path.join(cur_output_dir, 'metadata.txt')).write_text(response_metadata.text)
        metadata_dict[cur_file.stem] = parse_str_to_dict(response_metadata.text)
    except Exception as e:
        print(f'Exception when getting metadata from {cur_file.stem}: \n{e}')
        metadata_dict[cur_file.stem] = None

    # Getting cases 
    try:
        file_iter.set_postfix_str("Getting paper's cases")
        response_cases = get_paper_cases(file)
        Path(os.path.join(cur_output_dir, 'cases.txt')).write_text(response_cases.text)
        cases_dict[cur_file.stem] = parse_str_to_dict(response_cases.text)
    except Exception as e:
        print(f'Exception when getting cases from {cur_file.stem}: \n{e}')
        cases_dict[cur_file.stem] = None

## DataFrames

In [None]:
df_metadata = pd.DataFrame.from_dict(metadata_dict, orient='index')

In [None]:
df_metadata.to_csv(
    os.path.join(output_data_dir, "metadata.csv")
)

In [None]:
df_cases_list = []
for k, v in cases_dict.items():
    df_cur = pd.DataFrame.from_dict(v, orient='index')
    df_cur['file'] = k
    df_cases_list.append(df_cur)

In [None]:
pd.concat(df_cases_list).to_csv(
    os.path.join(output_data_dir, "cases.csv")
)