In [None]:
# Load Libraries
from google import genai
from google.genai import types
import pathlib
import httpx
import time
import random
import os
import pandas as pd

In [None]:
## Parameters to change for your configuration

# Year to extract data.  If you don't want to go by year, change pdf_path, file_pref, and output_path
year = '2015'     
# Model you want to use
model = "gemini-2.5-flash"
# API Key path
api_path = "API_Keys/google_gemini_key.txt"
# PDF path
pdf_path = '../data/'+year+'/'+year+'_articles.csv'
# File path for where pdfs are located.
file_pref = f"../data/{year}/Afil/" 
# Output csv file path
output_path = year+'_das.csv' 

In [None]:
# Get file names from master spreadsheet
# The spreadsheet includes columns
# Afil: 0 or 1 depending if the the article has a author with an affiliation with CSHL. Sometimes we gather collaborator articles.
# Publicatin type: The type of article such as Journal article, Book, Chapter, Conference, Interview, Preprint, etc.
# DOI: the doi number of the publications
# doi_file: the file name of pdf, which is based off of the doi numer with the '/' changed to '.' for file loading in python.

df = pd.read_csv(pdf_path) # Where the data is located and the file naming scheme.

In [None]:
df = df[df['Afil']==1]  #Filter the spreadsheet to only have affiliated articles
files = df[df['Publication type']=='Journal article']['doi_file'] # get a list of file names for only journal articles

In [None]:
# Get api key for google gemini
# This is an important step.  The key allows you to access the gemini api.
# Keys should never be shared or hard coded into programs.
# You can get a key by having a google account and clicking on Get API key here: https://aistudio.google.com/  (top right of the page)

with open(api_path, "r", encoding="utf-8") as file:
    google_api = file.read()

In [None]:
## Mini test to see that the AI works

# client = genai.Client(api_key=google_api)

# response = client.models.generate_content(
#     model="gemini-2.0-flash",
#     contents="Explain how AI works in a few words",
# )

# print(response.text)

In [None]:
# A function with a prompt for the LLM.  Briefly the promp asks to find and return just the data availability statment (DAS).

client = genai.Client(api_key=google_api)
def google_chat(file_name):
    # Retrieve and encode the PDF byte
    filepath = pathlib.Path(file_name)  # input file name to load
    
    prompt = """
    You are a helpful assistant. Search the PDF document for the *Data Availability Statement*, *Data Sharing Statement*, or any section titled *Data and Code Availability*.
    
    Instructions:
    - Extract the **entire** statement **verbatim**.
    - Do **not** use any text found in a 'Key Resources Table' or similarly labeled summary table.
    - If no such statement exists in the document, respond exactly with: "There is no data availability statement."
    """
    response = client.models.generate_content(
      model= model, #"gemini-2.5-flash", #gemini-2.0-flash  # You can change the model here.  Be aware of rate limits.  Available models: https://ai.google.dev/gemini-api/docs/rate-limits
      contents=[
          types.Part.from_bytes(
            data=filepath.read_bytes(),   # looks at pdf
            mime_type='application/pdf',  # type of documents (pdfs)
          ),
          prompt])   # incorporate prompt
    
    return response.text

In [None]:
# The server is sometimes very busy so sometimes you have to try multiple times to get the LLM to respond.
# Here it will try five times to get the model to respond.  Each time it will wait longer to increase a chance of a response.

def multi_try(file_name):
    for attempt in range(5):  # Attempt five times.  You can change the number if you see fit.
        try:
            result = google_chat(file_name) # If successful, will return DAS.
            break
        except Exception as e:                               # If not successful will print Attempt #{attempt} and wait.
            print(f"Attempt {attempt+1} failed: {e}")
            wait = 2 ** attempt + random.random()
            time.sleep(wait)

    return result


In [None]:
# This sets up all of our functions to batch run multiple files.
# Depending what model you choose it will only allow a certain number of requests a day.
# The gemini-2.5-flash only allows 250 requests per day (RPD).


file_list = [file_pref + f"{filename}" for filename in files]  # create full path to find the pdf.  You might have to change this for your needs.
file_name = []  # Keep track of the successful runs
results = []    # Get the DAS statement
fail = []       # Keep track of the failures
loc_start = 0   # Initialize
loc = []        # Find location of success
fail_loc = []   # Find location of failures


for file in file_list:
    try:
        result = multi_try(file)  # This will try multiple times to access the LLM. If it is busy, this does not count as a request as part of your requests per day.
        results.append(result)
        file_name.append(file)
        loc.append(loc_start)
        loc_start +=1
        time.sleep(6)             # For the current model you are only allowed 10 request per minute.  The wait makes sure we stick to this.
        print(loc_start, file)
    except:
        print(loc_start,'FAILURE!')  # If there is an error (e.g. can't find file or not response from LLM) it will print FAILURE.
        fail.append(file)
        fail_loc.append(loc_start)
        loc_start +=1

In [None]:
file_path =  output_path

df2 = pd.DataFrame({'file':file_name,         # the name of the first column and the data
                   'das':results})            # the das for the pdf
df2.to_csv(file_path, index=False, header=False)  # create a csv file with extracted data availability statements.

In [None]:
print(len(fail))  # print how many failed

In [None]:
fail # name of failed file

In [None]:
fail_loc  # location of failed pdf