In [None]:
# Load Libraries
from google import genai
from google.genai import types
import pathlib
import httpx
import time
import random
import os
import pandas as pd

In [None]:
## Parameters to change for your configuration

# Year to extract data.  If you don't want to go by year, change pdf_path, file_pref, and output_path
year = '2015'     

# Model you want to use
model = "gemini-2.5-flash"

# API Key path
api_path = "API_Keys/google_gemini_key.txt"

# DAS path
das_path = '../data/DAS/'+year+'_das.csv'

# Output csv file path
output_path = year+'_classification.csv'

In [None]:
# Get file names and DAS's
df = pd.read_csv(das_path)

In [None]:
# Insert column names as the first row
df.loc[-1] = df.columns  # add column names as a new row
df.index = df.index + 1  # shift index
df = df.sort_index()     # reorder rows

# Rename columns 
df.columns = ['file_name', 'DAS']

In [None]:
# Get api key for google gemini
# This is an important step.  The key allows you to access the gemini api.
# Keys should never be shared or hard coded into programs.
# You can get a key by having a google account and clicking on Get API key here: https://aistudio.google.com/  (top right of the page)

with open(api_path, "r", encoding="utf-8") as file:
    google_api = file.read()

In [None]:
# A function with a prompt for the LLM.  Briefly the prompt looks at the DAS and categorizes it into 7 categories.

client = genai.Client(api_key=google_api_key)
def google_chat(data_statement):

    prompt = """
    You are a classification assistant. Analyze the following data availability statement and determine which category or categories it fits best from the list below. 
    
    Output only the corresponding number(s), separated by commas if more than one applies. Do not include any explanation or extra text.
    
    Categories:
    
    1. Data is included in the main text or supporting data is explicitly stated to be in the manuscript (e.g., “data are included in the article,” “data supporting findings are in the text,” “source data are provided”).
    
    2. Data are available upon request or by contacting the corresponding author (e.g., “data available from authors upon request,” “available from corresponding author”).
    
    3. Data are deposited in a public repository or database (e.g., GEO, Dryad, Figshare, OSF) with a link, DOI, or accession number.
    
    4. Data are available in the supplementary or supporting materials (e.g., “see Supplementary Information”).
    
    5. No data availability statement is included.
    
    6. Data sharing is not applicable (e.g., article type is a review, commentary, editorial, or no original data was generated).

    7. Previously published data was used for this work. OR Source data for the paper was collected from a database. OR Mention of publicly used datasets.

    Here are examples:

    1. All study data are included in the main text.
    2. The data that support the findings of this study are available from the corresponding author upon reasonable request.
    2. Any data pertaining to this manuscript will be made available upon request.
    3. Metabolomics, lipidomics, mass spectrometry, and RNA sequencing data have been deposited at National Metabolomics Data Repository (NMDR), ProteomeXchange Consortium, and GEO repository and are publicly available as of the date of this publication using the accession number provided in the key resources table. An Excel file includingsource data has been deposited in Mendeley Data and is publicly avail-able under the following site: DOI: 10.17632/nrpm8ydybr.1. This paper does not report original code.
    4. See Supplement 2.
    4. The data that supports the findings of this study are available in the supplementary material of this article.
    5. There is no data availability statement.
    6. There are no data in this manuscript to share.
    6. Data sharing is not applicable to this article as no new data were created or analyzed in this study.
    1,2  Data supporting the findings are in the main text. Other data is available upon request.
    1,2,3 Sequence data from this article can be found in GEO The data supporting the findings and claims of this study is mentioned in the main text and is available with the corresponding author.
    1,2,3,4 All data are available in the main text and the supplemental information or at public databases. Sequence data were deposited in Genome Sequence Archive. Any additional information required to reanalyze the data reported in this study is available from the lead contact upon request.
    1,3 Cell line scDNA-seq (GSE270567) and scRNA-seq (GSE270568) are available through GEO. Patient scDNA-seq is available upon reasonable request. Image data is available upon reasonable request for cell lines and patients. If interested in using the High Definition Single Cell Assay please contact CSI-Cancer.
    1,4 All data relevant to the study are included in the article or uploaded as online supplemental information. Data will be made available following acceptance.
    1,2,4 All data are available in the main text and supplementary materials. All seed stocks generated in the study are freely available to the research community upon request.
    1,3,4 All data generated or analyzed during this study are included in this published article, its supplementary information files, and publicly available repositories.
    2,3 RNA-seq data have been deposited at GEO and are publicly available as of the date of publication. Accession number is listed in the key resources table. This paper does not report original code. Any additional information required to reanalyze the data reported in this paper is available from the lead contact upon request.
    2,4 The original contributions presented in the study are included in the article/Supplementary material, further inquiries can be directed to the corresponding author.
    2,3,4 The generated cryo-EM maps and PDB codes associated with different structures have been deposited in the EMDB and PDB databases, with the details mentioned in the key resources table. Raw data associated with the gel images and quantitative kinetics assays in Figures 1, 2, 5,6, 7, S1, S5, and S7 were deposited in Mendeley and are publicly avail-able at https://doi.org/10.17632/gzpdjrr8hj.1. This paper does not report original code. Any additional information required to reanalyze the data reported in this paper is available from the lead contact upon request.
    3,4 Raw data and information for CRISPR-generated alleles, all quantifications, and exact P values (one-way ANOVA and Tukey test) are in Supplementary Table S6. The raw Sanger sequence traces for edited sequences are in Supplementary Data Files.The tomato and groundcherry BioProject accession numbers are PRJNA491365, PRJNA704671, and PRJNA862958.
    7 The source data of this paper are collected in the following database record: biostudies:S-SCDT-10_1038-S44319-024-00304-5.
    7 The cancer dependency and expression datasets were obtained online at https://depmap.org/portal/download/ (DepMap Public 21Q4).
    7 Publicly used databases in study: NCBI Homologene, 11/22/2019, https://www.ncbi.nlm.nih.gov/homologene, GENCODE mm10 (v16), https://www.gencodegenes.org, JASPAR 2020 database, http://jaspar.genereg.net.
    7 This paper analyzes existing, publicly available data.
    2,3,7 This paper analyzes existing, publicly available data. The accession numbers for the datasets are listed in the key resources table. The full meta-marker lists for the BICCN cell types and optimal number of markers have been depositied on FigShare and are publicly available as of the date of publication. DOIs are listed in the key resources table. All original code has been deposited at Github at https://github.com/gillislab/MetaMarkers and is publicly available as of the date of publication. Any additional information required to reanalyze the data reported in this paper is available from the lead contact upon request.
  
    Data availability statement:
    {context}
    
    """
    response = client.models.generate_content(
      model="gemini-2.5-flash",
      contents=[data_statement,prompt])
    # print(response.text)
    return response.text

In [None]:
# The server is sometimes very busy so sometimes you have to try multiple times to get the LLM to respond.
# Here it will try five times to get the model to respond.  Each time it will wait longer to increase a chance of a response.

def multi_try(data_statement):
    for attempt in range(5):
        try:
            result = google_chat(data_statement)
            break
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            wait = 2 ** attempt + random.random()
            time.sleep(wait)

    return result

In [None]:
# This sets up all of our functions to batch run multiple files.
# Depending what model you choose it will only allow a certain number of requests a day.
# The gemini-2.5-flash only allows 250 requests per day (RPD).

classification = []  # classification output from the LLM
loc = []             # find location of success
loc_start = 0        # initialize
fail = []            # fail das
fail_loc = []        # fail location

for das in df['DAS']:
    try:
        result = multi_try(das) # This will try multiple times to access the LLM. If it is busy, this does not count as a request as part of your requests per day.
        classification.append(result)
        loc.append(loc_start)
        loc_start +=1
        time.sleep(6)             # For the current model you are only allowed 10 request per minute.  The wait makes sure we stick to this.
        print(loc_start,result)
    except:
        print('FAILURE!')  # If there is an error (e.g. can't find file or not response from LLM) it will print FAILURE.
        fail.append(das)
        fail_loc.append(loc_start)
        loc_start +=1
        classification.append(None)

In [None]:
df['classification'] = classification # put the classification into dataframe

In [None]:
df.to_csv(output_path) # save to csv

In [None]:
len(fail) # how many failed

In [None]:
fail_loc # location of failed DASs.