In [None]:
import os
import pandas as pd
import requests
import json
import re
from PyPDF2 import PdfReader


def get_configuration_constants():
    return {
        'API_KEY': "AIzaSyDb28g3Vagg-u_WQ39s2GGDuFO5pPmaM6Y",
        'MAX_CHARS': 30000,
        'COLUMNS': [
            "title", "first_author", "current", "capacity", "electrolyte_volume", "li_thickness"
        ] + [f"E_{i+1}" for i in range(15)]
    }


def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


def create_gemini_prompt(text):
    return {
        "contents": [{
            "parts": [{
                "text": f"""

Please extract the following information from the lithium battery-related scientific paper. 
If information is not found, return "N/A".
1. Paper title (title)
2. First author name (first_author)
3. Current density (current) in mA/cm² for coulombic efficiency measurements.
4. Capacity (capacity) in mAh/cm² for coulombic efficiency measurements. Be careful to distinguish it from other capacity values.
5. Electrolyte ammount (electrolyte_volume) in μL
6. Thickness of lithium foil (li_thickness) in micrometers for coulombic efficiency measurements. Be careful to distinguish it from other thickness values.
7. Electrolyte formulations (electrolytes) - return as array

RULES:
- Combine two pdf files with a and b suffixes (e.g., ref_1a.pdf and ref_1b.pdf) to extract the information
- Focus on the introduction, experimental section, and results/discussion sections of the paper and its supporting information.
- For current, capacity, electrolyte_volume, li_thickness columns: return only numbers, no units
- List ALL unique formulations
- An unique formulation MUST CONTAIN at least one Li-salt and one solvent
- The li-salt can be LiPF6, LiFSI, LiTFSI, LiBOB, LiDFOB, LiBF4, LiClO4, LiFNFSI, LiTFSI, LiNFSI, etc
- The solvent can be EC, DMC, EMC, DME, TTE, Acetonitrile, etc
- Sort by order of appearance in paper
- Maximum 15 electrolyte formulations in a single paper

USE THE FOLLOWING FORMAT FOR ELECTROLYTE FORMULATIONS:
- use Li-salt name first, then solvent(s)
- use M for molarity (e.g., 1M, 2M)
- use m for molality (e.g., 1m, 2m)
- use + to separate Li-salt and solvent
- use + to separate additives and Li-salt/solvent
- use space to separate M and lithium salt
- Use v/v for volume ratio
- use wt/wt for weight ratio
- Use mol/mol for molar ratio
- Use wt% for weight percentage (e.g., 2wt%)
- Use () to indicate the ratio of solvents
- Use () to indicate the molar ratio of individual solvent if needed
- Use () to indicate the molar ratio of li-salt if needed
- Put () after the last solvent name

Text:
{text}

Return in exact JSON format:
{{  
    "title": "paper title",
    "first_author": "first author name",
    "current": "value",
    "capacity": "value",
    "electrolyte_volume": "value",
    "li_thickness": "value",
    "electrolytes": ["formulation 1", "formulation 2", ...]
}}
"""
            }]
        }],
        "generationConfig": {
            "temperature": 0.2,
            "topP": 0.95,
            "topK": 30,
        },
    }


def send_gemini_request(api_key, prompt):
    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
    headers = {'Content-Type': 'application/json'}
    response = requests.post(api_url, headers=headers, json=prompt)
    
    if response.status_code != 200:
        print(f"API Error: {response.status_code} - {response.text}")
        return None
    
    return response


def parse_gemini_response(response):
    try:
        response_json = response.json()
        content = response_json["candidates"][0]["content"]["parts"][0]["text"]
        
        json_match = re.search(r'\{[\s\S]*\}', content)
        if not json_match:
            print(f"JSON not found in response: {content}")
            return None
            
        result = json.loads(json_match.group())
        return result
        
    except (KeyError, IndexError, json.JSONDecodeError) as e:
        print(f"Response parsing error: {str(e)}")
        print(f"Full response: {response.text}")
        return None


def format_extracted_info(result):
    electrolytes = result.get("electrolytes", [])
    if not isinstance(electrolytes, list):
        electrolytes = [electrolytes] if electrolytes != "N/A" else []
    
    extracted_info = {
        "title": result.get("title", "N/A"),
        "first_author": result.get("first_author", "N/A"),
        "current": result.get("current", "N/A"),
        "capacity": result.get("capacity", "N/A"),
        "electrolyte_volume": result.get("electrolyte_volume", "N/A"),
        "li_thickness": result.get("li_thickness", "N/A")
    }
    
    for i in range(15):
        key = f"E_{i+1}"
        extracted_info[key] = electrolytes[i] if i < len(electrolytes) else "N/A"
    
    return extracted_info


def extract_experiment_info_gemini(text, api_key, max_chars):
    if len(text) > max_chars:
        text = text[:max_chars] + "... [TEXT TRUNCATED]"
    
    prompt = create_gemini_prompt(text)
    response = send_gemini_request(api_key, prompt)
    
    if response is None:
        return None
    
    result = parse_gemini_response(response)
    if result is None:
        return None
    
    return format_extracted_info(result)


def extract_info_from_pdf_in_directory(directory_path, columns, api_key, max_chars):
    extracted_data = pd.DataFrame(columns=columns)
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith(".pdf")]
    total_pdfs = len(pdf_files)
    
    print(f"Found {total_pdfs} PDF files to process...")
    print("=" * 50)
    
    for idx, filename in enumerate(pdf_files, 1):
        file_path = os.path.join(directory_path, filename)
        print(f"[{idx}/{total_pdfs}] Processing: {filename}", end="")
        
        try:
            text = extract_text_from_pdf(file_path)
            extracted_info = extract_experiment_info_gemini(text, api_key, max_chars)
            
            if extracted_info:
                extracted_data = pd.concat([
                    extracted_data, 
                    pd.DataFrame([extracted_info])
                ], ignore_index=True)
                print(f" - ✓ Data extracted")
            else:
                print(f" - No data extracted")
        
        except Exception as e:
            print(f" - ✗ Error: {str(e)}")
    
    print("=" * 50)
    return extracted_data


def main():
    paper_name = "paper"
    output_file = "extracted_data.csv"
    
    config = get_configuration_constants()
    directory_path = os.path.join(os.getcwd(), paper_name)
    
    data = extract_info_from_pdf_in_directory(
        directory_path, 
        config['COLUMNS'], 
        config['API_KEY'], 
        config['MAX_CHARS']
    )
    
    if not data.empty:
        print(f"Processing completed! {len(data)} records extracted")
        data.to_csv(output_file, index=False)
        print(f"Data saved to: {output_file}")
    else:
        print("No data extracted")


if __name__ == "__main__":
    main()

Found 16 PDF files to process...
[1/16] Processing: ref_1a.pdfAPI Error: 429 - {
  "error": {
    "code": 429,
    "message": "You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.",
    "status": "RESOURCE_EXHAUSTED",
    "details": [
      {
        "@type": "type.googleapis.com/google.rpc.QuotaFailure",
        "violations": [
          {
            "quotaMetric": "generativelanguage.googleapis.com/generate_content_free_tier_requests",
            "quotaId": "GenerateRequestsPerDayPerProjectPerModel-FreeTier",
            "quotaDimensions": {
              "location": "global",
              "model": "gemini-2.0-flash"
            },
            "quotaValue": "200"
          }
        ]
      },
      {
        "@type": "type.googleapis.com/google.rpc.Help",
        "links": [
          {
            "description": "Learn more about Gemini API quotas",
            

In [None]:
Thinh = AIzaSyCJvV-nlMzV36NRsGAmJCX_UICsEjAmYKI
Minh = AIzaSyDwmTavTSj9VwT9OQG37WWYVtyc2O_M3w8
M = AIzaSyDb28g3Vagg-u_WQ39s2GGDuFO5pPmaM6Y