In [1]:
import os
import pandas as pd
import requests
import json
import re
from PyPDF2 import PdfReader


def get_configuration_constants():
    return {
        'API_KEY': "AIzaSyCJvV-nlMzV36NRsGAmJCX_UICsEjAmYKI",
        'MAX_CHARS': 30000,
        'COLUMNS': [
            "title", "first_author", "current", "capacity", "electrolyte_volume", 
            "li_thickness", "temperature"
        ] + [f"electrolyte_{i+1}" for i in range(10)]
    }


def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


def create_gemini_prompt(text):
    return {
        "contents": [{
            "parts": [{
                "text": f"""
Please extract the following information from the lithium battery research paper. 
If information is not found, return "N/A".
1. Paper title (title)
2. First author name (first_author)
3. Current density (current) in mA/cm²
4. Capacity (capacity) in mAh/cm²
5. Electrolyte volume (electrolyte_volume) in μL
6. Li thickness (li_thickness) in micrometers
7. Temperature (temperature) in degrees Celsius. If it is done either at room temperature or 298K, that means 25°C
8. Electrolyte components (electrolytes) - return as array

RULES:
- List ALL electrolytes 
- For current, capacity, electrolyte_volume, li_thickness, temperature columns: return only numbers, no units
- Return only chemical names (no concentration/ratio)
- Sort by order of appearance in paper
- Maximum 10 main components

Text:
{text}

Return in exact JSON format:
{{  
    "title": "paper title",
    "first_author": "first author name",
    "current": "value",
    "capacity": "value",
    "electrolyte_volume": "value",
    "li_thickness": "value",
    "temperature": "value",
    "electrolytes": ["component 1", "component 2", ...]
}}
"""
            }]
        }]
    }


def send_gemini_request(api_key, prompt):
    api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
    headers = {'Content-Type': 'application/json'}
    response = requests.post(api_url, headers=headers, json=prompt)
    
    if response.status_code != 200:
        print(f"API Error: {response.status_code} - {response.text}")
        return None
    
    return response


def parse_gemini_response(response):
    try:
        response_json = response.json()
        content = response_json["candidates"][0]["content"]["parts"][0]["text"]
        
        json_match = re.search(r'\{[\s\S]*\}', content)
        if not json_match:
            print(f"JSON not found in response: {content}")
            return None
            
        result = json.loads(json_match.group())
        return result
        
    except (KeyError, IndexError, json.JSONDecodeError) as e:
        print(f"Response parsing error: {str(e)}")
        print(f"Full response: {response.text}")
        return None


def format_extracted_info(result):
    electrolytes = result.get("electrolytes", [])
    if not isinstance(electrolytes, list):
        electrolytes = [electrolytes] if electrolytes != "N/A" else []
    
    extracted_info = {
        "title": result.get("title", "N/A"),
        "first_author": result.get("first_author", "N/A"),
        "current": result.get("current", "N/A"),
        "capacity": result.get("capacity", "N/A"),
        "electrolyte_volume": result.get("electrolyte_volume", "N/A"),
        "li_thickness": result.get("li_thickness", "N/A"),
        "temperature": result.get("temperature", "N/A")
    }
    
    for i in range(10):
        key = f"electrolyte_{i+1}"
        extracted_info[key] = electrolytes[i] if i < len(electrolytes) else "N/A"
    
    return extracted_info


def extract_experiment_info_gemini(text, api_key, max_chars):
    if len(text) > max_chars:
        text = text[:max_chars] + "... [TEXT TRUNCATED]"
    
    prompt = create_gemini_prompt(text)
    response = send_gemini_request(api_key, prompt)
    
    if response is None:
        return None
    
    result = parse_gemini_response(response)
    if result is None:
        return None
    
    return format_extracted_info(result)


def extract_info_from_pdf_in_directory(directory_path, columns, api_key, max_chars):
    extracted_data = pd.DataFrame(columns=columns)
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith(".pdf")]
    total_pdfs = len(pdf_files)
    
    print(f"Found {total_pdfs} PDF files to process...")
    print("=" * 50)
    
    for idx, filename in enumerate(pdf_files, 1):
        file_path = os.path.join(directory_path, filename)
        print(f"[{idx}/{total_pdfs}] Processing: {filename}", end="")
        
        try:
            text = extract_text_from_pdf(file_path)
            extracted_info = extract_experiment_info_gemini(text, api_key, max_chars)
            
            if extracted_info:
                extracted_data = pd.concat([
                    extracted_data, 
                    pd.DataFrame([extracted_info])
                ], ignore_index=True)
                print(f" - ✓ Data extracted")
            else:
                print(f" - No data extracted")
        
        except Exception as e:
            print(f" - ✗ Error: {str(e)}")
    
    print("=" * 50)
    return extracted_data


def main():
    paper_name = "paper"
    output_file = "extracted_data.csv"
    
    config = get_configuration_constants()
    directory_path = os.path.join(os.getcwd(), paper_name)
    
    data = extract_info_from_pdf_in_directory(
        directory_path, 
        config['COLUMNS'], 
        config['API_KEY'], 
        config['MAX_CHARS']
    )
    
    if not data.empty:
        print(f"Processing completed! {len(data)} records extracted")
        print("First 5 rows of extracted data:")
        print(data.head())
        data.to_csv(output_file, index=False)
        print(f"Data saved to: {output_file}")
    else:
        print("No data extracted")


if __name__ == "__main__":
    main()

Found 14 PDF files to process...
[1/14] Processing: ref_1a.pdf - ✓ Data extracted
[2/14] Processing: ref_1b.pdf - ✓ Data extracted
[3/14] Processing: ref_2a.pdfResponse parsing error: Invalid control character at: line 16 column 36 (char 628)
Full response: {
  "candidates": [
    {
      "content": {
        "parts": [
          {
            "text": "```json\n{\n    \"title\": \"Enhanced performance of lithium metal batteries via cyclic fluorinated ether based electrolytes\",\n    \"first_author\": \"Hafiz Ahmad Ishfaq\",\n    \"current\": \"0.5\",\n    \"capacity\": \"0.5\",\n    \"electrolyte_volume\": \"20\",\n    \"li_thickness\": \"110\",\n    \"temperature\": \"25\",\n    \"electrolytes\": [\n        \"2,2-bis(trifluoromethyl)-1,3-dioxolane\",\n        \"1,2-dimethoxyethane\",\n        \"lithium bis(fluorosulfonyl)imide\",\n        \"1,2-(1,1,2,2-tetrafluoroethyl) ether\",\n        \"1,1,2,2-tetrafluoroethyl-2,2,3,3-tetrafluoropropylether\",\n        \"bis(2,2,2-trifluoroethoxy