In [1]:
import os
import pandas as pd
import requests
import json
import re
from PyPDF2 import PdfReader


def get_configuration_constants():
    return {
        'GEMINI_API_KEY': "AIzaSyCJvV-nlMzV36NRsGAmJCX_UICsEjAmYKI",
        'OPENAI_API_KEY': "KEY",
        'MAX_CHARS': 30000,
        'COLUMNS': [
            "title", "first_author", "current_1", "capacity_1", "current_2", "capacity_2", "electrolyte_volume", "li_thickness"
        ] + [f"E_{i+1}" for i in range(15)]
    }


def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as pdf_file:
        reader = PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


def find_si_file(main_filename, si_directory):
    if not os.path.exists(si_directory):
        return None
    
    main_basename = os.path.splitext(main_filename)[0]
    si_files = [f for f in os.listdir(si_directory) if f.endswith(".pdf")]
    
    # Handle format like 'XXXa_Y' -> 'XXXb_Y' where XXX can be any name
    if 'a_' in main_basename:
        si_basename = main_basename.replace('a_', 'b_')
        si_candidate = f"{si_basename}.pdf"
        if si_candidate in si_files:
            return si_candidate
    
    return None


def extract_combined_text_from_files(main_file_path, si_file_path=None):
    combined_text = ""
    
    # Extract from main paper
    try:
        main_text = extract_text_from_pdf(main_file_path)
        combined_text += main_text + "\n\n"
    except Exception as e:
        print(f"Error reading main paper: {str(e)}")
    
    # Extract from SI if available
    if si_file_path and os.path.exists(si_file_path):
        try:
            si_text = extract_text_from_pdf(si_file_path)
            combined_text += si_text + "\n\n"
        except Exception as e:
            print(f"Error reading SI: {str(e)}")
    
    return combined_text


def create_extraction_prompt(text):
    return f"""
Please extract the following information from the lithium battery-related scientific paper.
This text contains BOTH the main paper AND supporting information combined.
Search through ALL the text to find the required information.

If information is not found anywhere in the combined text, return "N/A".

1. Paper title (title)
2. First author name (first_author)
3. Current density 1 (current_1) in mA/cm² for coulombic efficiency measurements.
4. Capacity 1 (capacity_1) in mAh/cm² for coulombic efficiency measurements
5. Current density 2 (current_2) in mA/cm² for coulombic efficiency measurements- second current density value if available.
6. Capacity 2 (capacity_2) in mAh/cm² for coulombic efficiency measurements - second capacity value if available.
7. Electrolyte amount (electrolyte_volume) in μL
8. Thickness of lithium foil (li_thickness) in micrometers for coulombic efficiency measurements.
9. Electrolyte formulations (electrolytes) - return as array

RULES:
- Search through the ENTIRE combined text (main paper + SI) for all information
- For current_1, capacity_1, current_2, capacity_2, electrolyte_volume, li_thickness columns: return only numbers, no units
- List ALL unique formulations
- A unique formulation MUST contain at least one Li-salt and one solvent
- Maximum 15 electrolyte formulations

USE THE FOLLOWING FORMAT FOR ELECTROLYTE FORMULATIONS:
- use Li-salt name first, then solvent(s)
- use M for molarity (e.g., 1M, 2M)
- use + to separate Li-salt and solvent
- use + to separate additives
- Use v/v for volume ratio
- use wt% for weight percentage
- Use mol/mol for molar ratio
- Use () to indicate ratios

Combined Text (Main Paper + Supporting Information):
{text}

Return in exact JSON format:
{{  
    "title": "paper title",
    "first_author": "first author name",
    "current_1": "value",
    "capacity_1": "value",
    "current_2": "value",
    "capacity_2": "value",
    "electrolyte_volume": "value",
    "li_thickness": "value",
    "electrolytes": ["formulation 1", "formulation 2", ...]
}}
"""


def send_request(api_key, text, model_type):
    if model_type == "gemini":
        prompt = {
            "contents": [{
                "parts": [{
                    "text": create_extraction_prompt(text)
                }]
            }],
            "generationConfig": {
                "temperature": 0.1,
                "topP": 0.95,
                "topK": 39,
            },
        }
        api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
        headers = {'Content-Type': 'application/json'}
        
    elif model_type == "openai":
        prompt = {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "user",
                    "content": create_extraction_prompt(text)
                }
            ],
            "temperature": 0.1,
            "top_p": 0.95
        }
        api_url = "https://api.openai.com/v1/chat/completions"
        headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Bearer {api_key}'
        }
    
    response = requests.post(api_url, headers=headers, json=prompt)
    
    if response.status_code != 200:
        print(f"{model_type.upper()} API Error: {response.status_code} - {response.text}")
        return None
    
    return response


def parse_response(response, model_type):
    try:
        response_json = response.json()
        
        if model_type == "gemini":
            content = response_json["candidates"][0]["content"]["parts"][0]["text"]
        elif model_type == "openai":
            content = response_json["choices"][0]["message"]["content"]
        
        json_match = re.search(r'\{[\s\S]*\}', content)
        if not json_match:
            print(f"JSON not found in {model_type} response: {content}")
            return None
            
        result = json.loads(json_match.group())
        return result
        
    except (KeyError, IndexError, json.JSONDecodeError) as e:
        print(f"{model_type} response parsing error: {str(e)}")
        print(f"Full response: {response.text}")
        return None


def format_extracted_info(result):
    electrolytes = result.get("electrolytes", [])
    if not isinstance(electrolytes, list):
        electrolytes = [electrolytes] if electrolytes != "N/A" else []
    
    extracted_info = {
        "title": result.get("title", "N/A"),
        "first_author": result.get("first_author", "N/A"),
        "current_1": result.get("current_1", "N/A"),
        "capacity_1": result.get("capacity_1", "N/A"),
        "current_2": result.get("current_2", "N/A"),
        "capacity_2": result.get("capacity_2", "N/A"),
        "electrolyte_volume": result.get("electrolyte_volume", "N/A"),
        "li_thickness": result.get("li_thickness", "N/A")
    }
    
    for i in range(15):
        key = f"E_{i+1}"
        extracted_info[key] = electrolytes[i] if i < len(electrolytes) else "N/A"
    
    return extracted_info


def extract_experiment_info(text, config):
    use_model = config.get('USE_MODEL', 'gemini').lower()
    
    if len(text) > config['MAX_CHARS']:
        text = text[:config['MAX_CHARS']] + "... [TEXT TRUNCATED]"
    
    if use_model == 'gemini':
        api_key = config['GEMINI_API_KEY']
    elif use_model == 'openai':
        api_key = config['OPENAI_API_KEY']
    else:
        print(f"Unknown model: {use_model}")
        return None
    
    response = send_request(api_key, text, use_model)
    if response is None:
        return None
    
    result = parse_response(response, use_model)
    if result is None:
        return None
    
    return format_extracted_info(result)


def extract_info_from_pdf_in_directory(directory_path, si_directory_path, columns, config):
    extracted_data = pd.DataFrame(columns=columns)
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith(".pdf")]
    total_pdfs = len(pdf_files)
    
    print(f"Found {total_pdfs} PDF files to process...")
    print("=" * 50)
    
    for idx, filename in enumerate(pdf_files, 1):
        file_path = os.path.join(directory_path, filename)
        si_filename = find_si_file(filename, si_directory_path)
        si_file_path = os.path.join(si_directory_path, si_filename) if si_filename else None
        
        print(f"[{idx}/{total_pdfs}] Processing: {filename}", end="")
        if si_filename:
            print(f" + {si_filename}", end="")
        
        try:
            text = extract_combined_text_from_files(file_path, si_file_path)
            extracted_info = extract_experiment_info(text, config)
            
            if extracted_info:
                extracted_data = pd.concat([
                    extracted_data, 
                    pd.DataFrame([extracted_info])
                ], ignore_index=True)
                print(f" - ✓ Data extracted")
            else:
                print(f" - No data extracted")
        
        except Exception as e:
            print(f" - ✗ Error: {str(e)}")
    
    print("=" * 50)
    return extracted_data


def run_extraction(paper_name, si_paper_name, output_file, use_model):
    config = get_configuration_constants()
    config['USE_MODEL'] = use_model
    
    # Add model name to output filename
    file_name, file_ext = os.path.splitext(output_file)
    output_file_with_model = f"{file_name}_{use_model}{file_ext}"
    
    directory_path = os.path.join(os.getcwd(), paper_name)
    si_directory_path = os.path.join(os.getcwd(), si_paper_name)
    
    if not os.path.exists(directory_path):
        print(f"Error: Main paper directory '{paper_name}' not found")
        return
    
    if not os.path.exists(si_directory_path):
        print(f"Warning: Supporting information directory '{si_paper_name}' not found")
    
    data = extract_info_from_pdf_in_directory(
        directory_path, 
        si_directory_path,
        config['COLUMNS'], 
        config
    )
    
    if not data.empty:
        print(f"Processing completed! {len(data)} records extracted")
        data.to_csv(output_file_with_model, index=False)
        print(f"Data saved to: {output_file_with_model}")
    else:
        print("No data extracted")


def main():
    paper_name = "paper"
    si_paper_name = "paper_SI"
    output_file = "1.csv"
    use_model = "gemini"  # Options: "gemini" or "openai"
    
    run_extraction(paper_name, si_paper_name, output_file, use_model)


if __name__ == "__main__":
    main()

Found 8 PDF files to process...
[1/8] Processing: Lia_1.pdf + Lib_1.pdf - ✓ Data extracted
[2/8] Processing: Lia_2.pdf + Lib_2.pdf - ✓ Data extracted
[3/8] Processing: Lia_3.pdf + Lib_3.pdf - ✓ Data extracted
[4/8] Processing: Lia_4.pdf + Lib_4.pdf - ✓ Data extracted
[5/8] Processing: Lia_5.pdf + Lib_5.pdf - ✓ Data extracted
[6/8] Processing: Lia_6.pdf + Lib_6.pdf - ✓ Data extracted
[7/8] Processing: Lia_7.pdf + Lib_7.pdf - ✓ Data extracted
[8/8] Processing: Lia_8.pdf + Lib_8.pdf - ✓ Data extracted
Processing completed! 8 records extracted
Data saved to: 1_gemini.csv


In [None]:
Thinh = AIzaSyCJvV-nlMzV36NRsGAmJCX_UICsEjAmYKI
Minh = AIzaSyDwmTavTSj9VwT9OQG37WWYVtyc2O_M3w8
M = AIzaSyDb28g3Vagg-u_WQ39s2GGDuFO5pPmaM6Y