In [20]:
import os
import pandas as pd
import requests
import json
import re
from PyPDF2 import PdfReader

API_KEY = "AIzaSyCJvV-nlMzV36NRsGAmJCX_UICsEjAmYKI"  
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={API_KEY}"
paper_path = "D:/STUDY/CODE/Paper_retrieval/paper"

In [21]:
# Hàm đọc và trích xuất thông tin từ các paper PDF trong thư mục
def extract_info_from_pdf_in_directory(directory_path):
    columns = [
        "title", "first_authors",
        "current", "capacity", "electrolyte_volume", 
        "li_thickness", "temperature"
    ] + [f"electrolyte_{i+1}" for i in range(10)]
    
    extracted_data = pd.DataFrame(columns=columns)
    
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing: {filename}")
            
            try:
                with open(file_path, "rb") as pdf_file:
                    reader = PdfReader(pdf_file)
                    text = ""
                    for page in reader.pages:
                        text += page.extract_text() + "\n"
                
                extracted_info = extract_experiment_info_gemini(text)
                
                if extracted_info:
                    extracted_data = pd.concat([
                        extracted_data, 
                        pd.DataFrame([extracted_info])
                    ], ignore_index=True)
            
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    return extracted_data

def extract_experiment_info_gemini(text):
    max_chars = 30000
    if len(text) > max_chars:
        text = text[:max_chars] + "... [TEXT TRUNCATED]"
    
    # Tạo prompt mới hỗ trợ nhiều electrolyte
    prompt = {
        "contents": [{
            "parts": [{
                "text": f"""
Hãy trích xuất các thông tin sau từ paper nghiên cứu về pin lithium. 
Nếu không tìm thấy thông tin, hãy trả về "N/A".
1. Tên bài báo (title)
2 Tên tác giả chính (first authors)
3. Dòng điện (current) tính theo mA/cm²
4. Dung lượng (capacity) tính theo mAh/cm²
5. Thể tích dung dịch điện phân (electrolyte volume) tính theo μL
6. Độ dày Li (Li thickness) tính theo micromet
7. Nhiệt độ (temperature) tính theo độ C
8. Các thành phần dung dịch điện phân (electrolytes) - trả về dưới dạng mảng

QUY TẮC:
- Liệt kê TẤT CẢ electrolytes 
- Các cột current, capacity, electrolyte_volume, li_thickness, temperature chỉ trả về số, không kèm đơn vị
- Chỉ trả về tên hóa học (không kèm nồng độ/tỷ lệ)
- Sắp xếp theo thứ tự xuất hiện trong bài
- Giới hạn tối đa 10 thành phần chính

Văn bản:
{text}

Trả lời theo định dạng JSON chính xác:
{{  
    "title": "tên bài báo",
    "first_authors": "tên tác giả chính",
    "current": "giá trị",
    "capacity": "giá trị",
    "electrolyte_volume": "giá trị",
    "li_thickness": "giá trị",
    "temperature": "giá trị",
    "electrolytes": ["thành phần 1", "thành phần 2", ...]
}}
"""
            }]
        }]
    }

    # Gửi request đến Gemini API
    headers = {'Content-Type': 'application/json'}
    response = requests.post(API_URL, headers=headers, json=prompt)
    
    if response.status_code != 200:
        print(f"API Error: {response.status_code} - {response.text}")
        return None

    try:
        response_json = response.json()
        content = response_json["candidates"][0]["content"]["parts"][0]["text"]
        
        json_match = re.search(r'\{[\s\S]*\}', content)
        if not json_match:
            print(f"JSON not found in response: {content}")
            return None
            
        result = json.loads(json_match.group())
        
        electrolytes = result.get("electrolytes", [])
        if not isinstance(electrolytes, list):
            electrolytes = [electrolytes] if electrolytes != "N/A" else []
        
        extracted_info = {
            "title": result.get("title", "N/A"),
            "first_authors": result.get("first_authors", "N/A"),
            "current": result.get("current", "N/A"),
            "capacity": result.get("capacity", "N/A"),
            "electrolyte_volume": result.get("electrolyte_volume", "N/A"),
            "li_thickness": result.get("li_thickness", "N/A"),
            "temperature": result.get("temperature", "N/A")
        }
        
        for i in range(10):
            key = f"electrolyte_{i+1}"
            extracted_info[key] = electrolytes[i] if i < len(electrolytes) else "N/A"
        
        return extracted_info
        
    except (KeyError, IndexError, json.JSONDecodeError) as e:
        print(f"Response parsing error: {str(e)}")
        print(f"Full response: {response.text}")
        return None


data = extract_info_from_pdf_in_directory(paper_path)

# Hiển thị và lưu kết quả
if not data.empty:
    print("\nExtracted Data:")
    print(data)
    data.to_csv("extracted_data.csv", index=False)
    print("Data saved to extracted_data.csv")
else:
    print("No data extracted")

Processing: 2507.02334v1.pdf
Processing: mmc1.pdf
Processing: ref_1a.pdf
Processing: ref_1b.pdf
Processing: ref_2a.pdf
Processing: ref_3a.pdf
Processing: ref_4a.pdf
Processing: ref_4b.pdf
Processing: ref_5a.pdf
Processing: ref_5b.pdf
Processing: ref_6a.pdf
Processing: ref_6b.pdf
Processing: ref_7a.pdf
Processing: ref_7b.pdf
Processing: ref_8a.pdf
Processing: ref_8b.pdf

Extracted Data:
                                                title       first_authors  \
0   High-ThroughputNEBforLi-IonConductorDiscoveryv...       Jingchen Lian   
1   High-Efﬁciency Lithium Metal Batteries with Fi...          Shuru Chen   
2   Lithium Difluorophosphate-Based Dual-Salt Low ...           Hao Zheng   
3   Lithium Difluorophosphate-Based Dual-Salt Low ...           Hao Zheng   
4   Enhanced performance of lithium metal batterie...  Hafiz Ahmad Ishfaq   
5   Enhancing Cycling Stability of Lithium Metal B...     Thanh-Nhan Tran   
6   An All-Fluorinated Ester Electrolyte for Stabl...       John Holoube

In [22]:
data

Unnamed: 0,title,first_authors,current,capacity,electrolyte_volume,li_thickness,temperature,electrolyte_1,electrolyte_2,electrolyte_3,electrolyte_4,electrolyte_5,electrolyte_6,electrolyte_7,electrolyte_8,electrolyte_9,electrolyte_10
0,High-ThroughputNEBforLi-IonConductorDiscoveryv...,Jingchen Lian,,,,,800.0,Li1+xAlxTi2−x(PO4)3,LiMgPO4,LiTiPO5,Li0.5Mg0.5Al0.5PO4,Li0.5TiPO4.5F0.5,Li2MnO3,LiTi2(PO4)3,,,
1,High-Efﬁciency Lithium Metal Batteries with Fi...,Shuru Chen,0.5,5.0,,,25.0,LiFSI,TEP,BTFE,LiPF6,,,,,,
2,Lithium Difluorophosphate-Based Dual-Salt Low ...,Hao Zheng,2.0,,,,25.0,LiDFP,LiBOB,LiFSI,LiTFSI,LiPF6,CO2,SO2,CsPF6,RbF,NaPF6
3,Lithium Difluorophosphate-Based Dual-Salt Low ...,Hao Zheng,1.0,1.5,,100.0,,LiPF6,LiDFP,LiBOB,LiFSI,LiTFSI,,,,,
4,Enhanced performance of lithium metal batterie...,Hafiz Ahmad Ishfaq,0.5,0.5,20.0,110.0,80.0,"2,2-bis(trifluoromethyl)-1,3-dioxolane","1,2-dimethoxyethane",Lithium bis(fluorosulfonyl)imide,"1,2-(1,1,2,2-tetrafluoroethyl) ether","1,1,2,2-tetrafluoroethyl-2,2,3,3-tetrafluoropr...","bis(2,2,2-trifluoroethoxy) methane","2-(2,2,2-trifluoroethoxy) -4-(trifluoromethyl)...","4-(trifluoromethyl) -1,3-dioxolane","2,2-dimethoxy-4-(trifluoromethyl) -1,3-dioxolane","1,2-bis(2,2,2-trifluoroethoxy) ethane"
5,Enhancing Cycling Stability of Lithium Metal B...,Thanh-Nhan Tran,0.5,4.0,,50.0,,LiFSI,DME,BTFEE,TTE,,,,,,
6,An All-Fluorinated Ester Electrolyte for Stabl...,John Holoubek,0.5,161.0,,,-40.0,LiPF6,"methyl 3,3,3-tri fluoropionate",fluoroethylene carbonate,methyl propionate,ethylene carbonate,diethyl carbonate,,,,
7,An All-Fluorinated Ester Electrolyte for Stabl...,John Holoubek,,0.84,,,23.0,Fluoroethylene carbonate,"Methyl 3,3,3-trifuoloropropionate",LiPF6,Methyl propionate,Ethylene carbonate,Diethyl carbonate,,,,
8,Role of inner solvation sheath within salt –so...,Xiaodi Ren,,,,,,LiFSI,DMC,TMS,TEP,DME,TTE,,,,
9,Role of Inner Solvation Sheath within Salt -So...,Xiaodi Ren,0.5,1.0,,,25.0,LiFSI,DMC,TTE,TMS,TEP,DME,,,,
