In [94]:
from langchain.chat_models import init_chat_model

from pdfminer.high_level import extract_text
from typing import Optional, List, Dict
from typing_extensions import Annotated, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.tools import tool

import pandas as pd
import getpass
import pdf2doi
import os

In [85]:
dir = './SSWLAB_논문'

In [86]:
sswlab_files = []

for file in os.listdir(dir):
    sswlab_files.append(file)

In [116]:
class PaperInfo(TypedDict):
    title: Annotated[str, ..., 'Title of the research paper']
    author: Annotated[str, ..., 'Authors of the paper in (Name) et al. format']
    id: Annotated[str, ..., 'DOI or arXiv ID']
    data: Annotated[List[str], ..., 'Data used']
    code_link: Annotated[str, 'None provided', 'Link to code (Github, etc)']
    packages_used: Annotated[List[str], 'None Provided', 'Packages or libraries used.']
    task: Annotated[str, ..., 'Task performed in paper (e.g. flare prediction, CME detection, etc.)']
    abstract: Annotated[str, ..., 'Summary of abstract']
    models: Annotated[List[str], ..., 'Model architecture(s) used e.g. CNN, ResNet, pix2pix, GAN, etc.']
    hybrid_model: Annotated[bool, ..., 'Whether hybrid model architectures were used.']
    multimodal: Annotated[List[str], 'N/A', 'Type of multimodal models used, if any.']
    baselines: Annotated[List[str], 'N/A', 'Baseline models or papers used.']
    preprocessing: Annotated[List[str], ..., 'Preprocessing steps taken']
    citations: Annotated[List[str], ..., 'Citations']
    approach_used: Annotated[List[str], ..., 'Approach(es) used']
    methodology: Annotated[str, ..., 'Summary of methodology']
    metrics: Annotated[List[Dict[str, str]], ..., 'Metrics used for evaluation and the scores obtained']
    limitations: Annotated[List[str], ..., 'Limitations of the research']
    reproducibility: Annotated[float, ..., 'From 0-5, based on the completeness of the method, code and data, how reproducible is the paper?']
    key_points: Annotated[Dict[str, str], ..., 'Key strengths and weaknesses']
    reuse_potential: Annotated[List[str], 'N/A', 'Notes on potential for reuse (if applicable)']

In [117]:
llm = init_chat_model('mistral-large-latest', model_provider='mistralai')

In [118]:
structured_llm = llm.with_structured_output(PaperInfo)

In [8]:
from mistralai import Mistral
from dotenv import load_dotenv
import datauri

In [9]:
load_dotenv()
client = Mistral(api_key=os.environ['MISTRAL_API_KEY'])

In [None]:
def upload_pdf(file):
    pdf = client.files.upload(
        file={
            'file_name': file,
            'content': open(f'./SSWLAB_논문/{file}', 'rb')
        },
        purpose='ocr'
    )

    signed_url = client.files.get_signed_url(file_id=pdf.id)
    return signed_url.url

In [11]:
def extract_pdf(file):
    ocr_response = client.ocr.process(
        model='mistral-ocr-latest',
        document={
            'type': 'document_url',
            'document_url': upload_pdf(file)
        },
        include_image_base64=True
    )

    return ocr_response

In [12]:
papers = []
for file in sswlab_files:
    pdf = extract_pdf(file)

    content = []
    for page in pdf.pages:
        content.append(page.markdown)

    papers.append({
        'file_name': file,
        'content': content
    })

In [13]:
for pdf in papers:
    content = '\n'.join(pdf['content'])
    pdf['content'] = content

In [24]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', "Extract information from the following research paper."),
        ('human', '{paper}')    
    ]
)

In [119]:
fin_llm = prompt | structured_llm

In [None]:
data = []
for paper in papers:
    data.append(fin_llm.invoke({'paper': paper['content']}))

In [None]:
df = pd.DataFrame(data)

In [None]:
df

[{'title': 'Generation of Modern Satellite Data from Galileo Sunspot Drawings in 1612 by Deep Learning',
  'author': 'Lee et al.',
  'id': '10.3847/1538-4357/abd701',
  'data': ['Mount Wilson Observatory (MWO) sunspot drawings (2011-2015)',
   'Solar Dynamics Observatory (SDO)/Helioseismic and Magnetic Imager (HMI) line-of-sight magnetograms (2011-2015)',
   'SDO/Atmospheric Imaging Assembly (AIA) seven wavelength images (94, 131, 171, 193, 211, 304, and 335 Å) (2011-2015)',
   'Galileo sunspot drawings (1612 June 2 to July 8)'],
  'code_link': 'None provided',
  'packages_used': ['NumPy', 'Keras', 'TensorFlow', 'SunPy'],
  'task': 'Generation of modern satellite-like solar magnetograms and EUV images from historical sunspot drawings',
  'abstract': "This study presents a deep learning-based approach to generate modern satellite-like solar magnetograms and EUV images from historical sunspot drawings. The authors use a conditional generative adversarial network (cGAN) model, specificall