In [1]:
from langchain.chat_models import init_chat_model

from pdfminer.high_level import extract_text
from typing import Optional, List, Dict
from typing_extensions import Annotated, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.tools import tool

import pandas as pd
import getpass
import pdf2doi
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dir = './SSWLAB_논문'

In [3]:
sswlab_files = []

for file in os.listdir(dir):
    sswlab_files.append(file)

In [4]:
class PaperInfo(TypedDict):
    title: Annotated[str, ..., 'Title of the research paper']
    author: Annotated[str, ..., 'Authors of the paper in (Name) et al. format']
    id: Annotated[str, ..., 'DOI or arXiv ID']
    data: Annotated[List[str], ..., 'Data used']
    code_link: Annotated[str, 'None provided', 'Link to code (Github, etc)']
    packages_used: Annotated[List[str], 'None Provided', 'Packages or libraries used.']
    task: Annotated[str, ..., 'Task performed in paper (e.g. flare prediction, CME detection, etc.)']
    abstract: Annotated[str, ..., 'Summary of abstract']
    models: Annotated[List[str], ..., 'Model architecture(s) used e.g. CNN, ResNet, pix2pix, GAN, etc.']
    hybrid_model: Annotated[bool, ..., 'Whether hybrid model architectures were used.']
    multimodal: Annotated[List[str], 'N/A', 'Type of multimodal models used, if any.']
    baselines: Annotated[List[str], 'N/A', 'Baseline models or papers used.']
    preprocessing: Annotated[List[str], ..., 'Preprocessing steps taken']
    citations: Annotated[List[str], ..., 'Citations']
    approach_used: Annotated[List[str], ..., 'Approach(es) used']
    methodology: Annotated[str, ..., 'Summary of methodology']
    metrics: Annotated[List[Dict[str, str]], ..., 'Metrics used for evaluation and the scores obtained']
    limitations: Annotated[List[str], ..., 'Limitations of the research']
    reproducibility: Annotated[float, ..., 'From 0-5, based on the completeness of the method, code and data, how reproducible is the paper?']
    key_points: Annotated[Dict[str, str], ..., 'Key strengths and weaknesses']
    reuse_potential: Annotated[List[str], 'N/A', 'Notes on potential for reuse (if applicable)']

In [5]:
llm = init_chat_model('mistral-large-latest', model_provider='mistralai')

In [6]:
structured_llm = llm.with_structured_output(PaperInfo)

In [13]:
papers_df = pd.read_csv('listup.csv')
papers = []

for file, content in zip(papers_df['file_name'], papers_df['content']):
    papers.append({'file_name': file, 'content': content})

In [8]:
from mistralai import Mistral
from dotenv import load_dotenv
import datauri

In [9]:
load_dotenv()
client = Mistral(api_key=os.environ['MISTRAL_API_KEY'])

In [None]:
def upload_pdf(file):
    pdf = client.files.upload(
        file={
            'file_name': file,
            'content': open(f'./SSWLAB_논문/{file}', 'rb')
        },
        purpose='ocr'
    )

    signed_url = client.files.get_signed_url(file_id=pdf.id)
    return signed_url.url

In [11]:
def extract_pdf(file):
    ocr_response = client.ocr.process(
        model='mistral-ocr-latest',
        document={
            'type': 'document_url',
            'document_url': upload_pdf(file)
        },
        include_image_base64=True
    )

    return ocr_response

In [12]:
papers = []
for file in sswlab_files:
    pdf = extract_pdf(file)

    content = []
    for page in pdf.pages:
        content.append(page.markdown)

    papers.append({
        'file_name': file,
        'content': content
    })

In [13]:
for pdf in papers:
    content = '\n'.join(pdf['content'])
    pdf['content'] = content

In [14]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', "Extract information from the following research paper."),
        ('human', '{paper}')    
    ]
)

In [15]:
fin_llm = prompt | structured_llm

In [16]:
data = []
for paper in papers:
    data.append(fin_llm.invoke({'paper': paper['content']}))

In [17]:
df = pd.DataFrame(data)

In [18]:
df

Unnamed: 0,title,author,id,data,code_link,packages_used,task,abstract,models,hybrid_model,...,baselines,preprocessing,citations,approach_used,methodology,metrics,limitations,reproducibility,key_points,reuse_potential
0,Generation of Modern Satellite Data from Galil...,Lee et al.,DOI: 10.3847/1538-4357/abd198,[Mount Wilson Observatory (MWO) sunspot drawin...,None provided,"[NumPy, Keras, TensorFlow, SunPy]",Generating modern satellite-like magnetograms ...,The study focuses on generating solar magnetog...,[Pix2Pix (cGAN-based model)],False,...,[None explicitly mentioned],[Alignment of sunspot drawings with SDO magnet...,"[Vaquero (2007), Arlt & Vaquero (2020), Hoyt &...",[Image-to-image translation using conditional ...,"The study uses a pix2pix model, which is a typ...",[{'Correlation Coefficient (CC)': 'Average CC ...,[The model does not successfully generate acti...,3,{'strengths': ['Innovative approach to bridgin...,[The methodology can be applied to other histo...
1,Visual Explanation of a Deep Learning Solar Fl...,Yi et al.,10.3847/2041-8213/abe94a,[Solar and Heliospheric Observatory/Michelson ...,None provided,"[PyTorch, NumPy, Matplotlib]",Solar flare prediction,This study presents a visual explanation of a ...,[Convolutional Neural Network (CNN) with dense...,False,...,[Event statistics flare forecasting (Wheatland...,"[Resizing magnetograms to 512x512 resolution, ...","[Priest & Forbes 2002, Shibata & Magara 2011, ...",[Deep learning-based flare prediction using CN...,The study uses a CNN-based model to predict so...,"[{'TSS': '0.65'}, {'ACC': '0.83'}, {'HSS': '0....",[The model's performance may vary with differe...,4,{'strengths': ['The model achieves a high TSS ...,[The methodology can be extended to other spac...
2,Improved AI-generated Solar Farside Magnetogra...,Jeong et al.,10.3847/2041-8213/ac91c1,[STEREO (Solar Terrestrial Relations Observato...,https://github.com/JeongHyunJin/Pix2PixCC,"[PyTorch, NumPy, Matplotlib, SciPy, Astropy, S...",Solar farside magnetogram generation,The study presents an improved artificial inte...,[Pix2PixCC (modified Pix2Pix with correlation ...,False,...,"[Pix2Pix (Isola et al. 2017), Pix2PixHD (Wang ...",[Alignment of images to ensure the same rotati...,"[Kim et al. (2019, KPL19), Jeong et al. (2020,...",[Deep learning-based image-to-image translatio...,"The study introduces the Pix2PixCC model, an e...",[{'metric': 'Pixel-to-pixel Correlation Coeffi...,[Physical quantities based on pixel-to-pixel d...,4,{'strengths': ['Improved AI-generated magnetog...,[The AI-generated magnetograms can be used for...
3,Construction of global IGS-3D electron density...,Ji et al.,10.1016/j.jastp.2024.106370,[International Global Navigation Satellite Sys...,,,Global 3-D electron density modeling,The study presents a deep learning-based metho...,[Multi-Layer Perceptron (MLP)],False,...,,[Normalization of input data to a range of [-1...,"[Bilitza (2001), Hajj and Romans (1998), Yue e...","[Deep learning-based inversion method, Trainin...",The study employs a deep learning approach usi...,[{'Root Mean Square Error (RMSE) between IGS a...,[The IRI model is not designed to account for ...,4,{'strengths': ['The IGS-3D Ne model improves t...,[The model can be adapted for real-time ionosp...
4,Generation of He I 1083 nm Images from SDO AIA...,Son et al.,10.3847/1538-4357/ac11ae,"[SDO/AIA 19.3 nm and 30.4 nm images, NSO/SOLIS...",None provided,"[SunPy, TensorFlow/PyTorch (implied by deep le...",Generation of He I 1083 nm images from SDO/AIA...,This study presents a deep learning-based meth...,[pix2pixHD (conditional Generative Adversarial...,False,...,"[pix2pix (Isola et al., 2017)]",[Selection of high-quality SDO/AIA images (qua...,"[Isola et al. (2017), Wang et al. (2018), Good...","[Image-to-image translation using pix2pixHD, T...","The study employs the pix2pixHD model, a condi...","[{'CC (Model I)': '0.83'}, {'RMSE (Model I)': ...",[The study acknowledges that the generated ima...,4,{'strengths': ['Successful generation of He I ...,[The methodology can be extended to generate o...
5,Can we properly determine differential emissio...,Youn et al.,arXiv:2501.00001,"[SDO/AIA datasets (2011-2021), Solar Orbiter/E...",https://github.com/ianan/demreg/,"[PyTorch, Astropy, aiapy, SunPy, NumPy]",Differential Emission Measure (DEM) determinat...,The study investigates whether differential em...,[Pix2PixCC],False,...,[Pix2PixHD],[Logarithm base 2 transformation and normaliza...,"[Schmelz et al. (2011), Hannah & Kontar (2012)...","[Image-to-image translation using Pix2PixCC, R...",The study uses a deep learning model (Pix2PixC...,[{'Pearson CC (Correlation Coefficient)': '0.8...,[Differences in response functions between AIA...,4,{'strengths': ['Successful generation of synth...,[The methodology can be applied to other space...
6,Fast Reconstruction of 3D Density Distribution...,Rahman et al.,10.3847/1538-4357/acc8e1,[Photospheric solar magnetic fields from MAS s...,None provided,"[SunPy, PyTorch, NumPy]",3D coronal electron density distribution recon...,This study presents the first attempt to gener...,[pix2pixHD],False,...,[MAS simulation model],[Data alignment and resizing to 182 × 96 × 151...,"[Gómez (2018), Vourlidas et al. (2000), de Pat...","[Image-to-image translation using pix2pixHD, C...","The study uses a pix2pixHD model, an advanced ...",[{'Pearson's Correlation Coefficient (CC)': 'A...,[The accuracy of AI-generated results may be a...,4,{'strengths': ['Significant reduction in compu...,
7,Solar farside magnetograms from deep learning ...,Kim et al.,10.1038/s41550-019-0717-0,"[SDO/AIA 304-Å images, SDO/HMI line-of-sight m...",https://github.com/tykimos/SolarMagGAN,"[NumPy, Keras]",Generation of farside solar magnetograms from ...,The paper presents a deep learning model based...,[Conditional Generative Adversarial Networks (...,False,...,[],"[Calibration of images, Rotation and centering...","[Solanki et al. (2006), Lindsey & Braun (2000)...","[Image-to-image translation using cGANs, Train...",The study uses a cGAN-based deep learning mode...,[{'Correlation Coefficient (CC) for Total Unsi...,[The model may not accurately generate the til...,4,{'strengths': ['The model successfully generat...,[The model can be adapted for other astronomic...
8,Three-day Forecasting of Solar Wind Speed Usin...,Son et al.,10.3847/1538-4357/ace04e,"[SDO/AIA 211 and 193 Å images, OMNIWeb solar w...",None provided,"[SolarSoft library, TensorFlow/PyTorch (implie...",Solar wind speed prediction,The study presents a deep-learning model to fo...,[Convolutional Neural Network (CNN) with Incep...,True,...,[Wang-Sheely-Arge-ENLIL (WSA-ENLIL) model],"[Rotating, centering, exposure compensation, a...","[Upendran et al. (2020), Sun et al. (2021), Ra...","[Deep learning-based forecasting, Multimodal d...",The study uses a deep-learning model that inte...,"[{'RMSE (6-hour prediction)': '37.4 km/s'}, {'...",[The model cannot predict solar wind speed enh...,4,{'strengths': ['The model successfully predict...,[The model can be adapted for near-real-time f...
9,Generation of High-resolution Solar Pseudo-mag...,Shin et al.,10.3847/2041-8213/ab95ac,[Ca II K 393.3 nm images from the Precision So...,https://github.com/NoelShin/Ca2Mag,"[PyTorch, PIL, NumPy, scikit-image]",Generation of high-resolution pseudo-magnetogr...,The paper presents a deep learning model based...,[pix2pixHD],False,...,[Comparison with baseline pixel-to-pixel calib...,[Alignment of Ca II K images with SDO/HMI imag...,"[LeCun & Bengio (1998), Goodfellow et al. (201...",[Deep learning-based image-to-image translatio...,The methodology involves using a pix2pixHD mod...,[{'metric': 'Total Unsigned Magnetic Flux (TUM...,[The model struggles to assign proper polariti...,4,{'strengths': ['High correlation between AI-ge...,[The model can be applied to historical Ca II ...
