In [1]:
from langchain.chat_models import init_chat_model

from pdfminer.high_level import extract_text
from typing import Optional, List, Dict
from typing_extensions import Annotated, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.tools import tool

import pandas as pd
import getpass
import pdf2doi
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dir = './SSWLAB_논문'

In [3]:
sswlab_files = []

for file in os.listdir(dir):
    sswlab_files.append(file)

In [4]:
class PaperInfo(TypedDict):
    title: Annotated[str, ..., 'Title of the research paper']
    author: Annotated[str, ..., 'Authors of the paper in (Name) et al. format']
    id: Annotated[str, ..., 'DOI or arXiv ID']
    data: Annotated[List[str], ..., 'Data used']
    code_link: Annotated[str, 'None provided', 'Link to code (Github, etc)']
    packages_used: Annotated[List[str], 'None Provided', 'Packages or libraries used.']
    task: Annotated[str, ..., 'Task performed in paper (e.g. flare prediction, CME detection, etc.)']
    abstract: Annotated[str, ..., 'Summary of abstract']
    models: Annotated[List[str], ..., 'Model architecture(s) used e.g. CNN, ResNet, pix2pix, GAN, etc.']
    hybrid_model: Annotated[bool, ..., 'Whether hybrid model architectures were used.']
    multimodal: Annotated[List[str], 'N/A', 'Type of multimodal models used, if any.']
    baselines: Annotated[List[str], 'N/A', 'Baseline models or papers used.']
    preprocessing: Annotated[List[str], ..., 'Preprocessing steps taken']
    citations: Annotated[List[str], ..., 'Citations']
    approach_used: Annotated[List[str], ..., 'Approach(es) used']
    methodology: Annotated[str, ..., 'Summary of methodology']
    metrics: Annotated[List[Dict[str, str]], ..., 'Metrics used for evaluation and the scores obtained']
    limitations: Annotated[List[str], ..., 'Limitations of the research']
    reproducibility: Annotated[float, ..., 'From 0-5, based on the completeness of the method, code and data, how reproducible is the paper?']
    key_points: Annotated[Dict[str, str], ..., 'Key strengths and weaknesses']
    reuse_potential: Annotated[List[str], 'N/A', 'Notes on potential for reuse (if applicable)']

In [5]:
llm = init_chat_model('mistral-large-latest', model_provider='mistralai')

In [6]:
structured_llm = llm.with_structured_output(PaperInfo)

In [13]:
papers_df = pd.read_csv('listup.csv')
papers = []

for file, content in zip(papers_df['file_name'], papers_df['content']):
    papers.append({'file_name': file, 'content': content})

In [8]:
from mistralai import Mistral
from dotenv import load_dotenv
import datauri

In [9]:
load_dotenv()
client = Mistral(api_key=os.environ['MISTRAL_API_KEY'])

In [None]:
def upload_pdf(file):
    pdf = client.files.upload(
        file={
            'file_name': file,
            'content': open(f'./SSWLAB_논문/{file}', 'rb')
        },
        purpose='ocr'
    )

    signed_url = client.files.get_signed_url(file_id=pdf.id)
    return signed_url.url

In [11]:
def extract_pdf(file):
    ocr_response = client.ocr.process(
        model='mistral-ocr-latest',
        document={
            'type': 'document_url',
            'document_url': upload_pdf(file)
        },
        include_image_base64=True
    )

    return ocr_response

In [12]:
papers = []
for file in sswlab_files:
    pdf = extract_pdf(file)

    content = []
    for page in pdf.pages:
        content.append(page.markdown)

    papers.append({
        'file_name': file,
        'content': content
    })

In [13]:
for pdf in papers:
    content = '\n'.join(pdf['content'])
    pdf['content'] = content

In [14]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', "Extract information from the following research paper."),
        ('human', '{paper}')    
    ]
)

In [15]:
fin_llm = prompt | structured_llm

In [None]:
data = []
for paper in papers:
    data.append(fin_llm.invoke({'paper': paper['content']}))

In [None]:
df = pd.DataFrame(data)

In [None]:
df

Unnamed: 0,title,author,id,data,code_link,packages_used,task,abstract,models,hybrid_model,...,baselines,preprocessing,citations,approach_used,methodology,metrics,limitations,reproducibility,key_points,reuse_potential
0,Generation of Modern Satellite Data from Galil...,Lee et al.,DOI: 10.3847/1538-4357/abd498,[Mount Wilson Observatory (MWO) sunspot drawin...,None provided,"[NumPy, Keras, TensorFlow, SunPy]",Image-to-image translation of historical sunsp...,The paper presents a deep learning model based...,[pix2pix (conditional GAN)],False,...,[N/A],[Alignment of sunspot drawings with SDO images...,"[Vaquero (2007), Arlt & Vaquero (2020), Hoyt &...",[Image-to-image translation using pix2pix (cGA...,"The study employs a pix2pix model, a type of c...",[{'Correlation Coefficient (CC)': '0.82 for ma...,[The model primarily reproduces active regions...,4,{'strengths': ['Innovative application of deep...,[The methodology can be extended to other hist...
1,Visual Explanation of a Deep Learning Solar Fl...,Yi et al.,10.3847/2041-8213/abe94a,[Full-disk magnetograms from SOHO/MDI and SDO/...,None provided,"[PyTorch, NumPy, Matplotlib]",Solar flare prediction,The study presents a deep learning-based solar...,[Convolutional Neural Network (CNN) with dense...,False,...,[Event statistics flare forecasting (Wheatland...,"[Resizing magnetograms to 512x512 resolution, ...","[Priest & Forbes 2002, Shibata & Magara 2011, ...","[Guided backpropagation, Gradient-weighted Cla...",The study uses a CNN-based model to predict so...,"[{'TSS': '0.65'}, {'HSS': '0.65'}, {'ACC': '0....",[The model's performance may be affected by th...,3,{'strengths': ['The model achieves a high TSS ...,[The methodology can be extended to other sola...
2,Improved AI-generated Solar Farside Magnetogra...,Jeong et al.,10.3847/2041-8213/ac911d,"[STEREO/EUVI EUV observations (304, 193, 171 Å...",https://github.com/JeongHyunJin/Pix2PixCC,"[PyTorch, NumPy, Matplotlib, SciPy, Astropy, S...",Generation of solar farside magnetograms,The research paper introduces an improved deep...,[Pix2PixCC (modified Pix2Pix with correlation ...,False,...,,[Alignment of images to have the same rotation...,"[Kim et al. (2019), Jeong et al. (2020), Isola...",[Deep learning-based image-to-image translatio...,The Pix2PixCC model is trained using a combina...,[{'Pixel-to-Pixel CC (Full Disk)': '0.88 (8×8 ...,[Physical quantities based on pixel-to-pixel d...,4,{'strengths': ['Improved accuracy of AI-genera...,[The released AI-generated solar farside magne...
3,Construction of global IGS-3D electron density...,Ji et al.,10.1016/j.jastp.2024.106370,[International Global Navigation Satellite Sys...,None provided,[PyTorch],Global 3-D electron density modeling from TEC ...,This study presents a deep learning-based meth...,[Multi-Layer Perceptron (MLP)],False,...,[International Reference Ionosphere (IRI) model],"[Normalization of input features (DOY, UT, LT,...","[Bilitza, D. (2001). International reference i...",[Inversion method to derive electron density p...,The study employs a two-step approach: \n1. **...,[{'RMSE (Jicamarca)': '0.37 log(m^-3) for IGS-...,"[The IRI-2016 model, used for training, is not...",3,{'strengths': ['The IGS-3D Ne model demonstrat...,[The methodology can be adapted to other ionos...
4,Generation of He I 1083 nm Images from SDO AIA...,Son et al.,10.3847/1538-4357/ac19d5,"[SDO/AIA 19.3 nm and 30.4 nm images, NSO/SOLIS...",None provided,"[SunPy, AIAPrep]",Generation of He I 1083 nm images from SDO/AIA...,The study presents a novel deep learning metho...,"[pix2pixHD, cGAN]",False,...,[N/A],"[Quality filtering of SDO/AIA images, Level 1....","[Isola et al. (2017), Wang et al. (2018), Shin...","[Image-to-image translation, High-resolution i...","The study uses the pix2pixHD model, a deep lea...","[{'CC (Model I)': '0.83'}, {'RMSE (Model I)': ...",[Limb darkening in He I 1083 nm images not ful...,3,{'strengths': ['Successful generation of high-...,[The methodology can be applied to other image...
5,Can we properly determine differential emissio...,Youn et al.,arXiv:2501.00001,"[SDO/AIA (2011-2021), Solar Orbiter/EUI/FSI (2...",https://github.com/ianan/demreg/,"[PyTorch, Astropy, aiapy, SunPy, NumPy]",Differential Emission Measure (DEM) determinat...,The study explores the feasibility of using de...,[Pix2PixCC],False,...,[Pix2PixHD],[Data calibration to 2011 January 01 median va...,"[Müller et al. (2020), Rochus et al. (2020), P...","[Image-to-image translation using Pix2PixCC, R...",The study involves training Pix2PixCC models o...,"[{'Pearson CC': '94 Å: 0.87 ± 0.07, 131 Å: 0.9...",[Intercalibration discrepancies between SDO/AI...,4,{'strengths': ['Successful generation of synth...,[The methodology can be adapted for other spac...
6,Fast Reconstruction of 3D Density Distribution...,Rahman et al.,10.3847/1538-4357/acc4af,[MAS thermodynamic model data (photospheric ma...,None provided,"[SunPy, PyTorch, NumPy]",3D coronal electron density distribution recon...,This study introduces a deep learning-based ap...,[pix2pixHD (a variant of Generative Adversaria...,False,...,[MAS (MHD Algorithm outside a Sphere) simulati...,[Data alignment to a fixed size of 182 × 96 × ...,"[Isola, P., Zhu, J. Y., Zhou, T., & Efros, A. ...","[Image-to-image translation using pix2pixHD, C...",The methodology involves using the pix2pixHD m...,[{'Pearson's Correlation Coefficient (CC)': 'A...,"[The study relies on MAS simulation data, whic...",4,{'Strengths': ['Significant reduction in compu...,[The methodology can be extended to generate o...
7,Solar farside magnetograms from deep learning ...,Kim et al.,10.1038/s41550-019-0714-9,"[SDO/AIA 304-Å images, SDO/HMI line-of-sight m...",https://github.com/tykimos/SolarMagGAN,"[NumPy, Keras]",Generation of farside solar magnetograms from ...,The paper presents a deep learning model based...,[Conditional Generative Adversarial Networks (...,False,...,,"[Calibrating, rotating, and centering images t...","[Solanki et al. (2006), Lindsey & Braun (2000)...","[Image-to-image translation using cGANs, Train...",The methodology involves using a cGAN-based de...,[{'Correlation Coefficient (CC) for Total Unsi...,[The model may not accurately generate the til...,4,{'strengths': ['High correlation between AI-ge...,
8,Three-day Forecasting of Solar Wind Speed Usin...,Son et al.,10.3847/1538-4357/ace006,[Solar Dynamics Observatory/Atmospheric Imagin...,None provided,"[SolarSoft library, TensorFlow/Keras (implied ...",Solar wind speed prediction,The study presents a deep-learning model to fo...,[Convolutional Neural Network (CNN) with Incep...,True,...,[Wang-Sheely-Arge-ENLIL (WSA-ENLIL) model],"[Rotating, centering, exposure compensation, a...","[Upendran et al. (2020), Sun et al. (2021), Ra...","[Deep learning with multimodal data fusion, Se...",The study uses a deep-learning model comprisin...,"[{'RMSE (6-hour prediction)': '37.4 km/s'}, {'...",[The model cannot predict solar wind speed enh...,4,{'strengths': ['Effective use of multimodal da...,[The model can be adapted for near-real-time s...
9,Generation of High-resolution Solar Pseudo-mag...,Shin et al.,10.3847/2041-8213/ab96a9,[Ca II K 393.3 nm images from the Precision So...,https://github.com/NoelShin/Ca2Mag,"[PyTorch, scikit-image, PIL, NumPy]",Generation of high-resolution pseudo-magnetogr...,The paper presents a deep learning model based...,"[pix2pixHD, Conditional Generative Adversarial...",False,...,"[Pixel-to-pixel calibration curve, Comparison ...",[Alignment of Ca II K images with SDO/HMI imag...,"[LeCun & Bengio (1998), Goodfellow et al. (201...",[Deep learning-based image translation using p...,"The study uses a pix2pixHD model, a variant of...",[{'Total Unsigned Magnetic Flux Correlation Co...,[Struggles to accurately assign magnetic polar...,4,{'strengths': ['High correlation of total unsi...,[The model can be applied to historical Ca II ...
