In [None]:
from langchain.chat_models import init_chat_model
from pdfminer.high_level import extract_text
from typing import Optional, List, Dict
from typing_extensions import Annotated, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.tools import tool
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, ToolMessage
#from langchain_community.llms import IpexLLM

import torch

import pandas as pd
import getpass
import pdf2doi
import os

In [23]:
from mistralai import Mistral
from dotenv import load_dotenv
import datauri

In [202]:
class PaperInfo(TypedDict):
    title: Annotated[str, ..., 'Title of the research paper']
    author: Annotated[str, ..., 'Authors of the paper in (Name) et al. format']
    id: Annotated[str, ..., 'DOI or arXiv ID']
    data: Annotated[List[str], ..., 'Data used']
    code_link: Annotated[str, 'None provided', 'Link to code (Github, etc)']
    packages_used: Annotated[List[str], 'None Provided', 'Packages or libraries used.']
    task: Annotated[str, ..., 'Task performed in paper (e.g. flare prediction, CME detection, etc.)']
    field: Annotated[str, ...,'What branch of Space Science/Solar Physics e.g. solar flares, CME, solar wind, etc.']
    abstract: Annotated[str, ..., 'Summary of abstract']
    models: Annotated[List[str], ..., 'Model architecture(s) used e.g. CNN, ResNet, pix2pix, GAN, etc.']
    hybrid_model: Annotated[bool, ..., 'Whether hybrid model architectures were used.']
    multimodal: Annotated[List[str], 'N/A', 'Type of multimodal models used, if any.']
    baselines: Annotated[List[str], 'N/A', 'Baseline models or papers used, along with citations in the format: baseline (author, year).']
    preprocessing: Annotated[List[str], ..., 'Preprocessing steps taken']
    citations: Annotated[List[str], ..., 'Citations']
    approach_used: Annotated[List[str], ..., 'Approach(es) used']
    methodology: Annotated[str, ..., 'Summary of methodology']
    metrics: Annotated[List[Dict[str, float]], ..., 'Metrics used for evaluation and the scores obtained']
    limitations: Annotated[List[str], ..., 'Limitations of the research']
    strengths: Annotated[str, ..., 'Key strengths']
    weaknesses: Annotated[str, ..., 'Key weaknesses']
    reproducibility: Annotated[float, ..., 'From 0-5, based on the completeness of the method, code and data, how reproducible is the paper?']
    impact: Annotated[float, ..., 'From 0-10, how impactful is this paper based on current trends?']
    reuse_potential: Annotated[List[str], ..., 'Notes on potential for reuse ("None" if no potential)']

In [204]:
papers_df = pd.read_csv('../listup.csv')
papers_df['file_name'] = list(map(lambda name: name.split('.pdf')[0].split(' - ')[1], papers_df['file_name']))
papers = []

for file, content in zip(papers_df['file_name'], papers_df['content']):
    papers.append({'file_name': file, 'content': content})

In [207]:
llm = init_chat_model('mistral-large-latest', model_provider='mistralai')
structured_llm = llm.with_structured_output(PaperInfo)

In [212]:
embeddings = MistralAIEmbeddings(
    model='mistral-embed',
    api_key=os.environ['MISTRAL_API_KEY']
)



In [213]:
load_dotenv()
client = Mistral(api_key=os.environ['MISTRAL_API_KEY'])

In [216]:
def upload_pdf(file):
    pdf = client.files.upload(
        file={
            'file_name': file,
            'content': open(f'{file}', 'rb')
        },
        purpose='ocr'
    )

    signed_url = client.files.get_signed_url(file_id=pdf.id)
    return signed_url.url

In [215]:
def extract_pdf(file):
    ocr_response = client.ocr.process(
        model='mistral-ocr-latest',
        document={
            'type': 'document_url',
            'document_url': upload_pdf(file)
        },
        include_image_base64=True
    )

    return ocr_response

In [29]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', "Extract information from the following research paper."),
        ('human', '{paper}')    
    ]
)

In [30]:
process_chain = prompt | structured_llm

In [31]:
data = []
for paper in papers:
    data.append(process_chain.invoke({'paper': paper['content']}))

In [32]:
for i in range(len(data)):
    data[i]['metrics_list'] = data[i]['metrics'].copy()
    data[i]['metrics'] = []
    data[i]['scores'] = []
    for j in range(len(data[i]['metrics_list'])):
        for key, value in data[i]['metrics_list'][j].items():
            data[i]['metrics'].append(key)
            data[i]['scores'].append(value)

In [33]:
data

[{'title': 'Generation of Modern Satellite Data from Galileo Sunspot Drawings in 1612 by Deep Learning',
  'author': 'Lee et al.',
  'id': '10.3847/1538-4357/abd498',
  'data': ['Mount Wilson Observatory (MWO) sunspot drawings (2011-2015)',
   'SDO/HMI line-of-sight magnetograms (2011-2015)',
   'SDO/AIA seven wavelength images (94, 131, 171, 193, 211, 304, and 335 Å) (2011-2015)',
   'Galileo sunspot drawings (1612 June 2 to July 8)'],
  'code_link': 'None provided',
  'packages_used': ['NumPy', 'Keras', 'TensorFlow', 'SunPy'],
  'task': 'Generation of modern satellite-like magnetograms and EUV images from historical sunspot drawings',
  'field': 'Solar Physics',
  'keywords': 'deep learning, generative adversarial networks (GANs), pix2pix, sunspot drawings, historical data, solar magnetograms, EUV images, solar activity, image-to-image translation, solar cycle',
  'abstract': 'This study uses deep learning to generate modern satellite-like magnetograms and EUV images from historical 

In [35]:
from langchain_tavily import TavilySearch

In [174]:
search_tool = TavilySearch(
    max_results=15,
)

In [40]:
df = pd.DataFrame(data)

In [64]:
df.head()

Unnamed: 0,title,author,id,data,code_link,packages_used,task,field,keywords,abstract,...,approach_used,methodology,metrics,limitations,reproducibility,strengths,weaknesses,metrics_list,scores,reuse_potential
0,Generation of Modern Satellite Data from Galil...,Lee et al.,10.3847/1538-4357/abd498,[Mount Wilson Observatory (MWO) sunspot drawin...,None provided,"[NumPy, Keras, TensorFlow, SunPy]",Generation of modern satellite-like magnetogra...,Solar Physics,"deep learning, generative adversarial networks...",This study uses deep learning to generate mode...,...,[Image-to-image translation using pix2pix (cGA...,"The study employs a pix2pix model, a type of c...",[Correlation Coefficient (CC) for TUMF (Magnet...,[The model struggles to reproduce detailed str...,4,The study demonstrates a novel application of ...,The model's reliance on sunspot drawings limit...,[{'Correlation Coefficient (CC) for TUMF (Magn...,"[0.82, 0.74, 0.75, 0.56, 0.65, 0.67, 0.7, 0.76...",
1,Visual Explanation of a Deep Learning Solar Fl...,Yi et al.,10.3847/2041-8213/abeb70,[Solar and Heliospheric Observatory (SOHO)/Mic...,None provided,"[PyTorch, NumPy, Matplotlib]",Solar flare prediction,Solar flares,"deep learning, solar flare prediction, CNN, Gr...",This study presents a visual explanation of a ...,...,"[CNN-based flare prediction, Attribution metho...",The study involves training a CNN model on ful...,"[True Skill Statistics (TSS), Accuracy (ACC), ...",[The model's performance may be affected by th...,3,The study provides a novel approach to interpr...,The reliance on JPG format magnetograms may in...,"[{'True Skill Statistics (TSS)': 0.65}, {'Accu...","[0.65, 0.83, 0.65, 0.61]",
2,Improved AI-generated Solar Farside Magnetogra...,Jeong et al.,10.3847/2041-8213/ac9116,"[SDO/AIA EUV images (304, 193, 171 Å), SDO/HMI...",https://github.com/JeongHyunJin/Pix2PixCC,"[PyTorch, NumPy, Matplotlib, SciPy, Astropy, S...",Solar farside magnetogram generation,Solar Physics,"solar magnetograms, deep learning, Pix2Pix, GA...",The paper presents an improved deep-learning m...,...,[Deep learning-based image-to-image translatio...,"The study introduces the Pix2PixCC model, an e...","[pixel-to-pixel CC (full disk), pixel-to-pixel...",[Physical quantities based on pixel-to-pixel d...,4,The Pix2PixCC model significantly improves the...,The model's reliance on historical frontside d...,"[{'pixel-to-pixel CC (full disk)': 0.88}, {'pi...","[0.88, 0.91, 0.7, 0.99, 0.94, 0.94, 0.9, 0.94,...",
3,Construction of global IGS-3D electron density...,Ji et al.,10.1016/j.jastp.2024.106370,[International Global Navigation Satellite Sys...,None provided,[PyTorch],Global 3-D ionospheric electron density distri...,Ionospheric Physics,"deep learning, ionospheric electron density pr...",This study presents a deep learning-based meth...,...,[Inversion method to derive electron density p...,The study employs a deep learning approach usi...,"[RMSE (log m^-3) - Jicamarca, RMSE (log m^-3) ...",[The IRI model's limitations at low latitudes ...,3,The IGS-3D Ne model demonstrates improved accu...,The model's performance is constrained by the ...,"[{'RMSE (log m^-3) - Jicamarca': 0.37}, {'RMSE...","[0.37, 0.22, 0.34]",
4,Generation of He I 1083 nm Images from SDO AIA...,(Son) et al.,10.3847/1538-4357/ac11ae,"[SDO/AIA 19.3 nm images, SDO/AIA 30.4 nm image...",None provided,"[Sunpy, AIAPrep]",Generation of He I 1083 nm images from SDO/AIA...,Solar Chromosphere and Transition Region,"deep learning, image-to-image translation, pix...",The study presents a deep learning method (pix...,...,"[Image-to-image translation using pix2pixHD, C...","The study uses the pix2pixHD model, a deep lea...","[CC (Model I), RMSE (Model I), CC (Model II), ...",[Differences in filament absorption mechanisms...,4,The study successfully demonstrates the use of...,"The study has some limitations, including diff...","[{'CC (Model I)': 0.83}, {'RMSE (Model I)': 11...","[0.83, 11.28, 0.86, 10.15, 0.88, 9.49]",


In [73]:
ids = list(df['id'])
tasks = list(df['task'])
fields = list(df['field'])
tasks_set = set(tasks)
fields_set = set(fields)

In [175]:
query_list = list(map(lambda field: f"{field} deep learning artificial intelligence machine learning cnn\n\nExclude papers with the following DOI/arXiv IDS: {list(df['id'][df['field'] == field])}.", fields_set))

In [181]:
query_results = list(map(lambda query: search_tool.invoke({'query': query, 'include_domains': ['scholar.google.com', 'arxiv.org']}), query_list))

In [182]:
query_results

[{'query': "Solar Corona and Inner Heliosphere deep learning artificial intelligence machine learning cnn\n\nExclude papers with the following DOI/arXiv IDS: ['10.3847/1538-4357/acc722'].",
  'follow_up_questions': None,
  'answer': None,
  'images': [],
  'results': [{'url': 'https://arxiv.org/abs/1902.09673',
    'title': 'Predicting the Structure of the Solar Corona and Inner Heliosphere ...',
    'content': "To aid in mission planning, and in anticipation of the unprecedented measurements to be returned, in late October, we developed a three-dimensional magnetohydrodynamic (MHD) solution for the solar corona and inner heliosphere, driven by the then available observations of the Sun's photospheric magnetic field.",
    'score': 0.3964893,
    'raw_content': None}],
  'response_time': 11.56,
  'request_id': '003a4bd8-a3e8-423d-b711-733c359f6247'},
 {'query': "Solar Physics deep learning artificial intelligence machine learning cnn\n\nExclude papers with the following DOI/arXiv IDS: 

In [184]:
urls = []
for query in query_results:
    if type(query) != str:
        for result in query['results']:
            urls.append(result['url'])

In [185]:
urls = list(filter(lambda url: url != 'https://scholar.google.com/', urls))
urls = list(filter(lambda url: 'abs' in url or 'html' in url or 'pdf' in url, urls))

In [186]:
urls = list(map(lambda url: url.replace('abs', 'pdf'), urls))
urls = list(map(lambda url: url.replace('html', 'pdf'), urls))
urls

['https://arxiv.org/pdf/1902.09673',
 'https://arxiv.org/pdf/2306.15308',
 'https://ar5iv.lpdf.arxiv.org/pdf/2306.15308',
 'https://arxiv.org/pdf/2306.15308',
 'https://arxiv.org/pdf/2204.03710',
 'https://arxiv.org/pdf/2508.16543',
 'https://arxiv.org/pdf/1909.05459',
 'https://arxiv.org/pdf/2108.09114',
 'https://arxiv.org/pdf/1912.02934',
 'https://arxiv.org/pdf/2505.03385',
 'https://arxiv.org/pdf/2409.04850v1',
 'https://arxiv.org/pdf/2407.04283v1',
 'https://arxiv.org/pdf/2409.04850',
 'https://arxiv.org/pdf/2408.12476',
 'https://arxiv.org/pdf/2411.08843',
 'https://arxiv.org/pdf/2309.04558',
 'https://arxiv.org/pdf/2405.02545v1',
 'https://arxiv.org/pdf/2410.10841v1',
 'https://arxiv.org/pdf/2501.14684v1',
 'https://arxiv.org/pdf/2508.06892v1',
 'https://arxiv.org/pdf/2508.14107v1',
 'https://arxiv.org/pdf/2203.01184',
 'https://arxiv.org/pdf/2304.01234',
 'https://arxiv.org/pdf/2507.10893v1',
 'https://arxiv.org/pdf/2407.08259v1',
 'https://arxiv.org/pdf/2501.15731',
 'https:/

In [187]:
import requests

In [188]:
dirs = '../queried_papers'

os.makedirs(dirs, exist_ok=True)

In [193]:
def download_pdf(url):
    filename = url.split('/')[-1] + '.pdf'
    file_dir = os.path.join(dirs, filename)
    response = requests.get(url)

    if response.status_code == 200 and not os.path.exists(file_dir):
        with open(file_dir, 'wb') as f:
            f.write(response.content)

In [196]:
indices = []
for i, url in enumerate(urls):
    try:
        download_pdf(url)
    except:
        indices.append(i)

In [197]:
indices

[2]

In [200]:
download_pdf(urls[2].replace('ar5iv.lpdf.', ''))

In [220]:
queried_papers_list = []

for file in os.listdir(dirs):
    queried_papers_list.append(os.path.join(dirs, file))

In [222]:
queried_papers = []
indices = []
for i, file in enumerate(queried_papers_list):
    try:
        pdf = extract_pdf(file)

        content = []

        for page in pdf.pages:
            content.append(page.markdown)

        papers.append({
            'file_name': file,
            'content': content
        })
    except: indices.append(i)

In [250]:
titles = list(map(lambda paper: paper['file_name'], papers))

In [251]:
papers_df = pd.DataFrame(papers)

In [258]:
papers_df.head()

Unnamed: 0,file_name,content
0,Generation of Modern Satellite Data from Galil...,# Generation of Modern Satellite Data from Gal...
1,Visual Explanation of a Deep Learning Solar Fl...,# Visual Explanation of a Deep Learning Solar ...
2,Improved AI-generated Solar Farside Magnetogra...,# Improved AI-generated Solar Farside Magnetog...
3,Construction of global IGS-3D electron density...,# Construction of global IGS-3D electron densi...
4,Generation of He i 1083 nm Images from SDO AIA...,# Generation of He I 1083 nm Images from SDO A...


In [259]:
papers_df = papers_df.drop_duplicates(subset='file_name')

In [261]:
papers_df.to_csv('listup_v1.csv', index=False)

In [297]:
papers_df = pd.read_csv('../listup_v1.csv')

In [None]:
papers_df

143

In [272]:
paper92 = extract_pdf('../papers/2202.08776.pdf')

In [273]:
content = []

for page in paper92.pages:
    content.append(page.markdown)

In [287]:
missing_paper = {
            'file_name': '2202.08776.pdf',
            'content': '\n'.join(content)
        }

In [298]:
papers_df = pd.concat([papers_df, pd.DataFrame(missing_paper, index=[0])])

In [302]:
papers_df = papers_df.reset_index()
papers_df = papers_df.drop(columns='index')

In [304]:
papers_df.to_csv('../listup_v1.csv', index=False)