In [1]:
from pdfminer.high_level import extract_text
from typing import Optional, List, Dict
from typing_extensions import Annotated, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.tools import tool
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, ToolMessage
from langchain_mistralai import MistralAIEmbeddings, ChatMistralAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.llms import IpexLLM
from langchain_chroma import Chroma

import torch

import pandas as pd
import getpass
import pdf2doi
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ["SYCL_CACHE_PERSISTENT"] = "1"

In [3]:
from mistralai import Mistral
from dotenv import load_dotenv
import datauri

In [4]:
class PaperInfo(TypedDict):
    title: Annotated[str, ..., 'Title of the research paper']
    author: Annotated[str, ..., 'Authors of the paper in (Name) et al. format']
    id: Annotated[str, ..., 'DOI or arXiv ID']
    data: Annotated[List[str], ..., 'Data used']
    code_link: Annotated[str, 'None provided', 'Link to code (Github, etc)']
    packages_used: Annotated[List[str], 'None Provided', 'Packages or libraries used.']
    task: Annotated[str, ..., 'Task performed in paper (e.g. flare prediction, CME detection, etc.)']
    field: Annotated[str, ...,'What branch of Space Science/Solar Physics e.g. solar flares, CME, solar wind, etc.']
    abstract: Annotated[str, ..., 'Summary of abstract']
    models: Annotated[List[str], ..., 'Model architecture(s) used e.g. CNN, ResNet, pix2pix, GAN, etc.']
    hybrid_model: Annotated[bool, ..., 'Whether hybrid model architectures were used.']
    multimodal: Annotated[List[str], 'N/A', 'Type of multimodal models used, if any.']
    baselines: Annotated[List[str], 'N/A', 'Baseline models or papers used, along with citations in the format: baseline (author, year).']
    preprocessing: Annotated[List[str], ..., 'Preprocessing steps taken']
    citations: Annotated[List[str], ..., 'Citations']
    approach_used: Annotated[List[str], ..., 'Approach(es) used']
    methodology: Annotated[str, ..., 'Summary of methodology']
    metrics: Annotated[List[Dict[str, float]], ..., 'Metrics used for evaluation and the scores obtained']
    limitations: Annotated[List[str], ..., 'Limitations of the research']
    reproducibility: Annotated[float, ..., 'From 0-5, based on the completeness of the method, code and data, how reproducible is the paper?']
    strengths: Annotated[str, ..., 'Key strengths']
    weaknesses: Annotated[str, ..., 'Key weaknesses']
    reuse_potential: Annotated[List[str], 'N/A', 'Notes on potential for reuse (if applicable)']

In [5]:
load_dotenv()
api_key = os.environ['MISTRAL_API_KEY'].strip("'")

In [6]:
papers_df = pd.read_csv('../listup.csv')
papers_df['file_name'] = list(map(lambda name: name.split('.pdf')[0].split(' - ')[1], papers_df['file_name']))
papers = []

for file, content in zip(papers_df['file_name'], papers_df['content']):
    papers.append({'file_name': file, 'content': content})

In [None]:
llm = IpexLLM.from_model_id(
    model_id='mistralai/Mistral-7B-v0.1',
    model_kwargs={
        'temperature': 0
    }
)
structured_llm = llm.with_structured_output(PaperInfo)

W0918 20:51:29.039000 37396 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [None]:
embeddings = MistralAIEmbeddings(
    model='mistral-embed',
    api_key=api_key
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [None]:
persist_directory = "papers_data"

if not os.path.exists(persist_directory):
    os.makedirs(persist_directory)
    

In [None]:
for paper in papers:
    split_paper = text_splitter.split_text(paper['content'])
    
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings,
        collection_name='research_papers'
    )

    vectorstore.add_texts(split_paper)