In [2]:
import pdb
import os
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.chains import StuffDocumentsChain, RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
from langchain.schema import Document
from tqdm import tqdm
import tempfile
import os
import re

In [4]:
# Method 1: Use the API and directly give the papers to the retriever within the notebook
def search_arxiv(topic, max_results=5):
    base_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": f"all:{topic}",
        "start": 1,
        "max_results": max_results
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "xml")
        papers = soup.find_all("entry")
        paper_details = []
        for paper in papers:
            download_url = paper.id.text.replace("abs", "pdf")
            paper_details.append({
                "title": paper.title.text.strip(),
                "authors": [author.find("name").text for author in paper.find_all("author")],
                "pub_date": paper.published.text,
                "summary": paper.summary.text.strip(),
                "download_url": download_url
            })
        return paper_details
    else:
        print(f"Error: Unable to fetch arXiv data (Status code {response.status_code})")
        return []


In [6]:
def download_and_extract_text(pdf_url):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Download failed: {e}")
        return ""

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        tmp_file.write(response.content)
        tmp_file_path = tmp_file.name

    try:
        doc = fitz.open(tmp_file_path)
        text = "".join(page.get_text() for page in doc)
        doc.close()
    except Exception as e:
        print(f"Error extracting text from {pdf_url}: {e}")
        text = ""
    finally:
        os.remove(tmp_file_path)

    return text


def clean_text(text):
    # Remove null bytes and control characters except newline/tab
    return re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", text)
def prepare_documents(paper_list):
    documents = []
    for paper in tqdm(paper_list, desc="Processing papers"):
        pdf_url = paper["download_url"]
        raw_text = download_and_extract_text(pdf_url)
        if raw_text.strip():
            cleaned_text = clean_text(raw_text)
            metadata = {
                "title": clean_text(paper["title"]),
                "authors": [clean_text(a) for a in paper["authors"]],
                "published": clean_text(paper["pub_date"]),
                "source": pdf_url
            }
            doc = Document(page_content=cleaned_text, metadata=metadata)
            documents.append(doc)
    return documents    


In [8]:
topic = "Arctic Amplification"  # change topic here
papers = search_arxiv(topic, max_results=3)
documents = prepare_documents(papers)
    

Processing papers: 100%|██████████████████████████| 3/3 [00:01<00:00,  2.16it/s]


In [9]:
documents

[Document(metadata={'title': 'Sea-level and summer season orbital insolation as drivers of Arctic\n  sea-ice', 'authors': ['Claude Hillaire-Marcel', 'Anne de Vernal', 'Michel Crucifix'], 'published': '2021-02-03T13:52:57Z', 'source': 'http://arxiv.org/pdf/2102.02067v2'}, page_content='Sea-level and summer season orbital insolation as drivers of Arctic sea-ice  \n \nClaude Hillaire-Marcel1, Anne de Vernal1, Michel Crucifix2 \n1. Geotop-UQAM, Montreal, Canada \n2. UC-Louvain, Louvain-la-Neuve, Belgium \n \nAbstract \n \nThe sea-ice cover of the Arctic Ocean is an important element of the climate and ocean system in \nthe Northern Hemisphere as it impacts albedo, atmospheric pressure regimes, CO2-exchange at the \nocean/atmosphere interface as well as the North Atlantic freshwater budget and thermohaline \ncirculation [1]. Due to global warming, the Arctic sea-ice cover is presently evolving at an \nunprecedented rate towards full melt during the summer season, driving the so-called "Arct

In [12]:
#Method 2: Loading the research papers from a local directory
def load_documents():
    root_folder = "/Users/harinivaranasi/Desktop/Research"
    
    loader = DirectoryLoader(
        path=root_folder,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        loader_kwargs={"extract_images": True},
        recursive=True   
    )
    
    documents = loader.load()
    breakpoint()
    return documents


In [14]:
def create_retriever(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", " "]
    )
    texts = text_splitter.split_documents(documents)
    
    embeddings = HuggingFaceEmbeddings(
        model_name='BAAI/bge-small-en-v1.5',
        model_kwargs={"device": "cpu"}
    )
    
    db = FAISS.from_documents(texts, embeddings)
    db.save_local("faiss_index")
    
    return db.as_retriever(search_type='mmr', search_kwargs={"k": 15})

def build_retrieval_qa_chain_v2(retriever):
    system_prompt = (
        "You are a precise research assistant. Your task is to extract information "
        "from the scientific research papers. Do not speculate or guess.\n\n"
        "If the user asks for datasets, list only those datasets that are mentioned in the research papers. "
        "If the user asks about specific details such as time periods, variables, relationships, regions, or links, "
        "respond only with the requested detail.\n\n"
        "If the answer is not found in the context, respond only with: 'Not available'.\n\n"
        "Context: {context}"
    )

    model = OllamaLLM(
    model='olafgeibig/nous-hermes-2-mistral:7B-DPO-Q5_K_M',
    temperature=0.2,
    max_tokens=2024,
    top_p=0.9
    )



    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}")
        ]
    )

    stuff_chain = create_stuff_documents_chain(model, prompt)

    retrieval_chain = create_retrieval_chain(
        retriever,
        stuff_chain
    )

    return retrieval_chain



def load_faiss_index():
    embeddings = HuggingFaceEmbeddings(
        model_name='BAAI/bge-small-en-v1.5',
        model_kwargs={"device": "cpu"}
    )
    
    return FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True).as_retriever(
        search_type='mmr', search_kwargs={"k": 15}
    )

def normalize_query(query: str) -> str:
    q = query.lower()
    if "dataset" in q or "data" in q:
        return query
    return query

In [16]:
def main():
    # Load or create the retriever
    #retriever = load_faiss_index() if os.path.exists("faiss_index") else create_retriever(load_documents())
    if os.path.exists("faiss_index"):
        os.system("rm -r faiss_index")  # Remove the previous FAISS index

    #Method 1: from arxiv api
    retriever = create_retriever(documents)
    #Method 2: from locxal directory
    # retriever = create_retriever(load_documents())
    # Build the QA and summarization chains
    qa_chain = build_retrieval_qa_chain_v2(retriever)
    #model = OllamaLLM(model='mistral', temperature=0.3, max_tokens=1024, top_p=0.85)
    while True:
        user_query = normalize_query(input("Type your query here (or type 'exit' to quit): "))
        if user_query.lower() == 'exit':
            print("Exiting the chatbot. Goodbye!")
            break
            
        else:        
             try:
                    response = qa_chain.invoke({"input": user_query})
                    # Print the answer
                    if 'answer' in response:
                        print("Response:", response['answer'])
                    '''
                    # Print source documents
                    if 'context' in response:
                        print("\nSource Documents:")
                        for doc in response['context']:
                            print(f" - Source: {doc.metadata['source']}, Page: {doc.metadata.get('page', 'N/A')}")
                    '''              
             except Exception as e:
                print("An error occurred while processing your query:", str(e))
        
if __name__ == "__main__":
    main()


  embeddings = HuggingFaceEmbeddings(


Type your query here (or type 'exit' to quit):  what are the datasets used in the research?


Response: The datasets used in the research include:
1. NASA Goddard Institute for Space Studies’ “GISTEMP” (Schmidt et al., 2016)
2. The globally-extended version of the Met Office Hadley Centre's HadCRUT4 temperature dataset by Cowtan and Way (2014), Had4krig_v2.
3. European Centre for Medium-Range Weather Forecasts (ECMWF) 20th Century atmosphere-only reanalysis, ERA20C (Poli et al., 2016).
4. ECMWF's interim reanalysis, ERAint (Dee et al., 2011).
5. Japanese Meteorological Agency's JRA55 (Kobayashi et al., 2015).
6. NASA's Global Modeling and Assimilation Office's MERRA2 (Gelaro et al., 2017).
7. National Centre for Atmospheric Research's CFSR (Saha et al., 2014).
8. 20th Century reanalysis version 2C, 20CRv2c (Compo et al., 2011).


Type your query here (or type 'exit' to quit):  what are the sources for these datasets used?


Response: The sources for the mentioned datasets are as follows:

- ERA20C and ERAint data can be obtained from https://www.ecmwf.int/en/forecasts/datasets/browse-reanalysis-datasets.
- Had4Krig dataset is available at http://www.hadleycentre.metoffice.gov.uk/hadcru/data/current/HadCRUT4/.
- 20CRv2c dataset can be found at the NOAA/OAR/ESRL PSD, Boulder, Colorado, USA website: http://www.esrl.noaa.gov/psd/.
- JRA55 data was produced by the Japanese Meteorological Agency and is available at https://jra.kishou.go.jp/JRA-55/.
- MERRA2 data can be downloaded from http://disc.gsfc.nasa.gov/merra-2/.
- CFSR data was created by NOAA and is available at http://cfs.ncep.noaa.gov/cfsr/.
- The work was financially supported by the Research Council of Norway through the EuropeWeather project (no. 231322 / F20).


Type your query here (or type 'exit' to quit):  exit


Exiting the chatbot. Goodbye!
