In [1]:
import os
import sys
import pypdf

from langchain.llms import Ollama
from langchain_chroma import Chroma
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [2]:
def ingest_multiple_pdfs(folder_path):
    # List all PDF files in the folder
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    all_chunks = []  # To store all chunks from all PDFs
    total_pages = 0  # To track total number of pages processed

    for pdf_file in pdf_files:
        file_path = os.path.join(folder_path, pdf_file)
        print(f"Processing: {pdf_file}")
        
        # Load and split the PDF
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        total_pages += len(pages)

        # Split the pages by char
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=100,
            length_function=len,
            add_start_index=True,
        )

        chunks = text_splitter.split_documents(pages)
        all_chunks.extend(chunks)  # Add chunks from this PDF to the total
        print(f"Split {len(pages)} pages into {len(chunks)} chunks from {pdf_file}.")

    print(f"Processed {total_pages} pages into {len(all_chunks)} chunks across all PDFs.")

    # Create embeddings
    embedding = FastEmbedEmbeddings()

    # Create and persist vector store
    #vector_store = Chroma.from_documents(
    Chroma.from_documents(
        documents=all_chunks,
        embedding=embedding,
        persist_directory="./sql_chroma_db"
    )
    print("Vector store created and persisted.")

In [3]:
#Run this only once to create the vector database 
folder_path = "D:\\Research\\PhD_thesis\\16th Semester work\\Papers_given_by_Sir"
ingest_multiple_pdfs(folder_path)

Processing: A classical nucleation theory description of active colloid assembly Supplementary Materials.pdf
Split 8 pages into 26 chunks from A classical nucleation theory description of active colloid assembly Supplementary Materials.pdf.
Processing: Aggregation-fragmentation and individual dynamics of active clusters.pdf
Split 16 pages into 57 chunks from Aggregation-fragmentation and individual dynamics of active clusters.pdf.
Processing: Classical Nucleation Theory Description of Active Colloid Assembly.pdf
Split 12 pages into 40 chunks from Classical Nucleation Theory Description of Active Colloid Assembly.pdf.
Processing: Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf
Split 31 pages into 113 chunks from Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf.
Processing: Interrupted Motility Induced Phase Separation in Aligning Active Colloids.pdf
Split 11 pages into 33 chunks fr

In [None]:
#Create an access token from Hugging face and use the same as the read and write token below
from huggingface_hub import login
access_token_read = "hf_nMedcrxscgRruLcCMGPcUHbYyqcGxdcuzasEDy"
access_token_write = "hf_nMedcrxscgRruLcCMGPcUHbYyqcGxdcuzasEDy"
login(token = access_token_read)

In [5]:
def rag_chain():
    model = Ollama(model="llama3", base_url="http://localhost:11434/",
                    temperature = 0.0)
    
    prompt = PromptTemplate.from_template(
        """
        <s> [Instructions] You are a friendly assistant. Answer the question based only on the following context. 
        If you don't know the answer, then reply, No Context availabel for this question {input}. [/Instructions] </s> 
        [Instructions] Question: {input} 
        Context: {context} 
        Answer: [/Instructions]
        """
    )
    #Load vector store
    embedding = FastEmbedEmbeddings()
    vector_store = Chroma(persist_directory="./sql_chroma_db", embedding_function=embedding)

    #Create chain
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k": 3,
            "score_threshold": 0.5,
        },
    )

    document_chain = create_stuff_documents_chain(model, prompt)
    chain = create_retrieval_chain(retriever, document_chain)

    return chain

In [6]:
#Test
chain = rag_chain()

  model = Ollama(model="llama3", base_url="http://localhost:11434/",


In [7]:
result = chain.invoke({"input": "What are the gaps in active particle dynamics?"})
print(result["answer"])

Based on the provided context, I can answer your question. The gaps in active particle dynamics are:

* Understanding the kinetics of evolution toward the steady state in active systems.
* Disentangling the relative role played by self-propulsion and noise in the physics of active Brownian particles.

These gaps were mentioned in the text as areas where significant progress has been made, but more work is needed to fully understand the dynamics of active particle systems.


In [8]:
#Creating a function for the query search
def ask(query: str):
    #create chain
    chain = rag_chain()
    #invoke chain
    result = chain.invoke({"input": query})
    #print results with source
    print(result["answer"])
    for doc in result["context"]:
        print("Source: ", doc.metadata["source"])

In [12]:
ask("What are the gaps in active particle dynamics?")

Based on the provided context, I can identify some gaps in active particle dynamics:

1. **Understanding the kinetics of evolution towards steady states**: While significant progress has been made in understanding stationary distributions, the kinetics of evolution towards these states remain poorly understood.
2. **Disentangling the relative role of self-propulsion and noise**: The introduction of η as an additional control parameter allows us to disentangle the relative role of self-propulsion and noise in active Brownian particles, but it is unclear how this will impact our understanding of the dynamics.

These gaps highlight areas where further research is needed to better understand the behavior of active particle systems.
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Aggregation-fragme

In [14]:
ask("What is kinetic monte carlo simulations?")

Kinetic Monte Carlo simulations are a type of computational method used to analyze the kinetics of statistical models, including off-lattice complex fluids. In this context, kinetic Monte Carlo simulations involve updating the positions of particles sequentially and repeating an elementary trial step that consists of:

1. Choosing a particle at random
2. Drawing a random displacement from a chosen distribution
3. Accepting or rejecting the move based on whether it creates overlap between particles

The simulation can be perturbed by introducing noise through small amplitude Monte Carlo moves, allowing for particle displacements that are not uniquely controlled by the direction of the displacement. The strength of this noise is quantified by the quantity η, which compares the relative size of the Monte Carlo moves to the persistent moves.

In this specific context, kinetic Monte Carlo simulations are used to study the dynamics of self-propelled hard disks, where activity is controlled b

In [15]:
ask("What are the different phases observed in self-propelled hard disks?")

According to the provided context, the different phases observed in self-propelled hard disks are:

1. Fluid phase
2. Clustered phase
3. Heterogeneous dynamics phase

These phases are described in the paper "Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks" by Demian Levis and Ludovic Berthier, published in Physical Review E in 2014.
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf


In [17]:
ask("What is cluster size distribution?")

What is cluster size distribution?

According to the given context, the cluster mass distribution P(n) is defined as the normalized histogram obtained by measuring the number of clusters containing n particles. In other words, it's a measure of how many clusters have a certain number of particles (n).
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Clustering and heterogeneous dynamics in a kinetic Monte Carlo model of self-propelled hard disks.pdf
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\Aggregation-fragmentation and individual dynamics of active clusters.pdf
Source:  D:\Research\PhD_thesis\16th Semester work\Papers_given_by_Sir\A classical nucleation theory description of active colloid assembly Supplementary Materials.pdf
