In [33]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    CSVLoader,
    TextLoader,
    UnstructuredWordDocumentLoader
)
from langchain_core.documents import Document
import os

def load_documents_from_files(file_paths):
    documents = []
    for file_path in file_paths:
        ext = os.path.splitext(file_path)[1].lower()

        if ext == ".pdf":
            loader = PyPDFLoader(file_path)
        elif ext == ".csv":
            loader = CSVLoader(file_path)
        elif ext in [".docx", ".doc"]:
            loader = UnstructuredWordDocumentLoader(file_path)
        elif ext in [".txt"]:
            loader = TextLoader(file_path)
        else:
            print(f"Unsupported file type: {file_path}")
            continue

        docs = loader.load()
        documents.extend(docs)
        print(f"Loaded {len(docs)} document(s) from {file_path}")

    return documents


In [34]:
!pip install pypdf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [35]:
from tkinter import Tk, filedialog

def select_files():
    Tk().withdraw()  
    file_paths = filedialog.askopenfilenames(
        title="Select files",
        filetypes=[
            ("Documents", "*.pdf *.csv *.docx *.doc *.txt"),
            ("All Files", "*.*")
        ]
    )
    return list(file_paths)

file_paths = select_files()
docs = load_documents_from_files(file_paths)


Loaded 11 document(s) from /home/vaishnavi/Desktop/SmartDOC/attention.pdf


In [36]:
docs[0].page_content[:500]  # Display the first 500 characters of the first document's content

'Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser ∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent o'

In [37]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_documents(docs, chunk_size=1000, chunk_overlap=200):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(docs)
    return chunks

all_chunks=chunk_documents(docs)
all_chunks[0].page_content 

'Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser ∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring signiﬁcantl

In [38]:
!pip install langchain_huggingface
!pip install sentence_transformers
!pip install langchain_chroma

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [39]:
import shutil

# Delete previous Chroma DB to reset embedding dimensions
shutil.rmtree("chroma_db", ignore_errors=True)


In [42]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

def create_chroma_vectorstore(docs, persist_directory="chromaDB", model_name="BAAI/bge-base-en-v1.5"):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    vectorstore = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    return vectorstore

vectorstore = create_chroma_vectorstore(all_chunks)


In [43]:
def ask_question(vectorstore, query, k=3):
    results = vectorstore.similarity_search_with_score(query, k=k)
    for i, (doc, score) in enumerate(results):
        print(f"\nMatch {i+1} (Score: {score:.4f}):\n{doc.page_content[:500]}\n")

ask_question(vectorstore, "What is attention?")


Match 1 (Score: 0.7138):
around each of the sub-layers, followed by layer normalization. We also modify the self-attention
sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This
masking, combined with fact that the output embeddings are offset by one position, ensures that the
predictions for position ican depend only on the known outputs at positions less than i.
3.2 Attention
An attention function can be described as mapping a query and a set of key-value pairs to an output,
w


Match 2 (Score: 0.7694):
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser ∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequenc

In [46]:
def get_relevant_chunks(retriever,query,k=3):
  results=retriever.get_relevant_documents(query)
  return results

#retriever=vectorstore.as_retriever(search_kwargs={"k":3})
retriever=vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k":3,"fetch_k":20}
)

query="What is attention?"
relevant_docs=get_relevant_chunks(retriever,query)

for i,doc in enumerate(relevant_docs):
  print(f"\nmatch {i+1}:\n{doc.page_content[:500]}\n")


match 1:
around each of the sub-layers, followed by layer normalization. We also modify the self-attention
sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This
masking, combined with fact that the output embeddings are offset by one position, ensures that the
predictions for position ican depend only on the known outputs at positions less than i.
3.2 Attention
An attention function can be described as mapping a query and a set of key-value pairs to an output,
w


match 2:
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser ∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based 

In [None]:
from langchain_community.llms import Ollama

llm = Ollama(model="tinyllama")

def generate_answer(query, retrieved_docs, llm):
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    prompt = f"""
    You are a helpful assistant. Use the context below to answer the question accurately.

    Context:
    {context}

    Question: {query}
    Answer:"""

    return llm.invoke(prompt)

query = "What is attention?"
retrieved_docs = retriever.get_relevant_documents(query)

answer = generate_answer(query, relevant_docs, llm)
print("SmartDOC says:\n", answer)


  llm = Ollama(model="tinyllama")


SmartDOC says:
 According to the given material, "Attention Is All You Need" (AIAM) is a new simple network architecture that combines the attention mechanism with Transformer, which includes encoder and decoder networks. The proposed model is based on a single-precision floating-point capacity of each GPU and measures changes in performance using beam search as described in the previous section. The AIAM uses various components like different heads for the attention, varying amounts of computation, and a single-head approach to attenion, but the overall performance drops off with too many heads.
