In [1]:
from dotenv import load_dotenv
load_dotenv()

from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


In [28]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('aiayn.pdf')

document = loader.load()

In [3]:
document

[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20251101175209', 'source': 'aiayn.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser ∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with r

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n## ", "\n### ",     # section headings (if markdown exists)
        "\n\n",                # paragraph
        "\n",                  # line
        ". ",                  # sentence
        " ",                   # word
        ""                     # fallback
    ],
    chunk_size=1200,      # 1000–2000 recommended for research papers
    chunk_overlap=150,    # 10–15% overlap for context retention
    length_function=len
)



In [5]:
chunks = splitter.split_documents(document)
chunks

[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20251101175209', 'source': 'aiayn.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser ∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with r

In [6]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

embedding_model = OpenAIEmbeddings()

db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,      # Pass the embedding model directly
    persist_directory="chroma_db_aiayn"
)

print("✅ Stored in ChromaDB using OpenAI embeddings!")


✅ Stored in ChromaDB using OpenAI embeddings!


In [7]:
db.get(include=['embeddings','documents', 'metadatas'])

{'ids': ['fd03ecc3-f2c6-4bf4-ab6a-b2db3fc34cab',
  'eb7630a4-baf7-47ce-b31a-edee108ea782',
  'c6274f47-fa75-437c-99df-234b17905137',
  '9ef208b2-d822-4601-a10b-5fd1ec1e0f2e',
  'e5cdd330-1b6e-4585-acee-4764be5d4667',
  '0443cd19-1895-4484-94a6-20a21a1c63f8',
  'a1aaa530-ebbf-4b4c-a390-e67da0fa1ee3',
  'a0322820-d39f-4314-a970-f4748f36bb0e',
  '2f9581b2-30ca-4b82-a61f-3d51f7901bc7',
  '136e05d4-a256-4ea8-b903-c8f9dfcfe533',
  'cb5ceaf2-c01a-47e2-be5c-58efd9793a5b',
  '86d9ff9f-f66a-40d0-a228-6fdd8d51f6e0',
  '1136d6df-585b-4f95-8995-7e77c154e3d9',
  '00f1b26e-c21f-4617-a62b-fcc1b458ca91',
  '1e475da9-599d-4f62-b3d6-f2a3ebcae25d',
  'a4f90b24-db2f-43e3-8ee0-40267c3a1ae5',
  'c89f49c6-1001-4690-b736-e12c024875f3',
  '8a1fb997-db1b-4193-bdfb-69d1506c85a5',
  'b552159c-5a65-494f-adb3-565ae2493884',
  '537f35f0-0398-452a-a268-6ef6f541d32e',
  '48bcbc51-a514-422d-93ea-306838c45f8d',
  '573ae179-186a-43d4-a8bd-79aecdf667a2',
  'ac4aaa4d-c384-4811-8732-9625224e92e2',
  '84b89645-daeb-4479-acfb-

In [8]:
db.similarity_search('what is encode?', k=2)

[Document(metadata={'source': 'aiayn.pdf', 'page_label': '5', 'total_pages': 11, 'producer': 'PDFium', 'page': 4, 'creationdate': 'D:20251101175209', 'creator': 'PDFium'}, page_content='from layer to layer. Another way of describing this is as two convolutions with kernel size 1.\nThe dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality\ndff = 2048.\n3.4 Embeddings and Softmax\nSimilarly to other sequence transduction models, we use learned embeddings to convert the input\ntokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transfor-\nmation and softmax function to convert the decoder output to predicted next-token probabilities. In\nour model, we share the same weight matrix between the two embedding layers and the pre-softmax\nlinear transformation, similar to [24]. In the embedding layers, we multiply those weights by √dmodel.\n3.5 Positional Encoding\nSince our model contains no recurrence and no convol

In [9]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
retriever

VectorStoreRetriever(tags=['Chroma', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001BB73C938F0>, search_kwargs={'k': 4})

In [10]:
retriever.invoke('What is attention')

[Document(metadata={'creator': 'PDFium', 'total_pages': 11, 'page': 1, 'source': 'aiayn.pdf', 'producer': 'PDFium', 'page_label': '2', 'creationdate': 'D:20251101175209'}, page_content='reduced to a constant number of operations, albeit at the cost of reduced effective resolution due\nto averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as\ndescribed in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been\nused successfully in a variety of tasks including reading comprehension, abstractive summarization,\ntextual entailment and learning task-independent sentence representations [4, 22, 23, 19].\nEnd-to-end memory networks are based on a recurrent attention mechanism instead of sequence-\naligned recurrence and have been shown to perform well on simple-language question answering and\n

In [29]:
llm = ChatOpenAI()

In [30]:
from langchain_core.prompts import PromptTemplate
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [35]:
question          = "summarize the key concepts of the paper"
retrieved_docs    = retriever.invoke(question)

In [36]:
retrieved_docs

[Document(metadata={'producer': 'PDFium', 'source': 'aiayn.pdf', 'page': 0, 'creator': 'PDFium', 'creationdate': 'D:20251101175209', 'page_label': '1', 'total_pages': 11}, page_content='has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head\nattention and the parameter-free position representation and became the other person involved in nearly every\ndetail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and\ntensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and\nefﬁcient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and\nimplementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating\nour research.\n†Work performed while at Google Brain.\n‡Work performed while at Google Research.\n31st Conference on Neural Information P

In [33]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

'[21] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention-\nbased neural machine translation. arXiv preprint arXiv:1508.04025, 2015.\n[22] Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention\nmodel. In Empirical Methods in Natural Language Processing, 2016.\n[23] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive\nsummarization. arXiv preprint arXiv:1705.04304, 2017.\n[24] Oﬁr Press and Lior Wolf. Using the output embedding to improve language models. arXiv\npreprint arXiv:1608.05859, 2016.\n[25] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words\nwith subword units. arXiv preprint arXiv:1508.07909, 2015.\n[26] Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton,\nand Jeff Dean. Outrageously large neural networks: The sparsely-gated mixture-of-experts\nlayer. arXiv preprint arXiv:1701.06538, 2017

In [34]:
final_prompt = prompt.invoke({"context": context_text, "question": question})
final_prompt

StringPromptValue(text="\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      [21] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention-\nbased neural machine translation. arXiv preprint arXiv:1508.04025, 2015.\n[22] Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention\nmodel. In Empirical Methods in Natural Language Processing, 2016.\n[23] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive\nsummarization. arXiv preprint arXiv:1705.04304, 2017.\n[24] Oﬁr Press and Lior Wolf. Using the output embedding to improve language models. arXiv\npreprint arXiv:1608.05859, 2016.\n[25] Rico Sennrich, Barry Haddow, and Alexandra Birch. Neural machine translation of rare words\nwith subword units. arXiv preprint arXiv:1508.07909, 2015.\n[26] Noam Shazeer, Azalia Mirhoseini, Kr

In [20]:
answer = llm.invoke(final_prompt)
print(answer.content)

Yes, the topic of attention is discussed in the context of a new network architecture called the Transformer, which is based solely on attention mechanisms. It dispenses with recurrence and convolutions entirely. The experiments conducted on two machine translation tasks show that these models are superior in quality, more parallelizable, and require significantly less time to train. The model achieved a BLEU score of 28.4 on the WMT 2014 English-to-German translation task, improving over existing best results.


In [21]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser


In [22]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [23]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [24]:
parallel_chain.invoke('who is ashish')

{'context': 'has been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head\nattention and the parameter-free position representation and became the other person involved in nearly every\ndetail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and\ntensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and\nefﬁcient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and\nimplementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating\nour research.\n†Work performed while at Google Brain.\n‡Work performed while at Google Research.\n31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA.\n\nhas been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head\natte

In [25]:
parser = StrOutputParser()

In [26]:
main_chain = parallel_chain | prompt | llm | parser

In [27]:
main_chain.invoke('Can you summarize the pdf')

'The document discusses the training regime for models used in machine translation, specifically focusing on the WMT 2014 English-German and English-French datasets. It details the training data, which includes 4.5 million sentence pairs for English-German and 36 million for English-French, with specific encoding methods and vocabulary sizes. The models were trained on a machine with 8 NVIDIA P100 GPUs, with each training step taking about 0.4 seconds. The results show significant improvements in translation performance, achieving a new state-of-the-art BLEU score of 41.0 for English-French translation after 3.5 days of training. The introduction highlights the transition from recurrent neural networks to self-attention mechanisms in sequence modeling and machine translation.'