In [1]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import os
import pprint

from langchain_groq import ChatGroq
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [2]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


In [3]:

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

In [4]:
model=ChatGroq(model="llama-3.1-8b-instant")

In [5]:
model.invoke("What is the best programming langauge for data science?").content

"The best programming language for data science is often a matter of personal preference, the specific task at hand, and the ecosystem you're working in. However, here are some popular choices:\n\n1. **Python**: Python is the most widely used language in data science, and for good reason. It has a vast number of libraries and tools, including:\n\t* NumPy and Pandas for data manipulation and analysis\n\t* Scikit-learn for machine learning\n\t* Matplotlib and Seaborn for data visualization\n\t* Scipy for scientific computing\n\t* TensorFlow and PyTorch for deep learning\n2. **R**: R is a popular choice for statistical computing and data visualization. It has a wide range of libraries, including:\n\t* dplyr for data manipulation\n\t* tidyr for data transformation\n\t* ggplot2 for data visualization\n\t* caret for machine learning\n3. **Julia**: Julia is a new language that's gaining popularity in the data science community. It's designed to be faster than Python and R, with a focus on hig

# Import data

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len
)

In [7]:
file_path = "../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf"
loader = PyPDFLoader(file_path)

In [8]:
document = loader.load()
document[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-25T15:13:27+05:30', 'moddate': '2024-10-28T12:09:01+05:30', 'trapped': '/False', 'source': '../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf', 'total_pages': 523, 'page': 0, 'page_label': 'Cover'}, page_content='')

In [9]:
len(document)

523

In [10]:
docs = text_splitter.split_documents(document)

In [11]:
len(docs)

2579

In [12]:
docs[0].metadata

{'producer': 'Adobe PDF Library 17.0',
 'creator': 'Adobe InDesign 19.5 (Windows)',
 'creationdate': '2024-10-25T15:13:27+05:30',
 'moddate': '2024-10-28T12:09:01+05:30',
 'trapped': '/False',
 'source': '../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf',
 'total_pages': 523,
 'page': 1,
 'page_label': 'FM - 1'}

In [13]:
pprint.pprint(docs[20].page_content)

('at providing concrete examples and guidance on how to optimize inference '
 'pipelines and deploy \n'
 'LLMs effectively. This makes the book a valuable resource for both '
 'researchers and practitioners.\n'
 'This book is highly recommended for anyone interested in learning about LLMs '
 'and their practical \n'
 'applications. By providing a comprehensive overview of the tools, '
 'techniques, and best practices \n'
 'involved in LLM development, the authors have created a valuable resource '
 'that will undoubtedly')


In [14]:
embeddings.embed_documents(docs[0].page_content)[0]

[-0.031815316528081894,
 -0.01097050216048956,
 -0.004820633679628372,
 0.02408158965408802,
 0.0051572578959167,
 -0.010021048597991467,
 0.02501378022134304,
 0.04004966840147972,
 -0.0226142518222332,
 0.03725309669971466,
 -0.011350283399224281,
 -0.013421817682683468,
 -0.037736453115940094,
 0.008959387429058552,
 0.009088858030736446,
 0.03217783570289612,
 -0.03526787459850311,
 0.019040854647755623,
 -0.039221055805683136,
 0.021837426349520683,
 0.045988067984580994,
 -0.0376674048602581,
 -0.017728883773088455,
 0.028155608102679253,
 0.01017641369253397,
 -0.005657878704369068,
 0.017866985872387886,
 -0.0026498378720134497,
 0.006473545450717211,
 0.0005842375103384256,
 0.04681668058037758,
 0.058693479746580124,
 0.0021610853727906942,
 -0.003931600134819746,
 -0.03790908306837082,
 -0.017590781673789024,
 0.06798085570335388,
 -0.007487734314054251,
 -0.018971804529428482,
 -0.0017187263583764434,
 -0.009477270767092705,
 -0.021388594061136246,
 -0.04495229944586754,
 0

In [15]:
vectorstore = FAISS.from_documents(docs, embeddings)

# Create chain and pipeline

In [75]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

In [64]:
document_text = retriever.invoke("What are the top open source LLMs?")

In [65]:
pprint.pprint(document_text)

[Document(id='3cd758ad-4d69-4f08-b8b1-2366e453125f', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-25T15:13:27+05:30', 'moddate': '2024-10-28T12:09:01+05:30', 'trapped': '/False', 'source': '../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf', 'total_pages': 523, 'page': 75, 'page_label': '47'}, page_content='We used Opik, an open-source tool made by Comet, as our prompt monitoring tool because it fol-\nlows Comet’s philosophy of simplicity and ease of use, which is currently relatively rare in the LLM \nlandscape. Other options offering similar features are Langfuse (open source, https://langfuse.\ncom), Galileo (not open source, rungalileo.io), and LangSmith (not open source, https://www.\nlangchain.com/langsmith), but we found their solutions more cumbersome to use and imple -'),
 Document(id='603728ef-240a-4dd7-aae9-2c54d613f819', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 

In [72]:
ft_query = vectorstore.similarity_search("What is LLM fine-tuning?", k=10)

In [73]:
pprint.pprint(ft_query[0].page_content)

('Old information\n'
 'Any LLM is trained or fine-tuned on a subset of the total world knowledge '
 'dataset. This is due \n'
 'to three main issues:\n'
 '• Private data: You cannot train your model on data you don’t own or have '
 'the right to use.\n'
 '• New data: New data is generated every second. Thus, you would have to '
 'constantly train \n'
 'your LLM to keep up.\n'
 '• Costs: Training or fine-tuning an LLM is an extremely costly operation. '
 'Hence, it is not \n'
 'feasible to do it on an hourly or daily basis.')


In [76]:
# Question: user question
# Context: based on question, retrive information from the vector database

prompt_template = """
    Answer the question based on the context provided below.
    If the context does not contain sufficient information, response with:
    "I do not have enough information about this."

    Context: {context}

    Question: {question}

    Answer:"""



In [77]:
prompt = PromptTemplate(
    template = prompt_template,
    input_variables=["context", "question"]
)

In [38]:
prompt

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n    Answer the question based on the context provided below.\n    If the context does not contain sufficient information, response with:\n    "I do not have enough information about this."\n\n    Context: {context}\n\n    Question: {question}\n\n    Answer:')

In [59]:
parser = StrOutputParser()

In [78]:
def format_docs(docs):
    return "\n\n.join([doc.page_content for doc in docs])"

In [79]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [80]:
rag_chain.invoke("What is fine-tuning an LLM?")

'I do not have enough information about this.'