In [67]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import os
import pprint

from langchain_groq import ChatGroq
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [43]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


In [44]:

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large"
)

In [45]:
model=ChatGroq(model="llama-3.1-8b-instant")

In [5]:
model.invoke("What is the best programming langauge for data science?").content

'There is no single "best" programming language for data science, as it often depends on the specific task, the type of data, and personal preference. However, some languages are more popular and widely used in the data science community than others.\n\n**Top programming languages for data science:**\n\n1. **Python**: Python is the most popular language for data science, and for good reason. It has a vast array of libraries and tools, including:\n\t* NumPy and pandas for data manipulation and analysis\n\t* Matplotlib and Seaborn for data visualization\n\t* Scikit-learn for machine learning\n\t* TensorFlow and Keras for deep learning\n\t* Pandas, NumPy, and SciPy for data analysis\n2. **R**: R is another popular language for data science, particularly in academia and research. It\'s known for its:\n\t* Data manipulation and analysis libraries (e.g., dplyr, tidyr)\n\t* Data visualization libraries (e.g., ggplot2, Shiny)\n\t* Machine learning libraries (e.g., caret, dplyr)\n3. **Julia**: 

# Import data

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    length_function=len
)

In [18]:
file_path = "../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf"
loader = PyPDFLoader(file_path)

In [19]:
document = loader.load()
document[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-25T15:13:27+05:30', 'moddate': '2024-10-28T12:09:01+05:30', 'trapped': '/False', 'source': '../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf', 'total_pages': 523, 'page': 0, 'page_label': 'Cover'}, page_content='')

In [20]:
len(document)

523

In [21]:
docs = text_splitter.split_documents(document)

In [22]:
len(docs)

2579

In [25]:
docs[0].metadata

{'producer': 'Adobe PDF Library 17.0',
 'creator': 'Adobe InDesign 19.5 (Windows)',
 'creationdate': '2024-10-25T15:13:27+05:30',
 'moddate': '2024-10-28T12:09:01+05:30',
 'trapped': '/False',
 'source': '../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf',
 'total_pages': 523,
 'page': 1,
 'page_label': 'FM - 1'}

In [40]:
pprint.pprint(docs[20].page_content)

('at providing concrete examples and guidance on how to optimize inference '
 'pipelines and deploy \n'
 'LLMs effectively. This makes the book a valuable resource for both '
 'researchers and practitioners.\n'
 'This book is highly recommended for anyone interested in learning about LLMs '
 'and their practical \n'
 'applications. By providing a comprehensive overview of the tools, '
 'techniques, and best practices \n'
 'involved in LLM development, the authors have created a valuable resource '
 'that will undoubtedly')


In [48]:
embeddings.embed_documents(docs[0].page_content)[0]

[-0.031868573278188705,
 -0.010945110581815243,
 -0.004794959910213947,
 0.024099960923194885,
 0.005110020283609629,
 -0.010021509602665901,
 0.02496313862502575,
 0.040016982704401016,
 -0.022580765187740326,
 0.03732386603951454,
 -0.011368068866431713,
 -0.01340517122298479,
 -0.03773818910121918,
 0.008899376727640629,
 0.009072012268006802,
 0.032162051647901535,
 -0.03526949882507324,
 0.01904173009097576,
 -0.03918833285570145,
 0.021821167320013046,
 0.04595565423369408,
 -0.03770366311073303,
 -0.01772969961166382,
 0.028122374787926674,
 0.01016824971884489,
 -0.005714245606213808,
 0.017867807298898697,
 -0.002606800990179181,
 0.006456579547375441,
 0.0006344366120174527,
 0.04681883379817009,
 0.058730706572532654,
 0.0021493160165846348,
 -0.00399004528298974,
 -0.0379108265042305,
 -0.01759159006178379,
 0.06798398494720459,
 -0.007531237788498402,
 -0.019007204100489616,
 -0.0016885941149666905,
 -0.009486338123679161,
 -0.02140684239566326,
 -0.04498889297246933,
 0.0

In [49]:
vectorstore = FAISS.from_documents(docs, embeddings)

# Create chain and pipeline

In [50]:
retriever = vectorstore.as_retriever()

In [51]:
document_text = retriever.invoke("What are the top open source LLMs?")

In [57]:
pprint.pprint(document_text)

[Document(id='a03a2d42-ceff-45ff-8d17-903ff320c16c', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 (Windows)', 'creationdate': '2024-10-25T15:13:27+05:30', 'moddate': '2024-10-28T12:09:01+05:30', 'trapped': '/False', 'source': '../datasets/9781836200079-LLM_ENGINEERS_HANDBOOK.pdf', 'total_pages': 523, 'page': 75, 'page_label': '47'}, page_content='We used Opik, an open-source tool made by Comet, as our prompt monitoring tool because it fol-\nlows Comet’s philosophy of simplicity and ease of use, which is currently relatively rare in the LLM \nlandscape. Other options offering similar features are Langfuse (open source, https://langfuse.\ncom), Galileo (not open source, rungalileo.io), and LangSmith (not open source, https://www.\nlangchain.com/langsmith), but we found their solutions more cumbersome to use and imple -'),
 Document(id='407e41d5-21c0-4cc5-9292-7594ab38d0cf', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.5 

In [58]:
ft_query = vectorstore.similarity_search("What is LLM fine-tuning?")

In [62]:
pprint.pprint(ft_query[0].page_content)

('Old information\n'
 'Any LLM is trained or fine-tuned on a subset of the total world knowledge '
 'dataset. This is due \n'
 'to three main issues:\n'
 '• Private data: You cannot train your model on data you don’t own or have '
 'the right to use.\n'
 '• New data: New data is generated every second. Thus, you would have to '
 'constantly train \n'
 'your LLM to keep up.\n'
 '• Costs: Training or fine-tuning an LLM is an extremely costly operation. '
 'Hence, it is not \n'
 'feasible to do it on an hourly or daily basis.')


In [63]:
# Question: user question
# Context: based on question, retrive information from the vector database

prompt_template = """
    Answer the question based on the context provided below.
    If the context does not contain sufficient information, response with:
    "I do not have enough information about this."

    Context: {context}

    Question: {question}

    Answer: """



In [65]:
prompt = PromptTemplate(
    template = prompt_template,
    input_variables=["context", "question"]
)

In [68]:
parser = StrOutputParser()

In [71]:
def format_docs(docs):
    return "\n\n.join([doc.page_content for doc in docs])"

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt 
    | model 
    | parser
)

In [70]:
rag_chain.invoke("Tell me about developing LLM applications.")

TypeError: Expected mapping type as input to PromptTemplate. Received <class 'str'>.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT 