#### Downloading the required packages

In [None]:
# pypdf -> reading for PDF files
# transformers -> We are using open source transformer models from hugging face
# einops ->
# accelerate -> Speed up the LLM model inference/response time
# langchain -> Wrapper to combine the multiple components
# bitsandbytes -> To quantizing the model, hence it takes less memory without performance degradation
# sentence_transformers -> To create embeddings
# llama-index -> plotform for building RAG applications using various LLM models

In [None]:
! pip install -q pypdf transformers einops accelerate langchain bitsandbytes sentence_transformers llama-index

In [2]:
# importing the modules
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts.prompts import SimpleInputPrompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding
import torch
# hugging face token
from huggingface_hub import notebook_login

In [None]:
# loging into hugging face
notebook_login()

In [3]:
# Reading the PDF files from folder
documents = SimpleDirectoryReader('in_data').load_data()

In [4]:
documents[:2]

[Document(id_='484046d3-d445-43c6-ae45-e84e6ef5358f', embedding=None, metadata={'page_label': 'Cover', 'file_name': 'Data Science from Scratch ( PDFDrive ).pdf', 'file_path': '/kaggle/input/ds-pdfs/Data Science from Scratch ( PDFDrive ).pdf', 'file_type': 'application/pdf', 'file_size': 6216449, 'creation_date': '2024-01-31', 'last_modified_date': '2024-01-31', 'last_accessed_date': '2024-01-31'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="DATA/DATA SCIENCEData Science from Scratch\nISBN: 978-1-491-90142-7US $39.99  CAN $45.99“\tJoel\ttakes\t you\ton\ta\t\njourney \tfrom\tbeing \t\ndata-curious \tto\tgetting \ta\t\nthorough \tunderstanding \t\nof\tthe\tbread-and-butter \t\nalgorithms \tthat\tevery \tdata\t\nscientist \tshould \tknow.”\n—Rohit Sivapr

In [5]:
# creating a base prompt to generate accurate and meaning full response
Base_prompt = '''
You are a Q&A assistant. Your goal is to answer questions \
accurately based on the instructions and provided context.
'''
# Default Llama-2 instruction format
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
# loading the Llama-2 model from HF
base_model = HuggingFaceLLM(
    context_window=4096, # context window of the model
    max_new_tokens=256, # limiting the LLM response
    generate_kwargs={'temperature':0.3, # LLM adds a creativity while generating response
                      'do_sample':True},
    system_prompt=Base_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf", # model path from hugging face
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map='auto', # if gpu memory no sufficient, automaticall model will store into RAM
    model_kwargs={"torch_dtype":torch.float16, # quantizing the model
                   "load_in_8bit":True} # reducing the size of the model 16bit -> 8bit
    )

In [None]:
# creating embeddings using model from HF
# we are loading embedding model from langchain (open source embedding model)
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    )

In [11]:
# service context is used to combine all the components. similar to chains in langchain
service_context = ServiceContext.from_defaults(
    chunk_size=1024, # dividing into chunks with size of 1024
    llm=base_model, # our LLM model (open source) from hugging face
    embed_model=embed_model # our embedding model (open source) from hugging face
    )

In [None]:
# Creating the Vectors from documents and storing into memory (variable index)
index = VectorStoreIndex.from_documents(documents,
                               service_context=service_context)

In [13]:
index # Object type and memory id

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x7d6a78a6f970>

In [None]:
# The simplest way to store your indexed data is to use the built-in .persist() method of every Index, 
# which writes all the data to disk at the location specified. This works for any type of index.
persist_dir = "vector_disk"
index.storage_context.persist(persist_dir=persist_dir)

In [None]:
# If you’ve already created and stored your embeddings, you’ll want to load them directly without loading your 
# documents or creating a new VectorStoreIndex:

from llama_index import StorageContext, load_index_from_storage

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=persist_dir,
                                               service_context=service_context)

# load index
index = load_index_from_storage(storage_context)

In [14]:
# setting things to get the response from stored vector
# we are initiating the query engine {it takes query and return the similar content to user query based cosine similarity}
query_engine = index.as_query_engine()

In [None]:
# query test
response = query_engine.query("How to handle missing data?")

In [19]:
print(response) # printing the response from LLM model


Missing data can be handled in several ways, depending on the nature of the data and the goals of the analysis. Here are some common methods for handling missing data:

1. **Dropna**: This method removes rows or columns with missing data, based on a specified threshold. This method is useful when the data is clean and the missing data is not due to errors in data collection.
2. **Fillna**: This method fills in missing data with a specified value or using an interpolation method such as 'ffill' or 'bfill'. This method is useful when the data is clean and the missing data is due to a lack of data.
3. **Isnull**: This method returns a boolean value indicating which values are missing/NA. This method is useful when the data is clean and the missing data is due to errors in data collection.
4. **Notnull**: This method negates the result of isnull. This method is useful when the data is clean and the missing data is due to errors in data collection.

It is important to note that the choice 