# RAG Pipeline

In [None]:
%%capture
!pip -q install git+https://github.com/huggingface/transformers
!pip install -q datasets loralib sentencepiece
!pip -q install bitsandbytes accelerate
!pip -q install langchain
!pip install einops
!pip install faiss-gpu
!pip install --upgrade --quiet  langchain-community chromadb bs4 qdrant-client
!pip install langchainhub

!pip install --upgrade --quiet  wikipedia
!pip install --upgrade --quiet  arxiv
!pip install --upgrade --quiet  pymupdf

!pip install xmltodict

!pip install cohere
!pip install -U langchain-cohere
!pip install evaluate
!pip install bert_score

!pip install --upgrade tensorflow
!pip install -U accelerate
!pip install sentence_transformers


In [None]:
import torch
import os
import bs4
import json
import numpy as np
import time
import locale


from pprint import pprint
from transformers import AutoTokenizer , AutoModelForCausalLM
from transformers import pipeline, BitsAndBytesConfig

from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import Qdrant
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.utils.math import cosine_similarity

from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.document_loaders import OnlinePDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import PubMedLoader


In [None]:
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
def LLM_pipeline(llm_name = "microsoft/Phi-3-mini-4k-instruct", temperature = 0.5, top_p =0.95, max_length = 4096):
    quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         llm_int4_enable_fp32_cpu_offload=True)
    llm_model = AutoModelForCausalLM.from_pretrained(
        llm_name,
        torch_dtype=torch.float32,
        device_map='auto',
        quantization_config=quantization_config, 
        trust_remote_code = True
    )

    llm_tokenizer = AutoTokenizer.from_pretrained(llm_name)
    llm_pipeline = pipeline(
        "text-generation",
        model=llm_model,
        tokenizer=llm_tokenizer,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        repetition_penalty=1.2
    )

    ##TODO## 
    # Add EOS_token
    return llm_pipeline

In [2]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [5]:
def text_spliiter(Docs, Chunk_size = 20, Overlap = 200):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size= Chunk_size, chunk_overlap = Overlap)
    splits = text_spliiter.split_documents(Docs)
    for idx, text in enumerate(splits):
      splits[idx].metadata['split_id'] = idx
    
    return splits

In [None]:
def PDFloader(directory_path):
    pdf_data = []
    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            loader = PyMubPDFLoader(file_path)
            data = loader.load()
            pdf_data.append(data[0])
    return pdf_data


In [6]:
def HTMLLoader(directory_path):
    return none

In [None]:
def VectorDB(Splits, Embeddings, Collection_name, TOP_K):
    qdrant_vectorstore = Qdrant.from_documents(splits = Splits,
    Embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name= Collection_name,
    force_recreate=True
    )
    retriever = qdrant_vectorstore.as_retriever(search_kwargs={"k": TOP_K})

    return retriever, qdrant_vectorstore


In [None]:
def Add_documents_to_VectorDB(Splits, Qdrant_vectorstore):
    Qdrant_vectorstore.add_documents(documents = Splits)