In [None]:
!pip install langchain
!pip install langchain_community
!pip install langchain-huggingface==0.1.2
!pip install langchain-pinecone
!pip install unstructured
!pip install "unstructured[pdf]"
!pip install boto3

In [81]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('/content/', glob="**/*.pdf")
data = loader.load()

In [None]:
print(f"You have {len(data)} documents")

In [None]:
print(f"Document 1 contains {len(data[0].page_content)} characters")

In [84]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=50)
chunks = text_splitter.split_documents(data)

In [None]:
chunks

In [None]:
print(f"You have {len(chunks)} chunks")

In [None]:
print(f"The first chunk is {len(chunks[0].page_content)} characters long")

In [None]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

# Load embedding LLM
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load data to vectorstore
import os
os.environ['PINECONE_API_KEY'] = "pcsk_3cVjXF_7mTGLMoAg8JbkTqsg5r77vYyrN89wwJ2nqyCFVqTuvjXdKvKaz8HLNNcB1vwnZo"
PineconeVectorStore.from_documents(chunks, embeddings, index_name='tutorial')

In [88]:
vectorstore = PineconeVectorStore(index_name='tutorial', embedding=embeddings)

In [89]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

# Define prompt
template = '''You are a Human Resource Manager for your Organization.
Use this context to reply to the Question:
{context}

Question: {question}'''

prompt = ChatPromptTemplate.from_template(template)

In [90]:
import os
os.environ["HUGGING_FACE_HUB_TOKEN"] = "" # replace YOUR_HUGGING_FACE_TOKEN

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint,ChatHuggingFace

llm = HuggingFaceEndpoint(
                repo_id="microsoft/Phi-3-mini-4k-instruct",
                task="text-generation",
                max_new_tokens=512,
                do_sample=False,
                repetition_penalty=1.03,
                token=""
            )

chat = ChatHuggingFace(llm=llm, verbose=True)

In [92]:
# Function to merge text chunks
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# RAG chain
rag_chain = (
    {"context": vectorstore.as_retriever() | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [95]:
question = 'How many days of leave can I carry forward?'
response = rag_chain.invoke(question)

In [None]:
response