In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import os
import torch
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import  HuggingFaceInstructEmbeddings,HuggingFaceHubEmbeddings,HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

class Create_embeddings_from_pdf_files:
    HFIembeddings=None
    def __init__(self) -> None:
        DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
        if Create_embeddings_from_pdf_files.HFIembeddings is None:
            Create_embeddings_from_pdf_files.HFIembeddings = HuggingFaceInstructEmbeddings(model_name="thenlper/gte-small",cache_folder="./Models/", model_kwargs={"device": DEVICE})
            
    def split_docs(self,pdf_file_path,chunk_size=1000,chunk_overlap=20):
        docs = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap).split_documents(PyPDFLoader(pdf_file_path).load_and_split())
        return docs
    
    def create_vectores_and_store_locally(self,vector_store_folder_name:str,folder_path:str):
        current_vector_files=[file_name.split(".")[0] for file_name in os.listdir(vector_store_folder_name) if file_name.endswith(".faiss")]
        for file in [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith(".pdf")]: 
            if os.path.basename(file).split(".")[0] not in current_vector_files:
                FAISS.from_documents(self.split_docs(file),embedding=Create_embeddings_from_pdf_files.HFIembeddings).save_local(vector_store_folder_name,os.path.basename(file).split(".")[0])
                print("vectore file created for: ",file)
            else:
                print("vectore file already exist for ",file)
                
    def load_vectore_stores(self,folder_path:str,current_vector_store:FAISS=None):
        flag = current_vector_store is not None
        for index in [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path) if file_name.endswith(".faiss")]: 
            if flag:
                current_vector_store.merge_from(FAISS.load_local("FAISS_db",Create_embeddings_from_pdf_files.HFIembeddings,os.path.basename(index).split(".")[0]))
            else:
                flag = True
                current_vector_store = FAISS.load_local("FAISS_db",Create_embeddings_from_pdf_files.HFIembeddings,os.path.basename(index).split(".")[0])
        return current_vector_store
                
   

In [3]:
vector_db=Create_embeddings_from_pdf_files()
vector_db.create_vectores_and_store_locally("FAISS_db","./DataSourceFiles")

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512
vectore file already exist for  ./DataSourceFiles\IPC_186045.pdf
vectore file already exist for  ./DataSourceFiles\special_marriage_act.pdf
vectore file already exist for  ./DataSourceFiles\THE_CODE_OF_CIVIL_PROCEDURE_1908.pdf


In [4]:
vector_store = vector_db.load_vectore_stores("FAISS_db")

In [5]:
# from langchain.vectorstores import Chroma
# # from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# # from langchain.embeddings import  HuggingFaceInstructEmbeddings,HuggingFaceHubEmbeddings,HuggingFaceEmbeddings

# from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
# from sentence_transformers import SentenceTransformer
# # model = SentenceTransformer(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",cache_folder="./Models/")
# sentence_transformer_ef = SentenceTransformerEmbeddingFunction(model_name="D:\Files\LLM\Project\Models\sentence-transformers_all-MiniLM-L6-v2")
# # # HFIembeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-mpnet-base-v2")
# # # # embedding_function = SentenceTransformerEmbeddings(model_name="gte-small") #,cache_folder="./Models/")
# # print((sentence_transformer_ef))
# vectorstore = Chroma.from_documents(documents=pages,embedding=sentence_transformer_ef,collection_name="law_collection")

In [6]:
# class QA_on_vctors_of_pdf:
    
from langchain.prompts import PromptTemplate
template = """
    - You're a helpful AI assistant assigned to assist individuals seeking legal advice within the framework of Indian laws and the constitution.
    - Your role is to guide users through legal processes and provide information in a lawful manner.
    - Use the given text to answer the question in atleast 1000 words, give all accurate information.
    - Answer questions step by step, highlighting relevant sections of Indian laws and the constitution, use bulleting to display more pretty and readable answer.
    - Refrain from responding to queries that may not contribute to legal affairs, and provide accurate and relevant information without distortion.
    - deny to give answers if its not available into provided text.
{context}

Question: {question} 
Answer:
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [7]:

from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
chain = RetrievalQA.from_chain_type(
    llm = HuggingFaceHub(repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", model_kwargs={"temperature":0.8, "max_length":4096,"max_new_tokens":4096}),
    chain_type="stuff",
    retriever=vector_store.as_retriever(
        search_type='similarity',
        search_kwargs={ 'k': 4 },
    ),
    # return_source_documents=True,
    chain_type_kwargs={"prompt": prompt} #,"verbose": True},
)

In [8]:
# q = "what is the punishment for robbery ?"
q = "who is judge ?"
q = "one person has commited defamation, what should be punishment for it?"
# q = "rioting has happened in neighbouring society. who will be responsible and what is the punishment for it"
# q = "what is punishment for robbery and also murder ?"
# q = "what are the rights of tenant and landlord ?"
# q = "Liability of person for whose benefit riot is committed"
# q = "person is selling adulterated drugs which is harmful to the health of people. person has also sold drug to the children below age of 15 what will be punishment for it?"
q="start with python programming language"
# q="explain all laws related to merriage"
# q+=" and what is a household work?"
response = chain({"query":q, "early_stopping":True,"min_length":1000,"max_tokens":2000})

from write_in_file import generate_docx_with_bullets
generate_docx_with_bullets(heading=q,main_paragraph=response["result"],output_folder="./tmp/")

Document './tmp/2024-02-15_12_48_07.docx' generated successfully.
