In [314]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import HuggingFaceHub
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import torch
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.schema import Document

In [315]:
class SentenceTransformerEmbeddings:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        embeddings = self.model.encode(texts)
        return embeddings.tolist()

    def embed_query(self, text):
        embedding = self.model.encode(text)
        return embedding.tolist()

    def __call__(self, text):
        if isinstance(text, str):
            return self.embed_query(text)
        elif isinstance(text, list):
            return self.embed_documents(text)
        else:
            raise ValueError("Input must be a string or a list of strings")


In [316]:
def load_and_extract_text(file_paths):
    docs=[]
    for file_path in file_paths:
        loader = UnstructuredFileLoader(file_path)
        elements = loader.load()
        docs.append(elements)
    return docs

In [321]:
# Define the split_docs function
def split_docs(docs, chunk_size=200, chunk_overlap=50):
    splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap,separator="\n\n")
    split_docs = []
    for doc in docs:
        inside=[]
        for chunk in doc:
            split=splitter.split_text(chunk.page_content)
            print(split)
            inside.append(split)
        split_docs.append(inside)    # inside=Document(page_content=split)
            # split_docs.append(inside)
        
    return split_docs

In [318]:
# Define the directory containing the files
files_dir = "files/"
file_paths = [os.path.join(files_dir, file) for file in os.listdir(files_dir)]
            
docs = load_and_extract_text(file_paths)



In [319]:
docs

[[Document(metadata={'source': 'files/Health_66.pdf'}, page_content='18. Maternity - Code Excl 18\n\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred during hospitalization) except ectopic pregnancy;\n\nii. Expenses towards miscarriage (unless due to an accident) and lawful medical\n\ntermination of pregnancy during the policy period. SPECIFIC EXCLUSIONS 19. Circumcision (unless necessary for treatment of a disease not excluded under this policy or necessitated due to an accident), Preputioplasty, Frenuloplasty, Preputial Dilatation and Removal of SMEGMA - Code Excl 19 Congenital External Condition / Defects / Anomalies Convalescence, general debility, run-down condition, Nutritional deﬁciency states Code Excl 21 Intentional self-injury\n\n20. 21.\n\nCode Excl 20\n\n22. 23. Injury/disease caused by or arising from or attributable to war, invasion, act of foreign\n\nCode Excl 22\n\n24.\n\nenemy, warlike operations (w

In [322]:
split_doc = split_docs(docs)

Created a chunk of size 460, which is longer than the specified 200
Created a chunk of size 387, which is longer than the specified 200
Created a chunk of size 237, which is longer than the specified 200
Created a chunk of size 667, which is longer than the specified 200
Created a chunk of size 225, which is longer than the specified 200
Created a chunk of size 207, which is longer than the specified 200
Created a chunk of size 238, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 310, which is longer than the specified 200
Created a chunk of size 262, which is longer than the specified 200
Created a chunk of size 275, which is longer than the specified 200
Created a chunk of size 385, which is longer than the specified 200
Created a chunk of size 383, which is longer than the specified 200
Created a chunk of size 549, which is longer than the specified 200
Created a chunk of size 1520, which is longer th

['18. Maternity - Code Excl 18\n\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred during hospitalization) except ectopic pregnancy;', 'ii. Expenses towards miscarriage (unless due to an accident) and lawful medical', 'termination of pregnancy during the policy period. SPECIFIC EXCLUSIONS 19. Circumcision (unless necessary for treatment of a disease not excluded under this policy or necessitated due to an accident), Preputioplasty, Frenuloplasty, Preputial Dilatation and Removal of SMEGMA - Code Excl 19 Congenital External Condition / Defects / Anomalies Convalescence, general debility, run-down condition, Nutritional deﬁciency states Code Excl 21 Intentional self-injury', '20. 21.\n\nCode Excl 20\n\n22. 23. Injury/disease caused by or arising from or attributable to war, invasion, act of foreign\n\nCode Excl 22\n\n24.', 'Code Excl 22\n\n24.\n\nenemy, warlike operations (whether war be declared or not) - Code Excl 2

Created a chunk of size 253, which is longer than the specified 200
Created a chunk of size 666, which is longer than the specified 200
Created a chunk of size 717, which is longer than the specified 200
Created a chunk of size 801, which is longer than the specified 200
Created a chunk of size 293, which is longer than the specified 200
Created a chunk of size 293, which is longer than the specified 200
Created a chunk of size 273, which is longer than the specified 200
Created a chunk of size 468, which is longer than the specified 200
Created a chunk of size 247, which is longer than the specified 200
Created a chunk of size 322, which is longer than the specified 200
Created a chunk of size 339, which is longer than the specified 200
Created a chunk of size 374, which is longer than the specified 200
Created a chunk of size 422, which is longer than the specified 200
Created a chunk of size 225, which is longer than the specified 200
Created a chunk of size 464, which is longer tha

['m m 0 0 . 0 1 2', 'Migration (Applicable only for Section 2 and Section 4) The insured person will have the option to migrate the policy to other health insurance products/plans offered by the company by applying for migration of the Policy at least 30 days before the policy renewal date as per IRDAI guidelines on Migration. lf such person is presently covered and has been continuously covered without any lapses under any health insurance product/plan offered by the company, the insured person will get the accrued continuity benefits in waiting periods as per IRDAI guidelines on migration. For Detailed Guidelines on migration, kindly refer the link https://www.irdai.gov.in/ADMINCMS/cms/frmGuidelines_Layout.aspx?page=PageNo3987', 'Portability (Applicable only for Section 2 and Section 4) The insured person will have the option to port the policy to other insurers by applying to such insurer to port the entire policy along with all the members of the family, if any, at least 45 days be

In [324]:
for i in split_doc:
    print(i)

[['18. Maternity - Code Excl 18\n\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred during hospitalization) except ectopic pregnancy;', 'ii. Expenses towards miscarriage (unless due to an accident) and lawful medical', 'termination of pregnancy during the policy period. SPECIFIC EXCLUSIONS 19. Circumcision (unless necessary for treatment of a disease not excluded under this policy or necessitated due to an accident), Preputioplasty, Frenuloplasty, Preputial Dilatation and Removal of SMEGMA - Code Excl 19 Congenital External Condition / Defects / Anomalies Convalescence, general debility, run-down condition, Nutritional deﬁciency states Code Excl 21 Intentional self-injury', '20. 21.\n\nCode Excl 20\n\n22. 23. Injury/disease caused by or arising from or attributable to war, invasion, act of foreign\n\nCode Excl 22\n\n24.', 'Code Excl 22\n\n24.\n\nenemy, warlike operations (whether war be declared or not) - Code Excl 

In [323]:
split_doc

[[['18. Maternity - Code Excl 18\n\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred during hospitalization) except ectopic pregnancy;',
   'ii. Expenses towards miscarriage (unless due to an accident) and lawful medical',
   'termination of pregnancy during the policy period. SPECIFIC EXCLUSIONS 19. Circumcision (unless necessary for treatment of a disease not excluded under this policy or necessitated due to an accident), Preputioplasty, Frenuloplasty, Preputial Dilatation and Removal of SMEGMA - Code Excl 19 Congenital External Condition / Defects / Anomalies Convalescence, general debility, run-down condition, Nutritional deﬁciency states Code Excl 21 Intentional self-injury',
   '20. 21.\n\nCode Excl 20\n\n22. 23. Injury/disease caused by or arising from or attributable to war, invasion, act of foreign\n\nCode Excl 22\n\n24.',
   'Code Excl 22\n\n24.\n\nenemy, warlike operations (whether war be declared or not)

In [302]:
for i in split_doc:
    print(i.page_content)

['18. Maternity - Code Excl 18\n\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred during hospitalization) except ectopic pregnancy;', 'ii. Expenses towards miscarriage (unless due to an accident) and lawful medical', 'termination of pregnancy during the policy period. SPECIFIC EXCLUSIONS 19. Circumcision (unless necessary for treatment of a disease not excluded under this policy or necessitated due to an accident), Preputioplasty, Frenuloplasty, Preputial Dilatation and Removal of SMEGMA - Code Excl 19 Congenital External Condition / Defects / Anomalies Convalescence, general debility, run-down condition, Nutritional deﬁciency states Code Excl 21 Intentional self-injury', '20. 21.\n\nCode Excl 20\n\n22. 23. Injury/disease caused by or arising from or attributable to war, invasion, act of foreign\n\nCode Excl 22\n\n24.', 'Code Excl 22\n\n24.\n\nenemy, warlike operations (whether war be declared or not) - Code Excl 2

In [328]:
embeddings = SentenceTransformerEmbeddings()



In [333]:
# Flatten the nested list and ensure all items are strings
flattened_doc = [str(item) for sublist in split_doc for item in sublist]

# Create Document objects from the flattened list
documents = [Document(page_content=content) for content in flattened_doc]


In [334]:
documents

[Document(page_content='[\'18. Maternity - Code Excl 18\\n\\ni. Medical treatment expenses traceable to childbirth (including complicated deliveries and caesarean sections incurred during hospitalization) except ectopic pregnancy;\', \'ii. Expenses towards miscarriage (unless due to an accident) and lawful medical\', \'termination of pregnancy during the policy period. SPECIFIC EXCLUSIONS 19. Circumcision (unless necessary for treatment of a disease not excluded under this policy or necessitated due to an accident), Preputioplasty, Frenuloplasty, Preputial Dilatation and Removal of SMEGMA - Code Excl 19 Congenital External Condition / Defects / Anomalies Convalescence, general debility, run-down condition, Nutritional deﬁciency states Code Excl 21 Intentional self-injury\', \'20. 21.\\n\\nCode Excl 20\\n\\n22. 23. Injury/disease caused by or arising from or attributable to war, invasion, act of foreign\\n\\nCode Excl 22\\n\\n24.\', \'Code Excl 22\\n\\n24.\\n\\nenemy, warlike operations

In [335]:
docsearch = FAISS.from_documents(documents,embeddings)
docsearch.save_local("faiss_index")

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [375]:
task = "text-generation"
repo_id="google-bert/bert-base-uncased"
# Define model kwargs
model_kwargs = {
    "temperature": 0.5,
        "top_k": 100,
        "max_new_tokens": 30000,
        "num_return_sequences": 1,
        "do_sample": True
}

# Instantiate the HuggingFaceHub client
llm = HuggingFaceHub(
    repo_id=repo_id,
    task=task,
    model_kwargs=model_kwargs,
    huggingfacehub_api_token="hhf_uYPOYJLkkheLxZXbeNBXIKidzwgTRTBCju"
)

In [365]:
prompt_template = PromptTemplate(
    template="""
        Your task is to provide answers from the context provided
        Don't use external knowledge

        Context: {context}

        Question: {question}

        Answer:
    """,
    input_variables=['context', 'question']
)

In [366]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 50})

In [367]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser(type='json')
)

In [368]:
def invoke(user_input):
    result = rag_chain.invoke(user_input)
    return result.strip()


In [369]:
user_query = input("Ask me anything: ")
result = invoke(user_query)
print(result)

HfHubHTTPError: 500 Server Error: Internal Server Error for url: https://api-inference.huggingface.co/models/EleutherAI/gpt-neo-2.7B (Request ID: 2y_XypJ_VE8yCGThZM2ke)

In [259]:
loader = UnstructuredFileLoader("new.txt")
elements = loader.load()

In [260]:
elements

[Document(metadata={'source': 'new.txt'}, page_content="6.What is meant by python numbers?\n\nNumber data types store numeric values. Number objects are created when\n\nyou assign a value to them.\n\nPython supports four different numerical types :\n\nint (signed integers)\n\nlong (long integers, they can also be represented in octal and\n\nhexadecimal)\n\nfloat (floating point real values)\n\ncomplex (complex numbers)\n\n7.What are python strings?\n\nStrings in Python are identified as a contiguous set of characters represented in\n\nthe quotation marks. Python allows for either pairs of single or double quotes. Subsets of\n\nstrings can be taken using the slice operator ([ ] and [:] ) with indexes starting at 0 in the\n\nbeginning of the string and working their way from -1 at the end.\n\nThe plus (+) sign is the string concatenation operator and the asterisk (*) is the\n\nrepetition operator.\n\nstr = 'Hello World!'\n\nprint str[0] # Prints first character of\n\nthe string o/p:\n\nH

In [262]:
# Assuming elements is a list of Document objects
for document in elements:
    page_content = document.page_content
    print(page_content)


6.What is meant by python numbers?

Number data types store numeric values. Number objects are created when

you assign a value to them.

Python supports four different numerical types :

int (signed integers)

long (long integers, they can also be represented in octal and

hexadecimal)

float (floating point real values)

complex (complex numbers)

7.What are python strings?

Strings in Python are identified as a contiguous set of characters represented in

the quotation marks. Python allows for either pairs of single or double quotes. Subsets of

strings can be taken using the slice operator ([ ] and [:] ) with indexes starting at 0 in the

beginning of the string and working their way from -1 at the end.

The plus (+) sign is the string concatenation operator and the asterisk (*) is the

repetition operator.

str = 'Hello World!'

print str[0] # Prints first character of

the string o/p:

H
