In [None]:
%pip install --quiet boto3==1.28.57 botocore==1.31.57 langchain==0.0.305

In [None]:
%pip install --quiet faiss-cpu pypdf pinecone-client apache-beam datasets tiktoken

In [3]:
import boto3
import json
import os

In [6]:
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain.vectorstores import Chroma, Pinecone
import pinecone
from tqdm.autonotebook import tqdm
from langchain.embeddings.openai import OpenAIEmbeddings
import numpy as np

In [7]:
bedrock_runtime = boto3.client(
    service_name = "bedrock-runtime",
    region_name = "us-east-1"
)

In [8]:
# bedrock = boto3.client(
#     service_name = "bedrock",
#     region_name = "us-east-1"
# )

In [None]:
#bedrock.list_foundation_models()

In [9]:
#modelId = 'ai21.j2-ultra-v1'
modelId = 'anthropic.claude-v2'
accept = 'application/json'
contentType = 'application/json'

In [10]:
from urllib.request import urlretrieve

os.makedirs("data", exist_ok=True)
files = [
    "https://incometaxindia.gov.in/Supporting%20Files/ITR2021/Instructions_ITR1_AY2021_22.pdf",
    "https://incometaxindia.gov.in/Supporting%20Files/ITR2021/Instructions_ITR2_AY2021_22.pdf"
]
for url in files:
    file_path = os.path.join("data", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [11]:
import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("./data/")

documents = loader.load()
# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=2000,
    chunk_overlap=0,
)
docs = text_splitter.split_documents(documents)

In [12]:
os.environ["PINECONE_API_KEY"] = "<YOUR_PINECONE_API_KEY>"
os.environ["PINECONE_API_ENV"] = "<YOUR_PINECONE_ENV>"

In [14]:
pinecone.init(
    api_key = os.environ.get('PINECONE_API_KEY'),
    environment = os.environ.get('PINECONE_API_ENV')
)

index_name = "itrsearchdx"

In [15]:
llm = Bedrock(
    model_id=modelId,
    client=bedrock_runtime
)
bedrock_embeddings = BedrockEmbeddings(client=bedrock_runtime) 

In [16]:
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

pinecone.create_index(name=index_name, dimension=1536, metric="dotproduct")
# wait for index to finish initialization
while not pinecone.describe_index(index_name).status["ready"]:
    time.sleep(1)

In [17]:
docsearch = Pinecone.from_texts(
    [t.page_content for t in docs],
    bedrock_embeddings,
    index_name = index_name
)

In [19]:
query = "Who is eligible to use this return form?"
docs = docsearch.similarity_search(query)

In [20]:
chain.run(input_documents = docs, question = query)

" Based on the context provided, ITR-1 (AY 2021-22) is only for resident individuals having income up to Rs 50 lakh and who do not have income from business/profession. ITR-2 (AY 2021-22) does not have any eligibility criteria specified in the provided instructions, so I don't have enough information to determine who is eligible to use that form. The context does not specify who can use ITR-2."

In [23]:
query = "What is 80TTA?"
docs = docsearch.similarity_search(query)

In [24]:
chain.run(input_documents = docs, question = query)

' 80TTA is a deduction that can be claimed in respect of interest income from savings accounts. The key points about deduction under section 80TTA are:\n\n- It is available only to individuals and HUFs. Non-individuals like companies, firms, etc cannot claim this deduction. \n\n- The maximum deduction available under 80TTA is Rs 10,000 in a financial year.\n\n- It can be claimed only against interest earned from savings accounts with banks, cooperative banks and post offices. Interest income from fixed deposits, NSCs, etc is not eligible for deduction under 80TTA.\n\n- Senior citizens cannot claim deduction under 80TTA. \n\nSo in summary, 80TTA provides a deduction on interest income earned from savings accounts up to Rs 10,000 per year. It is available only for individuals/HUFs and not senior citizens.'