In [19]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.output_parsers import StrOutputParser
import os
import csv
from dotenv import load_dotenv

In [3]:
# load the pdf
loader = PyPDFLoader("documents/PERSONAL LOAN APPLICATION FORM.pdf")
document = loader.load()

In [4]:
# split text
splitter = RecursiveCharacterTextSplitter(chunk_size=300 , chunk_overlap=20)
splits = splitter.split_documents(document)

In [5]:
# load embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
encode_kwargs = {'normalize_embeddings': True}

  from tqdm.autonotebook import tqdm, trange


In [6]:
# create a vector store
vectorstore = FAISS.from_documents(splits, embeddings)

# save the vectorstore locally
vectorstore.save_local('vectorstore.db')

In [7]:
# create a retreiver
retriever = vectorstore.as_retriever()

In [10]:
# load the llm model
groq_api_key = os.getenv('groq_api_key')
llm = ChatGroq(model='Gemma2-9b-It', groq_api_key=groq_api_key)

In [14]:
# create prompt
prompt = ChatPromptTemplate.from_template('''
You are tasked with extracting key information related to loan applications from documents for automated loan processing. The document contains data in various formats, and your goal is to locate and extract the following fields accurately:

Loan ID
Number of Dependents
Education
Gender
Marital Status
Self-Employed (Yes/No)
Annual Income
Loan Amount
Loan Term (in months)
CIBIL Score
Residential Assets
Commercial Assets
Luxury Assets
Bank Assets
                        
<context>
{context}
</context>

Question: {input}               
''')

In [21]:
# create a chain
output_parser = StrOutputParser()
doc_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever,doc_chain)

In [23]:
response = retrieval_chain.invoke({"input":"Extract the relevant fields for loan processing"})
answer = output_parser.parse(response['answer'])
answer

"Unfortunately, the provided text snippet does not contain any information related to the loan application fields you listed (Loan ID, Number of Dependents, Education, Gender, Marital Status, Self-Employed, Annual Income, Loan Amount, Loan Term, CIBIL Score, Residential Assets, Commercial Assets, Luxury Assets, Bank Assets). \n\nThe text seems to be part of a document related to identification or registration, mentioning GSTIN, Driving License No., and Passport No.  \n\nTo extract the loan application fields, you would need the actual content of the loan application document. \n\n\nLet me know if you have the complete loan application document, and I'll do my best to help you extract the required information. \n"

In [None]:

def save_to_csv(data, filename='loan_data.csv'):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = data.keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        writer.writerow(data)

def process_loan_document(document_text):
    extracted_data = extract_loan_data(document_text)
    save_to_csv(extracted_data)
    print(f"Data has been extracted and saved to loan_data.csv")
    