<a href="https://colab.research.google.com/github/darshita27-cmd/RAG-Q-A-chatbot/blob/main/mymodel_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd # for data manipulation, data cleaning
from langchain.document_loaders import DataFrameLoader # this loads the dataset in the format for processing in NLP
from langchain.text_splitter import RecursiveCharacterTextSplitter # this splits the text into smaller chunks based on criteria( such as word cound, paragraph boundaries ) for further NLP tasks
from langchain.vectorstores import Chroma # for vector embedding. chroma is to implement the vector embedding
from langchain.embeddings import HuggingFaceEmbeddings # for numerical represention in vector in graphs
from langchain.llms import HuggingFacePipeline # for text summerization,generation,translation
from langchain.chains import RetrievalQA # for retival of answers and questions. generates answers based on data
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # automodelforcasualllm is to generate text, autotokenizer coverts text to format models can understand
import torch # provides tools for tensor computing
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    df = df.fillna('Unknown')  # Replace NaN with 'Unknown'
    df['text'] = df.apply(lambda row: f"Loan ID: {row['Loan_ID']}, Gender: {row['Gender']}, "f"Married: {row['Married']}, Dependents: {row['Dependents']}, "f"Education: {row['Education']}, Self_Employed: {row['Self_Employed']}, "f"Applicant Income: {row['ApplicantIncome']}, "f"Coapplicant Income: {row['CoapplicantIncome']}, "f"Loan Amount: {row['LoanAmount']}, "f"Loan Term: {row['Loan_Amount_Term']}, "f"Credit History: {row['Credit_History']}, ""Property Area: {row['Property_Area']}, "f"Loan Status: {row['Loan_Status']}", axis=1)
    return df
def prepare_documents(df):
    loader = DataFrameLoader(df, page_content_column="text")
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) # distributing the the dataset with 500 characters and chuck_oberlap is.. for ex if 0 to 500 are in first chunk that 450 to 950 will be in the next chunk. helps in sequence
    docs = text_splitter.split_documents(documents)
    return docs
def setup_retriever(docs):
    embeddings = HuggingFaceEmbeddings(model_name="distilbert-base-uncased")
    vectorstore = Chroma.from_documents(docs, embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    return retriever
def setup_llm():
    model_name = "distilgpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100)
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm
def create_rag_chain(retriever, llm):
    qa_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=retriever,return_source_documents=True)
    return qa_chain
def answer_query(query, qa_chain):
    result = qa_chain({"query": query})
    answer = result["result"]
    sources = [doc.page_content for doc in result["source_documents"]]
    return answer, sources
def chatbot(file_path):
    df = load_dataset(file_path)
    docs = prepare_documents(df)
    retriever = setup_retriever(docs)
    llm = setup_llm()
    qa_chain = create_rag_chain(retriever, llm)
    print("\nChatbot is ready! Type your question (or 'quit' to exit):")
    while True:
        query = input("> ")
        if query.lower() == 'quit':
            print("Exiting chatbot...")
            break
        print("Processing...")
        answer, sources = answer_query(query, qa_chain)
        print("\nAnswer:", answer)
        print("\nSources:")
        for i, source in enumerate(sources, 1):
            print(f"{i}. {source}")
        print("\nAsk another question (or type 'quit' to exit):")
if __name__ == "__main__":
    # Update this path after uploading the CSV to Colab
    file_path = "/content/Training Dataset.csv"
    chatbot(file_path)


Device set to use cpu



Chatbot is ready! Type your question (or 'quit' to exit):
> How many loans were approved for self-employed applicants?


  result = qa_chain({"query": query})
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Processing...

Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Loan ID: LP001245, Gender: Male, Married: Yes, Dependents: 2, Education: Not Graduate, Self_Employed: Yes, Applicant Income: 1875, Coapplicant Income: 1875.0, Loan Amount: 97.0, Loan Term: 360.0, Credit History: 1.0, Property Area: {row['Property_Area']}, Loan Status: Y

Loan ID: LP001245, Gender: Male, Married: Yes, Dependents: 2, Education: Not Graduate, Self_Employed: Yes, Applicant Income: 1875, Coapplicant Income: 1875.0, Loan Amount: 97.0, Loan Term: 360.0, Credit History: 1.0, Property Area: {row['Property_Area']}, Loan Status: Y

Loan ID: LP001245, Gender: Male, Married: Yes, Dependents: 2, Education: Not Graduate, Self_Employed: Yes, Applicant Income: 1875, Coapplicant Income: 1875.0, Loan Amount: 97.0, Loan Term: 360.0, Credit History: 1.0, Property Area: {row['Property_Area']}, Loan Status: 