# Step 0
## Setup & Installation

In [4]:
#extracing information from pdf
from langchain_community.document_loaders import PyPDFLoader

# splitting & embedding
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

# vector db
from langchain.vectorstores import FAISS

# allow me to execute bash commands
import os
import re


## Step 1: 
### Extract text from text books

In [6]:

pdf_folder_path = "content/"

loaders = []
for file in os.listdir(pdf_folder_path):
    if file.endswith(".pdf"):
        print("Reading file: ", file)
        loaders.append(PyPDFLoader(os.path.join(pdf_folder_path, file)))

def load_pdf(loaders):
    full_documents = []
    for loader in loaders:
        print("Converting file: ", loader.file_path)
        documents = loader.load()
        full_documents.extend(documents)
    return full_documents

def convert_to_text(documents):
    full_text = ""
    for document in documents:
        if len(document.page_content) > 20:
            full_text += document.page_content
    return full_text
    


full_documents = load_pdf(loaders)
print("The total page of the textbooks are : ", len(full_documents))
full_text = convert_to_text(full_documents)
print("The total number of words are : ", len(full_text))




Reading file:  5G_mobile_communication_concepts.pdf
Reading file:  5G_wireless.pdf
Converting file:  content/5G_mobile_communication_concepts.pdf
Converting file:  content/5G_wireless.pdf
The total page of the textbooks are :  939
The total number of words are :  2415197


# Step 2:
## Preprocessing, clean text

In [7]:
def remove_extra_spaces(text):
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

def clean_text(text):
    cleaned_lines = []
    lines = text.split("\n")
    for line in lines:
        line = remove_extra_spaces(line)
        cleaned_lines.append(line)
    cleaned_text = "\n".join(cleaned_lines)
    return cleaned_text

clean_texted_text = clean_text(full_text)
print("The total number of words after cleaning are : ", len(clean_texted_text))

file_name = "cleaned_text.txt"
with open(file_name, 'w', encoding='utf-8') as f:
    f.write(clean_texted_text)


    
    

The total number of words after cleaning are :  2405960


## Step 3:
### Splitting texts into chunks

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

with open('cleaned_text.txt', 'r', encoding='utf-8') as f:
    clean_texted_text = f.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100
)

chunks = text_splitter.split_text(clean_texted_text)
print("The total number of chunks are : ", len(chunks))

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_db = FAISS.from_texts(
    texts = [chunk for chunk in chunks],
    embedding = embeddings,
)

vector_db.save_local("faiss_index")
print("The vector db is saved in the faiss_index folder")


The total number of chunks are :  6061
The vector db is saved in the faiss_index folder


## Step 4:
### Query Processing & Retrieval

In [11]:
def retrive_context(query, k=3, score_threshold=0.8):
    retrieved_context = vector_db.similarity_search(
        query,
        k = k,
        score_threshold = score_threshold
    )
    return retrieved_context



## Step 5:
### Generting Answer with LLM

In [13]:


from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model='llama3')

def answer_question(question, context, llm):
    # Reformat context
    formatted_context = "\n".join([doc.page_content for doc in context])

    # Prompt Template
    prompt = f"""
    You are an expert research assistant specializing in answering questions about research papers.

    Task: Answer the question based on the provided context, with detail explaination and reasoning.

    Instructions:
    * Be concise and accurate.
    * If the context does not contain the answer, say EXACTLY "I cannot answer confidently"
    * If the question is unrelated to the context, say EXACTLY "NA"
    * If the question asks for a yes/no answer, provide it and explain your reasoning shortly.

    Context:
    {formatted_context}

    Question:
    {question}

    Answer:
    """

    # Generate answer using the LLM
    try:
        response = llm.invoke(prompt)  # Use the llm object directly
        return response.strip() # Remove leading/trailing whitespace
    except Exception as e:
        print(f"Error during LLM call: {e}")
        return "Error processing the request."

def response(query, k=10, score_threshold=0.8):
    retrieved_context = retrive_context(query, k=k, score_threshold=score_threshold)
    if not retrieved_context:
        return "No relevant context found."
    
    # Answer the question using the LLM
    response = answer_question(query, retrieved_context, llm)
    return response


# while True:
#     query = input("Enter your question:")
#     if query.upper() == ("QUIT") or query.upper() == ("EXIT"):
#         print("Exiting the program")
#         break
#     if query == "":
#         print("Please enter a valid question")
#         continue
#     answer = response(query, k=10, score_threshold=0.8)
#     print("The answer is : ", answer)
#     print("\n", "type 'quit' or 'exit' to stop the program")

## Step 5:
#### Creating an UI for ease to use

In [None]:
import tkinter as tk
from tkinter import scrolledtext
from threading import Thread


def generate_and_display(user_input):
    response = generate_answer(user_input)

    # Update UI on main thread using after()
    root.after(0, lambda: display_response(response))

def display_response(response):
    chat_log.config(state=tk.NORMAL)
    chat_log.insert(tk.END, f"AI: {response}\n\n", "bot")
    chat_log.config(state=tk.DISABLED)
    chat_log.see(tk.END)

def generate_answer(query, k=10, score_threshold=0.8):
    retrieved_context = retrive_context(query, k=k, score_threshold=score_threshold)
    if not retrieved_context:
        return "No relevant context found."

    # Answer the question using the LLM
    response = answer_question(query, retrieved_context, llm)
    return response


def send_message():
    user_input = entry.get()
    if not user_input.strip():
        return
    
    if user_input.lower() in ["exit", "quit"]:
        chat_log.config(state=tk.NORMAL)
        chat_log.insert(tk.END, f"You: {user_input}\n", "user")
        chat_log.insert(tk.END, "AI: Goodbye! 👋\n\n", "bot")
        chat_log.config(state=tk.DISABLED)
        chat_log.see(tk.END)
        root.after(3000, root.destroy)  # delay to let user see the message
        return

    chat_log.config(state=tk.NORMAL)
    chat_log.insert(tk.END, "You: " + user_input + "\n", "user")
    chat_log.config(state=tk.DISABLED)
    chat_log.see(tk.END)

    entry.delete(0, tk.END)

    Thread(target=generate_and_display, args=(user_input,)).start()

# UI setup
root = tk.Tk()
root.title("RAG Chatbot")

# Frame for scrollbar and chat log
frame = tk.Frame(root)
frame.pack(padx=10, pady=10, fill=tk.BOTH, expand=True)

scrollbar = tk.Scrollbar(frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)

chat_log = tk.Text(
    frame, wrap=tk.WORD, state=tk.DISABLED, width=60, height=20, yscrollcommand=scrollbar.set
)
chat_log.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
scrollbar.config(command=chat_log.yview)

# Define tag styles
chat_log.tag_config("user", foreground="blue", font=("Arial", 11, "bold"))
chat_log.tag_config("bot", foreground="green", font=("Arial", 11))

entry = tk.Entry(root, width=50)
entry.pack(side=tk.LEFT, padx=(10, 0), pady=(0, 10))
entry.bind("<Return>", lambda event: send_message())

send_button = tk.Button(root, text="Send", command=send_message)
send_button.pack(side=tk.RIGHT, padx=(0, 10), pady=(0, 10))

root.mainloop()
