# PDF Manual Reader
This is a simple chatbot which can look up a give pdf and answer your questions.

### Download and install packages

In [None]:
!pip install PyPDF2 langchain openai python-dotenv
!pip install typing-extensions --upgrade

### Import libraries

In [None]:
from PyPDF2 import PdfReader # for extracting text from pdf
from langchain.text_splitter import CharacterTextSplitter # for splitting text in smaller snippets
import os # for reading environment variables
from dotenv import load_dotenv # for loading environment variables
from openai import OpenAI # openai api
import json # for storing snippets and embeddings in json
from numpy import dot # for matching user questions with snippets.
load_dotenv()

### Set Parameters

In [None]:
EXTRACTED_TEXT_FILE_PATH = "pdf.txt" # text extracted from pdf
EXTRACTED_JSON_PATH = "extracted.json" # snippets and embeddings
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # add your openai key in a .env file
EMBEDDING_MODEL = "text-embedding-ada-002" # embedding model used
GPT_MODEL = "gpt-3.5-turbo" # gpt model used. alternatively you can use gpt-4 or other models.
CHUNK_SIZE = 1000 # chunk size of snippets
CHUNK_OVERLAP = 200 # check size to create overlap between snippets
CONFIDENCE_SCORE = 0.75 # for filtering search results. [0,1] prefered: 0.75 or greater

### Extract text from pdf
Then save in - `pdf_text.txt`

In [None]:
def extract_text_from_pdf(file_path: str):
    
    reader = PdfReader(file_path)

    number_of_pages = len(reader.pages)

    pdf_text = ""

    for i in range(number_of_pages):

        page = reader.pages[i]

        pdf_text += page.extract_text()
        
        # Add a newline after each page's text for readability
        pdf_text += "\n"
    
    # Specify the file path for the new text file
    file_path = EXTRACTED_TEXT_FILE_PATH

    # Write the content to the text file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(pdf_text)

### Turn text into embeddings
Allows the computer to perform mathematical operations and analysis

In [None]:
def create_embeddings(file_path: str):
    
    snippets = []
    # Initialize a CharacterTextSplitter with specified settings
    text_splitter = CharacterTextSplitter(separator="\n",
                                         chunk_size=CHUNK_SIZE,
                                         chunk_overlap=CHUNK_OVERLAP,
                                         length_function=len)

    # Read the content of the file specified by file_path
    with open(file_path, "r", encoding="utf-8") as file:
            file_text = file.read()

    # Split the text into snippets using the specified settings
    snippets = text_splitter.split_text(file_text)

    
    client = OpenAI(api_key=OPENAI_API_KEY)
    
    # Request embeddings for the snippets using the specified model
    response = client.embeddings.create(input=snippets,model=EMBEDDING_MODEL)
    
    # Extract embeddings from the API response
    embedding_list = [response_object.embedding for response_object in response.data]

    # Create a JSON object containing embeddings and snippets
    embedding_json = {
        'embeddings': embedding_list,
        'snippets': snippets
    }
    
    # Convert the JSON object to a formatted JSON string
    json_object = json.dumps(embedding_json, indent=4)

    # Write the JSON string to a file specified by EXTRACTED_JSON_PATH
    with open(EXTRACTED_JSON_PATH, 'w', encoding="utf-8") as file:
        file.write(json_object)

### Read Embedding JSON file
Reads `extracted.json` and prepared embedding for the advisor bot

In [None]:
def get_embeddings():
    
    # Open the JSON file containing embeddings and snippets
    with open(EXTRACTED_JSON_PATH,'r') as file:
        # Load the JSON data into a Python dictionary
        embedding_json = json.load(file)
        
    # Return the embeddings and snippets from the loaded JSON
    return embedding_json['embeddings'], embedding_json['snippets']

### Create Embedding from User's Question
Output of this function is used to find the right embedding.

In [None]:
def user_question_embedding_creator(question):
    
    client = OpenAI(api_key=OPENAI_API_KEY)
    
    # Request embedding for the provided question using the specified model
    response = client.embeddings.create(input=question,model=EMBEDDING_MODEL)
    
    # Extract and return the embedding from the API response
    return response.data[0].embedding

### Answer user's question
The main event

In [None]:
def answer_users_question(user_question):
    
    try:
        # Create an embedding for the user's question
        user_question_embedding = user_question_embedding_creator(user_question)
    except Exception as e:
        # Handle any exception that occurred while using Embedding API.
        return f"An error occurred while creating embedding: {str(e)}"
        
    
    # Calculate cosine similarities between the user's question embedding and the document embeddings
    cosine_similarities = []
    for embedding in embeddings:
        cosine_similarities.append(dot(user_question_embedding,embedding))

    # Pair snippets with their respective cosine similarities and sort them by similarity
    scored_snippets = zip(snippets, cosine_similarities)
    sorted_snippets = sorted(scored_snippets, key=lambda x: x[1], reverse=True)
    
    # Filter snippets based on a confidence score and select the top 5 results
    formatted_top_results = [snipps for snipps, _score in sorted_snippets if _score > CONFIDENCE_SCORE]
    if len(formatted_top_results) > 5:
        formatted_top_results = formatted_top_results[:5]
        
    # Create the chatbot system using pdf_description provided by the user.
    chatbot_system = f"""You are provided with SEARCH RESULTS from a pdf. You need to generate answer to the user's question based on the given SEARCH RESULTS. SEARCH RESULTS as a python list. SEARCH RESULTS and USER's QUESTION are delimited by ``` \n If there is no information available, or question is irrelevent respond with - "Sorry! I can't help you." """
    
    # Create the prompt using results and user's question.
    prompt = f"""\
    SEARCH RESULTS:
    ```
    {formatted_top_results}
    ```
    USER'S QUESTION:
    ```
    {user_question}
    ```
    
    """
    
    # Prepare the chat conversation and use GPT model for generating a response
    messages = [{'role':'system', 'content':chatbot_system},
                {'role':'user', 'content':prompt}]
    
    try:
        client = OpenAI(api_key=OPENAI_API_KEY)
        completion = client.chat.completions.create(model=GPT_MODEL,
                                             messages=messages,
                                             temperature=0,
                                             stream=False)
    except Exception as e:
        # Handle exception while communicating with ChatCompletion API
        return f"An error occurred with chatbot: {str(e)}"
        
    # Return the chatbot response.
    return completion.choices[0].message.content

---

## Start Here
After initiating environment, specify the path to pdf below and run all the cells.

In [None]:
PDF_FILE_PATH = "2019Forester.pdf"

**Converting pdf to text file.**
> ⚠️ **ONLY RUN ONCE.**
> You only need to extract text once.

Add `#` to the beginning of this function after extracting text from the pdf.

In [None]:
# extract_text_from_pdf(PDF_FILE_PATH)

**Creating embeddings from text file.**

> ⚠️ **ONLY RUN ONCE PER PDF**
> This is a BILLED FUNCTION.

`create_embeddings` function is billed, it uses OpenAI's APIs to create embeddings, use it only when required. Comment it after creating embeddings from the pdf.

In [None]:
# create_embeddings(EXTRACTED_TEXT_FILE_PATH)

**Prepare Embeddings**

This reads the embeddings from the json file and stores them for chatbot.

In [None]:
embeddings, snippets = get_embeddings()

## Chatbot

**To exit leave user input blank and hit enter.**

In [31]:
# Start an infinite loop, allowing the user to ask questions
while True:

    user_question = input("")
    
    # Prompt the user to input a question
    print("👤USER: " + user_question)
    
    # Print a separator for readability
    print("----------------------")
    
    # Check if the user entered an empty question
    if user_question =="":
        
        # If the user entered an empty question, exit the loop
        break
    else:
        
        # If the user entered a question, proceed to generate a response
        print("🤖 ADVISOR BOT:")
        
        # Call the function to generate an answer based on the user's question
        # and print the bot's response
        print(answer_users_question(user_question=user_question))
        
        # Print a separator for readability
        print("----------------------")

👤USER: How do you drain and refill the coolant?
----------------------
🤖 ADVISOR BOT:
To drain and refill the coolant, follow these steps:

1. Make sure the engine is completely shut off and cooled down.
2. Locate the coolant reservoir tank on the outside of the vehicle while the engine is cool.
3. Check the coolant level on the reservoir tank. If it is close to or lower than the "LOW" level mark, add coolant up to the "FULL" level mark. If the reservoir tank is empty, remove the radiator cap and refill coolant up to just below the filler neck.
4. After refilling the reserve tank and the radiator, reinstall the cap and check that the rubber gaskets inside the radiator cap are in the proper position.
5. Be careful not to spill engine coolant when adding it. If coolant touches the exhaust pipe, it may cause a bad smell, smoke, and/or a fire. If engine coolant gets on the exhaust pipe, be sure to wipe it off.
6. If the coolant level in the reserve tank is below the "LOW" mark, add coolant