# Set up the env through google colab to run the code

In [1]:
# Install required libraries
!pip install openai==0.28 langchain faiss-cpu sentence-transformers tiktoken pdfplumber


Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading openai-0.28.0-py3-none-any.whl (76 kB)


In [2]:
# Import libraries
import openai
import faiss
import tiktoken
import pdfplumber
import random
import numpy as np
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Initialize OpenAI API key
openai.api_key = "YOUR_OPENAI_API_KEY"  # Replace with your API key


  from tqdm.autonotebook import tqdm, trange


In [3]:
# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Check if the pdf is upload successfully, you can also replace the document with any pdf that you have, MAKE SURE TO HAVE THE RIGHT PATH OF THE FILE!

In [6]:

# Extract text from the uploaded PDF
document_text = extract_text_from_pdf('/ORD_Use_Agreement.pdf')

# Check if the text extraction was successful
if document_text:
    print("PDF successfully uploaded and text extracted!")
    # Print the first 1000 characters to verify the content
    print("\nSample Extracted Text (First 1000 characters):\n")
    print(document_text[:1000])  # Print only the first 1000 characters
else:
    print("Failed to extract text from the PDF. Please check the file path and try again.")


PDF successfully uploaded and text extracted!

Sample Extracted Text (First 1000 characters):

CHICAGO- O'HARE INTERNATIONAL AIRPORT
*************************************
AMENDED AND RESTATED
AIRPORT USE AGREEMENT
AND TERMINAL FACILITIES LEASE
*************************************
(As Amended through 2001 – Unofficial Version)
J54154-2 C:\Documents and Settings\OM00022\Local
Settings\Temp\XPgrpwise\ohareuseagreementamendedandrestatedunoff
icialversion.wpdTABLE OF CONTENTS
Page
ARTICLE I DEFINITIONS . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
Section 1.01 - Definitions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
Section 1.02 - Interpretation . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 17
Section 1.03 - Incorporation of Exhibits . . . . . . . . . . . . . . . . . . . . . . 18
ARTICLE II TERM . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 19
Section

# Text splitting function for optimization

In [7]:
# Function to split text by tokens using tiktoken
def split_text_by_tokens(text, token_limit):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), token_limit):
        chunk = tokens[i:i + token_limit]
        chunks.append(tokenizer.decode(chunk))
    return chunks
doc_chunk = split_text_by_tokens(document_text, token_limit=512)

# Test if text split works, if it works it should print partial words chunck from the document, you can also change the index number to go through different words chunck of the file as well

In [9]:
print("Number of Chunks Created:", len(doc_chunk))
print("\nSample Chunk (First 100 characters of the first chunk):\n")
print(doc_chunk[25][:100])  # Print the first 100 characters of the first chunk

Number of Chunks Created: 137

Sample Chunk (First 100 characters of the first chunk):

 principal amount of $8,000,000.
(65) "1959 Terminal Lease Agreement" means the lease, if any, of te


## DO NOT RUN THIS! Create the embeddings (This is the first approach mentioned in the paper but due to the price of OpenAI token we gonna use another model, this code only shows the how to create embeddings with OpenAI)

In [10]:
'''
# expensive
# Initialize OpenAI API key
openai.api_key = ""  # Replace with your OpenAI API key

# Function to generate embeddings for text chunks using the new API interface
def get_embeddings(texts):
    response = openai.Embedding.create(
        model="text-embedding-ada-002",  # Use the correct embedding model
        input=texts
    )
    # Extract and return the embeddings
    return [embedding["embedding"] for embedding in response["data"]]

# Example: Generate embeddings for your text chunks
embeddings = get_embeddings(doc_chunk)
'''


'\n# expensive\n# Initialize OpenAI API key\nopenai.api_key = ""  # Replace with your OpenAI API key\n\n# Function to generate embeddings for text chunks using the new API interface\ndef get_embeddings(texts):\n    response = openai.Embedding.create(\n        model="text-embedding-ada-002",  # Use the correct embedding model\n        input=texts\n    )\n    # Extract and return the embeddings\n    return [embedding["embedding"] for embedding in response["data"]]\n\n# Example: Generate embeddings for your text chunks\nembeddings = get_embeddings(doc_chunk)\n'

# As mentioned earlier here we choose OPEN RESOURCE embeddings since it's free!

In [11]:
# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose another model if needed

# Generate embeddings for your text chunks
embeddings = model.encode(doc_chunk)

print("Generated embeddings using free model successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated embeddings using free model successfully!


# Create vector database with the embeddings + keep the original chunk for the key words matching method

In [12]:
# Initialize the FAISS index for vector-based retrieval
dimension = len(embeddings[0])  # The dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity search

# Convert embeddings to a NumPy array and add to the index
index.add(np.array(embeddings, dtype=np.float32))
print("Embeddings added to FAISS index successfully!")

Embeddings added to FAISS index successfully!


# Vectorize user query (Vectorize user's questions)

In [13]:
# Function to vectorize the user query
def vectorize_query(query):
    # Use the same model (sentence-transformers) to create the embedding
    query_embedding = model.encode([query])[0]
    return np.array([query_embedding], dtype=np.float32)  # Convert to NumPy array


In [14]:
# Function to retrieve top-k relevant chunks from the vector database
def retrieve_top_k_chunks(query_embedding, top_k=3):
    distances, indices = index.search(query_embedding, top_k)  # Search FAISS index
    top_chunks = [doc_chunk[i] for i in indices[0]]  # Retrieve the corresponding text chunks
    return top_chunks

# Combine the top-k related words chunck from the document and give chatgpt a new prompt (So now the new prompt that ChatGPT recieved includes the original question and the top-k related words chunk from the document)

In [15]:
# Function to format the context for ChatGPT
def format_context_for_chatgpt(user_query, retrieved_chunks):
    context = "\n".join(retrieved_chunks)  # Combine the chunks into a single string
    prompt = f"Question: {user_query}\n\nContext:\n{context}\n\nAnswer:"
    return prompt

# ***Make sure to have your OpenAI API key here! Each call would cost around $0.005, to access this make sure you have token money in ur account ***

In [16]:
# Function to get an answer from ChatGPT
openai.api_key = "" ## Important! to successfully run this part of code you need an OpenAI API key and make sure there's purchased token in your account
def get_chatgpt_response(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Use the new model
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=300,  # Adjust the max_tokens as needed
        temperature=0.7  # Adjust temperature for more or less creative answers
    )
    return response['choices'][0]['message']['content'].strip()

# Call RAG while accessing ChatGPT, you can also change questions as well

In [17]:
# Complete RAG workflow function
def rag_chatbot(user_query, top_k=5):
    # Step 1: Vectorize the user query
    query_embedding = vectorize_query(user_query)

    # Step 2: Retrieve top-k relevant chunks
    retrieved_chunks = retrieve_top_k_chunks(query_embedding, top_k)

    # Step 3: Format the context for ChatGPT
    prompt = format_context_for_chatgpt(user_query, retrieved_chunks)

    # Step 4: Get the response from ChatGPT
    answer = get_chatgpt_response(prompt)

    return answer

# Example usage
user_question = "What are the key concepts discussed in the document?"
answer = rag_chatbot(user_question)
print(f"Answer from ChatGPT:\n{answer}")


Answer from ChatGPT:
The key concepts discussed in the document include definitions and interpretations of terms used in the agreement, the term of the agreement, grant of rights related to the use of airport facilities and exclusive premises, qualifications for operation in the State of Illinois, obligations of the city, rules and regulations compliance, indemnity, insurance, condemnation, governmental functions, airport development plan, construction of capital projects, and specific sections related to financial matters such as issuance of obligations and approval processes. The document also covers various exhibits that are incorporated into the agreement, outlining details of specific areas within the airport and related responsibilities.
