<a href="https://colab.research.google.com/github/etuckerman/SOCOTEC/blob/main/SOCOTEC_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found!")


GPU: NVIDIA A100-SXM4-40GB


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
import torch

# Enable mixed precision for faster computations on A100
torch.set_default_dtype(torch.float16)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True


In [4]:
%%capture
!pip install llama_parse huggingface_hub langchain chromadb nest_asyncio langchain-community unstructured langchain-huggingface gradio

In [5]:
!nvidia-smi


Wed Jan  8 23:01:07 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0              41W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# RAG PIPELINE

# Loading and Preprocessing

In [6]:
import nest_asyncio
from llama_parse import LlamaParse

# Apply nest_asyncio to handle the event loop
nest_asyncio.apply()

### BASIC PARSING
# # Initialize the LlamaParse parser with optimized parsing instructions
# parser = LlamaParse(
#     api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
#     result_type="markdown",  # Retain markdown format for structured output
#     language="en",  # Set to English since the IBC is in English
#     verbose=True,  # Enable detailed logs to monitor parsing performance
#     is_formatting_instruction=True,  # Preserve formatting for context retrieval
#     parsing_instruction="""
#         Extract the following key elements from the document:
#         1. Chapter titles and their numbers.
#         2. Section headings and subheadings with their corresponding numbers.
#         3. Key definitions and terms listed in the document.
#         4. Detailed descriptions of occupancy classifications, fire-resistance requirements, and structural design criteria.
#         5. All tables and their captions, including their associated data.
#         6. Any reference codes, figures, or diagrams mentioned in the text.
#         Format the extracted data in a structured and readable manner, preserving markdown styling for clarity (e.g., **bold** headings, bullet points for lists, etc.).
#     """
# )

### OPTIMISED PARSING TEST [currently costs 30$ so i cancelled it]
# Initialize the LlamaParse parser with optimized parameters
parser = LlamaParse(
    api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
    is_remote=False,  # Processing locally for faster iterations
    verbose=True,  # Keep verbose for detailed logs
    show_progress=True,  # Show progress for better tracking
    language="en",  # Document language is English
    split_by_page=True,  # Process document page by page for modularity
    result_type="markdown",  # Export as markdown for better structuring
    max_timeout=3000,  # Increase timeout for processing large documents
    num_workers=6,  # Utilize 6 workers for concurrent processing
    parsing_instruction=(
        "Extract all critical information, including definitions, tables, figures, and important text "
        "relevant to occupancy classifications, construction types, fire-resistance requirements, "
        "design loads, and any other regulations. Focus on sections that may aid in answering queries."
    ),
    structured_output=False,  # Output as plain markdown, structured parsing is unnecessary here
    annotate_links=True,  # Annotate links for better context during retrieval
    auto_mode=True,  # Enable auto mode to trigger optimizations for certain elements
    auto_mode_trigger_on_table_in_page=True,  # Prioritize tables (highly structured info)
    auto_mode_trigger_on_image_in_page=True,  # Include charts/diagrams for completeness
    disable_ocr=False,  # Allow OCR for text in non-standard formats
    extract_charts=True,  # Include chart data in the parsed output
    extract_layout=False,  # Skip layout info, focusing purely on content
    premium_mode=True,  # Enable premium processing for improved accuracy
    page_separator="\n\n---\n\n",  # Separate pages clearly for retrieval
    max_pages=None,  # Process the entire document
    continuous_mode=False,  # Avoid continuous mode; keep pages distinct
)


# Parse the syllabus document
parsed_documents = parser.load_data("/content/IBC.pdf")

# Save the parsed results to a markdown or any preferred format
with open('IBC.md', 'w') as f:
    for doc in parsed_documents:
        f.write(doc.text + '\n')


KeyboardInterrupt: 

In [6]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


# Embedding and Vector Store setup

In [10]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [12]:
torch.cuda.empty_cache()


In [14]:
!nvidia-smi


Wed Jan  8 23:05:19 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              46W / 400W |  35723MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline

# Step 1: Load the parsed markdown document
loader = UnstructuredMarkdownLoader("IBC.md")
docs = loader.load()

# Step 2: Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

# Step 3: Load the GTE model for embeddings with trust_remote_code=True
embedding_model_name = "Alibaba-NLP/gte-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name, trust_remote_code=True)

# Define the device for GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device
embedding_model = embedding_model.to(device)

# Adjust the embedding generation code
def get_embeddings(texts, model, tokenizer):
    """
    Generate embeddings for a list of texts using the GTE model on GPU.
    """
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling to get fixed-size embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1)

    return embeddings.cpu().numpy()  # Move the result back to CPU for further processing

# Generate embeddings for the document chunks
corpus_embeddings = get_embeddings([doc.page_content for doc in texts], embedding_model, tokenizer)

# Step 4: Initialize a FAISS vector store
embedding_dim = corpus_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance for similarity search

# Add embeddings to the vector store
index.add(corpus_embeddings)

def retrieve_documents(query, k=2):
    """
    Retrieve the top-k most similar documents for a given query.
    """
    query_embedding = get_embeddings([query], embedding_model, tokenizer)
    distances, indices = index.search(query_embedding, k)
    results = [texts[i].page_content for i in indices[0]]
    return results

# Step 5: Load the Qwen Model for Text Generation
qwen_pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-7B",
    tokenizer="Qwen/Qwen2.5-7B",
    device=0  # Use GPU
)
qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)

# Step 6: Define the Prompt Template
prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=(
        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. "
        "You have extensive knowledge of the IBC 2018 International Building Code. "
        "Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content. "
        "Do not mention or reference any specific document or context. Just provide a direct and concise answer. "
        "Query: {query}\n"
        "Response:"
    ),
)

# Example Workflow
query = "What are the requirements for fire exits in commercial buildings?"
retrieved_docs = retrieve_documents(query)
context = " ".join(retrieved_docs)

# Construct the input for the Qwen model
formatted_input = prompt.format(context=context, query=query)
response = qwen_llm(formatted_input)

# Output the response
print("Response:", response)


configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 8.86 GiB. GPU 0 has a total capacity of 39.56 GiB of which 4.68 GiB is free. Process 21679 has 34.88 GiB memory in use. Of the allocated memory 34.37 GiB is allocated by PyTorch, and 16.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

When processing such a substantial document for a Retrieval-Augmented Generation (RAG) system, it's crucial to optimize the text chunking and embedding process to balance performance and accuracy.

Optimizing Text Chunking and Embedding:

Text Chunking:

Chunk Size: Given the document's length, consider setting the chunk_size to 1500 characters. This size is manageable for most language models and ensures that each chunk contains sufficient context.
Overlap: Maintain an overlap of 100 characters (chunk_overlap=100). This overlap helps preserve context between chunks, which is beneficial for understanding references across sections.
Embeddings:

Model Selection: The all-MiniLM-L6-v2 model is efficient and effective for generating embeddings. It's a good choice for balancing performance and computational efficiency.
Vector Store: Utilize Chroma as the vector store. It's optimized for handling large datasets and supports efficient similarity searches.

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Load the parsed markdown document
loader = UnstructuredMarkdownLoader("IBC.md")
docs = loader.load()

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)


In [None]:

# Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(texts, embeddings)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})


# MODEL SETUP

In [None]:
# Step 3: Load the Qwen Model
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline

qwen_pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-7B",
    tokenizer="Qwen/Qwen2.5-7B",
    device=0  # Use GPU
)
qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)

## Refine Prompt Template

In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=(
        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
        "You have extensive knowledge of the IBC 2018 International Building Code."
        "Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content."
        "Do not mention or reference any specific document or context. Just provide a direct and concise answer."
        "Query: {query}\n"
        "Response:"
    ),
)


## Setup RetrivalQA Chain

In [None]:

# # Step 6: Test the RAG System
# query_1 = "What is the purpose of Appendix B: Board of Appeals?"
# response_1 = qa_chain.invoke({"query": query_1})
# print(f"Answer 1: {response_1}")


In [None]:

# query_2 = "Explain the key concepts discussed in the document?"
# response_2 = qa_chain.invoke({"query": query_2})
# print(f"Answer 2: {response_2}")


In [None]:
# # Example IBC-specific questions
# queries = [
#     "What is the purpose of Appendix B: Board of Appeals?",
#     "What are the occupancy classifications defined in Chapter 3?",
#     "How does the IBC define mixed-use occupancies?",
#     "What are the fire-resistance requirements for Type I construction?",
#     "What are the minimum design loads for buildings and structures?"
# ]


In [None]:
# # Loop through and retrieve answers
# for query in queries:
#     response = qa_chain.invoke({"query": query})
#     print(f"Query: {query}\nAnswer: {response}\n")


In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_llm(llm=qwen_llm, retriever=retriever)


##Gradio implementation




In [None]:
import gradio as gr
import torch
from langchain.chains import RetrievalQA

# Define the function that will display the results in a more readable format
def query_rag_system(query):
    # Use the qa_chain.invoke to get the response for the query
    response = qa_chain.invoke({"query": query})
    # Return the response in a user-friendly format (you can format it as needed)
    return response.get('result', "No result found")

# Create a Gradio interface
interface = gr.Interface(
    fn=query_rag_system,  # This is the function that will be called to generate the output
    inputs=gr.Textbox(label="Enter your query"),  # The input for the user query
    outputs=gr.Textbox(label="RAG System Answer", lines=20),  # The output for displaying the result
    live=True,  # Optional: Allows for live updates as the user types
    title="RAG Query Interface",  # Title for the interface
    description="Enter a query related to the IBC 2018 International Building Code, and the system will provide an answer based on the context."
)

# Launch the interface
interface.launch(debug=True, share=True)
