<a href="https://colab.research.google.com/github/etuckerman/SOCOTEC/blob/main/SOCOTEC_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found!")


GPU: NVIDIA A100-SXM4-40GB


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
import torch

# Enable mixed precision for faster computations on A100
torch.set_default_dtype(torch.float16)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True


In [4]:
%%capture
!pip install llama_parse huggingface_hub langchain chromadb nest_asyncio langchain-community unstructured langchain-huggingface gradio

In [5]:
!nvidia-smi


Wed Jan  8 23:09:05 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              45W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# RAG PIPELINE

# Loading and Preprocessing

In [6]:
import nest_asyncio
from llama_parse import LlamaParse

# Apply nest_asyncio to handle the event loop
nest_asyncio.apply()

### BASIC PARSING
# # Initialize the LlamaParse parser with optimized parsing instructions
# parser = LlamaParse(
#     api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
#     result_type="markdown",  # Retain markdown format for structured output
#     language="en",  # Set to English since the IBC is in English
#     verbose=True,  # Enable detailed logs to monitor parsing performance
#     is_formatting_instruction=True,  # Preserve formatting for context retrieval
#     parsing_instruction="""
#         Extract the following key elements from the document:
#         1. Chapter titles and their numbers.
#         2. Section headings and subheadings with their corresponding numbers.
#         3. Key definitions and terms listed in the document.
#         4. Detailed descriptions of occupancy classifications, fire-resistance requirements, and structural design criteria.
#         5. All tables and their captions, including their associated data.
#         6. Any reference codes, figures, or diagrams mentioned in the text.
#         Format the extracted data in a structured and readable manner, preserving markdown styling for clarity (e.g., **bold** headings, bullet points for lists, etc.).
#     """
# )

### OPTIMISED PARSING TEST [currently costs 30$ so i cancelled it]
# Initialize the LlamaParse parser with optimized parameters
parser = LlamaParse(
    api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
    is_remote=False,  # Processing locally for faster iterations
    verbose=True,  # Keep verbose for detailed logs
    show_progress=True,  # Show progress for better tracking
    language="en",  # Document language is English
    split_by_page=True,  # Process document page by page for modularity
    result_type="markdown",  # Export as markdown for better structuring
    max_timeout=3000,  # Increase timeout for processing large documents
    num_workers=6,  # Utilize 6 workers for concurrent processing
    parsing_instruction=(
        "Extract all critical information, including definitions, tables, figures, and important text "
        "relevant to occupancy classifications, construction types, fire-resistance requirements, "
        "design loads, and any other regulations. Focus on sections that may aid in answering queries."
    ),
    structured_output=False,  # Output as plain markdown, structured parsing is unnecessary here
    annotate_links=True,  # Annotate links for better context during retrieval
    auto_mode=True,  # Enable auto mode to trigger optimizations for certain elements
    auto_mode_trigger_on_table_in_page=True,  # Prioritize tables (highly structured info)
    auto_mode_trigger_on_image_in_page=True,  # Include charts/diagrams for completeness
    disable_ocr=False,  # Allow OCR for text in non-standard formats
    extract_charts=True,  # Include chart data in the parsed output
    extract_layout=False,  # Skip layout info, focusing purely on content
    premium_mode=True,  # Enable premium processing for improved accuracy
    page_separator="\n\n---\n\n",  # Separate pages clearly for retrieval
    max_pages=None,  # Process the entire document
    continuous_mode=False,  # Avoid continuous mode; keep pages distinct
)


# Parse the syllabus document
parsed_documents = parser.load_data("/content/IBC.pdf")

# Save the parsed results to a markdown or any preferred format
with open('IBC.md', 'w') as f:
    for doc in parsed_documents:
        f.write(doc.text + '\n')


KeyboardInterrupt: 

In [6]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


# Embedding and Vector Store setup

In [10]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [12]:
torch.cuda.empty_cache()


In [14]:
!nvidia-smi


Wed Jan  8 23:05:19 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              46W / 400W |  35723MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline

# Step 1: Load the parsed markdown document
loader = UnstructuredMarkdownLoader("IBC.md")
docs = loader.load()

# Step 2: Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

# Step 3: Load the BGE model for embeddings
model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

# Move the model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = embedding_model.to(device)

# Adjust the embedding generation code
def get_embeddings(texts, model, tokenizer):
    """
    Generate embeddings for a list of texts using the BGE model on GPU.
    """
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling to get fixed-size embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1)

    return embeddings.cpu().numpy()  # Move the result back to CPU for further processing


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

TypeError: in method 'IndexFlatCodes_add', argument 3 of type 'float const *'

In [8]:

# Generate embeddings for the document chunks
corpus_embeddings = get_embeddings([doc.page_content for doc in texts], embedding_model, tokenizer)

# Step 4: Initialize a FAISS vector store
embedding_dim = corpus_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance for similarity search

# Convert embeddings to np.float32 and ensure contiguous memory layout
corpus_embeddings = corpus_embeddings.astype(np.float32)
#This line ensures the embeddings are in the correct data type.
corpus_embeddings = np.ascontiguousarray(corpus_embeddings)
#This line ensures the embeddings are in a contiguous memory layout.


# Add embeddings to the vector store
index.add(corpus_embeddings)

def retrieve_documents(query, k=2):
    """
    Retrieve the top-k most similar documents for a given query.
    """
    query_embedding = get_embeddings([query], embedding_model, tokenizer)
    distances, indices = index.search(query_embedding, k)
    results = [texts[i].page_content for i in indices[0]]
    return results

# Step 5: Load the Qwen Model for Text Generation
qwen_pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-7B",
    tokenizer="Qwen/Qwen2.5-7B",
    device=0  # Use GPU
)
qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


TypeError: in method 'IndexFlat_search', argument 3 of type 'float const *'

In [29]:

# Step 6: Define the Prompt Template
prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=(
        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. "
        "You have extensive knowledge of the IBC 2018 International Building Code. "
        "Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content. "
        "Do not mention or reference any specific document or context. Just provide a direct and concise answer. "
        "Query: {query}\n"
        "Response:"
    ),
)


In [11]:
# def retrieve_documents(query, k=2):
#     """
#     Retrieve the top-k most similar documents for a given query.
#     """
#     query_embedding = get_embeddings([query], embedding_model, tokenizer)
#     # Convert query_embedding to numpy.float32 before searching
#     query_embedding = query_embedding.astype(np.float32)
#     #This line ensures the embeddings are in the correct data type.
#     query_embedding = np.ascontiguousarray(query_embedding)
#     #This line ensures the embeddings are in a contiguous memory layout.
#     distances, indices = index.search(query_embedding, k)
#     results = [texts[i].page_content for i in indices[0]]
#     return results

In [34]:
def retrieve_documents(query, k=2, device="cuda"):
    """
    Retrieve the top-k most similar documents for a given query.
    """
    # Move the model to the correct device (GPU or CPU)
    embedding_model.to(device)

    # Tokenize the query
    query_embedding = tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        # Generate embeddings from the model
        query_embedding = embedding_model(**query_embedding).last_hidden_state.mean(dim=1)  # Get embeddings

    query_embedding = query_embedding.cpu().numpy().astype(np.float32)  # Ensure the embeddings are on CPU before passing to FAISS

    # Perform similarity search in the FAISS index
    distances, indices = index.search(query_embedding, k)
    results = [texts[i] for i in indices[0]]  # Retrieve the corresponding documents
    return results

# Define the function that will process the query
def query_rag_system(query, device="cuda"):
    # Retrieve the documents relevant to the query
    retrieved_docs = retrieve_documents(query, device=device)

    # Extract the text content from Document objects and concatenate them to form the context
    context = " ".join([doc.page_content for doc in retrieved_docs])

    # Debugging: print the context to check it's being retrieved
    print(f"Context for query '{query}':\n{context}\n")

    # Generate the formatted input for the model using the prompt template
    formatted_input = prompt.format(context=context, query=query)

    # Get the model's response (assuming qwen_llm is set up correctly)
    model_response = qwen_llm(formatted_input)

    return model_response


# Define the prompt template
prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=(
        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. "
        "You have extensive knowledge of the IBC 2018 International Building Code. "
        "Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content. "
        "Do not mention or reference any specific document or context. Just provide a direct and concise answer. "
        "Query: {query}\n"
        "Response:"
    ),
)


# Create Gradio interface
interface = gr.Interface(
    fn=query_rag_system,  # Function to handle user input and query the RAG system
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask a question..."),  # User input field
    outputs=gr.Textbox(label="RAG System Answer", lines=20),  # Output the response
    live=True,  # Optional: update in real-time
    title="RAG Query Interface",
    description="Enter a question, and the system will provide an answer based on the retrieved context."
)

# Launch the Gradio interface
interface.launch(debug=True, share=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://95b2a2d3e4c9cc8350.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Context for query 'What are the IBC requirements for fall protection systems on roofs with slopes steeper
than 2:12?':
IA IB IIA IIB IIIA IIIB IV VA VB B B B Cc B Cc B B Cc

For SI: 1 foot = 304.8 mm, 1 square foot = 0.0929 m². a. Unless otherwise required in accordance with the International Wildland-Urban Interface Code or due to the location of the building within a fire district in accordance with Appendix D. b. Nonclassified roof coverings shall be permitted on buildings of Group R-3 and Group U occupancies, where there is a minimum fire-separation distance of 6 feet measured from the leading edge of the roof. c. Buildings that are not more than two stories above grade plane and having not more than 6,000 square feet of projected roof area and where there is a minimum 10-foot fire-separation distance from the leading edge of the roof to a lot line on all sides of the building, except for street fronts or public ways, shall be permitted to have roofs of No. 1 cedar or redwood shake



In [30]:
# import gradio as gr
# import torch
# from langchain.chains import RetrievalQA

# def retrieve_documents(query, k=2):
#     """
#     Retrieve the top-k most similar documents for a given query.
#     """
#     query_embedding = get_embeddings([query], embedding_model, tokenizer)
#     # Convert query_embedding to numpy.float32 before searching
#     query_embedding = query_embedding.astype(np.float32)
#     #This line ensures the embeddings are in the correct data type.
#     query_embedding = np.ascontiguousarray(query_embedding)
#     #This line ensures the embeddings are in a contiguous memory layout.
#     distances, indices = index.search(query_embedding, k)
#     results = [texts[i].page_content for i in indices[0]]
#     return results

# # Define the function that will process the query
# def query_rag_system(query):
#     # Retrieve the documents relevant to the query
#     retrieved_docs = retrieve_documents(query)

#     # Concatenate the documents to form the context
#     context = " ".join(retrieved_docs)

#     # Debugging: print the context to check it's being retrieved
#     print(f"Context for query '{query}':\n{context}\n")

#     # Generate the formatted input for the model using the prompt template
#     formatted_input = prompt.format(context=context, query=query)

#     # Get the model's response
#     model_response = qwen_llm(formatted_input)

#     return model_response


# # Create Gradio interface
# interface = gr.Interface(
#     fn=query_rag_system,  # Function to handle user input and query the RAG system
#     inputs=gr.Textbox(label="Enter your query", placeholder="Ask a question..."),  # User input field
#     outputs=gr.Textbox(label="RAG System Answer", lines=20),  # Output the response
#     live=True,  # Optional: update in real-time
#     title="RAG Query Interface",
#     description="Enter a question, and the system will provide an answer based on the retrieved context."
# )

# # Launch the Gradio interface
# interface.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://553f1e784bc701ad84.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Context for query 'r':
1110.2 Facilities serving Group R-2, R-3 and R-4 occupancies.

Recreational facilities that serve Group R-2, R-3 and Group R-4 occupancies shall comply with Sections 1110.2.1 through 1110.2.3, as applicable.

1110.2.1 Facilities serving Accessible units.

In Group R-2 and R-4 occupancies where recreational facilities serve Accessible units, every recreational facility of each type serving Accessible units shall be accessible.

1110.2.2 Facilities serving Type A and Type B units in a single building.

In Group R-2, R-3 and R-4 occupancies where recreational facilities serve a single building containing Type A units or Type B units, 25 percent, but not less than one, of each type of recreational facility shall be accessible. Every recreational facility of each type on a site shall be considered to determine the total number of each type that is required to be accessible.

1110.2.3 Facilities serving Type A and Type B units in multiple buildings.

In Group R-2, R-3 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Context for query 'j':
Appendix J Grading.

Appendix J provides standards for the grading of properties. This appendix also provides standards for administration and enforcement of a grading program including permit and inspection requirements. Appendix J was originally developed in the 1960s and used for many years in jurisdictions throughout the western states. It is intended to provide consistent and uniform code requirements anywhere grading is considered an issue.

Appendix K Administrative Provisions.

Appendix K primarily provides administrative provisions for jurisdictions adopting and enforcing NFPA 70—the National Electrical Code (NEC). The provisions contained in this appendix are compatible with administrative and enforcement provisions contained in Chapter 1 of the IBC and the other International Codes. Annex H of NFPA 70 also contains administrative provisions for the NEC; however, some of its provisions are not compatible with IBC Chapter 1. Section K110 also contains te



In [12]:

# Example Workflow
query = "What are the requirements for fire exits in commercial buildings?"
retrieved_docs = retrieve_documents(query)
context = " ".join(retrieved_docs)

# Construct the input for the Qwen model
formatted_input = prompt.format(context=context, query=query)
response = qwen_llm(formatted_input)

# Output the response
print("Response:", response)


  response = qwen_llm(formatted_input)


Response: You are Qwen, created by Alibaba Cloud. You are a helpful assistant. You have extensive knowledge of the IBC 2018 International Building Code. Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content. Do not mention or reference any specific document or context. Just provide a direct and concise answer. Query: What are the requirements for fire exits in commercial buildings?
Response: The IBC 2018 requires that all commercial buildings have at least two means of egress from each floor, including fire exits. Fire exits must be clearly marked and located in easily accessible areas. They must also be equipped with emergency lighting and exit signs. Additionally, fire exits must be kept clear of obstructions and must not be used for any other purpose.


When processing such a substantial document for a Retrieval-Augmented Generation (RAG) system, it's crucial to optimize the text chunking and embedding process to balance performance and accuracy.

Optimizing Text Chunking and Embedding:

Text Chunking:

Chunk Size: Given the document's length, consider setting the chunk_size to 1500 characters. This size is manageable for most language models and ensures that each chunk contains sufficient context.
Overlap: Maintain an overlap of 100 characters (chunk_overlap=100). This overlap helps preserve context between chunks, which is beneficial for understanding references across sections.
Embeddings:

Model Selection: The all-MiniLM-L6-v2 model is efficient and effective for generating embeddings. It's a good choice for balancing performance and computational efficiency.
Vector Store: Utilize Chroma as the vector store. It's optimized for handling large datasets and supports efficient similarity searches.

In [None]:
# from langchain_huggingface import HuggingFaceEmbeddings
# from langchain.vectorstores import Chroma
# from langchain.document_loaders import UnstructuredMarkdownLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter


# # Load the parsed markdown document
# loader = UnstructuredMarkdownLoader("IBC.md")
# docs = loader.load()

# # Split documents into chunks
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
# texts = text_splitter.split_documents(docs)


In [None]:

# # Create embeddings and vector store
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# vectorstore = Chroma.from_documents(texts, embeddings)
# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})


# MODEL SETUP

In [None]:
# # Step 3: Load the Qwen Model
# from transformers import pipeline
# from langchain_huggingface import HuggingFacePipeline

# qwen_pipe = pipeline(
#     "text-generation",
#     model="Qwen/Qwen2.5-7B",
#     tokenizer="Qwen/Qwen2.5-7B",
#     device=0  # Use GPU
# )
# qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)

## Refine Prompt Template

In [None]:
# from langchain.prompts import PromptTemplate

# prompt = PromptTemplate(
#     input_variables=["context", "query"],
#     template=(
#         "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
#         "You have extensive knowledge of the IBC 2018 International Building Code."
#         "Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content."
#         "Do not mention or reference any specific document or context. Just provide a direct and concise answer."
#         "Query: {query}\n"
#         "Response:"
#     ),
# )


## Setup RetrivalQA Chain

In [None]:

# # Step 6: Test the RAG System
# query_1 = "What is the purpose of Appendix B: Board of Appeals?"
# response_1 = qa_chain.invoke({"query": query_1})
# print(f"Answer 1: {response_1}")


In [None]:

# query_2 = "Explain the key concepts discussed in the document?"
# response_2 = qa_chain.invoke({"query": query_2})
# print(f"Answer 2: {response_2}")


In [None]:
# # Example IBC-specific questions
# queries = [
#     "What is the purpose of Appendix B: Board of Appeals?",
#     "What are the occupancy classifications defined in Chapter 3?",
#     "How does the IBC define mixed-use occupancies?",
#     "What are the fire-resistance requirements for Type I construction?",
#     "What are the minimum design loads for buildings and structures?"
# ]


In [None]:
# # Loop through and retrieve answers
# for query in queries:
#     response = qa_chain.invoke({"query": query})
#     print(f"Query: {query}\nAnswer: {response}\n")


In [None]:
# from langchain.chains import RetrievalQA

# qa_chain = RetrievalQA.from_llm(llm=qwen_llm, retriever=retriever)


##Gradio implementation




In [13]:
import gradio as gr
import torch
from langchain.chains import RetrievalQA

# Define the function that will display the results in a more readable format
def query_rag_system(query):
    # Use the qa_chain.invoke to get the response for the query
    response = qa_chain.invoke({"query": query})
    # Return the response in a user-friendly format (you can format it as needed)
    return response.get('result', "No result found")

# Create a Gradio interface
interface = gr.Interface(
    fn=query_rag_system,  # This is the function that will be called to generate the output
    inputs=gr.Textbox(label="Enter your query"),  # The input for the user query
    outputs=gr.Textbox(label="RAG System Answer", lines=20),  # The output for displaying the result
    live=True,  # Optional: Allows for live updates as the user types
    title="RAG Query Interface",  # Title for the interface
    description="Enter a query related to the IBC 2018 International Building Code, and the system will provide an answer based on the context."
)

# Launch the interface
interface.launch(debug=True, share=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://577f4368a7914b045c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 2045, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1592, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 8

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://577f4368a7914b045c.gradio.live


