<a href="https://colab.research.google.com/github/etuckerman/SOCOTEC/blob/main/SOCOTEC_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU found!")


GPU: NVIDIA A100-SXM4-40GB


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
import torch

# Enable mixed precision for faster computations on A100
torch.set_default_dtype(torch.float16)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True


In [4]:
%%capture
!pip install llama_parse huggingface_hub langchain chromadb nest_asyncio langchain-community unstructured langchain-huggingface gradio

In [5]:
!nvidia-smi


Thu Jan 16 23:01:19 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              47W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# RAG PIPELINE

# Loading and Preprocessing

In [6]:
# import nest_asyncio
# from llama_parse import LlamaParse

# # Apply nest_asyncio to handle the event loop
# nest_asyncio.apply()

# ### BASIC PARSING
# # # Initialize the LlamaParse parser with optimized parsing instructions
# # parser = LlamaParse(
# #     api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
# #     result_type="markdown",  # Retain markdown format for structured output
# #     language="en",  # Set to English since the IBC is in English
# #     verbose=True,  # Enable detailed logs to monitor parsing performance
# #     is_formatting_instruction=True,  # Preserve formatting for context retrieval
# #     parsing_instruction="""
# #         Extract the following key elements from the document:
# #         1. Chapter titles and their numbers.
# #         2. Section headings and subheadings with their corresponding numbers.
# #         3. Key definitions and terms listed in the document.
# #         4. Detailed descriptions of occupancy classifications, fire-resistance requirements, and structural design criteria.
# #         5. All tables and their captions, including their associated data.
# #         6. Any reference codes, figures, or diagrams mentioned in the text.
# #         Format the extracted data in a structured and readable manner, preserving markdown styling for clarity (e.g., **bold** headings, bullet points for lists, etc.).
# #     """
# # )

# ### OPTIMISED PARSING TEST [currently costs 30$ so i cancelled it]
# # Initialize the LlamaParse parser with optimized parameters
# parser = LlamaParse(
#     api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
#     is_remote=False,  # Processing locally for faster iterations
#     verbose=True,  # Keep verbose for detailed logs
#     show_progress=True,  # Show progress for better tracking
#     language="en",  # Document language is English
#     split_by_page=True,  # Process document page by page for modularity
#     result_type="markdown",  # Export as markdown for better structuring
#     max_timeout=3000,  # Increase timeout for processing large documents
#     num_workers=6,  # Utilize 6 workers for concurrent processing
#     parsing_instruction=(
#         "Extract all critical information, including definitions, tables, figures, and important text "
#         "relevant to occupancy classifications, construction types, fire-resistance requirements, "
#         "design loads, and any other regulations. Focus on sections that may aid in answering queries."
#     ),
#     structured_output=False,  # Output as plain markdown, structured parsing is unnecessary here
#     annotate_links=True,  # Annotate links for better context during retrieval
#     auto_mode=True,  # Enable auto mode to trigger optimizations for certain elements
#     auto_mode_trigger_on_table_in_page=True,  # Prioritize tables (highly structured info)
#     auto_mode_trigger_on_image_in_page=True,  # Include charts/diagrams for completeness
#     disable_ocr=False,  # Allow OCR for text in non-standard formats
#     extract_charts=True,  # Include chart data in the parsed output
#     extract_layout=False,  # Skip layout info, focusing purely on content
#     premium_mode=True,  # Enable premium processing for improved accuracy
#     page_separator="\n\n---\n\n",  # Separate pages clearly for retrieval
#     max_pages=None,  # Process the entire document
#     continuous_mode=False,  # Avoid continuous mode; keep pages distinct
# )


# # Parse the syllabus document
# parsed_documents = parser.load_data("/content/IBC.pdf")

# # Save the parsed results to a markdown or any preferred format
# with open('IBC.md', 'w') as f:
#     for doc in parsed_documents:
#         f.write(doc.text + '\n')


In [12]:
!pip install faiss-gpu-cu12

Collecting faiss-gpu-cu12
  Downloading faiss_gpu_cu12-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading faiss_gpu_cu12-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu-cu12
Successfully installed faiss-gpu-cu12-1.9.0.post1


# Embedding and Vector Store setup

In [13]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [14]:
torch.cuda.empty_cache()


In [15]:
!nvidia-smi


Thu Jan 16 23:04:20 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              47W / 400W |      5MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [20]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [21]:
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from transformers import pipeline

# Step 1: Load the parsed markdown document
loader = UnstructuredMarkdownLoader("IBC.md")
docs = loader.load()

# Step 2: Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

# Step 3: Load the BGE model for embeddings
model_name = "BAAI/bge-m3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
embedding_model = AutoModel.from_pretrained(model_name)

# Move the model to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = embedding_model.to(device)

# Adjust the embedding generation code
def get_embeddings(texts, model, tokenizer):
    """
    Generate embeddings for a list of texts using the BGE model on GPU.
    """
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # Mean pooling to get fixed-size embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1)

    return embeddings.cpu().numpy()  # Move the result back to CPU for further processing


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [22]:

# Generate embeddings for the document chunks
corpus_embeddings = get_embeddings([doc.page_content for doc in texts], embedding_model, tokenizer)

# Step 4: Initialize a FAISS vector store
embedding_dim = corpus_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance for similarity search

# Convert embeddings to np.float32 and ensure contiguous memory layout
corpus_embeddings = corpus_embeddings.astype(np.float32)
#This line ensures the embeddings are in the correct data type.
corpus_embeddings = np.ascontiguousarray(corpus_embeddings)
#This line ensures the embeddings are in a contiguous memory layout.


# Add embeddings to the vector store
index.add(corpus_embeddings)

def retrieve_documents(query, k=2):
    """
    Retrieve the top-k most similar documents for a given query.
    """
    query_embedding = get_embeddings([query], embedding_model, tokenizer)
    distances, indices = index.search(query_embedding, k)
    results = [texts[i].page_content for i in indices[0]]
    return results

# Step 5: Load the Qwen Model for Text Generation
qwen_pipe = pipeline(
    "text-generation",
    model="Qwen/Qwen2.5-7B",
    tokenizer="Qwen/Qwen2.5-7B",
    device=0  # Use GPU
)
qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


In [27]:
# Step 6: Define the Prompt Template
prompt = PromptTemplate(
    input_variables=["context", "query"],
    template=(
        "<|im_start|>user\n"
        "You are Qwen, created by Alibaba Cloud. You are a helpful assistant. "
        "You have extensive knowledge of the IBC 2018 International Building Code. "
        "Answer the following query based on your knowledge of the IBC, as if you are already familiar with the content. "
        "Do not mention or reference any specific document or context. Just provide a direct and concise answer. \n"
        "Query: {query}\n"
        "<|im_end|>\n\n"
        "<|im_start|>assistant\n"
    ),
)


In [45]:
import torch
from transformers import pipeline
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

# Define the prompt template for summarization
def summarize_chunk(chunk, llm, max_new_tokens=300):
    """
    Summarize the following text in bullet points. Ensure the summary is accurate, concise, and includes all key information.
    Avoid omitting important details or using unnecessary phrases.
    """
    prompt = (
        "Provide a concise summary of the following text in bullet points. Be sure to include all essential information "
        "without leaving out key details. Avoid unnecessary explanations or introductory phrases.\n\n"
        f"{chunk}\n\n"
        "Summary:"
    )

    response = llm(prompt, max_new_tokens=max_new_tokens, truncation=True)
        # Ensure the model generates the summary content properly
    summary = response[0]["generated_text"]

    # Strip any potential leading 'Summary:' text
    if "Summary:" in summary:
        summary = summary.split("Summary:")[-1].strip()

    return summary

# Load the parsed markdown document
loader = UnstructuredMarkdownLoader("IBC.md")
docs = loader.load()

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
texts = text_splitter.split_documents(docs)

# Summarize each chunk and save to a new list
summarized_chunks = []
for doc in texts:
    summarized_chunk = summarize_chunk(doc.page_content, qwen_pipe)
    summarized_chunks.append(summarized_chunk)

    # Print each summarized chunk after it is generated
    print(f"Summary of Chunk {len(summarized_chunks)}:\n{summarized_chunk}\n\n---\n")

# Save the summarized chunks to a .md file
with open('summarized_chunks.md', 'w') as f:
    for idx, summary in enumerate(summarized_chunks):
        f.write(f"### Summary of Chunk {idx + 1}\n\n{summary}\n\n---\n\n")

print("Summarized chunks have been saved to 'summarized_chunks.md'.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
- The dimensions range from 1-2 to 10-2, with some dimensions listed multiple times.
- The lumber is used in various combinations, such as 2x2x4, 2x2x6, 2x2x8, 2x2x10, and 2x2x12.
- The lumber is used in various clear span configurations, such as 2x2x4, 2x2x6, 2x2x8, 2x2x10, and 2x2x12.
- The lumber is used in various clear span configurations, such as 2x2x4, 2x2x6, 2x2x8, 2x2x10, and 2x2x12.
- The lumber is used in various clear span configurations, such as 2x2x4, 2x2x6, 2x2x8, 2x2x10, and 2x2x12.
- The lumber is used in various clear span configurations, such as 2x2x4, 2x2x6, 2x

---

Summary of Chunk 1749:
- 2-2 × 8, 4-3, 2, 3-3, 2, 2-8, 2, 4-3, 2, 3-3, 2, 2-8, 2, 4-1, 2, 3-2, 2, 2-8, 3
- 2-2 × 10, 5-0, 2, 3-10, 2, 3-2, 3, 5-0, 2, 3-10, 2, 3-2, 3, 4-10, 2, 3-9, 3, 3-2, 3
- 2-2 × 12, 5-11, 2, 4-6, 3, 3-9, 3, 5-11, 2, 4-6, 3, 3-9, 3, 5-8, 2, 4-5, 3, 3-9, 3
- 3-2 × 8, 5-3, 1, 4-0, 2, 3-5, 2, 5-3, 2, 4-0, 2,

---

Summary 

In [46]:
# Save the summarized chunks to a .md file without headers
with open('summarized_chunks_rag.md', 'w') as f:
    for idx, summary in enumerate(summarized_chunks):
        f.write(f"{summary}\n\n---\n\n")  # Only save the summary without the header


In [25]:
import gradio as gr

In [30]:
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline
import gradio as gr

# Updated retrieve_documents function
def retrieve_documents(query, k=2, device="cuda"):
    """
    Retrieve the top-k most similar summarized chunks for a given query.
    """
    embedding_model.to(device)

    # Tokenize the query
    query_embedding = tokenizer(query, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        # Generate embeddings for the query
        query_embedding = embedding_model(**query_embedding).last_hidden_state.mean(dim=1)

    query_embedding = query_embedding.cpu().numpy().astype(np.float32)

    # Perform similarity search in FAISS index
    distances, indices = index.search(query_embedding, k)
    results = [summaries[i] for i in indices[0]]  # Retrieve the corresponding summaries

    # Calculate cosine similarity scores
    similarities = distances[0]

    return results, similarities

# Define the function to process queries
def query_rag_system(query, device="cuda"):
    """
    Process user query, retrieve relevant summarized chunks, and generate a response.
    """
    # Retrieve relevant summaries and their similarity scores
    retrieved_summaries, similarities = retrieve_documents(query, device=device)

    # Concatenate summaries to form context
    context = " ".join(retrieved_summaries)

    # Debugging: print the context and similarity scores
    print(f"Context for query '{query}':\n{context}\n")
    print(f"Similarity scores for retrieved summaries: {similarities}")

    # Format the context and query for the model
    formatted_input = prompt.format(context=context, query=query)

    # Get the model's response
    model_response = qwen_llm(formatted_input)

    # Return the response and similarity scores
    return f"Model Response:\n{model_response}\n\nSimilarity Scores: {similarities}"

# Create Gradio interface
interface = gr.Interface(
    fn=query_rag_system,  # Function to handle user input and query the RAG system
    inputs=gr.Textbox(label="Enter your query", placeholder="Ask a question..."),  # User input field
    outputs=gr.Textbox(label="RAG System Answer", lines=20),  # Output the response
    live=True,  # Optional: update in real-time
    title="RAG Query Interface",
    description="Enter a question, and the system will provide an answer based on the retrieved summaries."
)

# Launch the Gradio interface
interface.launch(debug=True, share=True)


Both `max_new_tokens` (=2048) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=2048) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


KeyboardInterrupt: 