<a href="https://colab.research.google.com/github/etuckerman/SOCOTEC/blob/main/SOCOTEC_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG PIPELINE

In [None]:
!pip install llama_parse huggingface_hub langchain chromadb nest_asyncio


Collecting llama_parse
  Downloading llama_parse-0.5.19-py3-none-any.whl.metadata (7.0 kB)
Collecting chromadb
  Downloading chromadb-0.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting llama-index-core>=0.11.0 (from llama_parse)
  Downloading llama_index_core-0.12.10.post1-py3-none-any.whl.metadata (2.5 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.7.5-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxrunt

# Loading and Preprocessing

In [None]:
import nest_asyncio
from llama_parse import LlamaParse

# Apply nest_asyncio to handle the event loop
nest_asyncio.apply()

# Initialize the LlamaParse parser with parsing instructions
parser = LlamaParse(
    api_key="llx-ZTieolOu9t8Ks9FvurLVGbBujjpap5s63nI0PHXsv4EV4szb",
    result_type="markdown",
    language="en",
    verbose=True,
    is_formatting_instruction=False,
    # parsing_instruction="""Extract all the important information from the document and focus on the tables and sections that may help with answering questions the user."""
)

# Parse the syllabus document
parsed_documents = parser.load_data("/content/IBC.pdf")

# Save the parsed results to a markdown or any preferred format
with open('IBC.md', 'w') as f:
    for doc in parsed_documents:
        f.write(doc.text + '\n')


# Embedding and Vector Store

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.14-py3-none-any.whl.metadata (2.9 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.14 (from langchain-community)
  Downloading langchain-0.3.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.29 (from langchain-community)
  Downloading langchain_core-0.3.29-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Downloading langchain_community-0.3.14-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading langchain-0.3.14-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
!pip install unstructured

Collecting unstructured
  Downloading unstructured-0.16.12-py3-none-any.whl.metadata (24 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2024.10.22-py3-none-any.whl.metadata (13 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz (from unstructured)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting unstructured-client (from un

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Init embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Load parsed document
loader = UnstructuredMarkdownLoader("IBC.md")

# Split the documents into smaller chunks for efficient retrieval
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = loader.load()

# Split documents into chunks
texts = text_splitter.split_documents(docs)

# Create a Chroma vector store to store the document chunks
vectorstore = Chroma.from_documents(texts, embeddings)


  vectorstore.persist()


# MODEL SETUP

In [None]:
from transformers import pipeline

# Load Qwen 2.5 7B model using Hugging Face's pipeline
qwen_model = "Qwen/Qwen2.5-7B"
qwen_pipe = pipeline("text-generation", model=qwen_model, tokenizer=qwen_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


## Langchain for RAG

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Wrap Qwen model in Langchain's HuggingFacePipeline
qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)

# Set up the RetrievalQA chain, linking the vector store and the Qwen model
qa_chain = RetrievalQA.from_chain_type(llm=qwen_llm, chain_type="stuff", retriever=vectorstore.as_retriever())

# Test the setup with a question
question = "What is the main topic of the document?"
response = qa_chain.run(question)

# Output the response
print("Answer:", response)


  qwen_llm = HuggingFacePipeline(pipeline=qwen_pipe)
  response = qa_chain.run(question)


Answer: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Important Sections for Student or User Questions:

Copyright and Licensing: Important for understanding the legal use of the document.

Access Information: Useful for users who may need to reference how the document was accessed or who accessed it.

General Information about the IBC: While not detailed in the text provided, the IBC typically includes regulations and guidelines for building construction, safety, and compliance.

Potential Questions from Students or Users:

What are the legal implications of using the IBC document?

How can I access the full version of the 2018 International Building Code?

What are the main topics covered in the International Building Code?

For further assistance, users may need to refer to specific sections of the IBC or inquire about particular building regulations or codes.

Impor

## Full RAG System

In [None]:
# Ask a question
question = "Explain the key concepts discussed in the document?"

# Get the answer from the system
response = qa_chain.run(question)

# Print the answer
print("Answer:", response)


KeyboardInterrupt: 