In [None]:
%pip install "unstructured[md]" nltk langchain-text-splitters

## **Single Document**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.document_loaders import TextLoader

# Load
markdown_path = "./Processed_Files_Introduction_to_End/CANopen_Integration_7012_V10_Mar11.md"
loader = TextLoader(markdown_path, autodetect_encoding=True)
doc = loader.load()
doc[0]

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("###", "Header 3"),
    ("####", "Header 4"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(doc[0].page_content)
md_header_splits

In [None]:
# Char-level splits
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)
splits

## **All Documents**

In [77]:
import glob
import os
from langchain_core.documents import Document

In [78]:
# 1. Path/pattern for markdown files
folder_path = "./Processed_Files_Introduction_to_End/*.md"

# 2. Parameters for the Header Splitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, 
    strip_headers=False
)

# 3. Parameters for the character-level splitter
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap
)

# Final list for all splits from all files
all_splits = []

for file_path in glob.glob(folder_path):
    # a) Load the file as raw text (preserving Markdown)
    loader = TextLoader(file_path, autodetect_encoding=True)
    docs = loader.load()  # usually returns a list [Document]
    
    # b) For each Document, perform two splits
    for doc in docs:
        md_header_splits = markdown_splitter.split_text(doc.page_content)
        splits = text_splitter.split_documents(md_header_splits)
        
        # c) Use the base file name as "name_file" (without the path and without extension)
        filename = os.path.splitext(os.path.basename(file_path))[0]

        for splitted_doc in splits:
            splitted_doc.metadata["name_file"] = filename
            # Optionally, remove the original "source" if not needed:
            # splitted_doc.metadata.pop("source", None)

        all_splits.extend(splits)

In [None]:
min_size = 1000  # Minimum size for the merged chunks
merged_documents = []  # List to store merged documents
buffer_content = ""
buffer_metadata = {}

for doc in all_splits:
    text = doc.page_content.strip()  # remove extra spaces if necessary
    # If there is no content accumulated in the buffer and the chunk is small,
    # initialize the buffer with this chunk.
    if not buffer_content and len(text) < min_size:
        buffer_content = text
        buffer_metadata = doc.metadata
    # If there is already content in the buffer, concatenate it with the current chunk.
    elif buffer_content:
        buffer_content += "\n" + text  # add a line break to separate the texts
        # When the buffer reaches or exceeds the minimum size, create a Document
        if len(buffer_content) >= min_size:
            merged_documents.append(
                Document(page_content=buffer_content, metadata=buffer_metadata)
            )
            buffer_content = ""
            buffer_metadata = {}
    # If the current chunk is already large enough and there is nothing in the buffer, add it directly.
    elif len(text) >= min_size:
        merged_documents.append(Document(page_content=text, metadata=doc.metadata))

# If there is remaining content in the buffer, add it as well.
if buffer_content:
    merged_documents.append(Document(page_content=buffer_content, metadata=buffer_metadata))

# Example of printing the results:
for i, document in enumerate(merged_documents):
    print(f"Document {i+1}:")
    print("Metadata:", document.metadata)
    print("Content (first 200 characters):", document.page_content[:200])
    print("-" * 50)

In [None]:
print(f"Generated {len(all_splits)} chunks in total.")

In [None]:
print(f"Generated {len(merged_documents)} chunks in total.")

In [None]:
merged_documents

## **RAG**

In [2]:
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain_ollama import ChatOllama
from typing_extensions import TypedDict
from typing import List
from langchain.schema import Document
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langgraph.graph import START, StateGraph
from IPython.display import Image, display
from langchain_core.output_parsers import StrOutputParser
from pprint import pprint
from langchain_core.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
import textwrap

In [3]:
# Example embeddings (you can use OpenAIEmbeddings or another)
embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")

# # Create an in-memory local vector store (FAISS)
# vector_store = FAISS.from_documents(splits, embedding=embeddings)

In [4]:
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [5]:
# Example of an LLM
llm = ChatOllama(temperature=0, model="gemma3:12b")

In [6]:
template = """You are a helpful assistant that generates multiple sub-questions related to an input question about PROFIBUS-related documents and specifications in the context of industrial automation. \n

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

The goal is to break down the input into a set of sub-problems / sub-questions that can be answered in isolation. \n
Generate multiple search queries related to: {question} \n
Output (5 queries):
"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [None]:
# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

# Run
# 1. What is a GSD file, where can you get it, and what is its function?
question = "What is a GSD file, where can you get it, and what is its function?"

# # 2. What is a token message and how and which devices use it?
# question = "What is a token message and how and which devices use it?"

# 3. Which protocol parameter describes the slave's action time after receiving a message?
# question = "Which protocol parameter describes the slave's action time after receiving a message?"

# 4. How to calculate DP cycle time?
# question = "How to calculate DP cycle time?"

# 5. Briefly explain the function of a watchdog timer in a PROFIBUS DP slave.
# question = "Briefly explain the function of a watchdog timer in a PROFIBUS DP slave."

# 6. Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?
# question = "Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?"

# 7. Discuss the factors that have a significant effect on the overall cycle time of a DP network.
# question = "Discuss the factors that have a significant effect on the overall cycle time of a DP network."

# 8. What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode.
# question = "What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode."

# 9. The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70
# question = "The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70"

# 10. Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?
# question = "Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?"

questions = generate_queries_decomposition.invoke({"question":question})

In [None]:
questions

In [65]:
# Prompt
template = """Here is the question you need to answer in the context of industrial automation and PROFIBUS-related documents and specifications:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [66]:
def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()

In [67]:
q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | vector_store.as_retriever(), 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

In [68]:
def wrap_paragraphs(text, width=80) -> str:
    """
    Wrap paragraphs of text to a given width.
    
    Args:
        text: The text to wrap.
        width: The width to wrap the text to.

    Returns:    
        The wrapped text.
    """
    paragraphs = text.split('\n\n')  # split on double newlines
    wrapped_paragraphs = []
    for paragraph in paragraphs:
        wrapped_paragraphs.append(textwrap.fill(paragraph, width=width))
    return "\n\n".join(wrapped_paragraphs)

In [69]:
wrapped_answer = wrap_paragraphs(answer, width=80)
print(wrapped_answer)

The provided documentation does not detail the structure of the data bytes
within a PROFIBUS DPV1 diagnostic telegram. It references several PROFIBUS
guidelines, but these are described as providing installation profiles or
specifying communication profiles (CP 3/1, CP 3/3, CP 3/4, CP 3/5, and CP 3/6)
rather than outlining the internal structure of diagnostic telegram data.


In [70]:
prompt_rag  = """"You are an AI language model assistant that understands PROFIBUS-related documents and specifications in the context of industrial automation.

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

### Instructions:
- Answer in a clear, informative, and technically accurate manner.
- Provide a **complete and relevant explanation**, but avoid excessive detail.
- Start with a **direct answer**, then expand with useful clarification, examples, or context if appropriate.
- Avoid unnecessary filler or repetition.
- Answer **in the same language as the question**.
- You can say "I don't know" if you don't know the answer.

### Now answer the following:

Question: {question}

Context: {context}

Answer (in the language of the question):  
"""

prompt_rag = ChatPromptTemplate.from_template(prompt_rag)

In [72]:
def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG on each sub-question"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        
        # Retrieve documents for each sub-question
        retrieved_docs = vector_store.as_retriever().invoke(sub_question)
        
        # Use retrieved documents and sub-question in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        rag_results.append(answer)
    
    return rag_results,sub_questions

# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

In [None]:
answers, questions

In [74]:
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Here is a set of Q+A pairs:

DO NOT — I REPEAT, DO NOT — START YOUR ANSWER WITH ANY OF THE FOLLOWING:
- "Here's a synthesized answer..."
- "Based on the provided Q&A pairs..."
- "Here's what I found..."
- "According to the context..."
- "Okay, here's a synthesized answer..."
OR ANY OTHER SIMILAR PHRASES. 

{context}

Use these to synthesize an answer (IN THE SAME LANGUAGE AS THE QUESTION) to the question: {question}

START THE ANSWER WITH A DIRECT RESPONSE TO THE QUESTION, THEN EXPAND WITH USEFUL CLARIFICATION, EXAMPLES, OR CONTEXT IF APPROPRIATE.
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_answer = final_rag_chain.invoke({"context":context,"question":question})

## **Queries**

### 1 - What is a GSD file, where can you get it, and what is its function?

In [34]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

A GSD file (short for Geräte-Stücklisten Datei, or Device Description File) is a
text file that describes a PROFIBUS device to a master controller, like a PLC or
distributed control system. Think of it as the device's "profile" for the
PROFIBUS network.

You can typically obtain GSD files from the device manufacturer. They are
usually available for download from the manufacturer's website or included in
the device's documentation. The PROFIBUS website
([http://www.profibus.com](http://www.profibus.com)) can also be a helpful
resource, although it primarily directs you to manufacturer resources.

The function of a GSD file is crucial for proper communication between a
PROFIBUS master and slave device. It contains vital information that the master
needs to configure and communicate with the device correctly. This information
includes:

*   **Device Identification:** Manufacturer, device name, and product code. *
**Module Information:** Details about the modules within the device. *   **D

### 2 - What is a token message and how and which devices use it?

In [22]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

A token message in PROFIBUS refers to a special frame used to manage
communication on the bus, ensuring only one device transmits at a time. It's a
core element of the older, peer-to-peer PROFIBUS network topology (often
referred to as PROFIBUS DP in its original form), but its usage and significance
have diminished with the prevalence of master-slave architectures. Let's break
down what it is, how it's used, and which devices historically employed it.

**What is a Token Message?**

The token itself isn't the data being transmitted; it's a control signal. Think
of it like a "permission slip" to talk on the bus.  It contains information like
the destination address, source address, message type, and length.  The device
holding the token is the only one allowed to transmit data. Once the
transmission is complete, the token is released, allowing another device to
request it.

**How Token Messages are Used (Historically):**

In the original PROFIBUS DP network, devices would request the to

### 3 - Which protocol parameter describes the slave's action time after receiving a message?

In [58]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The protocol parameter that best describes the slave's action time after
receiving a message in PROFIBUS is its **reaction time**.

While the provided documentation doesn't explicitly define a specific "reaction
time parameter," this term encapsulates the time a slave device takes to process
data received from the master controller and prepare its response. It's a
crucial factor in overall system performance.

Here's a breakdown of why "reaction time" is the most appropriate term and how
it relates to other concepts:

*   **What it encompasses:** Slave reaction time includes the time for the slave
to read input data, perform any necessary calculations or control actions, and
then write output data back to the master. *   **Relationship to Cycle Time:**
The overall system latency (delay) is the sum of the PROFIBUS DP cycle time (the
time for the master to transmit data to all slaves and receive responses) and
the slave reaction time. Minimizing both is key for fast automation. *
**Facto

### 4 - How to calculate DP cycle time?

In [71]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

**How to calculate DP cycle time?**

The DP cycle time isn't a simple calculation but rather a configuration
parameter that needs to be carefully considered based on system requirements and
network limitations. There's no single formula, but rather a process of
estimation and adjustment. Here's a breakdown of the factors involved and how to
approach it:

**1. Understanding the Components of Cycle Time:**

The total DP cycle time is the sum of several components:

*   **Transmission Time (T_trans):** This is the time it takes for the master to
transmit data to all slaves and receive responses. It depends on:     *   **Bus
Speed:** 1.5 Mbit/s or 3 Mbit/s. Higher speed reduces transmission time.     *
**Number of Bytes Transmitted:**  Each slave requires a certain number of bytes
for input data, output data, status words, and diagnostics.     *   **Number of
Slaves:** More slaves mean more data to transmit and receive. *   **Slave
Response Time (T_slave):** This is the time it takes for e

### 5 - Briefly explain the function of a watchdog timer in a PROFIBUS DP slave.

In [110]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

Briefly, a watchdog timer in a PROFIBUS DP slave is a safety mechanism designed
to detect and respond to malfunctions that could prevent the slave from
communicating properly on the bus.

Here's a more detailed explanation: The PROFIBUS master periodically sends a
"heartbeat" signal to the slave device. The slave has a timer that must be
"kicked" or refreshed by acknowledging this heartbeat within a defined
timeframe. If the slave fails to acknowledge the heartbeat – perhaps due to a
software fault, hardware failure, or communication problem – the watchdog timer
expires. Upon expiration, the slave will typically enter a fail-safe state. This
might involve shutting down outputs, setting an alarm, or signaling an error
condition to the master.

The purpose of this mechanism is to prevent a malfunctioning slave from
continuing to operate in an unpredictable or potentially dangerous manner. It's
a crucial element in ensuring the safety and reliability of PROFIBUS systems,
especially in app

### 6 - Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?

In [134]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

"Sync" and "freeze" modes are functionalities within PROFIBUS networks primarily
used during installation, commissioning, and maintenance to simplify diagnostics
and troubleshooting. **No devices are *required* to support these modes, but
it's highly recommended, especially in critical applications.**

**Sync Mode:** This forces all slaves on the network to enter a defined state,
typically setting all outputs to a known, safe value. This allows for easy
visual inspection of the network's status and simplifies device identification
during commissioning. It's a diagnostic tool, not a normal operating mode. Think
of it as a way to temporarily put all devices into a predictable state for
inspection.

**Freeze Mode:** Similar to sync mode, freeze mode temporarily halts the
operation of devices. It's used to capture the state of the network at a
specific point in time, which can be helpful for diagnosing issues.

While not mandatory, support for these modes is common, particularly in PROFIBU

### 7 - Discuss the factors that have a significant effect on the overall cycle time of a DP network.

In [151]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The overall cycle time of a PROFIBUS DP network is significantly affected by a
number of interconnected factors, all of which need careful consideration during
network design and maintenance. These factors can be broadly categorized into
data volume, network topology and length, slave device characteristics, and
master controller configuration.

**1. Data Volume & Communication Profiles (RCPs):** The amount of data exchanged
between the master and each slave device is a primary driver of cycle time. Each
byte transmitted contributes to the overall load. Larger data blocks, whether
for process values, status information, or diagnostic data, directly increase
the time required for a complete scan. The chosen Communication Profile (RCP)
plays a crucial role here. Different RCPs dictate the data exchange parameters,
and some are more efficient than others in terms of data transmission. Selecting
an RCP that minimizes unnecessary data transmission is vital for keeping cycle
times low. For e

### 8 - What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode.

In [34]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The four operating modes of a DP Class 1 master are Initialization, Polling (or
Data Transfer), Error Handling, and Diagnostics.

**1. Initialization:** During this phase, the master configures itself and the
network. It establishes communication parameters, checks for connected devices,
and assigns addresses. Slave devices respond to the master's initialization
requests, confirming their presence and readiness. There's no direct data
exchange in this mode; it's purely about establishing a functional network.

**2. Polling (Data Transfer):** This is the primary mode of operation for a DP
Class 1 master. The master sequentially requests data from each slave device
connected to the network. The master sends a request frame to a specific slave,
and the slave responds with its data. This process repeats for each slave in a
predetermined order. The master controls the timing and sequence of data
exchange. This is a polled mode, meaning the master actively initiates
communication.  For examp

### 9 - The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70

In [19]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The data unit of a configuration telegram containing bytes D1, 23, and 70
represents a specific data area within a Siemens PLC (Programmable Logic
Controller) used for storing PROFIBUS network configuration data.

Here's a breakdown of what each byte signifies:

*   **Byte 0: D1** - This indicates data block D1 within the Siemens PLC's
memory. "D" signifies a data block, and "1" is the block number. This is the
starting point for the configuration data. *   **Byte 1: 23** - This is the byte
offset within data block D1. It specifies that the configuration data begins at
byte 23 within data block D1. *   **Byte 2: 70** - This represents the number of
consecutive bytes allocated to store the configuration data. In this case, 70
bytes are reserved for the PROFIBUS configuration information, starting at byte
23 of data block D1.

Therefore, the sequence "D1 23 70" defines a contiguous block of 70 bytes,
beginning at byte 23 within data block D1, dedicated to holding PROFIBUS
configuration i

### 10 - Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?


In [48]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

A slave device in cyclic data exchange communicates the presence of a diagnostic
fault to its controlling master primarily through error frames and diagnostic
data within the cyclic data transmission. When a fault occurs, the slave device
can insert an error frame into the communication stream, interrupting the normal
data flow and signaling the master to a problem. Alternatively, the fault
information can be encoded within a specific byte or set of bytes within the
slave’s cyclic data transmission – a designated area for diagnostic information.
This allows the master to receive the fault indication alongside the regular
data.

The master's response to this diagnostic fault varies depending on the severity
of the fault and the configured system behavior. Generally, the master will
acknowledge the error, potentially logging it for later analysis. It might also
attempt to recover from the fault by re-requesting data from the faulty slave,
essentially trying a retry. More sophisticated sy