In [1]:
# %pip install "unstructured[md]" nltk langchain-text-splitters

from dotenv import load_dotenv

load_dotenv()

True

## **Single Document**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.document_loaders import TextLoader

# Load
markdown_path = "./Processed_Files_Introduction_to_End/CANopen_Integration_7012_V10_Mar11.md"
loader = TextLoader(markdown_path, autodetect_encoding=True)
doc = loader.load()
doc[0]

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("###", "Header 3"),
    ("####", "Header 4"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(doc[0].page_content)
md_header_splits

In [None]:
# Char-level splits
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)
splits

## **All Documents**

In [77]:
import glob
import os
from langchain_core.documents import Document

In [78]:
# 1. Path/pattern for markdown files
folder_path = "./Processed_Files_Introduction_to_End/*.md"

# 2. Parameters for the Header Splitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, 
    strip_headers=False
)

# 3. Parameters for the character-level splitter
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap
)

# Final list for all splits from all files
all_splits = []

for file_path in glob.glob(folder_path):
    # a) Load the file as raw text (preserving Markdown)
    loader = TextLoader(file_path, autodetect_encoding=True)
    docs = loader.load()  # usually returns a list [Document]
    
    # b) For each Document, perform two splits
    for doc in docs:
        md_header_splits = markdown_splitter.split_text(doc.page_content)
        splits = text_splitter.split_documents(md_header_splits)
        
        # c) Use the base file name as "name_file" (without the path and without extension)
        filename = os.path.splitext(os.path.basename(file_path))[0]

        for splitted_doc in splits:
            splitted_doc.metadata["name_file"] = filename
            # Optionally, remove the original "source" if not needed:
            # splitted_doc.metadata.pop("source", None)

        all_splits.extend(splits)

In [None]:
min_size = 1000  # Minimum size for the merged chunks
merged_documents = []  # List to store merged documents
buffer_content = ""
buffer_metadata = {}

for doc in all_splits:
    text = doc.page_content.strip()  # remove extra spaces if necessary
    # If there is no content accumulated in the buffer and the chunk is small,
    # initialize the buffer with this chunk.
    if not buffer_content and len(text) < min_size:
        buffer_content = text
        buffer_metadata = doc.metadata
    # If there is already content in the buffer, concatenate it with the current chunk.
    elif buffer_content:
        buffer_content += "\n" + text  # add a line break to separate the texts
        # When the buffer reaches or exceeds the minimum size, create a Document
        if len(buffer_content) >= min_size:
            merged_documents.append(
                Document(page_content=buffer_content, metadata=buffer_metadata)
            )
            buffer_content = ""
            buffer_metadata = {}
    # If the current chunk is already large enough and there is nothing in the buffer, add it directly.
    elif len(text) >= min_size:
        merged_documents.append(Document(page_content=text, metadata=doc.metadata))

# If there is remaining content in the buffer, add it as well.
if buffer_content:
    merged_documents.append(Document(page_content=buffer_content, metadata=buffer_metadata))

# Example of printing the results:
for i, document in enumerate(merged_documents):
    print(f"Document {i+1}:")
    print("Metadata:", document.metadata)
    print("Content (first 200 characters):", document.page_content[:200])
    print("-" * 50)

In [None]:
print(f"Generated {len(all_splits)} chunks in total.")

In [None]:
print(f"Generated {len(merged_documents)} chunks in total.")

In [None]:
merged_documents

## **RAG**

In [2]:
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain_ollama import ChatOllama
from typing_extensions import TypedDict
from typing import List
from langchain.schema import Document
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langgraph.graph import START, StateGraph
from IPython.display import Image, display
from langchain_core.output_parsers import StrOutputParser
from pprint import pprint
from langchain_core.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
import textwrap
from langchain.chat_models import ChatOpenAI

In [3]:
# Example embeddings (you can use OpenAIEmbeddings or another)
embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")

# # Create an in-memory local vector store (FAISS)
# vector_store = FAISS.from_documents(splits, embedding=embeddings)

In [4]:
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [5]:
# Example of an LLM
# llm = ChatOllama(temperature=0, model="gemma3:12b")

llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

  llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")


In [6]:
template = """You are a helpful assistant that generates multiple sub-questions related to an input question about PROFIBUS-related documents and specifications in the context of industrial automation. \n

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

The goal is to break down the input into a set of sub-problems / sub-questions that can be answered in isolation. \n
Generate multiple search queries related to: {question} \n
Output (5 queries):
"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

In [None]:
# Chain
generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))

# Run
# 1. What is a GSD file, where can you get it, and what is its function?
question = "What is a GSD file, where can you get it, and what is its function?"

# # 2. What is a token message and how and which devices use it?
# question = "What is a token message and how and which devices use it?"

# 3. Which protocol parameter describes the slave's action time after receiving a message?
# question = "Which protocol parameter describes the slave's action time after receiving a message?"

# 4. How to calculate DP cycle time?
# question = "How to calculate DP cycle time?"

# 5. Briefly explain the function of a watchdog timer in a PROFIBUS DP slave.
# question = "Briefly explain the function of a watchdog timer in a PROFIBUS DP slave."

# 6. Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?
# question = "Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?"

# 7. Discuss the factors that have a significant effect on the overall cycle time of a DP network.
# question = "Discuss the factors that have a significant effect on the overall cycle time of a DP network."

# 8. What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode.
# question = "What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode."

# 9. The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70
# question = "The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70"

# 10. Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?
# question = "Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?"

questions = generate_queries_decomposition.invoke({"question":question})

In [None]:
questions

In [153]:
# Prompt
template = """Here is the question you need to answer in the context of industrial automation and PROFIBUS-related documents and specifications:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [154]:
def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()

In [155]:
q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | vector_store.as_retriever(), 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | llm
    | StrOutputParser())

    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair

In [156]:
def wrap_paragraphs(text, width=80) -> str:
    """
    Wrap paragraphs of text to a given width.
    
    Args:
        text: The text to wrap.
        width: The width to wrap the text to.

    Returns:    
        The wrapped text.
    """
    paragraphs = text.split('\n\n')  # split on double newlines
    wrapped_paragraphs = []
    for paragraph in paragraphs:
        wrapped_paragraphs.append(textwrap.fill(paragraph, width=width))
    return "\n\n".join(wrapped_paragraphs)

In [None]:
wrapped_answer = wrap_paragraphs(answer, width=80)
print(wrapped_answer)

In [158]:
prompt_rag  = """"You are an AI language model assistant that understands PROFIBUS-related documents and specifications in the context of industrial automation.

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

### Instructions:
- Answer in a clear, informative, and technically accurate manner.
- Provide a **complete and relevant explanation**, but avoid excessive detail.
- Start with a **direct answer**, then expand with useful clarification, examples, or context if appropriate.
- Avoid unnecessary filler or repetition.
- Answer **in the same language as the question**.
- You can say "I don't know" if you don't know the answer.

### Now answer the following:

Question: {question}

Context: {context}

Answer (in the language of the question):  
"""

prompt_rag = ChatPromptTemplate.from_template(prompt_rag)

In [159]:
def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):
    """RAG on each sub-question"""
    
    # Use our decomposition / 
    sub_questions = sub_question_generator_chain.invoke({"question":question})
    
    # Initialize a list to hold RAG chain results
    rag_results = []
    
    for sub_question in sub_questions:
        
        # Retrieve documents for each sub-question
        retrieved_docs = vector_store.as_retriever().invoke(sub_question)
        
        # Use retrieved documents and sub-question in RAG chain
        answer = (prompt_rag | llm | StrOutputParser()).invoke({"context": retrieved_docs, 
                                                                "question": sub_question})
        rag_results.append(answer)
    
    return rag_results,sub_questions

# Wrap the retrieval and RAG process in a RunnableLambda for integration into a chain
answers, questions = retrieve_and_rag(question, prompt_rag, generate_queries_decomposition)

In [None]:
answers, questions

In [161]:
def format_qa_pairs(questions, answers):
    """Format Q and A pairs"""
    
    formatted_string = ""
    for i, (question, answer) in enumerate(zip(questions, answers), start=1):
        formatted_string += f"Question {i}: {question}\nAnswer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions, answers)

# Prompt
template = """Here is a set of Q+A pairs:

DO NOT — I REPEAT, DO NOT — START YOUR ANSWER WITH ANY OF THE FOLLOWING:
- "Here's a synthesized answer..."
- "Based on the provided Q&A pairs..."
- "Here's what I found..."
- "According to the context..."
- "Okay, here's a synthesized answer..."
OR ANY OTHER SIMILAR PHRASES. 

{context}

Use these to synthesize an answer (IN THE SAME LANGUAGE AS THE QUESTION) to the question: {question}

START THE ANSWER WITH A DIRECT RESPONSE TO THE QUESTION, THEN EXPAND WITH USEFUL CLARIFICATION, EXAMPLES, OR CONTEXT IF APPROPRIATE.
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_answer = final_rag_chain.invoke({"context":context,"question":question})

## **Queries**

### 1 - What is a GSD file, where can you get it, and what is its function?

In [18]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

A GSD (General Station Description) file is a standardized file format used in
PROFIBUS networks to describe the capabilities and parameters of PROFIBUS
devices. Its primary function is to provide essential configuration and
communication information to engineering tools, enabling the integration and
management of devices within the network.

You can obtain a GSD file for a specific PROFIBUS device through several
avenues:

1. **Manufacturer's Website**: Most manufacturers offer GSD files in the support
or downloads section of their official websites. 2. **PROFIBUS International**:
This organization maintains a repository of GSD files for various devices, which
can be accessed through their website. 3. **Technical Documentation**: The
device's user manual or technical documentation often includes information on
where to find the GSD file. 4. **Engineering Tools**: Some engineering software
tools that support PROFIBUS configuration may come with a library of GSD files
or the capability 

### 2 - What is a token message and how and which devices use it?

In [30]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

A token message in the context of PROFIBUS communication protocols is a control
message that facilitates access to the communication medium among master
stations through a token-passing mechanism. This mechanism ensures that only the
master station holding the token can transmit data, promoting fair access to the
network.

In a PROFIBUS network, the specific devices that utilize token messages are the
master devices, particularly the DP-master (Class 1). These master devices are
responsible for initiating communication and managing data flow to and from
slave devices. When a DP-master holds the token, it can send messages to the
connected DP-slaves without needing an external request.

The token message serves several critical functions. It allows for organized
communication by ensuring that all master stations have equal opportunities to
transmit data, thus preventing any single station from monopolizing the bus.
Additionally, the token-passing protocol guarantees short reaction times

### 3 - Which protocol parameter describes the slave's action time after receiving a message?

In [54]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The protocol parameter that describes the slave's action time after receiving a
message is the **Send Timeout**. This parameter defines the maximum time allowed
for a slave device to respond to a request from the master after the message has
been received.

The Send Timeout is crucial for ensuring timely communication within the
PROFIBUS network. It is negotiated between the DP-master and the DP-slave,
allowing the slave to adjust its response time based on the master's
requirements. For example, if a master device sends a request for data, the Send
Timeout will dictate how quickly the slave must process that request and send
back the appropriate response.

In practical applications, if the Send Timeout is set too long, it can lead to
delays in communication, potentially affecting the overall performance of the
system. Conversely, if it is set too short, the slave may not have enough time
to process the request, leading to communication errors. Therefore, configuring
the Send Timeout a

### 4 - How to calculate DP cycle time?

In [66]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

To calculate the DP cycle time in a PROFIBUS network, you can use the formula:

**DP Cycle Time (TDP) = TDX + TI + TO + TM**

Where: - **TDX** is the duration of the cyclic part of the DP cycle. - **TI** is
the Input Time, which is the time required to acquire and update the input data
from the DP slaves. - **TO** is the Output Time, which is the time required to
process and send the output data to the DP slaves. - **TM** is the Master Time,
which is the time allocated for the DP master to process application tasks after
the cyclic part of the cycle.

For example, if you have a system where TDX is 200 µs, TI is 100 µs, TO is 150
µs, and TM is 50 µs, the total DP cycle time would be:

**TDP = 200 µs + 100 µs + 150 µs + 50 µs = 500 µs**

This calculation is crucial for ensuring that the network operates efficiently
and meets the timing requirements of the application. Additionally, factors such
as the number of devices, data size, and network load can influence these time
parameters, so 

### 5 - Briefly explain the function of a watchdog timer in a PROFIBUS DP slave.

In [102]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The function of a watchdog timer in a PROFIBUS DP slave is to monitor
communication between the DP master and the slave device, ensuring safe
operation. If the DP master fails to send signals within a specified timeframe,
the watchdog timer activates a safety mechanism that sets the slave's outputs to
a safe state. This is crucial for preventing unintended operations that could
lead to hazardous situations.

For instance, if the DPV1 feature is not supported, the watchdog timer's base
time is typically set to 10 ms. In cases where DPV1 is supported, the base time
can vary based on additional settings. The timer's duration is calculated using
a formula that incorporates the Watchdog Time Base and specific factors,
allowing the slave device to respond appropriately to communication failures.
This mechanism enhances the reliability and safety of industrial automation
systems by ensuring that devices can react to potential issues effectively.


### 6 - Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?

In [114]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

Les modes "sync" et "freeze" dans le contexte de la communication PROFIBUS sont
des mécanismes essentiels pour la synchronisation et la gestion des données
entre les dispositifs maîtres et esclaves.

Le mode "sync" permet de synchroniser les objets de données de sortie dans les
esclaves DP. Lorsqu'une commande de synchronisation est envoyée par le maître,
les valeurs des objets de données de sortie sont mises à jour et gelées,
garantissant que tous les esclaves traitent les données de manière coordonnée.
Ce mode est crucial pour des applications nécessitant une synchronisation
précise, comme dans les systèmes de contrôle de mouvement.

Le mode "freeze", quant à lui, concerne la gestion des données d'entrée.
Lorsqu'il est activé, il permet de geler les valeurs des objets de données
d'entrée dans les esclaves DP, empêchant toute mise à jour jusqu'à ce qu'une
commande de dégel soit reçue. Cela assure que les données traitées restent
constantes pendant des opérations critiques.

Les dispos

### 7 - Discuss the factors that have a significant effect on the overall cycle time of a DP network.

In [126]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

The overall cycle time of a DP network is significantly affected by several key
factors, including input and output times, the number of connected devices, data
transmission rates, network topology, and device configuration.

Input Time (TI) and Output Time (TO) are critical components that determine how
quickly data can be acquired from and sent to the DP-slaves. A longer TI means
that the master takes more time to gather input data, while a longer TO delays
the output of data to the slaves. Both of these times must be minimized for
efficient operation.

The number of devices connected to the network also plays a crucial role. As
more DP-slaves are added, the master must spend additional time polling each
device, which can lead to increased cycle times. For instance, if a master
communicates with 10 slaves instead of 5, the cycle time will generally increase
due to the additional polling required.

Data transmission rates are another significant factor. Higher transmission
rates allow

### 8 - What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode.

In [138]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

Les quatre modes de fonctionnement d'un maître PROFIBUS DP de classe 1 sont : le
mode d'initialisation, le mode de paramétrage, le mode d'échange de données
utilisateur et le mode de diagnostic.

1. **Mode d'initialisation** : Dans ce mode, le maître DP vérifie la présence et
l'état des esclaves connectés. Il lit les informations de diagnostic et
configure les esclaves selon les paramètres définis. Par exemple, si un esclave
est déconnecté, le maître doit le détecter et ajuster son fonctionnement en
conséquence.

2. **Mode de paramétrage** : Après l'initialisation, le maître envoie des
requêtes de paramétrage aux esclaves pour les configurer correctement. Cela
garantit que chaque esclave est prêt à communiquer. Si un esclave nécessite des
paramètres spécifiques pour fonctionner, le maître s'assure de les appliquer à
ce stade.

3. **Mode d'échange de données utilisateur** : Une fois les esclaves configurés,
le maître entre dans ce mode pour échanger de manière cyclique des données
d'ent

### 9 - The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70

In [150]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

Les trois octets d'un telegramme de configuration, représentés en notation
hexadécimale, peuvent être décodés comme suit :

- **Octet 0 (D1)** : Cet octet représente généralement le code de commande ou la
fonction du telegramme. Il indique l'action spécifique que le maître demande à
l'esclave, comme une opération de lecture ou d'écriture.

- **Octet 1 (23)** : Cet octet sert souvent d'identifiant ou d'adresse. Dans le
contexte de PROFIBUS, il peut représenter l'adresse de l'esclave avec lequel le
maître communique. Chaque appareil sur le réseau PROFIBUS a une adresse unique,
et cet octet aide à diriger le telegramme vers le bon appareil.

- **Octet 2 (70)** : Cet octet peut contenir des données supplémentaires ou des
paramètres pertinents à la commande spécifiée dans l'octet D1. Par exemple, il
pourrait indiquer le type de configuration demandée ou des paramètres
spécifiques nécessaires pour l'opération en cours.

En résumé, ces trois octets forment un telegramme structuré qui permet a

### 10 - Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?


In [48]:
wrapped_answer = wrap_paragraphs(final_answer, width=80)
print(wrapped_answer)

A slave device in cyclic data exchange communicates the presence of a diagnostic
fault to its controlling master primarily through error frames and diagnostic
data within the cyclic data transmission. When a fault occurs, the slave device
can insert an error frame into the communication stream, interrupting the normal
data flow and signaling the master to a problem. Alternatively, the fault
information can be encoded within a specific byte or set of bytes within the
slave’s cyclic data transmission – a designated area for diagnostic information.
This allows the master to receive the fault indication alongside the regular
data.

The master's response to this diagnostic fault varies depending on the severity
of the fault and the configured system behavior. Generally, the master will
acknowledge the error, potentially logging it for later analysis. It might also
attempt to recover from the fault by re-requesting data from the faulty slave,
essentially trying a retry. More sophisticated sy