In [19]:
# %pip install "unstructured[md]" nltk langchain-text-splitters

from dotenv import load_dotenv

load_dotenv()

True

## **Single Document**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.document_loaders import TextLoader

# Load
markdown_path = "./Processed_Files_Introduction_to_End/Copia di iec61158-1{ed1.0}b.md"
loader = TextLoader(markdown_path, autodetect_encoding=True)
doc = loader.load()
doc[0]

## **All Documents**

In [11]:
import glob
import os
from langchain_core.documents import Document

In [12]:
# 1. Path/pattern for markdown files
folder_path = "./dev_files/*.md"

# 2. Parameters for the Header Splitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, 
    strip_headers=False
)

# 3. Parameters for the character-level splitter
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap
)

# Final list for all splits from all files
all_splits = []

for file_path in glob.glob(folder_path):
    # a) Load the file as raw text (preserving Markdown)
    loader = TextLoader(file_path, autodetect_encoding=True)
    docs = loader.load()  # usually returns a list [Document]
    
    # b) For each Document, perform two splits
    for doc in docs:
        md_header_splits = markdown_splitter.split_text(doc.page_content)
        splits = text_splitter.split_documents(md_header_splits)
        
        # c) Use the base file name as "name_file" (without the path and without extension)
        filename = os.path.splitext(os.path.basename(file_path))[0]

        for splitted_doc in splits:
            splitted_doc.metadata["name_file"] = filename
            # Optionally, remove the original "source" if not needed:
            # splitted_doc.metadata.pop("source", None)

        all_splits.extend(splits)

In [None]:
min_size = 1000  # Minimum size for the merged chunks
merged_documents = []  # List to store merged documents
buffer_content = ""
buffer_metadata = {}

for doc in all_splits:
    text = doc.page_content.strip()  # remove extra spaces if necessary
    # If there is no content accumulated in the buffer and the chunk is small,
    # initialize the buffer with this chunk.
    if not buffer_content and len(text) < min_size:
        buffer_content = text
        buffer_metadata = doc.metadata
    # If there is already content in the buffer, concatenate it with the current chunk.
    elif buffer_content:
        buffer_content += "\n" + text  # add a line break to separate the texts
        # When the buffer reaches or exceeds the minimum size, create a Document
        if len(buffer_content) >= min_size:
            merged_documents.append(
                Document(page_content=buffer_content, metadata=buffer_metadata)
            )
            buffer_content = ""
            buffer_metadata = {}
    # If the current chunk is already large enough and there is nothing in the buffer, add it directly.
    elif len(text) >= min_size:
        merged_documents.append(Document(page_content=text, metadata=doc.metadata))

# If there is remaining content in the buffer, add it as well.
if buffer_content:
    merged_documents.append(Document(page_content=buffer_content, metadata=buffer_metadata))

# Example of printing the results:
for i, document in enumerate(merged_documents):
    print(f"Document {i+1}:")
    print("Metadata:", document.metadata)
    print("Content (first 200 characters):", document.page_content[:200])
    print("-" * 50)

In [None]:
print(f"Generated {len(all_splits)} chunks in total.")

In [None]:
print(f"Generated {len(merged_documents)} chunks in total.")

In [None]:
merged_documents

## **RAG**

In [17]:
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain_ollama import ChatOllama
from langchain.schema import Document
from langgraph.graph import START, StateGraph
from IPython.display import Image, display
from pprint import pprint
from langchain_core.prompts import ChatPromptTemplate
from typing_extensions import TypedDict
from typing import List
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

In [20]:
# Example embeddings (you can use OpenAIEmbeddings or another)
# embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Create an in-memory local vector store (FAISS)
vector_store = FAISS.from_documents(merged_documents, embedding=embeddings)

In [22]:
vector_store.save_local("faiss_index")

In [23]:
vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [None]:
# Example of an LLM
# llm = ChatOllama(temperature=0, model="gemma3:12b")

llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")

In [25]:
template = """You are an AI language model assistant that understands PROFIBUS-related documents and specifications in the context of industrial automation.

IMPORTANT: FOCUS **ONLY ON PROFIBUS**.  
DO NOT INCLUDE **ANYTHING** ABOUT PROFINET.  
DO NOT INCLUDE **ANYTHING** ABOUT CANOPEN.  
THIS IS **ONLY** ABOUT PROFIBUS.  
REPEAT: **PROFIBUS ONLY**.  
IGNORE ALL OTHER PROTOCOLS.

### Instructions:
- Answer in a clear, informative, and technically accurate manner.
- Provide a **complete and relevant explanation**, but avoid excessive detail.
- Start with a **direct answer**, then expand with useful clarification, examples, or context if appropriate.
- Avoid unnecessary filler or repetition.
- Answer **in the same language as the question**.
- You can say "I don't know" if you don't know the answer.

### Now answer the following:

Question: {question}

Context: {context}

Answer (in the language of the question):  
"""

prompt = ChatPromptTemplate.from_template(template)

In [26]:
class State(TypedDict):
    question: str           # User's question
    context: List[Document] # Documents returned by the retrieve function
    answer: str             # Final answer from the LLM

def retrieve(state: State):
    """
    Retrieval step: performs a similarity search in the vector_store
    and returns the documents that will serve as context for the answer.
    
    Args:
        state (State): The current state containing the user's question.
    
    Returns:
        dict: A dictionary with the key "context" that will be added to the state.
    """
    retrieved_docs = vector_store.similarity_search(state["question"], k=5)
    # Returns a dictionary with the key "context"
    return {"context": retrieved_docs}

def generate(state: State):
    """
    Generation step: formats the 'context' and the 'question' into a prompt,
    calls the LLM, and obtains the final answer.
    
    Args:
        state (State): The current state containing the user's question and the retrieved context.
    
    Returns:
        dict: A dictionary with the key "answer" containing the final answer from the LLM.
    
    Raises:
        ValueError: If the response format from the LLM is unexpected.
    """
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    
    response = llm.invoke(messages)  # or llm(messages), depending on your setup

    # If the response is of type AIMessage and has the attribute 'content', use it:
    if hasattr(response, "content"):
        return {"answer": response.content}
    elif isinstance(response, str):
        return {"answer": response}
    else:
        raise ValueError("Unexpected response format.")

In [27]:
# Build the application graph:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")

# Compile into a callable "graph" object
graph = graph_builder.compile()

In [None]:
display(Image(graph.get_graph().draw_mermaid_png()))

## **Queries**

In [None]:
import re
from pprint import pprint

def clean_answer(answer: str) -> str:
    """
    Cleans the generated answer by removing unnecessary information and formatting.
    
    Args:
        answer (str): The raw answer generated by the LLM.
        
    Returns:
        str: The cleaned answer, formatted for better readability.
    """

    # Remove logs such as INFO:langchain...
    cleaned = re.sub(r'INFO:.*?(\n|$)', '', answer)

    # Extract the value of the 'answer' key if it's a dictionary string
    match = re.search(r"'answer':\s*([\"'])(.*?)\1", cleaned, re.DOTALL)
    if match:
        cleaned = match.group(2)

    # Remove markdown bold formatting **text**
    cleaned = re.sub(r"\*\*(.*?)\*\*", r"\1", cleaned)

    # Remove duplicate spaces and normalize line breaks
    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
    cleaned = re.sub(r'[ \t]+$', '', cleaned, flags=re.MULTILINE)

    return cleaned.strip()

def clean_context(context):
    """
    Simplifies the context by extracting relevant metadata and limiting content length.
    
    Args:
        context (List[Document]): The list of documents retrieved as context.
        
    Returns:
        List[dict]: A simplified list of dictionaries containing the document ID and a snippet of the content.
    """
    
    return [
        {
            "id": doc.metadata.get("name_file", doc.metadata.get("source", "unknown")),
            "content": doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
        }
        for doc in context
    ]

def process_question(question: str):
    """
    Process a user's question by retrieving context and generating an answer using the graph.

    Args:
        question (str): The user's question.

    Prints:
        The retrieved context and the generated answer.
    """
    for step_result in graph.stream({"question": question}, stream_mode="updates"):
        if "retrieve" in step_result:
            context = step_result.get("retrieve", {}).get("context", [])
            simplified_context = clean_context(context)
            print("\n📚 Retrieved Context:\n")
            pprint(simplified_context)

        if "generate" in step_result:
            raw_answer = step_result.get("generate", {}).get("answer", "")
            formatted_answer = clean_answer(raw_answer)
            print("\n🧠 Generated Answer:\n")
            print(formatted_answer)

### 1 - What is a GSD file, where can you get it, and what is its function?

In [32]:
question_1 = "What is a GSD file, where can you get it, and what is its function?"
process_question(question_1)


📚 Retrieved Context:

[{'content': '• General specifications  \n'
             'This section contains information on vendor and device names, '
             'hardware and software release states, baud rates supported, '
             'possible time intervals for monitoring times and the signal '
             'assignment on the bus connector.  \n'
             '• Master-related specifications  \n'
             'This section contains ...',
  'id': 'Copia di iec61784-1{ed4.0}b'},
 {'content': 'After the adjustment of the F-Parameters during network '
             'configuration, an F-Parameter record is compiled and stored '
             'within the F-Host/IO-controller for start-up of the network.  \n'
             'The F-Parameter "F\\_IO\\_StructureDescCRC" is used to ensure '
             'correct usage of the F-I/O data structure and data types by t...',
  'id': 'Copia di iec61784-3-3{ed2.0}en'},
 {'content': '#### **8.3.2.2 GSDML extensions**  \n'
             'The F-Parameters of a

### 2 - What is a token message and how and which devices use it?

In [33]:
question_2 = "What is a token message and how and which devices use it?"
process_question(question_2)


📚 Retrieved Context:

[{'content': 'a) Token-passing allows fair media access for all token '
             'holders.  \n'
             'EXAMPLE When four token holders produce the same amount of '
             'similar priority data they will share the media so that on '
             'average each of them can use 25 % of the available message '
             'transfer time. With the token-passing procedure, rules e...',
  'id': 'Copia di iec61158-4-3{ed3.0}b'},
 {'content': '#### **5.3.2 Token procedures**\n'
             '### **5.3.2.1 Token circulation**  \n'
             'The token is passed from master station to master station in '
             'ascending numerical order of station addresses by means of the '
             'token DLPDU (see 7.4). To close the logical token ring, the '
             'station with the highest address passes the toke...',
  'id': 'Copia di iec61158-4-3{ed3.0}b'},
 {'content': 'In the next token cycle (g), Master station 1 passes the token '
           

### 3 - Which protocol parameter describes the slave's action time after receiving a message?

In [None]:
question_3 = "Which protocol parameter describes the slave's action time after receiving a message?"
process_question(question_3)


📚 Retrieved Context:

[{'content': '## **Table 151 – Functions used by DMPMM2**\n'
             '## **11 Parameters for a DP-slave**  \n'
             'Table 152 contains limitations for datarate-depending AL timing '
             'parameters of Slaves.  \n'
             '| Datarate (kbit/s)         | ≤ 187,5 | 500 | 1 500 | 3 000 | 6 '
             '000 | 12 000 |\n'
             '|---------------------------|---------|-----|------...',
  'id': 'Copia di iec61158-6-3{ed3.0}b'},
 {'content': '## **Send Timeout**  \n'
             "Ce paramètre définit le temps de commande pour surveiller l'AR "
             'MS2 demandée par le maître DP (Classe 2). Le maître DP et '
             "l'esclave DP s'informent l'un l'autre du paramètre Send Timeout "
             "pris en charge. L'esclave DP réajuste son paramètre Send "
             'Timeout. Le maître DP (Classe 2) déc...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': '#### **Table 30 – Default reaction times and operating par

### 4 - How to calculate DP cycle time?

In [None]:
question_4 = "How to calculate DP cycle time?"
process_question(question_4)


📚 Retrieved Context:

[{'content': 'At least the following constraints have to be addressed:  \n'
             '- at a modular DP-slave the time parameters depend on the '
             'plugged in modules.\n'
             '- each module may add also a time offset (each module type may '
             'have an individual time offset).\n'
             '- the setting of TI and TO may be restricted.\n'
             '- the device itself m...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': '- Phase 3: conveyance of the value for the Output Data to the '
             'DP-slave and output of the new Output Data.  \n'
             'This model of synchronization requires enhanced processing '
             'capabilities of the DP-master (Class 1) and requires an '
             'optimized scheduling in the DP-master (Class 1) and in the '
             'DPslaves.  \n'
             'The de...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': '|       |                              | 

### 5 - Briefly explain the function of a watchdog timer in a PROFIBUS DP slave.

In [None]:
question_5 = "Briefly explain the function of a watchdog timer in a PROFIBUS DP slave."
process_question(question_5)


📚 Retrieved Context:

[{'content': '![](_page_95_Figure_10.jpeg)  \n'
             '**Figure 73 — Timing sections forming the FSCP 3/1 '
             'F\\_WD\\_Time**  \n'
             'The transfer of the new safety PDU to the F-Host characterises '
             'the next timing section (Bus). As soon as the F driver in the '
             'F-Host received the new safety PDU it restarts its watchdog '
             'timer and process...',
  'id': 'Copia di PROFIsafe-Profile_3192b_V24_Mar07'},
 {'content': 'The watchdog time that shall be assigned to the F-Parameter is '
             'longer than *the minimum watchdog time* to ensure that an '
             'emergency event has been caught.  \n'
             'According to [8.1.3](#page-70-0) the value to be assigned to '
             'F\\_WD\\_Time in the example of [Figure 73](#page-101-0) (Time '
             'trigger = 10 ms) would...',
  'id': 'Copia di iec61784-3-3{ed2.0}en'},
 {'content': '**idle-timer:** This timer monitors the id

### 6 - Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?

In [None]:
question_6 = "Briefly explain the meaning of “sync” and “freeze” modes. Which device(s) must support these modes?"
process_question(question_6)


📚 Retrieved Context:

[{'content': '### **Sync Mode**  \n'
             'This dynamic attribute indicates if the Sync mode is activated. '
             'This attribute shall always be set to FALSE if the attribute '
             'Sync Supported has the value FALSE. If the attribute Sync '
             'Supported has the value TRUE the following rules shall apply for '
             'the attribute Sync Mode:  \n'
             '- It ...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': '### **Sync Command**  \n'
             'This parameter controls the Sync Operation.  \n'
             'The allowed values are shown in Table 33.\n'
             '#### **Table 33 – Sync Command**  \n'
             '| Value | Meaning   |\n'
             '|-------|-----------|\n'
             '| 0     | no action |\n'
             '| 1     | Sync      |\n'
             '| 2     | Unsync    |\n'
             '## **Freeze Command**  \n'
             'This parameter contr...',
  'id': 'Copia di iec61158-

### 7 - Discuss the factors that have a significant effect on the overall cycle time of a DP network.

In [None]:
question_7 = "Discuss the factors that have a significant effect on the overall cycle time of a DP network."
process_question(question_7)


📚 Retrieved Context:

[{'content': 'At least the following constraints have to be addressed:  \n'
             '- at a modular DP-slave the time parameters depend on the '
             'plugged in modules.\n'
             '- each module may add also a time offset (each module type may '
             'have an individual time offset).\n'
             '- the setting of TI and TO may be restricted.\n'
             '- the device itself m...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': '# **A.2 Response time measurements**  \n'
             'In 9.3.1 a simplified model for typical response times is '
             'described. The congruence between the model and a real '
             'multivendor application for 15 000 sample measurements is shown '
             'in Figure A.84. In this case the transmission rate had been 1,5 '
             'MBit/s and the F-Hos...',
  'id': 'Copia di PROFIsafe-Profile_3192b_V24_Mar07'},
 {'content': '## **A.2 Response time measurements**  \n'
  

### 8 - What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode.


In [None]:
question_8 = "What are the four operating modes of a DP class 1 master? Briefly explain the interaction between the master and its assigned slaves in each mode."
process_question(question_8)


📚 Retrieved Context:

[{'content': '### **Figure 3 – Example of DP communication between field '
             'devices**\n'
             '### **6.1.3.2 Device types**\n'
             '### **6.1.3.2.1 DP-master (class 1)**  \n'
             'The DP-master (Class 1) is a controlling device which is '
             'associated with one or more DPslaves (field devices). The '
             'DP-master (Class 1) performs one or more of the...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': 'If there exists a valid Master Parameter Set in a DP-master '
             '(Class 1), this device starts to check whether the DP-slaves '
             'dedicated to that DP-master are present or not (this will be '
             'done with a read of the diagnosis of a DP-slave). After an '
             'appropriate answer the DP-master (Class 1) will set the par...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': 'The DP-master (Class 1) is a controlling device, which is '
             'as

### 9 - The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70

In [None]:
question_9 = "The data unit of a configuration telegram contains the following 3 bytes represented in hexadecimal notation. Decode the meaning of the bytes. \n- Byte 0:  D1 \n- Byte 1:  23 \n- Byte 2:  70"
process_question(question_9)


📚 Retrieved Context:

[{'content': '- 00 basic configuration message;\n'
             '- 01 path-diversity control message;\n'
             '- 10 extendible configuration message;\n'
             '- 11 extendible status-report invocation message.\n'
             '### **8.2.2.4.2 Basic configuration message**  \n'
             'Following its initial two bits of (00), the basic configuration '
             'message specifies operationa...',
  'id': 'Copia di iec61158-2{ed6.0}b'},
 {'content': '### <span id="page-161-1"></span><span '
             'id="page-161-0"></span>**9.10.2 Telegrams and fill '
             'characters**  \n'
             'On the physical level, it is sufficient to know that a telegram '
             'shall start and end with the bit sequence 0111 1110. This bit '
             'sequence is also known as a delimiter. Due to bit-stuffing, this '
             'bit ...',
  'id': 'Copia di iec61158-2{ed6.0}b'},
 {'content': 'Les deux messages normalisés et les deux clas

### 10 - Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?


In [None]:
question_10 = "Briefly explain how a slave device, which is in cyclic data exchange, communicates the presence of a diagnostic fault to its controlling master. What does the master do about this?"
process_question(question_10)


📚 Retrieved Context:

[{'content': 'For every slave device a fixed time slot according to its data '
             'width is allocated in the frame. The data package order is '
             'according to the physical order of the connected devices. Cyclic '
             'data (process data) and non cyclic data (parameter data) are '
             'transferred concurrently. Devices with parameter dat...',
  'id': 'Copia di iec61784-1{ed4.0}b'},
 {'content': 'Only in the data exchange mode the extended functionalities like '
             'the acyclic read and write of variables, the acyclic transfer of '
             'alarms, the up- and/or download of LR Data, the invocation of '
             'stateless and/or state-oriented functions, the DXB functionality '
             'and the Isochronous Mode are possible if su...',
  'id': 'Copia di iec61158-5-3{ed3.0}b'},
 {'content': '## **6.6 Cyclic communication between DP-master (class 1) and '
             'DP-slave**  \n'
             'The c