# CSRD RAG Assistant

In [55]:
%pip install requests beautifulsoup4 pandas networkx pyvis langchain langchain_openai faiss-cpu duckduckgo-search

Note: you may need to restart the kernel to use updated packages.


In [21]:
import requests
import pandas as pd
from bs4.element import Tag
import re
from bs4 import BeautifulSoup
from pyvis.network import Network
import uuid
import networkx as nx
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema.retriever import BaseRetriever
from langchain.docstore.document import Document
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from typing import List, Any  
from dotenv import load_dotenv

In [22]:
load_dotenv()

True

In [4]:

csrd_report_url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:02013L0034-20240109&qid=1712714544806'
html_page = requests.get(csrd_report_url).text

In [5]:
def get_directive_section(main_content):
  return main_content.find('div', {'class': 'eli-main-title'})

def get_content_section(main_content):
  return main_content.find('div', {'class': 'eli-subdivision'})

def get_chapter_sections(content_section):
  return content_section.find_all('div', recursive=False)

def get_article_sections(chapter_section):
  return chapter_section.find_all('div', {'class': 'eli-subdivision'}, recursive=False)

def get_directive_name(directive_section) -> str:
  title_doc = directive_section.find_all('p', {'class': 'title-doc-first'})
  title_doc = ' '.join([t.text.strip() for t in title_doc])
  return title_doc

def get_chapter_name(chapter_section) -> str:
  return chapter_section.find('p', {'class': 'title-division-2'}).text.strip().capitalize()

def get_chapter_id(chapter_section) -> str:
  chapter_id = chapter_section.find('p', {'class': 'title-division-1'}).text.strip()
  chapter_id = chapter_id.replace('CHAPTER', '').strip()
  return chapter_id

def get_article_name(article_section) -> str:
  return article_section.find('p', {'class': 'stitle-article-norm'}).text.strip()

def get_article_id(article_section) -> str:
  article_id = article_section.find('p', {'class': 'title-article-norm'}).text.strip()
  article_id = re.sub('\"?Article\s*', '', article_id).strip()
  return article_id

In [6]:
def _clean_paragraph(txt):
  # remove multiple break lines
  txt = re.sub('\n+', '\n', txt)
  # simplifies bullet points
  txt = re.sub('(\([\d\w]+\)\s?)\n', r'\1\t', txt)
  # simplifies quote
  txt = re.sub('‘', '\'', txt)
  # some weird references to other articles
  txt = re.sub('\(\\n[\d\w]+\n\)', '', txt)
  # remove spaces before punctuation
  txt = re.sub(f'\s([\.;:])', r'\1', txt)
  # remove reference links
  txt = re.sub('▼\w+\n', '', txt)
  # format numbers
  txt = re.sub('(?<=\d)\s(?=\d)', '', txt)
  # remove consecutive spaces
  txt = re.sub('\s{2,}', ' ', txt)
  # remove leading / trailing spaces
  txt = txt.strip()
  return txt 

def get_paragraphs(article_section):
  content = {}
  paragraph_number = '0'
  paragraph_content = []
  for child in article_section.children:
    if isinstance(child, Tag):
      if 'norm' in child.attrs.get('class'):
        if child.name == 'p':
          paragraph_content.append(child.text.strip())
        elif child.name == 'div':
          content[paragraph_number] = _clean_paragraph('\n'.join(paragraph_content))
          paragraph_number = child.find('span', {'class': 'no-parag'}).text.strip().split('.')[0]
          paragraph_content = [child.find('div', {'class': 'inline-element'}).text]
      elif 'grid-container' in child.attrs.get('class'):
        paragraph_content.append(child.text)
    content[paragraph_number] = _clean_paragraph('\n'.join(paragraph_content))
  return {k:v for k, v in content.items() if len(v) > 0}

In [7]:
main_content = BeautifulSoup(html_page, 'html.parser')
directive_section = get_directive_section(main_content)
directive_name = get_directive_name(directive_section)
content_section = get_content_section(main_content)

for chapter_section in get_chapter_sections(content_section):
  chapter_id = get_chapter_id(chapter_section)
  chapter_name = get_chapter_name(chapter_section)
  articles = len(get_article_sections(chapter_section))
  print(f'Chapter {chapter_id}: {chapter_name}')
  print(f'{articles} article(s)')
  print('')

Chapter 1: Scope, definitions and categories of undertakings and groups
3 article(s)

Chapter 2: General provisions and principles
5 article(s)

Chapter 3: Balance sheet and profit and loss account
6 article(s)

Chapter 4: Notes to the financial statements
4 article(s)

Chapter 5: Management report
3 article(s)

Chapter 6: Consolidated financial statements and reports
10 article(s)

Chapter 6a: Sustainability reporting standards
2 article(s)

Chapter 6b: Single electronic reporting format
1 article(s)

Chapter 7: Publication
5 article(s)

Chapter 8: Auditing and assurance of sustainability reporting
2 article(s)

Chapter 9: Provisions concerning exemptions and restrictions on exemptions
5 article(s)

Chapter 9a: Reporting concerning third-country undertakings
4 article(s)

Chapter 10: Report on payments to governments
8 article(s)

Chapter 10a: Report on income tax information
8 article(s)

Chapter 11: Transitional and final provisions
8 article(s)



In [8]:
nodes = []
edges = []

nodes.append(['0', 'CSRD', directive_name, 'DIRECTIVE'])


In [9]:
for chapter_section in get_chapter_sections(content_section):

  chapter_id = get_chapter_id(chapter_section)
  chapter_name = get_chapter_name(chapter_section)

  # level 1, chapter
  # chapters are included in root node
  nodes.append([ chapter_id, f'Chapter {chapter_id}', chapter_name, 'CHAPTER'])
  edges.append(['0', f'{chapter_id}', 'CONTAINS'])

  for article_section in get_article_sections(chapter_section):
    article_id = get_article_id(article_section)
    article_name = get_article_name(article_section)
    article_paragraphs = get_paragraphs(article_section)

    # level 2, article
    # articles are included in chapters
    nodes.append([f'{chapter_id}.{article_id}', f'Article {article_id}', article_name, 'ARTICLE'])
    edges.append([chapter_id, f'{chapter_id}.{article_id}', 'CONTAINS'])

    for paragraph_id, paragraph_text in article_paragraphs.items():

      # level 3, paragraph
      # paragraphs are included in articles
      nodes.append([f'{chapter_id}.{article_id}.{paragraph_id}', f'Article {article_id}({paragraph_id})', paragraph_text, 'PARAGRAPH'])
      edges.append([f'{chapter_id}.{article_id}', f'{chapter_id}.{article_id}.{paragraph_id}', 'CONTAINS'])

In [10]:
nodes_df = pd.DataFrame(nodes, columns=['id', 'label', 'content', 'group'])
edges_df = pd.DataFrame(edges, columns=['src', 'dst', 'label'])

In [11]:
# display(edges_df)
edges_df['label'].unique()

array(['CONTAINS'], dtype=object)

In [12]:
display(nodes_df)

Unnamed: 0,id,label,content,group
0,0,CSRD,DIRECTIVE 2013/34/EU OF THE EUROPEAN PARLIAMEN...,DIRECTIVE
1,1,Chapter 1,"Scope, definitions and categories of undertaki...",CHAPTER
2,1.1,Article 1,Scope,ARTICLE
3,1.1.1,Article 1(1),The coordination measures prescribed by this D...,PARAGRAPH
4,1.1.1a,Article 1(1a),The coordination measures prescribed by Articl...,PARAGRAPH
...,...,...,...,...
370,11.53.2,Article 53(2),Member States shall communicate to the Commiss...,PARAGRAPH
371,11.54,Article 54,Entry into force,ARTICLE
372,11.54.0,Article 54(0),This Directive shall enter into force on the t...,PARAGRAPH
373,11.55,Article 55,Addressees,ARTICLE


In [31]:
csrd_graph = nx.DiGraph()

for i, n in nodes_df.iterrows():
  csrd_graph.add_node(n['id'], label=n['label'], title=n['content'], group=n['group'])

for i, e in edges_df.iterrows():
  if e['label'] == 'CONTAINS':
    csrd_graph.add_edge(e['src'], e['dst'], label=e['label'])


In [32]:
# def displayGraph(graph):

#   net = Network(
#     height="750px", 
#     width="100%", 
#     directed=True, 
#     cdn_resources='remote',
#     notebook=True
#   )

#   net.options.groups = {
#       "DIRECTIVE": {
#         "icon": {
#             "face": 'FontAwesome',
#             "code": '\uf19c',
#         }
#       },
#       "CHAPTER": {
#           "icon": {
#               "face": 'FontAwesome',
#               "code": '\uf02d',
#           }
#       },
#       "ARTICLE": {                 
#         "icon": {
#             "face": 'FontAwesome',
#             "code": '\uf07c',
#           }
#       },
#       "PARAGRAPH": {                 
#         "icon": {
#             "face": 'FontAwesome',
#             "code": '\uf15b',
#           }
#       }
#   }

#   net.from_nx(graph)
#   net.show(f"/tmp/{uuid.uuid4().hex}.html")
#   return net.html.replace(
#     '<head>',
#     '<head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" type="text/css"/>'
#   )

In [34]:
from networkx import DiGraph  # Added this import


# Initialize embeddings and LLM
embeddings = OpenAIEmbeddings()
chat_model = ChatOpenAI(temperature=0)

# Create vector store
def create_vector_store(nodes_df):
    texts = nodes_df['content'].tolist()
    metadatas = [{'id': id} for id in nodes_df['id'].tolist()]
    return FAISS.from_texts(texts, embeddings, metadatas=metadatas)

vector_store = create_vector_store(nodes_df)
CSRD_search = vector_store.as_retriever()


# Custom Retriever
class CustomRetriever(BaseRetriever):
    retriever: Any
    knowledge_graph: DiGraph

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        # Use existing retriever to get the documents
        documents = self.retriever.get_relevant_documents(query)

        # Retrieve document Ids
        doc_ids = [doc.metadata['id'] for doc in documents]

        # Retrieve nodes
        nodes = [[node_id, self.knowledge_graph.nodes.get(node_id)] for node_id in doc_ids]
        nodes = [[node_id, node_data] for node_id, node_data in nodes if node_data is not None]

        # Build documents in relevance order
        processed_ids = set()
        supporting_documents = []

        for node_id, node_data in nodes:
            if node_data['group'] == 'PARAGRAPH' and node_id not in processed_ids:
                processed_ids.add(node_id)
                supporting_documents.append(
                    Document(
                        page_content=node_data['title'],
                        metadata={'id': node_id, 'label': node_data['label']}
                    )
                )

            # Traverse graph to get cross reference articles
            children_id = list(self.knowledge_graph.successors(node_id))
            for child_id in children_id:
                child_data = self.knowledge_graph.nodes[child_id]
                if child_data['group'] == 'PARAGRAPH' and child_id not in processed_ids:
                    processed_ids.add(child_id)
                    supporting_documents.append(
                        Document(
                            page_content=child_data['title'],
                            metadata={'id': child_id, 'label': child_data['label']}
                        )
                    )

        return supporting_documents

# Setup prompt and chain
TEMPLATE = """
Context information is below.

---------------------
{context}
---------------------

Given the context information and not prior knowledge.
Answer compliance issue related to the CSRD directive only.

If the question is not related to regulatory compliance, kindly decline to answer.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Keep the answer as concise as possible, citing articles and chapters whenever applicable.
Please do not repeat the answer and do not add any additional information.

Question: {question}

Answer:
"""

prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])

# Create chain
chain_kg = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=CustomRetriever(retriever=CSRD_search, knowledge_graph=csrd_graph),
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

# Example usage
def format_response(question, answer):
    print(f"Question: {question}\n")
    print(f"Answer: {answer['result']}\n")
    print("Sources:")
    for doc in answer['source_documents']:
        print(f"- {doc.metadata['label']}: {doc.page_content[:100]}...")

question = "Create a guided checklist of standards i should follow while reporting my emissions."
answer = chain_kg.invoke({"query": question})
format_response(question, answer)

Question: Create a guided checklist of standards i should follow while reporting my emissions.

Answer: To ensure compliance with the CSRD directive when reporting emissions, follow these standards:
- Specify scope 1, scope 2, and where relevant, scope 3 greenhouse gas emissions (Article 19a(2)(a)(i))
- Include information on climate change mitigation and adaptation (Article 19a(2)(a)(ii))
- Disclose information on water and marine resources, resource use, circular economy, pollution, biodiversity, and ecosystems (Article 19a(2)(a)(iii)-(vi))

Sources:
- Article 29b(1): The Commission shall adopt delegated acts in accordance with Article 49 supplementing this Directive...
- Article 29b(2): The sustainability reporting standards shall ensure the quality of reported information, by requirin...
- Article 29b(3): The sustainability reporting standards shall specify the forward-looking, retrospective, qualitative...
- Article 29b(4): Sustainability reporting standards shall take account of 

# Agentic Approach of CSRD Agent 

In [18]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import StringPromptTemplate

from langchain import OpenAI, LLMChain
from langchain.tools import DuckDuckGoSearchRun

from typing import List, Union
from langchain.schema import AgentAction, AgentFinish
import re
import langchain

In [45]:

text = []
metadata = []

for node_id, node_data in csrd_graph.nodes(data=True):
  text.append(node_data['title'])
  metadata.append({'id': node_id, 'label': node_data['label']})
  
embeddings = OpenAIEmbeddings()
csrd_vector_store = FAISS.from_texts(text, embeddings, metadatas=metadata)


def search_csrd(query: str) -> str:
  csrd_results = csrd_vector_store.similarity_search_with_score(query)
  print(csrd_results)
  
  response = []
  
  for doc, score in csrd_results:
    doc_id = doc.metadata['id']
    doc_label = doc.metadata['label']
    doc_content = doc.page_content
    doc_references = ','.join(list(csrd_graph.neighbors(doc_id)))
    
    response.append(
      f'''######
      [Articel ID]: {doc_id}
      [Article Name]: {doc_label}
      [Article Content]: {doc_content}
      [References]: {doc_references}
      '''
    )
    
    return "\n\n".join(response)
    
def search_reference(article_reference: str) -> str: 
  result = csrd_graph.nodes[article_reference]
  doc_references = ','.join(list(csrd_graph.neighbors(article_reference)))
  doc_content = result['title']
  doc_label = result['label']
  
  return f'''###
  [Articel ID]: {article_reference}
  [Article Name]: {doc_label}
  [Article Content]: {doc_content}
  [References]: {doc_references}
'''
  

## Tools Setup

In [None]:
# Define which tools the agent can use to answer user queries
search = DuckDuckGoSearchRun()

tools = [
    Tool(
        name = "Search",
        func=search.run,
        description="useful for when you need to answer questions about current events"
    ),
    Tool(
        name = "search_csrd",
        func=search_csrd,
        description="useful for when you need to answer questions about CSRD directive"
    ),
    Tool(
        name = "expand_search_reference",
        func=search_reference,
        description="useful for when you need to answer questions about CSRD directive"
    )
]

In [62]:
prompt_template = """
You are an expert in sustainability reporting compliance. 
Given the context information, answer compliance issues related to the CSRD directive.
Start your search with content related to a given query using the [search_csrd] tool.
Each article may have [article_references] to other articles. Expand your search using the [expand_search_reference] tool.
Continue your search until all referenced information have been used to answer the question.

If the question is not related to regulatory compliance, kindly decline to answer.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Return concise information answering the question and citing all the relevant [article_name].

You can give your final anser by stating: 
Final Answer:
"""


In [72]:
template = """
You're an expert in sustainability reporting compliance. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)

Start your search with content related to a given query using the [search_reference] tool.
Each article may have [article_references] to other articles. Expand your search using the [expand_search_reference] tool.
Continue your search until all referenced information have been used to answer the question.

If the question is not related to regulatory compliance, kindly decline to answer.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Return concise information answering the question and citing all the relevant [article_name].
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin! Remember to answer as an expert sustainabiliy report compliance agent..

Question: {input}
{agent_scratchpad}"""

In [73]:
from langchain.prompts import StringPromptTemplate
from typing import List, Union

class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]

    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        return self.template.format(**kwargs)

In [74]:
prompt = CustomPromptTemplate(
    template=template,
    tools=tools,
    input_variables=['input', 'intermediate_steps']
)

In [75]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser

class CustomOutputParser(AgentOutputParser):

    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)
     

In [76]:
output_parser = CustomOutputParser()
     

In [77]:
from langchain_openai import OpenAI

llm = OpenAI(temperature=0)

In [78]:
llm_chain = LLMChain(llm=llm, prompt=prompt)
     

In [79]:
tool_names = [tool.name for tool in tools]

agent = LLMSingleActionAgent(
    llm_chain=llm_chain,
    output_parser=output_parser,
    stop=["\nObservation:"],
    allowed_tools=tool_names
)
     

In [80]:
agent_executor = AgentExecutor.from_agent_and_tools(agent=agent,
                                                    tools=tools,
                                                    verbose=True)

In [81]:
agent_executor.invoke({"input": "What are the standards for reporting emissions?"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: It's important to stay up-to-date on current events and regulations related to sustainability reporting.
Action: Search
Action Input: "Sustainability reporting emissions standards"[0m

  ddgs_gen = ddgs.text(




Observation:[36;1m[1;3mIFRS Sustainability Standards are developed to enhance investor-company dialogue so that investors receive decision-useful, globally comparable sustainability-related disclosures that meet their information needs. ... 29 jurisdictions that have finalised or published proposals on climate-related disclosures have included Scope 3 GHG emissions ... Climate-related disclosure requirements from the International Sustainability Standards Board (ISSB), the EU and the Securities and Exchange Commission (SEC) are shaping the global climate reporting landscape. Although different in many ways, these requirements share a common factor: greenhouse gas emissions. On June 26, 2023, the International Sustainability Standards Board (ISSB), an International Financial Reporting Standards (IFRS) Foundation initiative, released IFRS S1, General Requirements for Disclosure of Sustainability-related Financial Information and IFRS S2, Climate-related Disclosures; both are effectiv

{'input': 'What are the standards for reporting emissions?',
 'output': 'The standards for reporting emissions include IFRS Sustainability Standards, ISO 14064, and GRI Standards. It is important to stay up-to-date on these standards and regulations in order to accurately report emissions and meet the information needs of investors.'}