# CSRD RAG Assistant

In [77]:
%pip install requests beautifulsoup4 pandas networkx pyvis langchain faiss-cpu dot-env

[31mERROR: Could not find a version that satisfies the requirement dot-env (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for dot-env[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [78]:
import requests
import pandas as pd
from bs4.element import Tag
import re
from bs4 import BeautifulSoup
from pyvis.network import Network
import uuid
import networkx as nx
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.schema.retriever import BaseRetriever
from langchain.docstore.document import Document
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from typing import List, Any  
from dotenv import load_dotenv

In [79]:
load_dotenv()

True

In [80]:

csrd_report_url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:02013L0034-20240109&qid=1712714544806'
html_page = requests.get(csrd_report_url).text

In [81]:
def get_directive_section(main_content):
  return main_content.find('div', {'class': 'eli-main-title'})

def get_content_section(main_content):
  return main_content.find('div', {'class': 'eli-subdivision'})

def get_chapter_sections(content_section):
  return content_section.find_all('div', recursive=False)

def get_article_sections(chapter_section):
  return chapter_section.find_all('div', {'class': 'eli-subdivision'}, recursive=False)

def get_directive_name(directive_section) -> str:
  title_doc = directive_section.find_all('p', {'class': 'title-doc-first'})
  title_doc = ' '.join([t.text.strip() for t in title_doc])
  return title_doc

def get_chapter_name(chapter_section) -> str:
  return chapter_section.find('p', {'class': 'title-division-2'}).text.strip().capitalize()

def get_chapter_id(chapter_section) -> str:
  chapter_id = chapter_section.find('p', {'class': 'title-division-1'}).text.strip()
  chapter_id = chapter_id.replace('CHAPTER', '').strip()
  return chapter_id

def get_article_name(article_section) -> str:
  return article_section.find('p', {'class': 'stitle-article-norm'}).text.strip()

def get_article_id(article_section) -> str:
  article_id = article_section.find('p', {'class': 'title-article-norm'}).text.strip()
  article_id = re.sub('\"?Article\s*', '', article_id).strip()
  return article_id

In [82]:
def _clean_paragraph(txt):
  # remove multiple break lines
  txt = re.sub('\n+', '\n', txt)
  # simplifies bullet points
  txt = re.sub('(\([\d\w]+\)\s?)\n', r'\1\t', txt)
  # simplifies quote
  txt = re.sub('‘', '\'', txt)
  # some weird references to other articles
  txt = re.sub('\(\\n[\d\w]+\n\)', '', txt)
  # remove spaces before punctuation
  txt = re.sub(f'\s([\.;:])', r'\1', txt)
  # remove reference links
  txt = re.sub('▼\w+\n', '', txt)
  # format numbers
  txt = re.sub('(?<=\d)\s(?=\d)', '', txt)
  # remove consecutive spaces
  txt = re.sub('\s{2,}', ' ', txt)
  # remove leading / trailing spaces
  txt = txt.strip()
  return txt 

def get_paragraphs(article_section):
  content = {}
  paragraph_number = '0'
  paragraph_content = []
  for child in article_section.children:
    if isinstance(child, Tag):
      if 'norm' in child.attrs.get('class'):
        if child.name == 'p':
          paragraph_content.append(child.text.strip())
        elif child.name == 'div':
          content[paragraph_number] = _clean_paragraph('\n'.join(paragraph_content))
          paragraph_number = child.find('span', {'class': 'no-parag'}).text.strip().split('.')[0]
          paragraph_content = [child.find('div', {'class': 'inline-element'}).text]
      elif 'grid-container' in child.attrs.get('class'):
        paragraph_content.append(child.text)
    content[paragraph_number] = _clean_paragraph('\n'.join(paragraph_content))
  return {k:v for k, v in content.items() if len(v) > 0}

In [83]:
main_content = BeautifulSoup(html_page, 'html.parser')
directive_section = get_directive_section(main_content)
directive_name = get_directive_name(directive_section)
content_section = get_content_section(main_content)

for chapter_section in get_chapter_sections(content_section):
  chapter_id = get_chapter_id(chapter_section)
  chapter_name = get_chapter_name(chapter_section)
  articles = len(get_article_sections(chapter_section))
  print(f'Chapter {chapter_id}: {chapter_name}')
  print(f'{articles} article(s)')
  print('')

Chapter 1: Scope, definitions and categories of undertakings and groups
3 article(s)

Chapter 2: General provisions and principles
5 article(s)

Chapter 3: Balance sheet and profit and loss account
6 article(s)

Chapter 4: Notes to the financial statements
4 article(s)

Chapter 5: Management report
3 article(s)

Chapter 6: Consolidated financial statements and reports
10 article(s)

Chapter 6a: Sustainability reporting standards
2 article(s)

Chapter 6b: Single electronic reporting format
1 article(s)

Chapter 7: Publication
5 article(s)

Chapter 8: Auditing and assurance of sustainability reporting
2 article(s)

Chapter 9: Provisions concerning exemptions and restrictions on exemptions
5 article(s)

Chapter 9a: Reporting concerning third-country undertakings
4 article(s)

Chapter 10: Report on payments to governments
8 article(s)

Chapter 10a: Report on income tax information
8 article(s)

Chapter 11: Transitional and final provisions
8 article(s)



In [84]:
nodes = []
edges = []

nodes.append(['0', 'CSRD', directive_name, 'DIRECTIVE'])


In [85]:
for chapter_section in get_chapter_sections(content_section):

  chapter_id = get_chapter_id(chapter_section)
  chapter_name = get_chapter_name(chapter_section)

  # level 1, chapter
  # chapters are included in root node
  nodes.append([ chapter_id, f'Chapter {chapter_id}', chapter_name, 'CHAPTER'])
  edges.append(['0', f'{chapter_id}', 'CONTAINS'])

  for article_section in get_article_sections(chapter_section):
    article_id = get_article_id(article_section)
    article_name = get_article_name(article_section)
    article_paragraphs = get_paragraphs(article_section)

    # level 2, article
    # articles are included in chapters
    nodes.append([f'{chapter_id}.{article_id}', f'Article {article_id}', article_name, 'ARTICLE'])
    edges.append([chapter_id, f'{chapter_id}.{article_id}', 'CONTAINS'])

    for paragraph_id, paragraph_text in article_paragraphs.items():

      # level 3, paragraph
      # paragraphs are included in articles
      nodes.append([f'{chapter_id}.{article_id}.{paragraph_id}', f'Article {article_id}({paragraph_id})', paragraph_text, 'PARAGRAPH'])
      edges.append([f'{chapter_id}.{article_id}', f'{chapter_id}.{article_id}.{paragraph_id}', 'CONTAINS'])

In [86]:
nodes_df = pd.DataFrame(nodes, columns=['id', 'label', 'content', 'group'])
edges_df = pd.DataFrame(edges, columns=['src', 'dst', 'label'])

In [87]:
# display(edges_df)
edges_df['label'].unique()

array(['CONTAINS'], dtype=object)

In [88]:
display(nodes_df)

Unnamed: 0,id,label,content,group
0,0,CSRD,DIRECTIVE 2013/34/EU OF THE EUROPEAN PARLIAMEN...,DIRECTIVE
1,1,Chapter 1,"Scope, definitions and categories of undertaki...",CHAPTER
2,1.1,Article 1,Scope,ARTICLE
3,1.1.1,Article 1(1),The coordination measures prescribed by this D...,PARAGRAPH
4,1.1.1a,Article 1(1a),The coordination measures prescribed by Articl...,PARAGRAPH
...,...,...,...,...
370,11.53.2,Article 53(2),Member States shall communicate to the Commiss...,PARAGRAPH
371,11.54,Article 54,Entry into force,ARTICLE
372,11.54.0,Article 54(0),This Directive shall enter into force on the t...,PARAGRAPH
373,11.55,Article 55,Addressees,ARTICLE


In [89]:
CSRD = nx.DiGraph()

for i, n in nodes_df.iterrows():
  CSRD.add_node(n['id'], label=n['label'], title=n['content'], group=n['group'])

for i, e in edges_df.iterrows():
  if e['label'] == 'CONTAINS':
    CSRD.add_edge(e['src'], e['dst'], label=e['label'])


In [90]:
def displayGraph(graph):

  net = Network(
    height="750px", 
    width="100%", 
    directed=True, 
    cdn_resources='remote',
    notebook=True
  )

  net.options.groups = {
      "DIRECTIVE": {
        "icon": {
            "face": 'FontAwesome',
            "code": '\uf19c',
        }
      },
      "CHAPTER": {
          "icon": {
              "face": 'FontAwesome',
              "code": '\uf02d',
          }
      },
      "ARTICLE": {                 
        "icon": {
            "face": 'FontAwesome',
            "code": '\uf07c',
          }
      },
      "PARAGRAPH": {                 
        "icon": {
            "face": 'FontAwesome',
            "code": '\uf15b',
          }
      }
  }

  net.from_nx(graph)
  net.show(f"/tmp/{uuid.uuid4().hex}.html")
  return net.html.replace(
    '<head>',
    '<head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" type="text/css"/>'
  )

In [91]:
node_text = nodes_df['content'].tolist()

In [92]:
print(node_text)

['DIRECTIVE 2013/34/EU OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL of 26 June 2013 on the annual financial statements, consolidated financial statements and related reports of certain types of undertakings, amending Directive 2006/43/EC of the European Parliament and of the Council and repealing Council Directives 78/660/EEC and 83/349/EEC', 'Scope, definitions and categories of undertakings and groups', 'Scope', 'The coordination measures prescribed by this Directive shall apply to the laws, regulations and administrative provisions of the Member States relating to the types of undertakings listed:\n(a) in Annex I;\n(b) in Annex II, where all of the direct or indirect members of the undertaking having otherwise unlimited liability in fact have limited liability by reason of those members being undertakings which are:\n(i) of the types listed in Annex I; or\n(ii) not governed by the law of a Member State but which have a legal form comparable to those listed in Annex I.', 'The coordi

In [93]:
from networkx import DiGraph  # Added this import


# Initialize embeddings and LLM
embeddings = OpenAIEmbeddings()
chat_model = ChatOpenAI(temperature=0)

# Create graph
CSRD_graph = nx.DiGraph()
for i, n in nodes_df.iterrows():
    CSRD_graph.add_node(n['id'], label=n['label'], title=n['content'], group=n['group'])
for i, e in edges_df.iterrows():
    if e['src'] != e['dst']:
        CSRD_graph.add_edge(e['src'], e['dst'])

# Create vector store
def create_vector_store(nodes_df):
    texts = nodes_df['content'].tolist()
    metadatas = [{'id': id} for id in nodes_df['id'].tolist()]
    return FAISS.from_texts(texts, embeddings, metadatas=metadatas)

vector_store = create_vector_store(nodes_df)
CSRD_search = vector_store.as_retriever()


# Custom Retriever
class CustomRetriever(BaseRetriever):
    retriever: Any
    knowledge_graph: DiGraph

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        # Use existing retriever to get the documents
        documents = self.retriever.get_relevant_documents(query)

        # Retrieve document Ids
        doc_ids = [doc.metadata['id'] for doc in documents]

        # Retrieve nodes
        nodes = [[node_id, self.knowledge_graph.nodes.get(node_id)] for node_id in doc_ids]
        nodes = [[node_id, node_data] for node_id, node_data in nodes if node_data is not None]

        # Build documents in relevance order
        processed_ids = set()
        supporting_documents = []

        for node_id, node_data in nodes:
            if node_data['group'] == 'PARAGRAPH' and node_id not in processed_ids:
                processed_ids.add(node_id)
                supporting_documents.append(
                    Document(
                        page_content=node_data['title'],
                        metadata={'id': node_id, 'label': node_data['label']}
                    )
                )

            # Traverse graph to get cross reference articles
            children_id = list(self.knowledge_graph.successors(node_id))
            for child_id in children_id:
                child_data = self.knowledge_graph.nodes[child_id]
                if child_data['group'] == 'PARAGRAPH' and child_id not in processed_ids:
                    processed_ids.add(child_id)
                    supporting_documents.append(
                        Document(
                            page_content=child_data['title'],
                            metadata={'id': child_id, 'label': child_data['label']}
                        )
                    )

        return supporting_documents

# Setup prompt and chain
TEMPLATE = """
Context information is below.

---------------------
{context}
---------------------

Given the context information and not prior knowledge.
Answer compliance issue related to the CSRD directive only.

If the question is not related to regulatory compliance, kindly decline to answer.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Keep the answer as concise as possible, citing articles and chapters whenever applicable.
Please do not repeat the answer and do not add any additional information.

Question: {question}

Answer:
"""

prompt = PromptTemplate(template=TEMPLATE, input_variables=["context", "question"])

# Create chain
chain_kg = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=CustomRetriever(retriever=CSRD_search, knowledge_graph=CSRD_graph),
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

# Example usage
def format_response(question, answer):
    print(f"Question: {question}\n")
    print(f"Answer: {answer['result']}\n")
    print("Sources:")
    for doc in answer['source_documents']:
        print(f"- {doc.metadata['label']}: {doc.page_content[:100]}...")

question = "List all the conditions whereby a company structure is allowed to limit their reporting to business strategy only."
answer = chain_kg.invoke({"query": question})
format_response(question, answer)

Question: List all the conditions whereby a company structure is allowed to limit their reporting to business strategy only.

Answer: The provided context does not specify conditions under which a company structure is allowed to limit their reporting to business strategy only in relation to the CSRD directive. Therefore, I don't know the answer.

Sources:
- Article 23(1): Small groups shall be exempted from the obligation to draw up consolidated financial statements and ...
- Article 19(3): Member States may exempt small undertakings from the obligation to prepare management reports, provi...
- Article 48b(3): Member States shall provide that the rule set out in paragraph 1 of this Article does not apply to s...
- Article 31(1): Member States may exempt small undertakings from the obligation to publish their profit and loss acc...
