# CSRD RAG Assistant

In [14]:
%pip install requests beautifulsoup4 pandas networkx pyvis

Note: you may need to restart the kernel to use updated packages.


In [15]:
import requests


csrd_report_url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:02013L0034-20240109&qid=1712714544806'
html_page = requests.get(csrd_report_url).text

In [16]:
print(html_page)


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML//EN" "xhtml-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<!-- fmx2xhtml # converter_version:9.15.0 # generated_on:20240222-1913 -->
<head><meta name="format-detection" content="telephone=no"/><meta http-equiv='X-UA-Compatible' content='IE=edge'/><meta name='viewport' content='width=device-width, initial-scale=1'><meta name='WT.z_usr_lan' content='EN'>
   <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
   <title>Consolidated TEXT: 32013L0034 — EN — 09.01.2024</title>
   <script type="text/javascript" src="/ruxitagentjs_ICANVfgqrux_10299241001084140.js" data-dtconfig="app=47d4c64c3b67ec69|cuc=m097nmfl|agentId=2f5b9a82d4160179|mel=100000|mb=null|featureHash=ICANVfgqrux|dpvc=1|iub=null|lastModification=1736535059403|tp=500,50,0|rdnt=1|uxrgce=1|agentUri=/ruxitagentjs_ICANVfgqrux_10299241001084140.js|reportUrl=/rb_39a3e95b-5423-482c-879b-99ef235dffeb|rid=RID_-115705826|rpid=2105131548|domain=europa.eu"></script><link 

In [17]:
def get_directive_section(main_content):
  return main_content.find('div', {'class': 'eli-main-title'})

def get_content_section(main_content):
  return main_content.find('div', {'class': 'eli-subdivision'})

def get_chapter_sections(content_section):
  return content_section.find_all('div', recursive=False)

def get_article_sections(chapter_section):
  return chapter_section.find_all('div', {'class': 'eli-subdivision'}, recursive=False)

def get_directive_name(directive_section) -> str:
  title_doc = directive_section.find_all('p', {'class': 'title-doc-first'})
  title_doc = ' '.join([t.text.strip() for t in title_doc])
  return title_doc

def get_chapter_name(chapter_section) -> str:
  return chapter_section.find('p', {'class': 'title-division-2'}).text.strip().capitalize()

def get_chapter_id(chapter_section) -> str:
  chapter_id = chapter_section.find('p', {'class': 'title-division-1'}).text.strip()
  chapter_id = chapter_id.replace('CHAPTER', '').strip()
  return chapter_id

def get_article_name(article_section) -> str:
  return article_section.find('p', {'class': 'stitle-article-norm'}).text.strip()

def get_article_id(article_section) -> str:
  article_id = article_section.find('p', {'class': 'title-article-norm'}).text.strip()
  article_id = re.sub('\"?Article\s*', '', article_id).strip()
  return article_id

In [18]:
from bs4.element import Tag
import re


In [19]:
def _clean_paragraph(txt):
  # remove multiple break lines
  txt = re.sub('\n+', '\n', txt)
  # simplifies bullet points
  txt = re.sub('(\([\d\w]+\)\s?)\n', r'\1\t', txt)
  # simplifies quote
  txt = re.sub('‘', '\'', txt)
  # some weird references to other articles
  txt = re.sub('\(\\n[\d\w]+\n\)', '', txt)
  # remove spaces before punctuation
  txt = re.sub(f'\s([\.;:])', r'\1', txt)
  # remove reference links
  txt = re.sub('▼\w+\n', '', txt)
  # format numbers
  txt = re.sub('(?<=\d)\s(?=\d)', '', txt)
  # remove consecutive spaces
  txt = re.sub('\s{2,}', ' ', txt)
  # remove leading / trailing spaces
  txt = txt.strip()
  return txt 

def get_paragraphs(article_section):
  content = {}
  paragraph_number = '0'
  paragraph_content = []
  for child in article_section.children:
    if isinstance(child, Tag):
      if 'norm' in child.attrs.get('class'):
        if child.name == 'p':
          paragraph_content.append(child.text.strip())
        elif child.name == 'div':
          content[paragraph_number] = _clean_paragraph('\n'.join(paragraph_content))
          paragraph_number = child.find('span', {'class': 'no-parag'}).text.strip().split('.')[0]
          paragraph_content = [child.find('div', {'class': 'inline-element'}).text]
      elif 'grid-container' in child.attrs.get('class'):
        paragraph_content.append(child.text)
    content[paragraph_number] = _clean_paragraph('\n'.join(paragraph_content))
  return {k:v for k, v in content.items() if len(v) > 0}

In [20]:
from bs4 import BeautifulSoup

main_content = BeautifulSoup(html_page, 'html.parser')
directive_section = get_directive_section(main_content)
directive_name = get_directive_name(directive_section)
content_section = get_content_section(main_content)

for chapter_section in get_chapter_sections(content_section):
  chapter_id = get_chapter_id(chapter_section)
  chapter_name = get_chapter_name(chapter_section)
  articles = len(get_article_sections(chapter_section))
  print(f'Chapter {chapter_id}: {chapter_name}')
  print(f'{articles} article(s)')
  print('')

Chapter 1: Scope, definitions and categories of undertakings and groups
3 article(s)

Chapter 2: General provisions and principles
5 article(s)

Chapter 3: Balance sheet and profit and loss account
6 article(s)

Chapter 4: Notes to the financial statements
4 article(s)

Chapter 5: Management report
3 article(s)

Chapter 6: Consolidated financial statements and reports
10 article(s)

Chapter 6a: Sustainability reporting standards
2 article(s)

Chapter 6b: Single electronic reporting format
1 article(s)

Chapter 7: Publication
5 article(s)

Chapter 8: Auditing and assurance of sustainability reporting
2 article(s)

Chapter 9: Provisions concerning exemptions and restrictions on exemptions
5 article(s)

Chapter 9a: Reporting concerning third-country undertakings
4 article(s)

Chapter 10: Report on payments to governments
8 article(s)

Chapter 10a: Report on income tax information
8 article(s)

Chapter 11: Transitional and final provisions
8 article(s)



In [21]:
import pandas as pd

nodes = []
edges = []

nodes.append(['0', 'CSRD', directive_name, 'DIRECTIVE'])


In [22]:
for chapter_section in get_chapter_sections(content_section):

  chapter_id = get_chapter_id(chapter_section)
  chapter_name = get_chapter_name(chapter_section)

  # level 1, chapter
  # chapters are included in root node
  nodes.append([ chapter_id, f'Chapter {chapter_id}', chapter_name, 'CHAPTER'])
  edges.append(['0', f'{chapter_id}', 'CONTAINS'])

  for article_section in get_article_sections(chapter_section):
    article_id = get_article_id(article_section)
    article_name = get_article_name(article_section)
    article_paragraphs = get_paragraphs(article_section)

    # level 2, article
    # articles are included in chapters
    nodes.append([f'{chapter_id}.{article_id}', f'Article {article_id}', article_name, 'ARTICLE'])
    edges.append([chapter_id, f'{chapter_id}.{article_id}', 'CONTAINS'])

    for paragraph_id, paragraph_text in article_paragraphs.items():

      # level 3, paragraph
      # paragraphs are included in articles
      nodes.append([f'{chapter_id}.{article_id}.{paragraph_id}', f'Article {article_id}({paragraph_id})', paragraph_text, 'PARAGRAPH'])
      edges.append([f'{chapter_id}.{article_id}', f'{chapter_id}.{article_id}.{paragraph_id}', 'CONTAINS'])

In [23]:
nodes_df = pd.DataFrame(nodes, columns=['id', 'label', 'content', 'group'])
edges_df = pd.DataFrame(edges, columns=['src', 'dst', 'label'])

In [32]:
# display(edges_df)
edges_df['label'].unique()

array(['CONTAINS'], dtype=object)

In [30]:
display(nodes_df)

Unnamed: 0,id,label,content,group
0,0,CSRD,DIRECTIVE 2013/34/EU OF THE EUROPEAN PARLIAMEN...,DIRECTIVE
1,1,Chapter 1,"Scope, definitions and categories of undertaki...",CHAPTER
2,1.1,Article 1,Scope,ARTICLE
3,1.1.1,Article 1(1),The coordination measures prescribed by this D...,PARAGRAPH
4,1.1.1a,Article 1(1a),The coordination measures prescribed by Articl...,PARAGRAPH
...,...,...,...,...
370,11.53.2,Article 53(2),Member States shall communicate to the Commiss...,PARAGRAPH
371,11.54,Article 54,Entry into force,ARTICLE
372,11.54.0,Article 54(0),This Directive shall enter into force on the t...,PARAGRAPH
373,11.55,Article 55,Addressees,ARTICLE


In [33]:
import networkx as nx

CSRD = nx.DiGraph()

for i, n in nodes_df.iterrows():
  CSRD.add_node(n['id'], label=n['label'], title=n['content'], group=n['group'])

for i, e in edges_df.iterrows():
  if e['label'] == 'CONTAINS':
    CSRD.add_edge(e['src'], e['dst'], label=e['label'])


DiGraph with 375 nodes and 374 edges


In [26]:
from pyvis.network import Network
import os
import uuid

def displayGraph(graph):

  net = Network(
    height="750px", 
    width="100%", 
    directed=True, 
    cdn_resources='remote',
    notebook=True
  )

  net.options.groups = {
      "DIRECTIVE": {
        "icon": {
            "face": 'FontAwesome',
            "code": '\uf19c',
        }
      },
      "CHAPTER": {
          "icon": {
              "face": 'FontAwesome',
              "code": '\uf02d',
          }
      },
      "ARTICLE": {                 
        "icon": {
            "face": 'FontAwesome',
            "code": '\uf07c',
          }
      },
      "PARAGRAPH": {                 
        "icon": {
            "face": 'FontAwesome',
            "code": '\uf15b',
          }
      }
  }

  net.from_nx(graph)
  net.show(f"/tmp/{uuid.uuid4().hex}.html")
  return net.html.replace(
    '<head>',
    '<head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" type="text/css"/>'
  )

In [28]:
print(displayGraph(CSRD))

/tmp/1c23aee5f2554d478f835ce6e8006a97.html
<html>
    <head><link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" type="text/css"/>
        <meta charset="utf-8">
        
            <script>function neighbourhoodHighlight(params) {
  // console.log("in nieghbourhoodhighlight");
  allNodes = nodes.get({ returnType: "Object" });
  // originalNodes = JSON.parse(JSON.stringify(allNodes));
  // if something is selected:
  if (params.nodes.length > 0) {
    highlightActive = true;
    var i, j;
    var selectedNode = params.nodes[0];
    var degrees = 2;

    // mark all nodes as hard to read.
    for (let nodeId in allNodes) {
      // nodeColors[nodeId] = allNodes[nodeId].color;
      allNodes[nodeId].color = "rgba(200,200,200,0.5)";
      if (allNodes[nodeId].hiddenLabel === undefined) {
        allNodes[nodeId].hiddenLabel = allNodes[nodeId].label;
        allNodes[nodeId].label = undefined;
      }
    }
    var connectedNodes = netw