# Data Cleaning and EDA

In [1]:
%pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [27]:
import json
import boto3
import os 
from dotenv import load_dotenv 

In [28]:
load_dotenv()

True

In [52]:
file_path = "data/csrd_datapoints.json"
data = None 

with open(file_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

In [53]:
CSRD_STANDARDS = {
    "ESRS_2": "General Disclosures",
    "E1": "Climate Change",
    "E2": "Pollution",
    "E3": "Water and Marine Resources",
    "E4": "Biodiversity and Ecosystems",
    "E5": "Resources and Circular Economy",
    "S1": "Own Workforce",
    "S2": "Workers in the Value Chain",
    "S3": "Affected Communities",
    "S4": "Customers and End-Users",
    "G1": "Business Conduct",
    
}

In [54]:
csrd_datapoints = []

def clean_csrd_datapoints(data):
    for item in data:
        for _item in data[item]:
            esrs = str(_item["ESRS"]).replace(" ", "_")
            
            csrd_datapoints.append({
                "PK": f"{esrs}#{_item['ID']}",
                "SK": f"{esrs}#{_item['ID']}",
                "standard": CSRD_STANDARDS.get(esrs),
                **_item,
            })
    return csrd_datapoints 

In [55]:
csrd = clean_csrd_datapoints(data)

print(csrd[0])

{'PK': 'ESRS_2#BP-1_01', 'SK': 'ESRS_2#BP-1_01', 'standard': 'General Disclosures', 'ID': 'BP-1_01', 'ESRS': 'ESRS 2', 'DR': 'BP-1', 'Paragraph': '5 a', 'Related AR': '', 'Name': 'Basis for preparation of sustainability statement', 'Data Type': 'semi-narrative', 'Conditional or alternative DP': '', 'May \n[V]': '', 'Appendix B - ESRS 2 \n(SFDR + PILLAR 3 + Benchmark + CL)': '', 'DPs to be disclosed in case of phased-in [Appendix C - ESRS 1]\nUndertaking less than 750 employees': '', 'Appendix C - ESRS 1\n[DPs subject to phased-in]\n': ''}


## Store CSRD DataPoints to DDB 

In [56]:
dynamodb = boto3.resource(
    "dynamodb",
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY"),
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
    region_name="us-east-2"
)

In [57]:
TABLE_NAME = "CoreTable-CoreTable97EB8292-12YM841LJM4YC"
table = dynamodb.Table(TABLE_NAME)

In [64]:
def batch_write(items):
    with table.batch_writer() as batch:
        for item in items:
            print(item)
            batch.put_item(Item=item)

In [65]:
batch_write(csrd)

{'PK': 'ESRS_2#BP-1_01', 'SK': 'ESRS_2#BP-1_01', 'standard': 'General Disclosures', 'ID': 'BP-1_01', 'ESRS': 'ESRS 2', 'DR': 'BP-1', 'Paragraph': '5 a', 'Related AR': '', 'Name': 'Basis for preparation of sustainability statement', 'Data Type': 'semi-narrative', 'Conditional or alternative DP': '', 'May \n[V]': '', 'Appendix B - ESRS 2 \n(SFDR + PILLAR 3 + Benchmark + CL)': '', 'DPs to be disclosed in case of phased-in [Appendix C - ESRS 1]\nUndertaking less than 750 employees': '', 'Appendix C - ESRS 1\n[DPs subject to phased-in]\n': ''}
{'PK': 'ESRS_2#BP-1_02', 'SK': 'ESRS_2#BP-1_02', 'standard': 'General Disclosures', 'ID': 'BP-1_02', 'ESRS': 'ESRS 2', 'DR': 'BP-1', 'Paragraph': '5 b i', 'Related AR': '', 'Name': 'Scope of consolidation of consolidated sustainability statement is same as for financial statements', 'Data Type': 'narrative', 'Conditional or alternative DP': 'Conditional', 'May \n[V]': '', 'Appendix B - ESRS 2 \n(SFDR + PILLAR 3 + Benchmark + CL)': '', 'DPs to be discl

ClientError: An error occurred (ValidationException) when calling the BatchWriteItem operation: Provided list of item keys contains duplicates