# Data Cleaning and EDA

In [1]:
%pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import boto3
import os 
from dotenv import load_dotenv 

In [3]:
load_dotenv()

True

In [4]:
file_path = "data/csrd_datapoints.json"
data = None 

with open(file_path, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

In [5]:
CSRD_STANDARDS = {
    "ESRS_2": "General Disclosures",
    "E1": "Climate Change",
    "E2": "Pollution",
    "E3": "Water and Marine Resources",
    "E4": "Biodiversity and Ecosystems",
    "E5": "Resources and Circular Economy",
    "S1": "Own Workforce",
    "S2": "Workers in the Value Chain",
    "S3": "Affected Communities",
    "S4": "Customers and End-Users",
    "G1": "Business Conduct",
    
}

In [6]:
csrd_datapoints = []

def clean_csrd_datapoints(data):
    for item in data:
        for _item in data[item]:
            esrs = str(_item["ESRS"]).replace(" ", "_")
            csrd_datapoints.append({
                "PK": f"{esrs}#{_item['ID']}",
                "SK": f"{esrs}#{_item['ID']}",
                "standard": CSRD_STANDARDS.get(esrs),
                **_item,
            })
    return csrd_datapoints 

In [7]:
data = clean_csrd_datapoints(data)

print(data[0])

{'PK': 'ESRS_2#BP-1_01', 'SK': 'ESRS_2#BP-1_01', 'standard': 'General Disclosures', 'ID': 'BP-1_01', 'ESRS': 'ESRS 2', 'DR': 'BP-1', 'Paragraph': '5 a', 'Related AR': nan, 'Name': 'Basis for preparation of sustainability statement', 'Data Type': 'semi-narrative', 'Conditional or alternative DP': nan, 'May \n[V]': nan, 'Appendix B - ESRS 2 \n(SFDR + PILLAR 3 + Benchmark + CL)': nan, 'DPs to be disclosed in case of phased-in [Appendix C - ESRS 1]\nUndertaking less than 750 employees': nan, 'Appendix C - ESRS 1\n[DPs subject to phased-in]\n': nan}


## Store CSRD DataPoints to DDB 

In [18]:
dynamodb = boto3.resource(
    "dynamodb",
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY"),
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
    region_name="us-east-2"
)

In [19]:
TABLE_NAME = "CoreTable-CoreTable97EB8292-12YM841LJM4YC"
table = dynamodb.Table(TABLE_NAME)

In [20]:
item = {
    "PK": "test#212",
    "SK": "test#212",
    "id": "123",  # Primary Key
    "name": "John Doe",
    "email": "john.doe@example.com",
    "age": 30
}

response = table.put_item(Item=item)
print("Data inserted successfully:", response)


Data inserted successfully: {'ResponseMetadata': {'RequestId': 'P3VGC2P0IPR7F54JFQNL3HE4K3VV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Sat, 15 Feb 2025 02:28:14 GMT', 'content-type': 'application/x-amz-json-1.0', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': 'P3VGC2P0IPR7F54JFQNL3HE4K3VV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2745614147'}, 'RetryAttempts': 0}}


In [21]:
def batch_write(items):
    with table.batch_writer() as batch:
        for item in items:
            batch.put_item(Item=item)

In [22]:
batch_write(data)

TypeError: Float types are not supported. Use Decimal types instead.