In [22]:
##################################
#XGBOOST for TCFD data prediction#
#Maintainer: Christopher Chan    #
#Version: 0.0.1                  #
#Date: 2023-05-15                #
##################################

import os, sys, re
import random
import pathlib
import xgboost

import numpy as np
import pyarrow as pa
import pandas as pd
import scipy as sp
import sklearn as sk

from tqdm import tqdm

# define paths
data_interim = pathlib.Path("../data/interim")
data_processed = pathlib.Path("../data/processed")

# Our prediction dataset is Mercedes
TCFDlist = [i for i in os.listdir(f"{data_interim}/TCFD_pivot") if i.endswith("csv") and not i.startswith("Merc")]

concat_TCFD = pd.concat([pd.read_csv(os.path.join(data_interim, "TCFD_pivot", file), sep = ";", dtype_backend = "pyarrow") for file in TCFDlist])
concat_TCFD.dropna(inplace = True)
concat_TCFD.reset_index(drop = True, inplace = True)

Merc_TCFD = pd.read_csv(f"{data_interim}/TCFD_pivot/Mercedes-Benz_Group_AG_TCFDPivot.csv", sep = ";", dtype_backend = "pyarrow")

In [24]:
# Change not disclosed to NA
concat_TCFD.sample(n = 8)

Unnamed: 0,Date,Percentage GHG reduction target (scope unspecified),Percentage GHG reduction target – Scope 1,Percentage GHG reduction target – Scope 1+2,Percentage GHG reduction target – Scope 1+2+3,Percentage GHG reduction target – Scope 2,Percentage GHG reduction target – Scope 3,Percentage energy use reduction target,Percentage of company certified to ISO 50001,Total energy consumption,...,Total scope 3 emissions for Category 14 - Franchises,Total scope 3 emissions for Category 15 - Investments,Total scope 3 emissions for Category 2 - Capital goods,Total scope 3 emissions for Category 3 - Fuel- and energy-related activities,Total scope 3 emissions for Category 4 - Upstream transportation and distribution,Total scope 3 emissions for Category 5 - Waste generated in operations,Total scope 3 emissions for Category 6 - Business travel,Total scope 3 emissions for Category 7 - Employee commuting,Total scope 3 emissions for Category 8 - Upstream leased assets,Total scope 3 emissions for Category 9 - Downstream transportation and distribution
384,2021-03-31,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,255.18 m Gigajoule (GJ) Energy,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed
790,2021-12-31,Base year: 2019 Target year: 2030 : 20% Intensity,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,36.127586 m Kilowatt hour (kWh) Energy,...,Not Disclosed,Not Disclosed,Not Disclosed,1045 Metric tonnes (t) CO2e,Not Disclosed,170 Metric tonnes (t) CO2e,4172 Metric tonnes (t) CO2e,Not Disclosed,Not Disclosed,Not Disclosed
128,2022-12-31,Not Disclosed,Not Disclosed,Base year: 2019 Target year: 2024 : 20% Absolute,Base year: 2019 Target year: 2050 : 100% Absolute,Not Disclosed,Base year: 2019 Target year: 2024 : 6% Absolute,Not Disclosed,29 %,35472 Terajoule (TJ) Energy,...,Not Disclosed,Not Disclosed,510000 Metric tonnes (t) CO2e,550000 Metric tonnes (t) CO2e,820000 Metric tonnes (t) CO2e,Not Disclosed,150000 Metric tonnes (t) CO2e,Not Disclosed,Not Disclosed,Not Disclosed
873,2020-12-31,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,69.07212 m Gigajoule (GJ) Energy,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed
36,2021-12-31,Not Disclosed,Not Disclosed,Base year: 2017 Target year: 2024 : 27% Absolute,Not Disclosed,Not Disclosed,Base year: 2017 Target year: 2030 : 16% Absolute,Not Disclosed,Not Disclosed,2.696924 m Gigajoule (GJ) Energy,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed
697,2021-12-31,Base year: 2021 Target year: 2025 : 100% Absolute,Not Disclosed,Base year: 2021 Target year: 2030 : 100% Absolute,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,115000 Megawatt hour (MWh) Energy,...,Not Disclosed,Not Disclosed,Not Disclosed,2700 Metric tonnes (t) CO2e,Not Disclosed,Not Disclosed,6800 Metric tonnes (t) CO2e,5900 Metric tonnes (t) CO2e,Not Disclosed,Not Disclosed
741,2020-12-31,Not Disclosed,Not Disclosed,Base year: 2019 Target year: 2035 : 68% Absolute,Not Disclosed,Not Disclosed,Base year: 2018 Target year: 2035 : 20.9% Abso...,Not Disclosed,Not Disclosed,522606 Megawatt hour (MWh) Energy 1.779296 m G...,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed
560,2020-12-31,Base year: 2020 Target year: 2040 : 100% Absolute,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,1649 Terajoule (TJ) Energy,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed


In [25]:
concat_TCFD.columns

Index(['Date', 'Percentage GHG reduction target (scope unspecified)',
       'Percentage GHG reduction target – Scope 1',
       'Percentage GHG reduction target – Scope 1+2',
       'Percentage GHG reduction target – Scope 1+2+3',
       'Percentage GHG reduction target – Scope 2',
       'Percentage GHG reduction target – Scope 3',
       'Percentage energy use reduction target',
       'Percentage of company certified to ISO 50001',
       'Total energy consumption', 'Total energy consumption from biomass',
       'Total energy consumption from natural gas',
       'Total energy consumption from solid/liquid fossil fuels',
       'Total non-renewable energy consumption',
       'Total renewable energy consumption', 'Total scope 1 GHG emissions',
       'Total scope 1+2 GHG emissions (location-based)',
       'Total scope 1+2 GHG emissions (market-based)',
       'Total scope 1+2 GHG emissions (unspecified)',
       'Total scope 2 GHG emissions (location-based)',
       'Total scope 