In [2]:
##############################
#03_DataFrame example code   #
#Maintainer: Christopher Chan#
#Version: 0.0.2              #
#Date: 2023-05-10            #
##############################

# Just like we are used to, let's import all our necessary modules here
import os, sys, re
import random
import pathlib

import datetime as dt
import numpy as np
import pyarrow as pa
import pandas as pd


# Let's set the path to where our data are located
data_raw = pathlib.Path("../data/raw")
data_interim = pathlib.Path("../data/interim")
data_external = pathlib.Path("../data/external")
data_processed = pathlib.Path("../data/processed")

## Welcome to 03_DataFrames
This is where your previously learnt class will start to hopefully make sense in a table data environment.
Although manipulating tables and dataframes in python/R and excel has not much differences in outcome,
the massive gain from control, speed, read and write ability justifies moving workflow to any programming language

### 03_DataFrames Goals:
- Read and write DataFrames
- Exploring types in DataFrames
- Indexing and subsetting DataFrames
- Joins, Merge, Pivot

> This is the first part of the Data Science cycle, where we will learn to import and tidy the data
![data-science-explore](../readme_figs/data-science-explore.png)

#### Introducing CSV
If you have always used excels, you realised that excel has a row limit of 1,048,576 until it requires another sheet
If you use any other operating system, you need an excel license to read xlsx.
CSV (aka. Comma Separated Value) is a:
- Platform independent
- Basic
- Row-based dataframe format

As the name suggest values are comma separated, you can open up `../data/external/module_info.csv` to take a look!
The advantages of using CSV:
- Can be edited with any basic text editing software (notepad, excel, word...)
- Much faster read and write in any programming language
- Does not take up all your memory (i.e. No more excel crashes)
- Forces standard format in tables (1 row index, 1 column index)

The disadvantages of using CSV:
- No cool excel like collaborative 
- No excel functions (<-- We can just do all of this in python)
- No colours
- No crazy multi-column, multi-row index support (This forces good practice)

In E&S data, we have a lot of "," in our Answers field, therefore we will use semi-colon ";" as a separator instead!

In [3]:
# Pandas 2.0
# Read our E&S data with 50k samples
ESData_sample = pd.read_csv(f"{data_raw}/ESData_sample.csv", sep = ";")
#ESData_sample = pd.read_csv(f"{data_raw}/ESData_sample.csv", sep = ";", dtype_backend = "pyarrow")

# Full E&S data
ESData = pd.read_parquet(f"{data_raw}/ESData_full.parquet", dtype_backend = "pyarrow", engine = "pyarrow")
print(ESData.size)

# Calling our assigned E&S data automatically gives us the head of the DataFrame and the tail of the DataFrame
ESData_sample

16999983


Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
0,4904,EOPol1.2,Does the company disclose an anti-discriminati...,2021-07-27 00:00:00,Yes,2022-07-28 00:00:00,False,1049,"Expedia Group, Inc.",25301020,"USA,CHN,FRA,AUS,NOR,SWE,DNK,FIN,BEL,GBR,ITA,CH...",EMPLOYEE CODE OF CONDUCT,Policy,6,Wrong Source,We will not tolerate discrimination of any kin...,https://s27.q4cdn.com/708721433/files/doc_down...
1,4946,EMS6,Does the company disclose the number of incide...,2020-12-31 00:00:00,No,2023-02-15 00:00:00,False,844,"S&P Global, Inc.",40203040,"ARE,ARG,AUS,AUT,BEL,BLR,BMU,BRA,BRB,CAN,CHE,CH...",,,,,,
2,4527,WatMon1.2,Does the company disclose details on freshwate...,2021-12-31 00:00:00,Not Meaningful,2023-03-20 00:00:00,False,238,Alpha Services & Holdings SA,40101010,"ALB,BGR,CYP,DEU,GBR,GRC,IRL,JEY,LUX,MKD,ROU,SR...",,,,,,
3,4821,PosNRE2,Does the company disclose measures to mitigate...,2020-12-31 00:00:00,No,2023-03-30 00:00:00,False,736,Abbott Laboratories,35101010,"ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHS,BIH,BMU,BOL,BR...",,,,,,
4,4657,SupSt1.0,Does the company disclose a supplier labour po...,2022-11-01 00:00:00,Yes,2023-03-31 00:00:00,False,1572,Hapag-Lloyd AG,20303010,"DEU,FRA,NLD,BEL,USA,VNM,JPN,MLT,MEX,NZL,GBR,IT...",Hapag-Lloyd Supplier Code of Conduct 2022,Policy (Supplier),1,,• Prohibition of any discrimination based on e...,https://www.hapag-lloyd.com/content/dam/websit...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,4307,HRDue1.13,Does the company disclose that its suppliers a...,2020-12-31 00:00:00,Yes,2023-03-27 00:00:00,False,887,Texas Instruments Incorporated,45301020,"USA,CAN,ISR,DNK,NLD,DEU,FRA,GBR,PHL,SGP,MYS,KO...",2020 CORPORATE CITIZENSHIP REPORT,Sustainability Report,32,,To manage human rights and eliminate violation...,https://www.ti.com/lit/ml/szzo015/szzo015.pdf?...
49996,4541,WatInv2.5,Does the company disclose metrics for withdraw...,2021-12-31 00:00:00,Yes,2023-03-04 00:00:00,False,556,Electricite de France SA,55101010,"AUS,BEL,BGR,BRA,CAN,CHE,CHL,CHN,CUW,CZE,DEU,DJ...","EDF group's Environmental, Social and Governan...",Company website,,,E-Water resources management (sheet)\nDrinking...,https://www.edf.fr/sites/groupe/files/2022-04/...
49997,5026,WatInv4.0-x,Total water withdrawn from areas with high wat...,2021-12-31 00:00:00,7.339262 m Cubic metre (m3),2023-02-23 00:00:00,False,854,"NextEra Energy, Inc.",55101010,"USA,LKA,NLD,CAN,CYM,ESP",2022 ESG Report,Sustainability Report,62,,percentage of\neach in regions of high or extr...,https://www.investor.nexteraenergy.com/~/media...
49998,4583,HRSup5.0,Does the company express an expectation on sup...,2021-12-31 00:00:00,Not Meaningful,2023-04-24 00:00:00,False,1125,Las Vegas Sands Corp.,25301010,"BMU,CHN,CYM,HKG,IND,JPN,KOR,MAC,MUS,MYS,NLD,PR...",,,,,,


In [3]:
# Let's make sure our sample have 50,000 rows
print(ESData_sample.shape)
print(f"Our sample E&S DataFrame have {ESData_sample.shape[0]} rows, and {ESData_sample.shape[1]} columns!")

(50000, 17)
Our sample E&S DataFrame have 50000 rows, and 17 columns!


### We should probably explore the data a little bit more...
- Look at it
- Find some things about it
- Look at the data type

In [4]:
# Look at the heads and tails, and random samples
ESData_sample.head()

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
0,4904,EOPol1.2,Does the company disclose an anti-discriminati...,2021-07-27 00:00:00,Yes,2022-07-28 00:00:00,False,1049,"Expedia Group, Inc.",25301020,"USA,CHN,FRA,AUS,NOR,SWE,DNK,FIN,BEL,GBR,ITA,CH...",EMPLOYEE CODE OF CONDUCT,Policy,6.0,Wrong Source,We will not tolerate discrimination of any kin...,https://s27.q4cdn.com/708721433/files/doc_down...
1,4946,EMS6,Does the company disclose the number of incide...,2020-12-31 00:00:00,No,2023-02-15 00:00:00,False,844,"S&P Global, Inc.",40203040,"ARE,ARG,AUS,AUT,BEL,BLR,BMU,BRA,BRB,CAN,CHE,CH...",,,,,,
2,4527,WatMon1.2,Does the company disclose details on freshwate...,2021-12-31 00:00:00,Not Meaningful,2023-03-20 00:00:00,False,238,Alpha Services & Holdings SA,40101010,"ALB,BGR,CYP,DEU,GBR,GRC,IRL,JEY,LUX,MKD,ROU,SR...",,,,,,
3,4821,PosNRE2,Does the company disclose measures to mitigate...,2020-12-31 00:00:00,No,2023-03-30 00:00:00,False,736,Abbott Laboratories,35101010,"ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHS,BIH,BMU,BOL,BR...",,,,,,
4,4657,SupSt1.0,Does the company disclose a supplier labour po...,2022-11-01 00:00:00,Yes,2023-03-31 00:00:00,False,1572,Hapag-Lloyd AG,20303010,"DEU,FRA,NLD,BEL,USA,VNM,JPN,MLT,MEX,NZL,GBR,IT...",Hapag-Lloyd Supplier Code of Conduct 2022,Policy (Supplier),1.0,,• Prohibition of any discrimination based on e...,https://www.hapag-lloyd.com/content/dam/websit...


In [5]:
ESData_sample.tail()

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
49995,4307,HRDue1.13,Does the company disclose that its suppliers a...,2020-12-31 00:00:00,Yes,2023-03-27 00:00:00,False,887,Texas Instruments Incorporated,45301020,"USA,CAN,ISR,DNK,NLD,DEU,FRA,GBR,PHL,SGP,MYS,KO...",2020 CORPORATE CITIZENSHIP REPORT,Sustainability Report,32.0,,To manage human rights and eliminate violation...,https://www.ti.com/lit/ml/szzo015/szzo015.pdf?...
49996,4541,WatInv2.5,Does the company disclose metrics for withdraw...,2021-12-31 00:00:00,Yes,2023-03-04 00:00:00,False,556,Electricite de France SA,55101010,"AUS,BEL,BGR,BRA,CAN,CHE,CHL,CHN,CUW,CZE,DEU,DJ...","EDF group's Environmental, Social and Governan...",Company website,,,E-Water resources management (sheet)\nDrinking...,https://www.edf.fr/sites/groupe/files/2022-04/...
49997,5026,WatInv4.0-x,Total water withdrawn from areas with high wat...,2021-12-31 00:00:00,7.339262 m Cubic metre (m3),2023-02-23 00:00:00,False,854,"NextEra Energy, Inc.",55101010,"USA,LKA,NLD,CAN,CYM,ESP",2022 ESG Report,Sustainability Report,62.0,,percentage of\neach in regions of high or extr...,https://www.investor.nexteraenergy.com/~/media...
49998,4583,HRSup5.0,Does the company express an expectation on sup...,2021-12-31 00:00:00,Not Meaningful,2023-04-24 00:00:00,False,1125,Las Vegas Sands Corp.,25301010,"BMU,CHN,CYM,HKG,IND,JPN,KOR,MAC,MUS,MYS,NLD,PR...",,,,,,
49999,4680,SupSt1.104,Does the company disclose a supplier policy pr...,2022-08-31 00:00:00,No,2023-02-03 00:00:00,False,738,Accenture Plc,45102010,"AGO,AND,ARE,ARG,AUS,AUT,BEL,BGD,BGR,BMU,BOL,BR...",,,,,,


In [6]:
# Random sample of 4
ESData_sample.sample(n = 4)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
12940,4872,PMH4.1,Does the company disclose the existence of str...,2021-12-31 00:00:00,No,2023-03-06 00:00:00,False,773,"Lumen Technologies, Inc.",50101010,"ARG,AUS,AUT,BEL,BGR,BMU,BRA,CAN,CHE,CHL,CHN,CO...",,,,,,
22496,4753,SupCP2.03,Does the company disclose the existence of sup...,2022-02-28 00:00:00,No,2023-02-07 00:00:00,False,971,"CarMax, Inc.",25504050,"USA,BMU,LBN,VEN,SUR,BOL,CHN",,,,,,
22973,4949,SusRep13,Does the company disclose the level of externa...,2021-12-31 00:00:00,"Yes, limited assurance",2023-02-20 00:00:00,False,780,Colgate-Palmolive Company,30301010,"USA,CZE,NLD,ESP,BRA,CHE,DEU,GBR,SGP,ITA,FRA,ME...",Scope 1 and Scope 2 Assurance Statement,Company website,2.0,,Level of Assurance and Qualifications:\n Limited,https://www.colgatepalmolive.com/content/dam/c...
6918,4830,SupSt2.7,Does the company disclose a supplier policy th...,2020-12-31 00:00:00,No,2023-03-17 00:00:00,False,1190,Eversource Energy,55101010,"NLD,USA",,,,,,


In [7]:
# We can access individual column and multiple columns using
ESData_sample["Question"]

0        Does the company disclose an anti-discriminati...
1        Does the company disclose the number of incide...
2        Does the company disclose details on freshwate...
3        Does the company disclose measures to mitigate...
4        Does the company disclose a supplier labour po...
                               ...                        
49995    Does the company disclose that its suppliers a...
49996    Does the company disclose metrics for withdraw...
49997    Total water withdrawn from areas with high wat...
49998    Does the company express an expectation on sup...
49999    Does the company disclose a supplier policy pr...
Name: Question, Length: 50000, dtype: object

In [8]:
#Multiple Columns
ESData_sample[["FactorId", "Question"]]

Unnamed: 0,FactorId,Question
0,4904,Does the company disclose an anti-discriminati...
1,4946,Does the company disclose the number of incide...
2,4527,Does the company disclose details on freshwate...
3,4821,Does the company disclose measures to mitigate...
4,4657,Does the company disclose a supplier labour po...
...,...,...
49995,4307,Does the company disclose that its suppliers a...
49996,4541,Does the company disclose metrics for withdraw...
49997,5026,Total water withdrawn from areas with high wat...
49998,4583,Does the company express an expectation on sup...


In [9]:
# Describe and data type info
# This finds the numeric columns automatically and give us 
ESData_sample.describe()

Unnamed: 0,FactorId,AgentId,AgentGics
count,50000.0,50000.0,50000.0
mean,4725.5569,713.90094,32121400.0
std,236.589686,394.393798,14124140.0
min,4244.0,1.0,10101010.0
25%,4558.0,404.0,20106020.0
50%,4743.0,702.0,30202030.0
75%,4919.0,1022.0,40301050.0
max,5255.0,1598.0,60201030.0


In [10]:
# Data Types that we learnt
# We see that Date and PublicationDate is not a date object yet, we can change that!
print(ESData_sample.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   FactorId            50000 non-null  int64 
 1   Name                50000 non-null  object
 2   Question            50000 non-null  object
 3   Date                50000 non-null  object
 4   Answer              49998 non-null  object
 5   PublicationDate     50000 non-null  object
 6   Restated            50000 non-null  bool  
 7   AgentId             50000 non-null  int64 
 8   AgentName           50000 non-null  object
 9   AgentGics           50000 non-null  int64 
 10  CountryOfOperation  50000 non-null  object
 11  SourceName          18815 non-null  object
 12  SourceType          18815 non-null  object
 13  Page                16514 non-null  object
 14  Comment             2497 non-null   object
 15  Excerpt             17230 non-null  object
 16  URL                 18

In [11]:
# Let's change the columns data type
ESData_sample[["Date", "PublicationDate"]] = ESData_sample[["Date", "PublicationDate"]].apply(pd.to_datetime)

print(ESData_sample[["Date", "PublicationDate"]].dtypes)

ESData_sample[["Date", "PublicationDate"]].sample(n = 8)

Date               datetime64[ns]
PublicationDate    datetime64[ns]
dtype: object


Unnamed: 0,Date,PublicationDate
35109,2021-12-31,2022-12-08
15044,2021-12-31,2022-12-29
19700,2021-12-31,2023-03-13
29918,2022-12-15,2023-03-07
32408,2021-12-31,2023-02-20
2186,2021-12-31,2023-05-03
12336,2021-12-31,2023-03-06
7228,2021-12-31,2023-03-03


### Subsetting Dataframes using Location and Integer-Location (loc, iloc)
- loc: indexing dataframes using names
    - This is useful when we have both row names/index and column names

- iloc: indexing dataframes using integer locations
    - This is useful when we count rows and columns

In [20]:
# loc

# We can select 1 specific row, and 1 specific column by name
print(ESData_sample.loc[999, "Question"])

# We can select a range of rows and a range of columns by name
print(ESData_sample.loc[997:999, "FactorId":"Question"])

# So why does this give us an error?
print(ESData_sample.loc[-1, "FactorId":"Question"])


Does the company disclose the existence of an anonymous hotline for supply chain workers?
     FactorId       Name                                           Question
997      4927  EOMsr1.23  Does the company disclose ethnic/racial divers...
998      4972    TraM2-x  Percentage of employees receiving career devel...
999      4778  SupCP4.11  Does the company disclose the existence of an ...


KeyError: -1

In [21]:
# iloc

# We can achieve the same result of location with iloc
print(ESData_sample.iloc[999, 2])

# Note that in iloc we use 997:1000 instead of loc[997:999]
# This is because pandas with iloc now uses integer counting method instead of pure indexing
print(ESData_sample.iloc[997:1000, 0:3])

# But now we can also do negatives
print(ESData_sample.iloc[-1, 0:3])

Does the company disclose the existence of an anonymous hotline for supply chain workers?
     FactorId       Name                                           Question
997      4927  EOMsr1.23  Does the company disclose ethnic/racial divers...
998      4972    TraM2-x  Percentage of employees receiving career devel...
999      4778  SupCP4.11  Does the company disclose the existence of an ...
FactorId                                                 4680
Name                                               SupSt1.104
Question    Does the company disclose a supplier policy pr...
Name: 49999, dtype: object


### Let's take a look at a specific company perhaps?
Mercedes-Benz?

In [22]:
Mercedes_df = ESData[ESData["AgentName"] == "Mercedes-Benz Group AG"]
del(ESData)

Mercedes_df.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
367954,4595,HRSup8.2,Does the company express an expectation on sup...,2021-12-31 00:00:00,Not Meaningful,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
368821,4574,HRSup1.25,Does the company express an expectation on sup...,2022-07-22 00:00:00,Yes,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Responsible Sourcing Standards 2022,Policy (Supplier),22.0,,References • UN Global Compact,https://supplier.mercedes-benz.com/servlet/Jiv...
368020,4725,HFat1.0,Does the company disclose occupational health ...,2021-12-31 00:00:00,Yes,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Key figures Human Resources,Company website,,,"2017 2018 2019 2020 20213,4 Number of emp...",https://sustainabilityreport.mercedes-benz.com...
368866,4754,SupCP2.1,Does the company disclose detailed information...,2022-12-31 00:00:00,Yes,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,255.0,,"For effective, sustainable supplier management...",https://sustainabilityreport.mercedes-benz.com...
368358,4952,SusRep13.12,Does the company disclose external assurance f...,2022-12-31 00:00:00,Yes,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,245.0,,We have performed a limited assurance engageme...,https://sustainabilityreport.mercedes-benz.com...


### Writing and saving data

In [23]:
# Let's save our Mercedes_df subset as excel
with open(f"{data_processed}/Mercedes_df.xlsx", "wb") as merc:
    Mercedes_df.to_excel(merc, index = False)

# Save as csv
with open(f"{data_processed}/Mercedes_df.csv", "wb") as merc:
    Mercedes_df.to_csv(merc, sep = ";", index = False)

We can now read our newly created Mercedes CSV! <br>
Woops, this way of reading the file is different! <br>
There seems to be a few ways to open a file!

In [24]:
with open(f"{data_processed}/Mercedes_df.csv", "rb") as merc:
    Mercedes_df = pd.read_csv(merc, dtype_backend = "pyarrow", sep = ";")

Mercedes_df.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
36,4541,WatInv2.5,Does the company disclose metrics for withdraw...,2021-12-31 00:00:00,No,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
1213,4475,Tax1,Does the company disclose a commitment to proh...,2022-12-31 00:00:00,No,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
906,4540,WatInv2.4,Does the company disclose metrics for withdraw...,2022-12-31 00:00:00,No,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
1067,4889,FoAMsr12,Does the company disclose the existence of for...,2022-12-31 00:00:00,No,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
1239,4836,CoC1.0,Does the company disclose a commitment to proh...,2023-01-25 00:00:00,Yes,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Our Integrity Code [1/25/23],Policy,19.0,,We do not give or take bribes,https://group.mercedes-benz.com/documents/comp...


#### Duplications, Check for duplications!

In [25]:
ESData_sample.loc[ESData_sample.duplicated()]

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL


### Let's do some more complicated things
Now, that we know the basics of reading a dataframe, let's do some joining! <br>
Joinings or Mergings are combining 2 tables based on similar values!

![joins](../readme_figs/joins.jpg)

So, we have some information about the Published Companies Domicile information we have not included, <br>
But, it is in another xlsx file, it is in the external folder!

In [26]:
Domicile_df = pd.read_excel(f"{data_external}/PublishedCompanies_2023-04-18.xlsx")

Domicile_df.sample(n = 5)

Unnamed: 0,Agent ID,ISS Company ID,Agent,Domicile
5656,79024,166200,"Weyco Group, Inc.",United States
415,1136108,3405428,Ariston Holding NV,Italy
3558,48999,522045,New Gold Inc.,Canada
4459,4101,514190,Sanlam Ltd.,South Africa
4168,501342,574170,PyroGenesis Canada Inc.,Canada


We have new information that we wanted to include into the E&S raw data! <br>
We have 1 column that is common between both dataframes: Agent Id.

> Perform a left join on AgentId!

![left_join](../readme_figs/left_join.jpg)

In [27]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge.html
ESSample_wDomLEFT = pd.merge(ESData_sample, Domicile_df, how = "left", left_on = "AgentId", right_on = "Agent ID")
ESSample_wDomRIGHT = pd.merge(ESData_sample, Domicile_df, how = "right", left_on = "AgentId", right_on = "Agent ID")

ESSample_wDomLEFT.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,...,SourceName,SourceType,Page,Comment,Excerpt,URL,Agent ID,ISS Company ID,Agent,Domicile
40376,4492,PayG1.3,Does the company disclose country-by-country r...,2022-12-31,Yes,2023-04-21,False,511,Bayer AG,35202010,...,Annual Report 2022,Annual Report,169.0,,Information on Geographical Areas> Net sales (...,https://www.bayer.com/sites/default/files/2023...,511.0,13876.0,Bayer AG,Germany
1828,4787,SupCP5.22,Does the company disclose training on supplier...,2020-12-31,No,2023-03-27,False,1057,Fifth Third Bancorp,40101010,...,,,,,,,1057.0,53359.0,Fifth Third Bancorp,United States
42495,5025,WatInv2.5-x,Quantity withdrawn from third-party water,2020-12-31,Not Disclosed,2022-12-21,False,1100,"The Interpublic Group of Companies, Inc.",50201010,...,,,,,,,1100.0,77994.0,"The Interpublic Group of Companies, Inc.",United States
13137,4847,CoPro5.0,Does the company refer to ISO 37001 Anti-bribe...,2021-12-31,Yes,2023-04-13,False,553,EDP-Energias de Portugal SA,55101010,...,ANNUAL REPORT 2021,Annual Report,113.0,,Obtained ISO 37001 (Anti-bribery Management Sy...,https://www.edp.com/sites/default/files/2022-0...,553.0,45432.0,EDP-Energias de Portugal SA,Portugal
23370,4698,Lobb1.0,Does the company disclose the monetary value o...,2021-12-31,No,2022-12-08,False,50,"Weichai Power Co., Ltd.",20106010,...,,,,,,,50.0,553636.0,"Weichai Power Co., Ltd.",China


In [28]:
ESSample_wDomRIGHT.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,...,SourceName,SourceType,Page,Comment,Excerpt,URL,Agent ID,ISS Company ID,Agent,Domicile
15780,4910.0,EOPol1.06,Does the company disclose a commitment against...,2021-08-25,Yes,2023-03-17,False,1190.0,Eversource Energy,55101010.0,...,CODE OF BUSINESS CONDUCT,Policy,28.0,,Eversource provides fair treatment and equal e...,https://www.eversource.com/content/docs/defaul...,1190,110983,Eversource Energy,United States
35329,4563.0,HRSup1.0,Does the company express an expectation on sup...,2020-12-10,Yes,2022-12-13,False,1248.0,Regions Financial Corporation,40101015.0,...,Code Of Business Conduct and Ethics,Policy,4.0,,Honoring and affirming protections for human\n...,https://ir.regions.com/~/media/Files/R/Regions...,1248,53772,Regions Financial Corporation,United States
897,4608.0,CliInv30,Does the company disclose its aggregated scope...,2021-12-31,Yes,2023-03-16,False,229.0,ACS Actividades de Construccion y Servicios SA,20103010.0,...,Integrated Report of ACS Group 2021,Annual Report,101.0,,Scope 3,https://www.grupoacs.com/ficheros_editor/File/...,229,515575,ACS Actividades de Construccion y Servicios SA,Spain
48551,4297.0,HRPol2.1,Does the company embed in a public policy a co...,2020-12-31,Not Meaningful,2022-12-26,False,1342.0,"Yum! Brands, Inc.",25301040.0,...,,,,,,,1342,534028,"Yum! Brands, Inc.",United States
4985,4485.0,PayG1.00,Does the company provide country-by-country fi...,2021-12-31,No,2023-03-20,False,945.0,"AvalonBay Communities, Inc.",60106010.0,...,,,,,,,945,512735,"AvalonBay Communities, Inc.",United States


### Let's go back to our Mercedes data!
- Can we find out more about it?
- Let's focus on Environmental factors?

In [29]:
# Let's make a list of relevant factors:
Clim_factors = [4980, 4981, 4982, 4983, 4984, 4985, 4986, 4987, 4988, 4989,
                4990, 4991, 4992, 4993, 4994, 4995, 4996, 4997, 4998, 4999,
                5000, 5001, 5002, 5077, 5078, 5079, 5080, 5081, 5082, 5003,
                5055, 5056, 5057, 5058, 5059, 5083, 4963]


Merc_Clim = Mercedes_df[Mercedes_df["FactorId"].isin(Clim_factors)]
print(Merc_Clim["Question"].unique())
Merc_Clim.sample(n = 5)

<ArrowExtensionArray>
[                                       'Percentage of company certified to ISO 50001',
                                                         'Total scope 1 GHG emissions',
                                        'Total scope 2 GHG emissions (location-based)',
                                          'Total scope 2 GHG emissions (market-based)',
                                           'Total scope 2 GHG emissions (unspecified)',
                                      'Total scope 1+2 GHG emissions (location-based)',
                                        'Total scope 1+2 GHG emissions (market-based)',
                                         'Total scope 1+2 GHG emissions (unspecified)',
                                                         'Total scope 3 GHG emissions',
               'Total scope 3 emissions for Category 1 - Purchased goods and services',
                              'Total scope 3 emissions for Category 2 - Capital goods',
        'T

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
174,4994,CliInv21-x,Total scope 3 emissions for Category 7 - Emplo...,2021-12-31 00:00:00,122000 Metric tonnes (t) CO2,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,140,,Scope 3 (r Mercedes-Benz Cars1) Absolute  sco...,https://group.mercedes-benz.com/documents/sust...
167,4987,CliInv30-x,Total scope 3 GHG emissions,2021-12-31 00:00:00,123.3 m Metric tonnes (t) CO2,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,140,,Scope 3 (r Mercedes-Benz Cars1) Absolute  sco...,https://group.mercedes-benz.com/documents/sust...
151,4980,CliInv1.01-x,Total scope 1 GHG emissions,2021-12-31 00:00:00,681000 Metric tonnes (t) CO2,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,153,,CO2 direct (Scope 1),https://group.mercedes-benz.com/documents/sust...
865,5003,EnSou1.1-x,Total energy consumption,2022-12-31 00:00:00,6087 Gigawatt hour (GWh) Energy,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,143,,Energy consumption (in GWh),https://sustainabilityreport.mercedes-benz.com...
850,4993,CliInv20-x,Total scope 3 emissions for Category 6 - Busin...,2021-12-31 00:00:00,19000 Metric tonnes (t) CO2e,2023-04-26 00:00:00,True,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,103,,Scope 3 Specific Absolute CO2 in million t Bus...,https://sustainabilityreport.mercedes-benz.com...


### Can we refine for TCFD disclosure? (Scopes measurements in questions?)
#### Introducing Regular Expression (aka. regex / grep / alienspeak)!
![regex](../readme_figs/regex.jpg)

Regular Expression, first invented in 1951, was a way of classifying and pattern matching languages.
This was the defacto way search egines were built before the existence of NLP based methods

For the example, let's say we want to query for "scope" follow by any numer [1, 2, 3] in the Mercedes dataframe question.
**Scope specified disclosure**
This means we have to match:
- scope 3
- scope 1+2

In [30]:
Merc_TCFD = Merc_Clim[Merc_Clim["Question"].str.contains(r"scope\s\d.*", regex = True)]

# Let's check our query results
print(Merc_TCFD["Question"].unique())

<ArrowExtensionArray>
[                                                        'Total scope 1 GHG emissions',
                                        'Total scope 2 GHG emissions (location-based)',
                                          'Total scope 2 GHG emissions (market-based)',
                                           'Total scope 2 GHG emissions (unspecified)',
                                      'Total scope 1+2 GHG emissions (location-based)',
                                        'Total scope 1+2 GHG emissions (market-based)',
                                         'Total scope 1+2 GHG emissions (unspecified)',
                                                         'Total scope 3 GHG emissions',
               'Total scope 3 emissions for Category 1 - Purchased goods and services',
                              'Total scope 3 emissions for Category 2 - Capital goods',
        'Total scope 3 emissions for Category 3 - Fuel- and energy-related activities',
   'Total 

In [31]:
Merc_TCFD.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
856,4996,CliInv23-x,Total scope 3 emissions for Category 9 - Downs...,2022-12-31 00:00:00,Not Disclosed,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
857,4997,CliInv24-x,Total scope 3 emissions for Category 10 - Proc...,2022-12-31 00:00:00,Not Disclosed,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
180,5000,CliInv27-x,Total scope 3 emissions for Category 13 - Down...,2021-12-31 00:00:00,Not Disclosed,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
864,5002,CliInv29-x,Total scope 3 emissions for Category 15 - Inve...,2022-12-31 00:00:00,Not Disclosed,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
172,4992,CliInv19-x,Total scope 3 emissions for Category 5 - Waste...,2021-12-31 00:00:00,1 m Metric tonnes (t) CO2,2023-04-26 00:00:00,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,140.0,,Scope 3 (r Mercedes-Benz Cars1) Absolute  sco...,https://group.mercedes-benz.com/documents/sust...
