In [1]:
##############################
#03_DataFrame example code   #
#Maintainer: Christopher Chan#
#Version: 0.2.2              #
#Date: 2023-06-01            #
##############################

# Just like we are used to, let's import all our necessary modules here
import os, sys, re
import random
import pathlib
import requests

import datetime as dt
import numpy as np
import pyarrow as pa
import pandas as pd


# Let's set the path to where our data are located
data_raw = pathlib.Path("../data/raw")
data_interim = pathlib.Path("../data/interim")
data_external = pathlib.Path("../data/external")
data_processed = pathlib.Path("../data/processed")

## Welcome to 03_DataFrames
This is where your previously learnt class will start to hopefully make sense in a table data environment.
Although manipulating tables and dataframes in python/R and excel has not much differences in outcome,
the massive gain from control, speed, read and write ability justifies moving workflow to any programming language

### 03_DataFrames Goals:
- Read and write DataFrames
- Exploring types in DataFrames
- Indexing and subsetting DataFrames
- Joins, Merge, Pivot

> This is the first part of the Data Science cycle, where we will learn to import and tidy the data

![data-science-explore](../readme_figs/data-science-explore.png)

#### Introducing CSV
If you have always used excels, you realised that excel has a row limit of 1,048,576 until it requires another sheet
If you use any other operating system, you need an excel license to read xlsx.
CSV (aka. Comma Separated Value) is a:
- Platform independent
- Basic
- Row-based dataframe format

As the name suggest values are comma separated, you can open up `../data/external/module_info.csv` to take a look!
The advantages of using CSV:
- Can be edited with any basic text editing software (notepad, excel, word...)
- Much faster read and write in any programming language
- Does not take up all your memory (i.e. No more excel crashes)
- Forces standard format in tables (1 row index, 1 column index)

The disadvantages of using CSV:
- No cool excel like collaborative 
- No excel functions (<-- We can just do all of this in python)
- No colours
- No crazy multi-column, multi-row index support (This forces good practice)

In E&S data, we have a lot of "," in our Answers field, therefore we will use semi-colon ";" as a separator instead!

In [2]:
# Pandas 2.0
# Read our E&S data with 50k samples
ESData_sample = pd.read_csv(f"{data_raw}/ESData_sample.csv", sep = ";")
#ESData_sample = pd.read_csv(f"{data_raw}/ESData_sample.csv", sep = ";", dtype_backend = "pyarrow")

# Full E&S data
ESData = pd.read_parquet(f"{data_raw}/ESData_full.parquet", dtype_backend = "pyarrow", engine = "pyarrow")
date_cols = ESData.select_dtypes(include = "timestamp[us][pyarrow]").columns.tolist()

ESData = ESData.astype({date_cols[0]: "date64[pyarrow]",
                        date_cols[1]: "date64[pyarrow]"})

ESData = ESData.astype({date_cols[0]: "string[pyarrow]",
                        date_cols[1]: "string[pyarrow]"})

print(ESData.size)

#with open(f"{data_interim}/ESData_typeset.parquet", "wb") as data:
#    ESData.to_parquet(data, engine = "pyarrow", index = False)

# Calling our assigned E&S data automatically gives us the head of the DataFrame and the tail of the DataFrame
ESData_sample

16999983


Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
0,4904,EOPol1.2,Does the company disclose an anti-discriminati...,2021-07-27 00:00:00,Yes,2022-07-28 00:00:00,False,1049,"Expedia Group, Inc.",25301020,"USA,CHN,FRA,AUS,NOR,SWE,DNK,FIN,BEL,GBR,ITA,CH...",EMPLOYEE CODE OF CONDUCT,Policy,6,Wrong Source,We will not tolerate discrimination of any kin...,https://s27.q4cdn.com/708721433/files/doc_down...
1,4946,EMS6,Does the company disclose the number of incide...,2020-12-31 00:00:00,No,2023-02-15 00:00:00,False,844,"S&P Global, Inc.",40203040,"ARE,ARG,AUS,AUT,BEL,BLR,BMU,BRA,BRB,CAN,CHE,CH...",,,,,,
2,4527,WatMon1.2,Does the company disclose details on freshwate...,2021-12-31 00:00:00,Not Meaningful,2023-03-20 00:00:00,False,238,Alpha Services & Holdings SA,40101010,"ALB,BGR,CYP,DEU,GBR,GRC,IRL,JEY,LUX,MKD,ROU,SR...",,,,,,
3,4821,PosNRE2,Does the company disclose measures to mitigate...,2020-12-31 00:00:00,No,2023-03-30 00:00:00,False,736,Abbott Laboratories,35101010,"ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHS,BIH,BMU,BOL,BR...",,,,,,
4,4657,SupSt1.0,Does the company disclose a supplier labour po...,2022-11-01 00:00:00,Yes,2023-03-31 00:00:00,False,1572,Hapag-Lloyd AG,20303010,"DEU,FRA,NLD,BEL,USA,VNM,JPN,MLT,MEX,NZL,GBR,IT...",Hapag-Lloyd Supplier Code of Conduct 2022,Policy (Supplier),1,,• Prohibition of any discrimination based on e...,https://www.hapag-lloyd.com/content/dam/websit...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,4307,HRDue1.13,Does the company disclose that its suppliers a...,2020-12-31 00:00:00,Yes,2023-03-27 00:00:00,False,887,Texas Instruments Incorporated,45301020,"USA,CAN,ISR,DNK,NLD,DEU,FRA,GBR,PHL,SGP,MYS,KO...",2020 CORPORATE CITIZENSHIP REPORT,Sustainability Report,32,,To manage human rights and eliminate violation...,https://www.ti.com/lit/ml/szzo015/szzo015.pdf?...
49996,4541,WatInv2.5,Does the company disclose metrics for withdraw...,2021-12-31 00:00:00,Yes,2023-03-04 00:00:00,False,556,Electricite de France SA,55101010,"AUS,BEL,BGR,BRA,CAN,CHE,CHL,CHN,CUW,CZE,DEU,DJ...","EDF group's Environmental, Social and Governan...",Company website,,,E-Water resources management (sheet)\nDrinking...,https://www.edf.fr/sites/groupe/files/2022-04/...
49997,5026,WatInv4.0-x,Total water withdrawn from areas with high wat...,2021-12-31 00:00:00,7.339262 m Cubic metre (m3),2023-02-23 00:00:00,False,854,"NextEra Energy, Inc.",55101010,"USA,LKA,NLD,CAN,CYM,ESP",2022 ESG Report,Sustainability Report,62,,percentage of\neach in regions of high or extr...,https://www.investor.nexteraenergy.com/~/media...
49998,4583,HRSup5.0,Does the company express an expectation on sup...,2021-12-31 00:00:00,Not Meaningful,2023-04-24 00:00:00,False,1125,Las Vegas Sands Corp.,25301010,"BMU,CHN,CYM,HKG,IND,JPN,KOR,MAC,MUS,MYS,NLD,PR...",,,,,,


In [3]:
# Let's make sure our sample have 50,000 rows
print(ESData_sample.shape)
print(f"Our sample E&S DataFrame have {ESData_sample.shape[0]} rows, and {ESData_sample.shape[1]} columns!")

(50000, 17)
Our sample E&S DataFrame have 50000 rows, and 17 columns!


### We should probably explore the data a little bit more...
- Look at it
- Find some things about it
- Look at the data type

In [4]:
# Look at the heads and tails, and random samples
ESData_sample.head()

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
0,4904,EOPol1.2,Does the company disclose an anti-discriminati...,2021-07-27 00:00:00,Yes,2022-07-28 00:00:00,False,1049,"Expedia Group, Inc.",25301020,"USA,CHN,FRA,AUS,NOR,SWE,DNK,FIN,BEL,GBR,ITA,CH...",EMPLOYEE CODE OF CONDUCT,Policy,6.0,Wrong Source,We will not tolerate discrimination of any kin...,https://s27.q4cdn.com/708721433/files/doc_down...
1,4946,EMS6,Does the company disclose the number of incide...,2020-12-31 00:00:00,No,2023-02-15 00:00:00,False,844,"S&P Global, Inc.",40203040,"ARE,ARG,AUS,AUT,BEL,BLR,BMU,BRA,BRB,CAN,CHE,CH...",,,,,,
2,4527,WatMon1.2,Does the company disclose details on freshwate...,2021-12-31 00:00:00,Not Meaningful,2023-03-20 00:00:00,False,238,Alpha Services & Holdings SA,40101010,"ALB,BGR,CYP,DEU,GBR,GRC,IRL,JEY,LUX,MKD,ROU,SR...",,,,,,
3,4821,PosNRE2,Does the company disclose measures to mitigate...,2020-12-31 00:00:00,No,2023-03-30 00:00:00,False,736,Abbott Laboratories,35101010,"ARE,ARG,AUS,AUT,BEL,BGD,BGR,BHS,BIH,BMU,BOL,BR...",,,,,,
4,4657,SupSt1.0,Does the company disclose a supplier labour po...,2022-11-01 00:00:00,Yes,2023-03-31 00:00:00,False,1572,Hapag-Lloyd AG,20303010,"DEU,FRA,NLD,BEL,USA,VNM,JPN,MLT,MEX,NZL,GBR,IT...",Hapag-Lloyd Supplier Code of Conduct 2022,Policy (Supplier),1.0,,• Prohibition of any discrimination based on e...,https://www.hapag-lloyd.com/content/dam/websit...


In [5]:
ESData_sample.tail()

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
49995,4307,HRDue1.13,Does the company disclose that its suppliers a...,2020-12-31 00:00:00,Yes,2023-03-27 00:00:00,False,887,Texas Instruments Incorporated,45301020,"USA,CAN,ISR,DNK,NLD,DEU,FRA,GBR,PHL,SGP,MYS,KO...",2020 CORPORATE CITIZENSHIP REPORT,Sustainability Report,32.0,,To manage human rights and eliminate violation...,https://www.ti.com/lit/ml/szzo015/szzo015.pdf?...
49996,4541,WatInv2.5,Does the company disclose metrics for withdraw...,2021-12-31 00:00:00,Yes,2023-03-04 00:00:00,False,556,Electricite de France SA,55101010,"AUS,BEL,BGR,BRA,CAN,CHE,CHL,CHN,CUW,CZE,DEU,DJ...","EDF group's Environmental, Social and Governan...",Company website,,,E-Water resources management (sheet)\nDrinking...,https://www.edf.fr/sites/groupe/files/2022-04/...
49997,5026,WatInv4.0-x,Total water withdrawn from areas with high wat...,2021-12-31 00:00:00,7.339262 m Cubic metre (m3),2023-02-23 00:00:00,False,854,"NextEra Energy, Inc.",55101010,"USA,LKA,NLD,CAN,CYM,ESP",2022 ESG Report,Sustainability Report,62.0,,percentage of\neach in regions of high or extr...,https://www.investor.nexteraenergy.com/~/media...
49998,4583,HRSup5.0,Does the company express an expectation on sup...,2021-12-31 00:00:00,Not Meaningful,2023-04-24 00:00:00,False,1125,Las Vegas Sands Corp.,25301010,"BMU,CHN,CYM,HKG,IND,JPN,KOR,MAC,MUS,MYS,NLD,PR...",,,,,,
49999,4680,SupSt1.104,Does the company disclose a supplier policy pr...,2022-08-31 00:00:00,No,2023-02-03 00:00:00,False,738,Accenture Plc,45102010,"AGO,AND,ARE,ARG,AUS,AUT,BEL,BGD,BGR,BMU,BOL,BR...",,,,,,


In [6]:
# Random sample of 4
ESData_sample.sample(n = 4)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
37652,4527,WatMon1.2,Does the company disclose details on freshwate...,2020-12-31 00:00:00,No,2023-05-02 00:00:00,False,1268,Sempra Energy,55103010,"BMU,CAN,CHL,DEU,ESP,GBR,JEY,LUX,MEX,NLD,PER,USA",,,,,,
28969,5079,CliTar1.31-x,Percentage GHG reduction target – Scope 1+2,2022-03-31 00:00:00,Base year: 2018 Target year: 2030 : 72.5% Abso...,2023-05-02 00:00:00,False,684,SSE Plc,55101010,"CAN,DEU,ESP,GBR,IMN,IRL,JPN,PRT,USA",SSE PLC SUSTAINABILITY REPORT 2022,Sustainability Report,15.0,,Reduce absolute scope 1 and\n2 GHG emissions b...,https://www.sse.com/media/bgnpjq2x/sustainabil...
9705,4303,HRDue1.0,Does the company disclose the existence of a h...,2021-07-04 00:00:00,No,2023-03-04 00:00:00,False,164,Shoprite Holdings Ltd.,30101030,"AGO,BWA,COD,GHA,LSO,MDG,MOZ,MUS,MWI,NAM,NGA,SW...",,,,,,
171,4527,WatMon1.2,Does the company disclose details on freshwate...,2021-12-31 00:00:00,No,2022-10-19 00:00:00,False,656,Renault SA,25102010,"ARG,AUT,BEL,BGR,BRA,CAN,CHE,CHL,CHN,COL,CZE,DE...",,,,,,


In [7]:
# We can access individual column and multiple columns using
ESData_sample["Question"]

0        Does the company disclose an anti-discriminati...
1        Does the company disclose the number of incide...
2        Does the company disclose details on freshwate...
3        Does the company disclose measures to mitigate...
4        Does the company disclose a supplier labour po...
                               ...                        
49995    Does the company disclose that its suppliers a...
49996    Does the company disclose metrics for withdraw...
49997    Total water withdrawn from areas with high wat...
49998    Does the company express an expectation on sup...
49999    Does the company disclose a supplier policy pr...
Name: Question, Length: 50000, dtype: object

In [8]:
#Multiple Columns
ESData_sample[["FactorId", "Question"]]

Unnamed: 0,FactorId,Question
0,4904,Does the company disclose an anti-discriminati...
1,4946,Does the company disclose the number of incide...
2,4527,Does the company disclose details on freshwate...
3,4821,Does the company disclose measures to mitigate...
4,4657,Does the company disclose a supplier labour po...
...,...,...
49995,4307,Does the company disclose that its suppliers a...
49996,4541,Does the company disclose metrics for withdraw...
49997,5026,Total water withdrawn from areas with high wat...
49998,4583,Does the company express an expectation on sup...


In [9]:
# Describe and data type info
# This finds the numeric columns automatically and give us 
ESData_sample.describe()

Unnamed: 0,FactorId,AgentId,AgentGics
count,50000.0,50000.0,50000.0
mean,4725.5569,713.90094,32121400.0
std,236.589686,394.393798,14124140.0
min,4244.0,1.0,10101010.0
25%,4558.0,404.0,20106020.0
50%,4743.0,702.0,30202030.0
75%,4919.0,1022.0,40301050.0
max,5255.0,1598.0,60201030.0


In [10]:
# Data Types that we learnt
# We see that Date and PublicationDate is not a date object yet, we can change that!
print(ESData_sample.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   FactorId            50000 non-null  int64 
 1   Name                50000 non-null  object
 2   Question            50000 non-null  object
 3   Date                50000 non-null  object
 4   Answer              49998 non-null  object
 5   PublicationDate     50000 non-null  object
 6   Restated            50000 non-null  bool  
 7   AgentId             50000 non-null  int64 
 8   AgentName           50000 non-null  object
 9   AgentGics           50000 non-null  int64 
 10  CountryOfOperation  50000 non-null  object
 11  SourceName          18815 non-null  object
 12  SourceType          18815 non-null  object
 13  Page                16514 non-null  object
 14  Comment             2497 non-null   object
 15  Excerpt             17230 non-null  object
 16  URL                 18

In [11]:
# Let's change the columns data type
ESData_sample[["Date", "PublicationDate"]] = ESData_sample[["Date", "PublicationDate"]].apply(pd.to_datetime)

print(ESData_sample[["Date", "PublicationDate"]].dtypes)

ESData_sample[["Date", "PublicationDate"]].sample(n = 8)

Date               datetime64[ns]
PublicationDate    datetime64[ns]
dtype: object


Unnamed: 0,Date,PublicationDate
1866,2021-12-31,2023-04-19
16247,2022-12-31,2023-03-06
13353,2022-12-31,2023-04-24
44049,2022-01-27,2023-01-13
17489,2021-12-31,2023-03-07
17414,2021-06-30,2023-01-13
42376,2021-12-31,2023-03-10
4485,2020-12-31,2023-03-31


### Subsetting Dataframes using Location and Integer-Location (loc, iloc)
- loc: indexing dataframes using names
    - This is useful when we have both row names/index and column names

- iloc: indexing dataframes using integer locations
    - This is useful when we count rows and columns

In [12]:
# loc

# We can select 1 specific row, and 1 specific column by name
print(ESData_sample.loc[999, "Question"])

# We can select a range of rows and a range of columns by name
print(ESData_sample.loc[997:999, "FactorId":"Question"])

# So why does this give us an error?
print(ESData_sample.loc[-1, "FactorId":"Question"])


Does the company disclose the existence of an anonymous hotline for supply chain workers?
     FactorId       Name                                           Question
997      4927  EOMsr1.23  Does the company disclose ethnic/racial divers...
998      4972    TraM2-x  Percentage of employees receiving career devel...
999      4778  SupCP4.11  Does the company disclose the existence of an ...


KeyError: -1

In [13]:
# iloc

# We can achieve the same result of location with iloc
print(ESData_sample.iloc[999, 2])

# Note that in iloc we use 997:1000 instead of loc[997:999]
# This is because pandas with iloc now uses integer counting method instead of pure indexing
print(ESData_sample.iloc[997:1000, 0:3])

# But now we can also do negatives
print(ESData_sample.iloc[-1, 0:3])

Does the company disclose the existence of an anonymous hotline for supply chain workers?
     FactorId       Name                                           Question
997      4927  EOMsr1.23  Does the company disclose ethnic/racial divers...
998      4972    TraM2-x  Percentage of employees receiving career devel...
999      4778  SupCP4.11  Does the company disclose the existence of an ...
FactorId                                                 4680
Name                                               SupSt1.104
Question    Does the company disclose a supplier policy pr...
Name: 49999, dtype: object


### Let's take a look at a specific company perhaps?
Mercedes-Benz?

In [14]:
Mercedes_df = ESData[ESData["AgentName"] == "Mercedes-Benz Group AG"]
del(ESData)

Mercedes_df.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
368245,5008,GPG1.13-x,Gender pay gap or ratio [Mean - Adjusted],2021-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
368088,4806,TraM1,Does the company disclose the existence of a s...,2021-12-31,Yes,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Annual Report,Annual Report,104.0,,Training and professional development Due to e...,https://group.mercedes-benz.com/documents/inve...
368189,4907,EOPol1.03,Does the company disclose a commitment against...,2022-07-14,Yes,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Diversity & Inclusion,Company website,,,This is why at Mercedes-Benz we create and str...,https://group.mercedes-benz.com/sustainability...
368272,5054,HAcc2.3-x,Aggregate employee-contractor injury rate,2021-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
368060,4765,SupCP2.52,Does the company disclose social metrics on au...,2021-12-31,No,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,


### Writing and saving data

In [15]:
# Let's save our Mercedes_df subset as excel
with open(f"{data_processed}/Mercedes_df.xlsx", "wb") as merc:
    Mercedes_df.to_excel(merc, index = False)

# Save as csv
with open(f"{data_processed}/Mercedes_df.csv", "wb") as merc:
    Mercedes_df.to_csv(merc, sep = ";", index = False)

We can now read our newly created Mercedes CSV! <br>
Woops, this way of reading the file is different! <br>
There seems to be a few ways to open a file!

In [16]:
with open(f"{data_processed}/Mercedes_df.csv", "rb") as merc:
    Mercedes_df = pd.read_csv(merc, dtype_backend = "pyarrow", sep = ";")

Mercedes_df.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
1335,4765,SupCP2.52,Does the company disclose social metrics on au...,2022-12-31,Yes,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,225.0,Company has conducted 825 audit on suppliers,The Mercedes-Benz AG also continued to conduct...,https://sustainabilityreport.mercedes-benz.com...
365,4489,SPayG12,Does the company disclose payments to governme...,2021-12-31,Not Meaningful,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
722,5041,EOMsr4.2-x,Total monetary losses from employment discrimi...,2021-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
324,4326,HRDue7.23,Does the company disclose that its contractors...,2021-12-31,No,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
552,4809,Dtra1,Does the company disclose the average annual h...,2021-12-31,Yes,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,201.0,,Qualification hours per employee per year2 21....,https://group.mercedes-benz.com/documents/sust...


#### Duplications, Check for duplications!

In [17]:
ESData_sample.loc[ESData_sample.duplicated()]

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL


### Let's do some more complicated things
Now, that we know the basics of reading a dataframe, let's do some joining! <br>
Joinings or Mergings are combining 2 tables based on similar values!

![joins](../readme_figs/joins.jpg)

So, we have some information about the Published Companies Domicile information we have not included, <br>
But, it is in another xlsx file, it is in the external folder!

In [18]:
Domicile_df = pd.read_excel(f"{data_external}/PublishedCompanies_2023-04-18.xlsx")

Domicile_df.sample(n = 5)

Unnamed: 0,Agent ID,ISS Company ID,Agent,Domicile
2541,2642,75015,"IDEXX Laboratories, Inc.",United States
4464,156114,529301,Santam Ltd.,South Africa
2698,398915,630902,Invitae Corporation,United States
2848,155771,2770541,KazMunayGas NC JSC,Kazakhstan
3847,388,510001,Outokumpu Oyj,Finland


We have new information that we wanted to include into the E&S raw data! <br>
We have 1 column that is common between both dataframes: Agent Id.

> Perform a left join on AgentId!

![left_join](../readme_figs/left_join.jpg)

In [19]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge.html
ESSample_wDomLEFT = pd.merge(ESData_sample, Domicile_df, how = "left", left_on = "AgentId", right_on = "Agent ID")
ESSample_wDomRIGHT = pd.merge(ESData_sample, Domicile_df, how = "right", left_on = "AgentId", right_on = "Agent ID")

ESSample_wDomLEFT.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,...,SourceName,SourceType,Page,Comment,Excerpt,URL,Agent ID,ISS Company ID,Agent,Domicile
10395,4915,EOPol1.11,Does the company disclose a commitment against...,2021-07-04,No,2023-03-04,False,164,Shoprite Holdings Ltd.,30101030,...,,,,,,,164.0,508261.0,Shoprite Holdings Ltd.,South Africa
23588,5081,CliTar1.51-x,Percentage GHG reduction target – Scope 1+2+3,2021-12-31,Not Disclosed,2023-04-04,False,661,Rio Tinto Plc,15104020,...,,,,,,,661.0,131071.0,Rio Tinto Plc,United Kingdom
44735,4488,PayG1.03,Does the company provide country-by-country pa...,2021-12-31,The company is not a multinational,2023-03-13,False,770,Perdoceo Education Corporation,25302010,...,,,,,,,770.0,513316.0,Perdoceo Education Corporation,United States
46748,4988,CliInv15-x,Total scope 3 emissions for Category 1 - Purch...,2020-12-31,Not Disclosed,2022-12-08,False,250,Banco de Sabadell SA,40101010,...,Consolidated directors’ report,Regulation Documents,,,,https://www.grupbancsabadell.com/corp/files/14...,250.0,529190.0,Banco de Sabadell SA,Spain
15728,4614,CliInv20,Does the company disclose its scope 3 emission...,2021-12-31,Yes,2023-04-25,False,1583,Rheinmetall AG,20101010,...,ESG Report 2022 (For FYE 2021),Sustainability Report,52.0,,Other indirect greenhouse gas emissions | Scop...,https://www.rheinmetall.com/media/en/editor_me...,1583.0,509212.0,Rheinmetall AG,Germany


In [20]:
ESSample_wDomRIGHT.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,...,SourceName,SourceType,Page,Comment,Excerpt,URL,Agent ID,ISS Company ID,Agent,Domicile
2615,4904.0,EOPol1.2,Does the company disclose an anti-discriminati...,2021-10-08,Yes,2023-04-19,False,926.0,"American Electric Power Company, Inc.",55101010.0,...,AEP’S PRINCIPLES OF BUSINESS CONDUCT,Policy,9.0,,AEP is committed to providing an inclusive wor...,https://www.aep.com/assets/docs/investors/gove...,926,5133,"American Electric Power Company, Inc.",United States
12335,4987.0,CliInv30-x,Total scope 3 GHG emissions,2021-12-31,612812 Metric tonnes (t) CO2e,2023-04-25,True,724.0,Demant A/S,35101010.0,...,Sustainability Report 2022,Sustainability Report,35.0,,Scope 3 emissions,http://wdh01.azureedge.net/-/media/demant/shar...,724,504826,Demant A/S,Denmark
40186,4778.0,SupCP4.11,Does the company disclose the existence of an ...,2021-12-02,Yes,2023-04-25,False,685.0,Standard Chartered Plc,40101010.0,...,Supplier Policy 2018,Policy (Supplier),3.0,,Ethics\nPrinciple 1\nTo enforce a culture of s...,https://av.sc.com/corp-en/content/docs/supplie...,685,509628,Standard Chartered Plc,United Kingdom
30336,5026.0,WatInv4.0-x,Total water withdrawn from areas with high wat...,2021-12-31,7.339262 m Cubic metre (m3),2023-02-23,False,854.0,"NextEra Energy, Inc.",55101010.0,...,2022 ESG Report,Sustainability Report,62.0,,percentage of\neach in regions of high or extr...,https://www.investor.nexteraenergy.com/~/media...,854,51230,"NextEra Energy, Inc.",United States
17200,4972.0,TraM2-x,Percentage of employees receiving career devel...,2021-12-31,Not Disclosed,2023-03-29,False,307.0,Fortum Oyj,55101010.0,...,,,,,,,307,504329,Fortum Oyj,Finland


### Let's go back to our Mercedes data!
- Can we find out more about it?
- Let's focus on Environmental factors?

In [21]:
# Let's make a list of relevant factors:
Clim_factors = [4980, 4981, 4982, 4983, 4984, 4985, 4986, 4987, 4988, 4989,
                4990, 4991, 4992, 4993, 4994, 4995, 4996, 4997, 4998, 4999,
                5000, 5001, 5002, 5077, 5078, 5079, 5080, 5081, 5082, 5003,
                5055, 5056, 5057, 5058, 5059, 5083, 4963]


Merc_Clim = Mercedes_df[Mercedes_df["FactorId"].isin(Clim_factors)]
print(Merc_Clim["Question"].unique())
Merc_Clim.sample(n = 5)

<ArrowExtensionArray>
[                                       'Percentage of company certified to ISO 50001',
                                                         'Total scope 1 GHG emissions',
                                        'Total scope 2 GHG emissions (location-based)',
                                          'Total scope 2 GHG emissions (market-based)',
                                           'Total scope 2 GHG emissions (unspecified)',
                                      'Total scope 1+2 GHG emissions (location-based)',
                                        'Total scope 1+2 GHG emissions (market-based)',
                                         'Total scope 1+2 GHG emissions (unspecified)',
                                                         'Total scope 3 GHG emissions',
               'Total scope 3 emissions for Category 1 - Purchased goods and services',
                              'Total scope 3 emissions for Category 2 - Capital goods',
        'T

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
854,4994,CliInv21-x,Total scope 3 emissions for Category 7 - Emplo...,2022-12-31,107000 Metric tonnes (t) CO2e,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,103.0,,Scope 3 Specific Absolute CO2 in million t Emp...,https://sustainabilityreport.mercedes-benz.com...
855,4995,CliInv22-x,Total scope 3 emissions for Category 8 - Upstr...,2022-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
153,4981,CliInv4.1-x,Total scope 2 GHG emissions (location-based),2021-12-31,1.123 m Metric tonnes (t) CO2,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,153.0,,CO2 indirect (Scope 2) — location-based,https://group.mercedes-benz.com/documents/sust...
165,4985,CliInv11.2-x,Total scope 1+2 GHG emissions (market-based),2019-12-31,2.516 m Metric tonnes (t) CO2,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report,Sustainability Report,153.0,,Total — market-based,https://group.mercedes-benz.com/documents/sust...
869,5058,EnSou3.2-x,Total energy consumption from solid/liquid fos...,2022-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,,,,https://sustainabilityreport.mercedes-benz.com...


### Can we refine for TCFD disclosure? (Scopes measurements in questions?)
#### Introducing Regular Expression (aka. regex / grep / alienspeak)!
![regex](../readme_figs/regex.jpg)

Regular Expression, first invented in 1951, was a way of classifying and pattern matching languages.
This was the defacto way search egines were built before the existence of NLP based methods

For the example, let's say we want to query for "scope" follow by any numer [1, 2, 3] in the Mercedes dataframe question.
**Scope specified disclosure**
This means we have to match:
- scope 3
- scope 1+2

In [22]:
Merc_TCFD = Merc_Clim[Merc_Clim["Question"].str.contains(r"scope\s\d.*", regex = True)]

# Let's check our query results
print(Merc_TCFD["Question"].unique())

<ArrowExtensionArray>
[                                                        'Total scope 1 GHG emissions',
                                        'Total scope 2 GHG emissions (location-based)',
                                          'Total scope 2 GHG emissions (market-based)',
                                           'Total scope 2 GHG emissions (unspecified)',
                                      'Total scope 1+2 GHG emissions (location-based)',
                                        'Total scope 1+2 GHG emissions (market-based)',
                                         'Total scope 1+2 GHG emissions (unspecified)',
                                                         'Total scope 3 GHG emissions',
               'Total scope 3 emissions for Category 1 - Purchased goods and services',
                              'Total scope 3 emissions for Category 2 - Capital goods',
        'Total scope 3 emissions for Category 3 - Fuel- and energy-related activities',
   'Total 

In [23]:
Merc_TCFD.sample(n = 5)

Unnamed: 0,FactorId,Name,Question,Date,Answer,PublicationDate,Restated,AgentId,AgentName,AgentGics,CountryOfOperation,SourceName,SourceType,Page,Comment,Excerpt,URL
857,4997,CliInv24-x,Total scope 3 emissions for Category 10 - Proc...,2022-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
864,5002,CliInv29-x,Total scope 3 emissions for Category 15 - Inve...,2022-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
180,5000,CliInv27-x,Total scope 3 emissions for Category 13 - Down...,2021-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
170,4990,CliInv17-x,Total scope 3 emissions for Category 3 - Fuel-...,2021-12-31,Not Disclosed,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",,,,,,
852,4994,CliInv21-x,Total scope 3 emissions for Category 7 - Emplo...,2020-12-31,125000 Metric tonnes (t) CO2e,2023-04-26,False,540,Mercedes-Benz Group AG,25102010,"ARE,ARG,AUS,AUT,BEL,BGR,BRA,CAN,CHE,CHN,COL,CZ...",Sustainability Report 2022,Sustainability Report,103.0,,Scope 3 Specific Absolute CO2 in million t Emp...,https://sustainabilityreport.mercedes-benz.com...


### Pivot tables
Sometimes, we want to make the row values into columns and for detail comparison.
Now that we have the TCFD scopes of Mercedes-Benz, let's make the Question into columns and their Answers as values

In [24]:
# Pivot tables
# Wow this got a bit complicated, don't worry this is exceptional
# But let's just visualise the result, we can learn about lambda later.
Merc_TCFDPivot = pd.pivot_table(Merc_TCFD, values = "Answer", columns = "Question", index = "Date", aggfunc = lambda x: " ".join(x))

# Nice!
# Let's save it
with open(f"{data_processed}/Merc_TCFDPivot.csv", "wb") as data:
    Merc_TCFDPivot.to_csv(data, sep = ";")

Merc_TCFDPivot

Question,Total scope 1 GHG emissions,Total scope 1+2 GHG emissions (location-based),Total scope 1+2 GHG emissions (market-based),Total scope 1+2 GHG emissions (unspecified),Total scope 2 GHG emissions (location-based),Total scope 2 GHG emissions (market-based),Total scope 2 GHG emissions (unspecified),Total scope 3 GHG emissions,Total scope 3 emissions for Category 1 - Purch...,Total scope 3 emissions for Category 10 - Proc...,...,Total scope 3 emissions for Category 14 - Fran...,Total scope 3 emissions for Category 15 - Inve...,Total scope 3 emissions for Category 2 - Capit...,Total scope 3 emissions for Category 3 - Fuel-...,Total scope 3 emissions for Category 4 - Upstr...,Total scope 3 emissions for Category 5 - Waste...,Total scope 3 emissions for Category 6 - Busin...,Total scope 3 emissions for Category 7 - Emplo...,Total scope 3 emissions for Category 8 - Upstr...,Total scope 3 emissions for Category 9 - Downs...
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-31,1.239 m Metric tonnes (t) CO2,2.946 m Metric tonnes (t) CO2,2.516 m Metric tonnes (t) CO2,,1.706 m Metric tonnes (t) CO2,1.276 m Metric tonnes (t) CO2,,,,,...,,,,,,,,,,
2020-12-31,1.027 m Metric tonnes (t) CO2,2.519 m Metric tonnes (t) CO2,2.062 m Metric tonnes (t) CO2,,1.492 m Metric tonnes (t) CO2,1.035 m Metric tonnes (t) CO2,,103.2 m Metric tonnes (t) CO2e,17 m Metric tonnes (t) CO2e,,...,,,,,,800000 Metric tonnes (t) CO2e,12000 Metric tonnes (t) CO2e,125000 Metric tonnes (t) CO2e,,
2021-12-31,681000 Metric tonnes (t) CO2,1.805 m Metric tonnes (t) CO2 1.804 m Metric t...,1.148 m Metric tonnes (t) CO2 1.147 m Metric t...,Not Disclosed,1.123 m Metric tonnes (t) CO2,466000 Metric tonnes (t) CO2,Not Disclosed,123.3 m Metric tonnes (t) CO2 99.2 m Metric to...,20.4 m Metric tonnes (t) CO2 17 m Metric tonne...,Not Disclosed,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,1 m Metric tonnes (t) CO2 800000 Metric tonnes...,22000 Metric tonnes (t) CO2 19000 Metric tonne...,122000 Metric tonnes (t) CO2 107000 Metric ton...,Not Disclosed,Not Disclosed
2022-12-31,569000 Metric tonnes (t) CO2e,1.69 m Metric tonnes (t) CO2e,663000 Metric tonnes (t) CO2e,Not Disclosed,1.121 m Metric tonnes (t) CO2e,94000 Metric tonnes (t) CO2e,Not Disclosed,97.8 m Metric tonnes (t) CO2e,17.7 m Metric tonnes (t) CO2e,Not Disclosed,...,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,Not Disclosed,800000 Metric tonnes (t) CO2e,57000 Metric tonnes (t) CO2e,107000 Metric tonnes (t) CO2e,Not Disclosed,Not Disclosed


## Newsroom data

Can we find duplicates in News Desk January 2023 export

In [35]:
processed_newsDF = pd.read_excel(f"{data_raw}/Processed_Jan_Dump_All_Languages.xlsx")

processed_newsDF.sample(n = 8)

Unnamed: 0,article_id,is_influential,date_processed,user_name,status,language_x,title,content,url
5135,49631663245,False,2023-01-03,sananoo,processed,english,Salmonella outbreak linked to Nebraska plant,"FREMONT, Neb. (KCAU) — A multistate outbreak o...",http://ct.moreover.com/?a=49631663245&p=2me&v=...
9918,49687046400,False,2023-01-09,magnant,processed,spanish,En riesgo la entrega del ahorro a trabajadores...,"MONCLOVA, COAH.\n- Al menos 6 mil obreros de l...",http://ct.moreover.com/?a=49687046400&p=2me&v=...
6182,49749586343,False,2023-01-16,santlui,processed,english,Nursing union warns that next strike will be t...,Double the variety of nurses can be requested ...,http://ct.moreover.com/?a=49749586343&p=2me&v=...
9444,49875710466,False,2023-01-30,baneamb,processed,spanish,Un juzgado investiga como delito ambiental las...,Un juzgado de Donostia ha abierto diligencias ...,http://ct.moreover.com/?a=49875710466&p=2me&v=...
1734,49815431210,False,2023-01-25,juanang,processed,french,Retraites : Le SNALC appelle à la grève le 31 ...,“Après la participation historique à la grève ...,http://ct.moreover.com/?a=49815431210&p=2me&v=...
4676,49844225785,False,2023-01-26,skayamb,processed,english,Adani Group says it is looking for ‘remedial a...,Short-seller Hindenburg accused the Adani Grou...,http://ct.moreover.com/?a=49844225785&p=2me&v=...
6154,49747448473,False,2023-01-16,santlui,processed,english,Lawsuit filed against former Camp Kieve employee,The civil action accuses Bill McCook of child ...,http://ct.moreover.com/?a=49747448473&p=2me&v=...
1206,49754499428,False,2023-01-18,juanang,processed,french,"Montpellier : ""zéro transport"", l'appel à la g...",Les syndicats CGT-Tam et UGICT-CGT-Tam appelle...,http://ct.moreover.com/?a=49754499428&p=2me&v=...


Most of our newsroom URL comes from this "ct.moreover.com", which access some database and redirect it back to the original source, therefore can we find the url?

We can use https response code to test and find out what the original URL is, for reference:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Status

In [36]:
# EXPERIMENTAL

import aiohttp
import asyncio
import nest_asyncio

nest_asyncio.apply()

async def get_url(session, url):
    try:
        async with session.get(url) as res:
            if res.status == 200:
                print(f"The original URL is {res.url}")
                return str(res.url)
            elif res.status in (301, 302, 303, 307, 308):
                redir_url = res.headers.get("Location")  
                print(f"The original URL is redirected and the location is {redir_url}")
                return str(redir_url)
            else:
                print(f"The original URL {url} is corrupted with status code {res.status}")
                return str(res.status)

    except Exception as e:
        print(f"Skipping url {url} due to error: {e}")
        return None

async def OG_url(ct_url: list) -> list:
    async with aiohttp.ClientSession(trust_env = True) as session:
        tasks = [get_url(session, url) for url in ct_url]
        return await asyncio.gather(*tasks)

processed_newsDF["OG_url"] = asyncio.run(OG_url(processed_newsDF["url"].tolist()))

The original URL is https://senego.com/les-travailleurs-sociaux-un-corps-deshabille-au-coeur-des-fonctionnaires_1500224.html
The original URL is https://leclaireur.fnac.com/article/219840-une-base-de-donnees-biometriques-de-larmee-americaine-vendue-sur-ebay/
The original URL http://ct.moreover.com/?a=49606329528&p=2me&v=1&x=8XmWeL_qwr_ThAR7n1Ftag is corrupted with status code 403
The original URL http://ct.moreover.com/?a=49617746364&p=2me&v=1&x=nJzjmVU0apGTBFm-e6f31Q is corrupted with status code 403
The original URL is https://www.magazine-greenlife.com/news/info/w6xw.html
The original URL is https://www.generation-nt.com/actualites/twitter-elon-musk-gestion-piratage-donnees-personnelles-2032055
The original URL is https://www.cointribune.com/la-banque-hsbc-depose-une-demande-de-licence-americaine/
The original URL is https://www.tahiti-infos.com/L-entreprise-Ace-Deco-Center-condamnee-a-payer-500-000-francs-par-l-autorite-polynesienne-de-la-concurrence_a214277.html
The original URL i

In [33]:
# Find OG URL duplication
OG_newsDF = processed_newsDF[(processed_newsDF["OG_url"].notnull() & processed_newsDF["OG_url"].str.startswith("http"))]

duplicated_news = OG_newsDF[OG_newsDF["OG_url"].duplicated()]

In [34]:
with open(f"{data_interim}/articles_OGurl.csv", "wb") as df:
    processed_newsDF.to_csv(df, sep = ";", index = False)

with open(f"{data_interim}/articles_OGurl.xlsx", "wb") as df:
    processed_newsDF.to_excel(df, index = False)

with open(f"{data_interim}/articles_duplicated.xlsx", "wb") as df:
    duplicated_news.to_excel(df, index = False)