### Sustainability Aware Asset Management: **Groupe A: North America // Scope 1 + 2**
#### **Data import and cleaning**

In [72]:
## Packages lists:
import pandas as pd
import openpyxl

In [73]:
## Importing the 'Static.xlsx' file to have information on the companies (ISIN, Name, Sector, Country and Region.)
companies = pd.read_excel('Data/Static.xlsx')
## Selecting only the companies from the North America Region.
companies = companies[companies.Region == "AMER"]
## Selecting the ISIN of the companies as Index
companies = companies.set_index(companies.ISIN)

In [74]:
companies.head()

Unnamed: 0_level_0,Company,ISIN,GICSSectorName,Country,Region
ISIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AN8068571086,Schlumberger Limited,AN8068571086,Energy,UNITED STATES,AMER
BMG0450A1053,Arch Capital Group Ltd.,BMG0450A1053,Financials,UNITED STATES,AMER
BMG3223R1088,"Everest Group, Ltd.",BMG3223R1088,Financials,UNITED STATES,AMER
BMG6359F1370,Nabors Industries Ltd.,BMG6359F1370,Energy,UNITED STATES,AMER
BMG7496G1033,RenaissanceRe Holdings Ltd.,BMG7496G1033,Financials,UNITED STATES,AMER


In [86]:
## Importing the Monthly and Yearly Returns of each stock.
df_m = pd.read_excel("Data/DS_RI_T_USD_M.xlsx")
df_y = pd.read_excel("Data/DS_RI_T_USD_Y_CAI.xlsx")

## Selecting only the North America companies
df_m = df_m[df_m.ISIN.isin(companies.index)]
df_y = df_y[df_y.ISIN.isin(companies.index)]

## Dropping the ISIN columns
monthly_return = df_m.drop('ISIN', axis=1)
yearly_return = df_y.drop('ISIN', axis=1)

## Indexing the Companies names
monthly_return = monthly_return.set_index('NAME')
yearly_return = yearly_return.set_index('NAME')

## Removing the index name
monthly_return.index.name = None
yearly_return.index.name = None

## Transposing the dataframes to have a better structure to work with
monthly_return = monthly_return.T
yearly_return = yearly_return.T


In [37]:
def has_duplicates(arr):
    return len(set(arr)) != len(arr)

dup = []
for i in range(595):
    df = monthly_return.iloc[:,0].values
    res = has_duplicates(df)
    dup.append(res)

print(dup.count(True))
print(dup.count(False))

## As dup.count(True) = 0, we can see that we have no repeting values.

0
595


In [16]:
## MV data
mv = pd.read_excel('Data/DS_MV_USD_M.xlsx')

mv = mv[mv.ISIN.isin(companies.index)]
mv = mv.drop('ISIN', axis=1)
mv = mv.set_index('NAME')
mv.index.name = None

In [87]:
scope1_int = pd.read_excel('Data/TC_Scope1Intensity.xlsx', index_col=0)
scope1_int = scope1_int[scope1_int.index.isin(companies.index)].reset_index().set_index('NAME')
scope1_int = scope1_int.iloc[:,7:]

scope1_em = pd.read_excel('Data/TC_Scope1.xlsx', index_col=0)
scope1_em = scope1_em[scope1_em.index.isin(companies.index)].reset_index().set_index('NAME')
scope1_em = scope1_em.iloc[:,7:]

In [88]:
scope2_int = pd.read_excel('Data/TC_Scope2Intensity.xlsx', index_col=0)
scope2_int = scope2_int[scope2_int.index.isin(companies.index)].reset_index().set_index('NAME')
scope2_int = scope2_int.iloc[:,7:]

scope2_em = pd.read_excel('Data/TC_Scope2.xlsx', index_col=0)
scope2_em = scope2_em[scope2_em.index.isin(companies.index)].reset_index().set_index('NAME')
scope2_em = scope2_em.iloc[:,7:]