# Merging all S&P500 companies and ESG companies shortlisted by S&P within S&P500

## Load the Data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
input_folder_path = '../../data/raw'
output_folder_path = '../../data/transform'

# Read files
snp_full = f'{input_folder_path}/snp_companies_raw_full_15-Jan-2022.csv'
snp_esg = f'{input_folder_path}/sp-500-esg-index-01-15-2022.csv'

snp_full_df = pd.read_csv(snp_full).drop('Unnamed: 0',axis=1)
snp_esg_df = pd.read_csv(snp_esg)

In [3]:
print(f'Full shape of all S&P companies {snp_full_df.shape}')
print(f'Shape of ESG companies {snp_esg_df.shape}')

Full shape of all S&P companies (505, 9)
Shape of ESG companies (313, 9)


## Light Data Exploration for data quality

In [4]:
snp_full_df

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
...,...,...,...,...,...,...,...,...,...
500,YUM,Yum! Brands,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
501,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
502,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
503,ZION,Zions Bancorp,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [5]:
snp_esg_df

Unnamed: 0,Symbol,Name,Last,Change,%Chg,High,Low,Volume,Time
0,A,Agilent Technologies,144.68,-0.49,-0.34%,145.150,142.3600,2225400.0,2022-01-14
1,AAL,American Airlines Gp,18.49,-0.85,-4.40%,19.160,18.4000,44983102.0,2022-01-14
2,AAPL,Apple Inc,173.07,0.88,+0.51%,173.780,171.0900,80440703.0,2022-01-14
3,ABBV,Abbvie Inc,135.87,2.35,+1.76%,136.050,132.2906,10484500.0,2022-01-14
4,ABC,Amerisourcebergen Corp,136.20,1.31,+0.97%,136.305,133.3300,926900.0,2022-01-14
...,...,...,...,...,...,...,...,...,...
308,XOM,Exxon Mobil Corp,71.87,1.24,+1.76%,72.145,70.6000,23012800.0,2022-01-14
309,XYL,Xylem Inc,111.54,-2.88,-2.52%,113.430,109.5500,1725200.0,2022-01-14
310,YUM,Yum! Brands,127.06,-1.75,-1.36%,128.550,126.3400,1493500.0,2022-01-14
311,ZBRA,Zebra Technologies,528.00,2.20,+0.42%,533.220,517.6000,321800.0,2022-01-14


In [6]:
# Data Cleaning and Data reduction

# Remove the Barchart 'watermark'
snp_esg_df = (
    snp_esg_df[
        ~snp_esg_df['Symbol'].str.contains('Downloaded')]
)

# Drop Irrelevent Columns
snp_full_df = snp_full_df.drop(['SEC filings', 'Headquarters Location', 'CIK', 'Founded', 'Date first added'],axis=1)
snp_esg_df = snp_esg_df.drop(['Name','Last', 'Change', '%Chg', 'High', 'Low', 'Volume','Time'],axis=1)

In [7]:
# Check for df shape
print(snp_full_df.shape)
print(snp_esg_df.shape)

# Check for duplication. False when there is no duplication
print(len(snp_esg_df.duplicated()) != snp_esg_df.shape[0])
print(len(snp_full_df.duplicated()) != snp_full_df.shape[0])

(505, 4)
(312, 1)
False
False


## Find ESG companies and their relevant info

In [8]:
snp_esg_full = (snp_full_df.merge(snp_esg_df,
                                 how='left',
                                 left_on='Symbol',
                                 right_on='Symbol',
                                 indicator=True))

snp_esg_full['_merge'] = np.where(snp_esg_full['_merge'] == 'both',1,0)

# Table Clean up
snp_esg_full = (snp_esg_full
                # Rename _merge column for readability
                .rename({'_merge':'is_esg'}, axis=1)
                
                # Sort Values and reset the index
                .sort_values(['Symbol', 'GICS Sector'], ascending=True)
                .reset_index()
                .drop('index',axis=1))



In [9]:
snp_esg_full

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,is_esg
0,A,Agilent Technologies,Health Care,Health Care Equipment,1
1,AAL,American Airlines Group,Industrials,Airlines,1
2,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,0
3,AAPL,Apple,Information Technology,"Technology Hardware, Storage & Peripherals",1
4,ABBV,AbbVie,Health Care,Pharmaceuticals,1
...,...,...,...,...,...
500,YUM,Yum! Brands,Consumer Discretionary,Restaurants,1
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,0
502,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,1
503,ZION,Zions Bancorp,Financials,Regional Banks,0


In [10]:
snp_esg_full.to_csv(f'{output_folder_path}/snp_esg_full_{datetime.now().strftime("%d-%b-%Y")}.csv')