# Merging all S&P500 companies and ESG companies shortlisted by S&P within S&P500

## Load the Data

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
input_folder_path = '../../data/raw'
output_folder_path = '../../data/transform'

# Read files
snp_full = f'{input_folder_path}/snp_companies_raw_full_15-Jan-2022.csv'
snp_esg = f'{input_folder_path}/sp-500-esg-index-01-15-2022.csv'

snp_full_df = pd.read_csv(snp_full).drop('Unnamed: 0',axis=1)
snp_esg_df = pd.read_csv(snp_esg)

In [None]:
print(f'Full shape of all S&P companies {snp_full_df.shape}')
print(f'Shape of ESG companies {snp_esg_df.shape}')

## Light Data Exploration for data quality

In [None]:
snp_full_df

In [None]:
snp_esg_df

In [None]:
# Data Cleaning and Data reduction

# Remove the Barchart 'watermark'
snp_esg_df = (
    snp_esg_df[
        ~snp_esg_df['Symbol'].str.contains('Downloaded')]
)

# Drop Irrelevent Columns
snp_full_df = snp_full_df.drop(['SEC filings', 'Headquarters Location', 'CIK', 'Founded', 'Date first added'],axis=1)
snp_esg_df = snp_esg_df.drop(['Name','Last', 'Change', '%Chg', 'High', 'Low', 'Volume','Time'],axis=1)

In [None]:
# Check for df shape
print(snp_full_df.shape)
print(snp_esg_df.shape)

# Check for duplication. False when there is no duplication
print(len(snp_esg_df.duplicated()) != snp_esg_df.shape[0])
print(len(snp_full_df.duplicated()) != snp_full_df.shape[0])

## Find ESG companies and their relevant info

In [None]:
snp_esg_full = (snp_full_df.merge(snp_esg_df,
                                 how='left',
                                 left_on='Symbol',
                                 right_on='Symbol',
                                 indicator=True))

snp_esg_full['_merge'] = np.where(snp_esg_full['_merge'] == 'both',1,0)

# Table Clean up
snp_esg_full = (snp_esg_full
                # Rename _merge column for readability
                .rename({'_merge':'is_esg'}, axis=1)
                
                # Sort Values and reset the index
                .sort_values(['Symbol', 'GICS Sector'], ascending=True)
                .reset_index()
                .drop('index',axis=1))



In [None]:
# Convert date_first_added_temp to datetime object
snp_esg_full['date_first_added_temp'] = pd.to_datetime(snp_esg_full['date_first_added_temp'])

In [None]:
test = snp_esg_full[snp_esg_full.notna()]

In [None]:
test

In [None]:
# snp_esg_full.to_csv(f'{output_folder_path}/snp_esg_full_{datetime.now().strftime("%d-%b-%Y")}.csv')