In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np

# Loading in the Refinitiv Data

In [2]:
# Here, 'Refinitiv_1_30_25.csv' is the .csv file downloaded directly from the WRDS Refinitiv V2 query
refinitiv = pd.read_csv('Refinitiv_1_30_25.csv', header=0, encoding='latin-1')

  refinitiv = pd.read_csv('Refinitiv_1_30_25.csv', header=0, encoding='latin-1')


In [3]:
# Drop any unnecessary columns (Here, we droped 'fieldid')
df = refinitiv.copy()
df.drop(['fieldid'], axis=1, inplace=True)
print(df)

          orgpermid  year        cusip  naicscode                     comname  \
0        4295533401  2017    777780107        NaN          ROSETTA STONE INC.   
1        4295533401  2017    777780107        NaN          ROSETTA STONE INC.   
2        4295533401  2017    777780107        NaN          ROSETTA STONE INC.   
3        4295533401  2017    777780107        NaN          ROSETTA STONE INC.   
4        4295533401  2017    777780107        NaN          ROSETTA STONE INC.   
...             ...   ...          ...        ...                         ...   
2234360  8589934524  2022  880591300.0   221112.0  TENNESSEE VALLEY AUTHORITY   
2234361  8589934524  2022  880591300.0   221112.0  TENNESSEE VALLEY AUTHORITY   
2234362  8589934524  2022  880591300.0   221112.0  TENNESSEE VALLEY AUTHORITY   
2234363  8589934524  2022  880591300.0   221112.0  TENNESSEE VALLEY AUTHORITY   
2234364  8589934524  2022  880591300.0   221112.0  TENNESSEE VALLEY AUTHORITY   

          hierarchy       p

# Cleaning the CUSIPs

**Drop observations with NaN as CUSIP (means it's non-US)**; This scope limitation is in place because CRSP monthly returns are only provided for stocks with CUSIPs (US only). Additionally, my research only focuses on US companies anyways!

In [4]:
# Drop observations with missing CUSIPs and convert all CUSIPs to strings!
df.dropna(subset=['cusip'], inplace=True)
df['cusip'] = df['cusip'].astype(str)
print(df['cusip'])

0            777780107
1            777780107
2            777780107
3            777780107
4            777780107
              ...     
2234360    880591300.0
2234361    880591300.0
2234362    880591300.0
2234363    880591300.0
2234364    880591300.0
Name: cusip, Length: 746467, dtype: object


In [5]:
# Want all CUSIP strings to look the same (i.e. not have ".0" at the end)
df['cusip'] = df['cusip'].str.replace(r'\.0$', '', regex=True)

In [6]:
# Printing how many digits the current (uncleaned) CUSIPs have
df_cusipcopy = df.copy()
df_cusipcopy['cusipDigits'] = df_cusipcopy['cusip'].str.len()
print(df_cusipcopy['cusipDigits'].value_counts())

cusipDigits
9     745907
8        526
10        34
Name: count, dtype: int64


The first 6 digits of a CUSIP identify the company, digits 7-8 describe the security and the 9th is a check digit

With CRSP and Compustat, we can use 8-digit CRSPs, so...
* We need to add 0 to the end of the 7-digit cusips
* Take first 8 digits of every string in the column

In [7]:
def add_zero(string):
    if len(string) == 7:
        return string + '0'
    else:
        return string

df['cusip'] = df['cusip'].apply(add_zero)

In [8]:
# Using str.slice() to keep the first 8 characters
df['cusip'] = df['cusip'].str.slice(0, 8)

In [9]:
# Checking that all CUSIPs are the same length
df_cusipcopy2 = df.copy()
df_cusipcopy2['cusipDigits'] = df_cusipcopy2['cusip'].str.len()
print(df_cusipcopy2['cusipDigits'].value_counts())

cusipDigits
8    746467
Name: count, dtype: int64


In [10]:
# Printing all the unique fields so we can more easily delete the ones that we are no longer interested in below
unique_fields = df['fieldname'].unique()
print(unique_fields)

['ESGScore' 'ESGCombinedScore' 'EnvironmentPillarScore'
 'GovernancePillarScore' 'SocialPillarScore' 'ESGResourceUseScore'
 'ESGEmissionsScore' 'ESGInnovationScore' 'ESGWorkforceScore'
 'ESGHumanRightsScore' 'ESGCommunityScore' 'ESGProductResponsibilityScore'
 'ESGManagementScore' 'ESGShareholdersScore' 'ESGCsrStrategyScore'
 'AnalyticCO2EstimationMethod'
 'AnalyticEstimatesCO2EquivalentsEmissionTotal'
 'CO2EquivalentsEmissionDirectScope1'
 'CO2EquivalentsEmissionIndirectScope3'
 'CO2EquivalentsEmissionIndirectScope2' 'CO2EquivalentsEmissionTotal']


In [11]:
# We downloaded data with many different ESG data scores and data points to experiment with
# However, now knowing that we are only using 'AnalyticCO2EstimationMethod' and 'AnalyticEstimatesCO2EquivalentsEmissionTotal', 
    # we can remove all other fields
df = df[df['fieldname'] != 'ESGScore']
df = df[df['fieldname'] != 'ESGCombinedScore']
df = df[df['fieldname'] != 'EnvironmentPillarScore']
df = df[df['fieldname'] != 'GovernancePillarScore']
df = df[df['fieldname'] != 'SocialPillarScore']
df = df[df['fieldname'] != 'ESGResourceUseScore']
df = df[df['fieldname'] != 'ESGEmissionsScore']
df = df[df['fieldname'] != 'ESGInnovationScore']
df = df[df['fieldname'] != 'ESGWorkforceScore']
df = df[df['fieldname'] != 'ESGHumanRightsScore']
df = df[df['fieldname'] != 'ESGCommunityScore']
df = df[df['fieldname'] != 'ESGProductResponsibilityScore']
df = df[df['fieldname'] != 'ESGManagementScore']
df = df[df['fieldname'] != 'ESGShareholdersScore']
df = df[df['fieldname'] != 'ESGCsrStrategyScore']
df = df[df['fieldname'] != 'CO2EquivalentsEmissionDirectScope1']
df = df[df['fieldname'] != 'CO2EquivalentsEmissionIndirectScope3']
df = df[df['fieldname'] != 'CO2EquivalentsEmissionIndirectScope2']
df = df[df['fieldname'] != 'CO2EquivalentsEmissionTotal']

# Adding Sectors

While the provided North American Industry Classification System (NAICS) codes have 6-digits (representing the most detailed, “National Industry” categorization), only the first two digits were taken to obtain the more general sector.

In [12]:
# Only keep the first two digits of the NAICS codes
df['sectorcode'] = df['naicscode'].astype(str).str.slice(0, 2)
df.sectorcode.unique()

array(['na', '52', '33', '32', '62', '21', '22', '23', '53', '54', '71',
       '48', '42', '44', '31', '51', '56', '11', '81', '72', '61', '49',
       '55', '92'], dtype=object)

**Dropping**:
* '56': Administrative and Support and Waste Management and Remediation Services
* '81': Other Services (except Public Administration)
* '92': Public Administration (not covered in economic census)
* '61': Educational Services

In [13]:
df = df[df['sectorcode'] != '56']
df = df[df['sectorcode'] != '81']
df = df[df['sectorcode'] != '92']
df = df[df['sectorcode'] != '61']

**Combining**:
* '31', '32', '33': Manufacturing
* '48', '49: Transportation and Warehousing --> TransportWarehouse

In [14]:
df.loc[df['sectorcode'] == '32', 'sectorcode'] = '31'
df.loc[df['sectorcode'] == '33', 'sectorcode'] = '31'
df.loc[df['sectorcode'] == '49', 'sectorcode'] = '48'

In [15]:
def get_sector(sectorcode):
    if sectorcode == 'na':
        return 'DidNotIdentify'

    elif sectorcode == '31':
        return 'Manufacturing'
    elif sectorcode == '48':
        return 'TransportWarehouse'

    elif sectorcode == '52':
        return 'FinanceIns'
    elif sectorcode == '62':
        return 'Healthcare'
    elif sectorcode == '21':
        return 'OilGas'
    elif sectorcode == '53':
        return 'RealEstate'
    elif sectorcode == '54':
        return 'Services'
    elif sectorcode == '71':
        return 'ArtsEntRec'
    elif sectorcode == '11':
        return 'Agriculture'
    elif sectorcode == '55':
        return 'Management'
    elif sectorcode == '72':
        return 'AccomFood'

    elif sectorcode == '22':
        return 'Utilities'
    elif sectorcode == '23':
        return 'Construction'
    elif sectorcode == '42':
        return 'Wholesale'
    elif sectorcode == '44':
        return 'Retail'
    elif sectorcode == '51':
        return 'Information'

In [16]:
# Adding a sector column with the strings (names of the sectors, instead of sector codes) that we defined above
df['sector'] = df['sectorcode'].apply(get_sector)
df.drop(['naicscode', 'sectorcode'], axis=1, inplace=True)
print(df)

          orgpermid  year     cusip                      comname   hierarchy  \
15       4295533401  2017  77778010           ROSETTA STONE INC.  Data Point   
16       4295533401  2017  77778010           ROSETTA STONE INC.  Data Point   
32       4295533401  2018  77778010           ROSETTA STONE INC.  Data Point   
33       4295533401  2018  77778010           ROSETTA STONE INC.  Data Point   
49       4295533401  2019  77778010           ROSETTA STONE INC.  Data Point   
...             ...   ...       ...                          ...         ...   
2233464  8589934339  2022  60505104  BANK OF AMERICA CORPORATION  Data Point   
2233484  8589934339  2023  60505104  BANK OF AMERICA CORPORATION  Data Point   
2233485  8589934339  2023  60505104  BANK OF AMERICA CORPORATION  Data Point   
2234359  8589934524  2022  88059130   TENNESSEE VALLEY AUTHORITY  Data Point   
2234360  8589934524  2022  88059130   TENNESSEE VALLEY AUTHORITY  Data Point   

              pillar                   

In [17]:
# Export this cleaned dataset (with updated CUSIPs and Sectors) as a .csv and proceed to Step 2
df.to_csv('CleanedNonPivot_1_30_25.csv')