In [1]:
# Importing necessary packages
import pandas as pd
import numpy as np

In [2]:
ESGmeasures = pd.read_csv('RefinitivAndAssets_1_30_25.csv')
ESGmeasures.drop('Unnamed: 0', axis=1, inplace=True) # dropping ay additional
print(ESGmeasures)

       year     cusip                                 comname          sector  \
0      2002  00105510                      AFLAC INCORPORATED      FinanceIns   
1      2002  00195750                              AT&T CORP.  DidNotIdentify   
2      2002  00282410                     ABBOTT LABORATORIES   Manufacturing   
3      2002  00724F10                              ADOBE INC.  DidNotIdentify   
4      2002  00790310            ADVANCED MICRO DEVICES, INC.   Manufacturing   
...     ...       ...                                     ...             ...   
30469  2023  98980G10                           ZSCALER, INC.  DidNotIdentify   
30470  2023  98980L10               ZOOM COMMUNICATIONS, INC.  DidNotIdentify   
30471  2023  98981710                             ZUMIEZ INC.  DidNotIdentify   
30472  2023  98983L10  ZURN ELKAY WATER SOLUTIONS CORPORATION   Manufacturing   
30473  2023  98983V10                             ZUORA, INC.      FinanceIns   

      AnalyticCO2Estimation

In [3]:
print(ESGmeasures['AnalyticCO2EstimationMethod'].unique())

['Median' 'Reported' 'Energy' 'CO2']


# Monthly Fama-French Factors from Fama-French Data Library

Download desired Fama-French factor data from the Fama-French Data Library https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html

In [4]:
FF = pd.read_csv('FF_Factors_1_30_25.csv')
FF.drop('Unnamed: 0', axis=1, inplace=True) # drop unneccesary columns (I played around with the data file in Excel first)
print(FF)

     Year  Month  Mkt-RF   SMB   HML   RMW   CMA    RF
0    2002      1   -1.44  1.26  3.44  4.69  2.86  0.14
1    2002      2   -2.29 -0.36  2.16  8.07  5.11  0.13
2    2002      3    4.24  4.25  1.06 -1.78  0.59  0.13
3    2002      4   -5.20  6.72  3.88  4.56  5.37  0.15
4    2002      5   -1.38 -3.01  1.53  2.36  2.44  0.14
..    ...    ...     ...   ...   ...   ...   ...   ...
259  2023      8   -2.39 -3.68 -1.08  3.42 -2.37  0.45
260  2023      9   -5.24 -1.79  1.45  1.85 -0.84  0.43
261  2023     10   -3.18 -4.05  0.19  2.47 -0.67  0.47
262  2023     11    8.83 -0.11  1.66 -3.81 -0.99  0.44
263  2023     12    4.87  7.33  4.92 -3.04  1.30  0.43

[264 rows x 8 columns]


In [5]:
print(FF.dtypes) # checking the data types of the imported Fama-French factor data

Year        int64
Month       int64
Mkt-RF    float64
SMB       float64
HML       float64
RMW       float64
CMA       float64
RF        float64
dtype: object


# Monthly Stock Returns from CRSP

Export updated list of represented CUSIPs to pass through CRSP (to get monthly return data)

In [6]:
# Extract unique values from the column
unique_values = ESGmeasures['cusip'].unique()
# Write the unique values to a text file
with open('finaluniquecusips.txt', 'w') as f:
    for value in unique_values:
        f.write(str(value) + '\n')

The 'CRSP_Returns_2_23_25.csv' file is the data I downloaded from CRSP (date, CUSIP, and RET are the key fields to get) using the 'finaluniquecusips.txt' file from above as my CUSIPs for the data query

In [7]:
CRSP = pd.read_csv('CRSP_Returns_2_23_25.csv')
CRSP = CRSP.rename(columns={'CUSIP': 'cusip'})
CRSP.drop('PERMNO', axis=1, inplace=True) # removing unnecssary rows
print(CRSP)

              date     cusip        RET
0       2002-01-31  46603210   0.226585
1       2002-02-28  46603210   0.234078
2       2002-03-28  46603210   0.009997
3       2002-04-30  46603210   0.030498
4       2002-05-31  46603210  -0.060228
...            ...       ...        ...
684765  2023-08-31  12503M10   0.075745
684766  2023-09-29  12503M10   0.043417
684767  2023-10-31  12503M10   0.049165
684768  2023-11-30  12503M10   0.115016
684769  2023-12-29  12503M10  -0.019924

[684770 rows x 3 columns]


Certain CUSIPs may not have RET available (different CUSIP formatting perhaps, but it's not that many rows out of the large dataset), so we want to see what that looks like and then get rid of them.

In [8]:
def find_non_float_values(df, column_name):
    """Finds rows in a DataFrame where a column can't be converted to float."""
    try:
        df[column_name].astype(float)
        return pd.DataFrame()  # Return an empty DataFrame if no errors
    except ValueError:
        # Create a mask to identify rows with conversion errors
        mask = pd.to_numeric(df[column_name], errors='coerce').isna()
        return df[mask]

# Example usage:
result = find_non_float_values(CRSP, 'RET')
print(result)

              date     cusip  RET
387     2012-04-30  29402E10  NaN
388     2012-05-31  29402E10  NaN
389     2012-06-29  29402E10  NaN
390     2012-07-31  29402E10  NaN
391     2012-08-31  29402E10  NaN
...            ...       ...  ...
684512  2010-05-28  11133B40  NaN
684513  2010-06-30  11133B40    C
684605  2018-02-28  11133B40  NaN
684606  2010-05-28  12503M10  NaN
684607  2010-06-30  12503M10    C

[12017 rows x 3 columns]


In [9]:
CRSP = CRSP[pd.to_numeric(CRSP['RET'], errors='coerce').notnull()]

Converting the CRSP dataframe into the same data types as our other dataset for consistency when merged

In [10]:
CRSP['RET'] = CRSP['RET'].astype(float)
CRSP['date'] = pd.to_datetime(CRSP['date'])
# Extract Month and Year
CRSP['Year'] = CRSP['date'].dt.year
CRSP['Month'] = CRSP['date'].dt.month
print(CRSP)

             date     cusip       RET  Year  Month
0      2002-01-31  46603210  0.226585  2002      1
1      2002-02-28  46603210  0.234078  2002      2
2      2002-03-28  46603210  0.009997  2002      3
3      2002-04-30  46603210  0.030498  2002      4
4      2002-05-31  46603210 -0.060228  2002      5
...           ...       ...       ...   ...    ...
684765 2023-08-31  12503M10  0.075745  2023      8
684766 2023-09-29  12503M10  0.043417  2023      9
684767 2023-10-31  12503M10  0.049165  2023     10
684768 2023-11-30  12503M10  0.115016  2023     11
684769 2023-12-29  12503M10 -0.019924  2023     12

[672753 rows x 5 columns]


# Merging FF Factors onto CRSP Dataset

In [11]:
CRSP_FF = pd.merge(CRSP, FF, on=['Year', 'Month'])

In [12]:
# Double checking that there is no missing data
nan_count_per_column = CRSP_FF.isna().sum()
print(nan_count_per_column)

date      0
cusip     0
RET       0
Year      0
Month     0
Mkt-RF    0
SMB       0
HML       0
RMW       0
CMA       0
RF        0
dtype: int64


# Calculating Excess Returns of Stocks

In [13]:
CRSP_FF['ExcessReturn'] = CRSP_FF['RET'] - CRSP_FF['RF']
CRSP_FF.drop(['date'], axis=1, inplace=True)
CRSP_FF.rename(columns = {'Year':'year'}, inplace = True)
print(CRSP_FF)

           cusip       RET  year  Month  Mkt-RF   SMB   HML   RMW   CMA    RF  \
0       46603210  0.226585  2002      1   -1.44  1.26  3.44  4.69  2.86  0.14   
1       46603210  0.234078  2002      2   -2.29 -0.36  2.16  8.07  5.11  0.13   
2       46603210  0.009997  2002      3    4.24  4.25  1.06 -1.78  0.59  0.13   
3       46603210  0.030498  2002      4   -5.20  6.72  3.88  4.56  5.37  0.15   
4       46603210 -0.060228  2002      5   -1.38 -3.01  1.53  2.36  2.44  0.14   
...          ...       ...   ...    ...     ...   ...   ...   ...   ...   ...   
672748  12503M10  0.075745  2023      8   -2.39 -3.68 -1.08  3.42 -2.37  0.45   
672749  12503M10  0.043417  2023      9   -5.24 -1.79  1.45  1.85 -0.84  0.43   
672750  12503M10  0.049165  2023     10   -3.18 -4.05  0.19  2.47 -0.67  0.47   
672751  12503M10  0.115016  2023     11    8.83 -0.11  1.66 -3.81 -0.99  0.44   
672752  12503M10 -0.019924  2023     12    4.87  7.33  4.92 -3.04  1.30  0.43   

        ExcessReturn  
0   

# Merging EVERYTHING Together (ESG, Total Assets, FF Factors, CRSP Returns)

In [14]:
# Merge the CRSP and FF data with the data we previously gathered and cleaned
allrelevantdata = pd.merge(CRSP_FF, ESGmeasures, on=['year', 'cusip'])

In [15]:
# Double checking that there are no rows with missing data
print(allrelevantdata.isna().sum())

cusip                                           0
RET                                             0
year                                            0
Month                                           0
Mkt-RF                                          0
SMB                                             0
HML                                             0
RMW                                             0
CMA                                             0
RF                                              0
ExcessReturn                                    0
comname                                         0
sector                                          0
AnalyticCO2EstimationMethod                     0
AnalyticEstimatesCO2EquivalentsEmissionTotal    0
TotalAssets                                     0
dtype: int64


In [16]:
print(allrelevantdata)

           cusip       RET  year  Month  Mkt-RF   SMB   HML   RMW   CMA    RF  \
0       46603210 -0.097913  2015      1   -3.11 -0.92 -3.59  1.61 -1.65  0.00   
1       46603210  0.031288  2015      2    6.13  0.32 -1.86 -1.12 -1.82  0.00   
2       46603210  0.058010  2015      3   -1.12  3.07 -0.38  0.09 -0.52  0.00   
3       46603210 -0.022212  2015      4    0.59 -3.09  1.82  0.06 -0.61  0.00   
4       46603210  0.033260  2015      5    1.36  0.84 -1.15 -1.80 -0.75  0.00   
...          ...       ...   ...    ...     ...   ...   ...   ...   ...   ...   
362828  12503M10  0.075745  2023      8   -2.39 -3.68 -1.08  3.42 -2.37  0.45   
362829  12503M10  0.043417  2023      9   -5.24 -1.79  1.45  1.85 -0.84  0.43   
362830  12503M10  0.049165  2023     10   -3.18 -4.05  0.19  2.47 -0.67  0.47   
362831  12503M10  0.115016  2023     11    8.83 -0.11  1.66 -3.81 -0.99  0.44   
362832  12503M10 -0.019924  2023     12    4.87  7.33  4.92 -3.04  1.30  0.43   

        ExcessReturn       

In [17]:
# Export this cleaned dataset (contains everything we need to create factordata!)
allrelevantdata.to_csv('relevantdata.csv', index=False)