In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '8-K'
data_type_text = 'text_data'
data_type_id = 'id_data'

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')
    
############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
########################################################################################
############ Merge CRSP daily data with COMPUSTAT quarterly data #######################
########################################################################################

########### Read CRSP_daily raw data files
crsp_cols = ['date', 'PERMCO', 'CUSIP', 'RET', 'vwretd']
crsp = pd.read_csv(r'c:\users\fengzhi\desktop\crsp_daily.csv', usecols = crsp_cols, dtype={'CUSIP': str, 'RET': str})
crsp.columns = ['date', 'permco', 'cusip', 'ret', 'vwretd']

### Prepare merge: create date_key
crsp['date_key'] = crsp['date'].astype(str)

### Drop CRSP raw rows that contains non-numeric returns ('B' and 'C'), or with missing ret
print('number of daily obs. in CRSP: ' + str(crsp.shape[0]))
crsp = crsp[(crsp['ret'] != 'B') & (crsp['ret'] != 'C')]
crsp = crsp[crsp['ret'].notnull()]
crsp = crsp[crsp['vwretd'].notnull()] # none missing vwretd
print('number of daily data in CRSP that contains only numeric returns: ' + str(crsp.shape[0]))

### Mutate adjusted daily returns RET and delete ret and vwretd
crsp['RET'] = crsp['ret'].astype(float) - crsp['vwretd'].astype(float)
crsp = crsp.drop(columns=['ret', 'vwretd'])

### Mutate change in daily returns (comparing to last date with available ret)
crsp['lag_RET'] = crsp.groupby(['permco'])['RET'].shift(1)
crsp['DRET'] = crsp['RET'] - crsp['lag_RET']

crsp

number of daily obs. in CRSP: 51027516
number of daily data in CRSP that contains only numeric returns: 50284832


Unnamed: 0,date,permco,cusip,date_key,RET,lag_RET,DRET
0,19930104,7953,36720410,19930104,0.038326,,
1,19930105,7953,36720410,19930105,-0.015765,0.038326,-0.054091
2,19930106,7953,36720410,19930106,-0.001458,-0.015765,0.014307
3,19930107,7953,36720410,19930107,0.024909,-0.001458,0.026367
4,19930108,7953,36720410,19930108,0.003884,0.024909,-0.021025
...,...,...,...,...,...,...,...
51027511,20191224,53453,88160R10,20191224,0.014005,0.032654,-0.018649
51027512,20191226,53453,88160R10,20191226,0.008762,0.014005,-0.005243
51027513,20191227,53453,88160R10,20191227,-0.000656,0.008762,-0.009418
51027514,20191230,53453,88160R10,20191230,-0.031325,-0.000656,-0.030669


In [3]:
########### Read COMPUSTAT raw data files
comp_cols = ['gvkey', 'datadate', 'fyearq', 'fqtr', 'fyr', 'cusip', 'conm', 'actq', \
             'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'ppegtq', 'rectq', \
             'revtq', 'txditcq', 'xsgaq', 'iby', 'oancfy', 'xidocy', 'exchg', 'cik', 'costat', 'prccq', 'addzip', 'incorp', 'sic', 'ipodate']
comp = pd.read_csv('..\\filings\\compustat.csv', usecols = comp_cols)

### Reorder compustat column
# 1st line: merge keys
# 2nd line: extra id info
# 3rd line: financial data
# 4th line: financial data (CONT.)
comp = comp[['cusip', 'cik', 'datadate', \
'gvkey', 'conm', 'sic', 'incorp', 'addzip', 'fyearq', 'fqtr', 'fyr', 'ipodate', 'costat', 'exchg', \
'actq', 'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'xsgaq', 'oancfy', 'prccq', \
'iby', 'xidocy', 'rectq', 'ppegtq']]

print('number of quarterly filings in Compustat: ' + str(comp.shape[0]))

### Creat lagged variables in compustat raw data
comp['lag_prccq'] = comp.groupby(['gvkey'])['prccq'].shift(1)
comp['lag_cshoq'] = comp.groupby(['gvkey'])['cshoq'].shift(1)
comp['lag_ceqq'] = comp.groupby(['gvkey'])['ceqq'].shift(1)
comp['lag_dlcq'] = comp.groupby(['gvkey'])['dlcq'].shift(1)
comp['lag_dlttq'] = comp.groupby(['gvkey'])['dlttq'].shift(1)
comp['lag_atq'] = comp.groupby(['gvkey'])['atq'].shift(1)
comp['lag_ibq'] = comp.groupby(['gvkey'])['ibq'].shift(1)
comp['lag_revtq'] = comp.groupby(['gvkey'])['revtq'].shift(1)
comp['lag_rectq'] = comp.groupby(['gvkey'])['rectq'].shift(1)
comp['lag_oancfy'] = comp.groupby(['gvkey'])['oancfy'].shift(1)
comp['lag_xidocy'] = comp.groupby(['gvkey'])['xidocy'].shift(1)

#####################################################################
####################### Create ABTONE variables for Huang et al. 2014
#####################################################################

### EARN: earnings before extraordinary items (Compustat data item ibq) scaled by lagged total assets (Compustat data item atq)
comp['EARN'] = comp['ibq']/comp['lag_atq']
### LOSS, an indicator variable set to 1 when EARN is negative, and is 0 otherwise
comp['LOSS'] = 0 
comp.loc[comp['EARN'] < 0, 'LOSS'] = 1
### DEARN: change in earnings before extraordinary item scaled by beginning total assets (Compustat data item atq)
comp['DEARN'] = (comp['ibq'] - comp['lag_ibq'])/comp['lag_atq']
### STD_EARN: standard deviation of EARN calculated over the last five quarters
comp['STD_EARN'] = comp['EARN'].rolling(5).std()
### CFO: quarterly operating cash flows (Compustat data item oancfy) scaled by beginning total assets (Compustat data item atq);
comp['CFO'] = (comp['oancfy'] - comp['lag_oancfy'])/comp['lag_atq']
### TACC: total accruals, defined as quarterly income before extraordinary items (Compustat data item ibq) minus \
### the difference between quarterly operating cash flows (Compustat data item oancfy) and \
### quarterly extraordinary items and discontinued operations included in CFO (Compustat data item xidocy);
comp['TACC'] = comp['ibq'] - ((comp['oancfy']-comp['lag_oancfy']) - (comp['xidocy'] - comp['lag_xidocy']))
### TA: total assets, scaled by lagged total assets (Compustat data item atq);
comp['TA'] = comp['atq']/comp['lag_atq']
comp['LAG_TA'] = comp.groupby(['gvkey'])['TA'].shift(1)
comp['LAG_TA_REV'] = 1/comp['LAG_TA']
### DSALES: quarterly change in revenue (Compustat data item revtq), scaled by lagged total assets (Compustat data item atq);
comp['DSALES'] = (comp['revtq'] - comp['lag_revtq'])/comp['lag_atq']
### DAR: quarterly change in accounts receivable (Compustat data item rectq), scaled by lagged total assets (Compustat data item atq);
comp['DAR'] = (comp['rectq'] - comp['lag_rectq'])/comp['lag_atq']
### DSAR = DSALES - DAR
comp['DSAR'] = comp['DSALES'] - comp['DAR']
### PPE: gross property, plant, and equipment (Compustat data item ppegtq), scaled by lagged total assets (Compustat data item atq);
comp['PPE'] = comp['ppegtq']/comp['lag_atq']

### leap1_EARN, leap2_EARN, leap3_EARN and leap1_CFO, leap2_CFO, leap3_CFO for Huang et al. 2014 TABLE 4 replication
comp['leap1_EARN'] = comp.groupby(['gvkey'])['EARN'].shift(-1)
comp['leap2_EARN'] = comp.groupby(['gvkey'])['EARN'].shift(-2)
comp['leap3_EARN'] = comp.groupby(['gvkey'])['EARN'].shift(-3)

comp['leap1_CFO'] = comp.groupby(['gvkey'])['CFO'].shift(-1)
comp['leap2_CFO'] = comp.groupby(['gvkey'])['CFO'].shift(-2)
comp['leap3_CFO'] = comp.groupby(['gvkey'])['CFO'].shift(-3)

### Create first 2-digits SIC:SIC2
comp['SIC2'] = comp['sic'].astype(str).str[:-2]

#####################################################################
####################################################### Prepare merge
#####################################################################

### Delete the 9th digit of compustat filings' cusip, and filter filings that have 8-digits cusip after deletion 
comp['cusip'] = comp['cusip'].astype(str).str[:-1]
del_cusip = comp.loc[comp['cusip'].str.len() != 8].shape[0]
comp = comp.loc[comp['cusip'].str.len() == 8]

### Prepare merge: create date_key
comp['date_key'] = comp['datadate'].astype(str)

### Prepare merge: drop duplicated cusip-datekey obs.
del_merge = comp[comp.duplicated(subset=['cusip', 'date_key'])].shape[0]
comp = comp[comp.duplicated(subset=['cusip', 'date_key']) == False]
print('number of quarterly filings in Compustat for merge with CRSP: ' + str(comp.shape[0]))

comp

number of quarterly filings in Compustat: 1142966
number of quarterly filings in Compustat for merge with CRSP: 1140302


Unnamed: 0,cusip,cik,datadate,gvkey,conm,sic,incorp,addzip,fyearq,fqtr,fyr,ipodate,costat,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,xsgaq,oancfy,prccq,iby,xidocy,rectq,ppegtq,lag_prccq,lag_cshoq,lag_ceqq,lag_dlcq,lag_dlttq,lag_atq,lag_ibq,lag_revtq,lag_rectq,lag_oancfy,lag_xidocy,EARN,LOSS,DEARN,STD_EARN,CFO,TACC,TA,LAG_TA,LAG_TA_REV,DSALES,DAR,DSAR,PPE,leap1_EARN,leap2_EARN,leap3_EARN,leap1_CFO,leap2_CFO,leap3_CFO,SIC2,date_key
0,00036110,1750.0,19910228,1004,AAR CORP,5080,DE,60191,1990,3.0,5,19880101.0,A,11.0,271.238,383.354,191.761,5.019,15.891,11.436,69.020,2.937,3.977,,86.573,117.820,36.000,14.768,40.454,12.875,,0.0,87.347,,,,,,,,,,,,,,0,,,,,,,,,,,,0.010437,0.009064,0.007132,-0.009294,-0.098529,-0.012275,50,19910228
1,00036110,1750.0,19910531,1004,AAR CORP,5080,DE,60191,1990,4.0,5,19880101.0,A,11.0,268.399,379.958,193.778,1.553,15.891,16.500,68.953,3.092,4.001,,79.227,116.822,38.000,16.280,36.891,14.125,,0.0,91.238,113.060,12.875,15.891,191.761,11.436,69.020,383.354,3.977,117.820,87.347,40.454,0.0,0.010437,0,0.000063,,-0.009294,7.564,0.991141,,,-0.002603,0.010150,-0.012753,0.294923,0.009064,0.007132,0.007364,-0.098529,-0.012275,-0.002260,50,19910531
2,00036110,1750.0,19910831,1004,AAR CORP,5080,DE,60191,1991,1.0,5,19880101.0,A,11.0,267.629,376.618,195.076,2.937,15.891,21.430,68.322,3.107,3.444,,75.220,107.339,38.000,13.804,-0.546,16.250,3.444,0.0,79.519,,14.125,15.891,193.778,16.500,68.953,379.958,4.001,116.822,91.238,36.891,0.0,0.009064,0,-0.001466,,-0.098529,40.881,0.991210,0.991141,1.008938,-0.024958,-0.030843,0.005885,,0.007132,0.007364,0.002544,-0.012275,-0.002260,0.037451,50,19910831
3,00036110,1750.0,19911130,1004,AAR CORP,5080,DE,60191,1991,2.0,5,19880101.0,A,11.0,283.514,391.089,195.379,3.796,15.897,30.405,67.631,2.643,2.686,,90.079,101.948,38.000,14.256,-5.169,12.375,,0.0,86.196,,16.250,15.891,195.076,21.430,68.322,376.618,3.444,107.339,79.519,-0.546,0.0,0.007132,0,-0.002013,,-0.012275,7.309,1.038424,0.991210,1.008868,-0.014314,0.017729,-0.032043,,0.007364,0.002544,0.007849,-0.002260,0.037451,-0.024067,50,19911130
4,00036110,1750.0,19920229,1004,AAR CORP,5080,DE,60191,1991,3.0,5,19880101.0,A,11.0,287.538,396.971,197.076,2.610,15.897,35.977,67.390,3.095,2.880,,94.505,104.920,38.000,14.026,-6.053,15.500,,0.0,92.847,,12.375,15.897,195.379,30.405,67.631,391.089,2.686,101.948,86.196,-5.169,0.0,0.007364,0,0.000496,,-0.002260,3.764,1.015040,1.038424,0.962998,0.007599,0.017006,-0.009407,,0.002544,0.007849,0.003948,0.037451,-0.024067,0.002457,50,19920229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142961,4525EP10,1280776.0,20191231,331856,IMMUNIC INC,2836,DE,92127,2019,4.0,12,,A,14.0,32.230,65.955,58.363,29.369,10.745,,0.520,0.012,-7.691,32.970,7.072,0.000,0.000,7.274,-28.545,9.700,-34.933,0.0,0.758,,10.000,10.071,60.527,,,67.604,-8.215,0.000,0.000,-22.280,0.0,-0.113765,1,0.007751,,-0.092672,-1.426,0.975608,0.931518,1.073516,0.000000,0.011212,-0.011212,,,,,,,,28,20191231
1142962,04216R10,921114.0,20190331,332115,ARMATA PHARMACEUTICALS INC,2836,WA,90292,2019,1.0,12,,A,12.0,,,,,,,,0.348,-3.739,,,0.000,,2.493,,,-3.739,,,,,,,,,,,,,,,,0,,,,,,,,,,,,,-0.199192,-0.156497,,-0.130800,-0.093127,28,20190331
1142963,04216R10,921114.0,20190630,332115,ARMATA PHARMACEUTICALS INC,2836,WA,90292,2019,2.0,12,,A,12.0,14.202,34.916,22.630,13.192,9.959,1.264,2.413,0.335,-4.199,13.746,5.634,0.000,3.077,4.323,-8.286,4.010,-7.938,0.0,0.000,16.391,,,,,,,-3.739,0.000,,,,,0,,,,,,,,,,,,-0.199192,-0.156497,,-0.130800,-0.093127,,28,20190630
1142964,04216R10,921114.0,20190930,332115,ARMATA PHARMACEUTICALS INC,2836,WA,90292,2019,3.0,12,,A,12.0,9.934,29.304,18.132,8.690,9.934,1.251,1.900,0.366,-6.955,13.746,4.965,0.000,3.077,6.411,-12.853,3.700,-14.893,0.0,0.000,13.878,4.010,9.959,22.630,1.264,2.413,34.916,-4.199,0.000,0.000,-8.286,0.0,-0.199192,1,-0.078932,,-0.130800,-2.388,0.839271,,,0.000000,0.000000,0.000000,0.397468,-0.156497,,,-0.093127,,,28,20190930


In [4]:
# ################ Create COMP_DA to calculate DA and then merge with COMP (in order to preserve obs.)
# ### Drop obs. with non-positive assets in compustat raw data
# comp_DA = comp.loc[(comp['LAG_TA_REV'] != np.inf) & (comp['LAG_TA_REV'].isnull() == False) & (comp['LAG_TA_REV'] != -np.inf)]
# comp_DA = comp_DA.loc[(comp_DA['DSAR'] != np.inf) & (comp_DA['DSAR'].isnull() == False) & (comp_DA['DSAR'] != -np.inf)]
# comp_DA = comp_DA.loc[(comp_DA['PPE'] != np.inf) & (comp_DA['PPE'].isnull() == False) & (comp_DA['PPE'] != -np.inf)]
# print('number of quarterly filings in Compustat after deleting missing or infinite LAG_TA_REV, DSAR and PPE: ' + str(comp_DA.shape[0]))

# ############## Winsorize ##############################
# ###### Define a function that winsorize a variable at 1% and 99% 
# def winsorize (df, colnames):
#     for col in colnames:
#         varq01 = df[col].quantile(.01)
#         varq99 = df[col].quantile(.99)
#         df[col] = df[col].clip(varq01, varq99)
#     return df

# ############## Winsorize DA variables
# comp_DA = winsorize(comp_DA, ['TACC', 'LAG_TA_REV', 'DSAR', 'PPE'])

# ######## Create DA: residual from TACC regression by each two-digit SIC-year
# def TACC_reg(data):
#     try:
#         if data.shape[0] >= 20:
#             y, X = ps.dmatrices('TACC ~ 1 + LAG_TA_REV + DSAR + PPE', data = data, return_type = 'dataframe')
#             model = sm.OLS(y, X)
#             res = model.fit()
#             data['DA'] = res.resid
#             return data
#     except:
#         pass

# comp_DA = comp_DA.groupby(['SIC2','fyearq']).apply(TACC_reg)
# # comp_DA['DA'].describe()

# ####### Drop duplicate gvkey-datadate in comp_DA
# comp_DA = comp_DA[comp_DA.duplicated(subset=['gvkey', 'datadate']) == False]

# ####### Join DA to COMP
# comp = pd.merge(comp, comp_DA[['gvkey', 'datadate', 'DA']], on = ['gvkey', 'datadate'], how='left', validate = 'm:1')

# comp

In [5]:
##################### Left merge CRSP and COMPUSTAT, key unique in both datasets
crsp_comp = pd.merge(crsp, comp[['cusip','date_key','cik', 'conm']], on = ['cusip', 'date_key'], how='left', validate = '1:1')
# crsp_comp

###### after merging, count number of firms (identified by cusip) that has unique cik
print('total number of firms in CRSP: ' + str((crsp_comp.groupby(['cusip'])['cik'].nunique()).shape[0]))
print('number of firms without cik: ' + str((crsp_comp.groupby(['cusip'])['cik'].nunique() == 0).sum()))
print('number of firms with unique cik: ' + str((crsp_comp.groupby(['cusip'])['cik'].nunique() == 1).sum()))

###### fill cik with first and last observed non-null value by cusip group ########### Takes some time!!
def bfill (df, groupby, colnames):
    for col in colnames:
        df[col] = df.groupby(groupby)[col].fillna(method='bfill')
    return df

print('number of obs. with missing cik before fill: ' + str(crsp_comp['cik'].isnull().sum()))
crsp_comp['cik'] = crsp_comp.groupby(['cusip'])['cik'].fillna(method='ffill')
crsp_comp['cik'] = crsp_comp.groupby(['cusip'])['cik'].fillna(method='bfill')
print('number of obs. with missing cik after fill: ' + str(crsp_comp['cik'].isnull().sum()))
# print('number of firms with n cik: ' + str((crsp_comp.groupby(['cusip'])['cik'].nunique() == n).sum()))

###### drop obs. without cik
crsp_comp = crsp_comp[crsp_comp['cik'].notnull()]

###### backward fill financial variables with last observed non-null value by cik group (now equal to cusip group)
fillcols = ['conm']
groupcols = ['cik']
crsp_comp = bfill(crsp_comp, groupcols, fillcols)

print(crsp_comp.isnull().sum())

############## Save merged CRSP_COMP dataframe into local file crsp_comp_10-Q.csv (4GB: too big for Github, therefore stored in desktop)
crsp_comp.to_csv(r'c:\users\fengzhi\desktop\crsp_comp_' + obj_type + '.csv', index = 0)

crsp_comp

total number of firms in CRSP: 23423
number of firms without cik: 6698
number of firms with unique cik: 16725
number of obs. with missing cik before fill: 49842200
number of obs. with missing cik after fill: 9774801
date              0
permco            0
cusip             0
date_key          0
RET               0
lag_RET       16504
DRET          16504
cik               0
conm        1316709
dtype: int64


Unnamed: 0,date,permco,cusip,date_key,RET,lag_RET,DRET,cik,conm
0,19930104,7953,36720410,19930104,0.038326,,,43350.0,GAS NATURAL INC
1,19930105,7953,36720410,19930105,-0.015765,0.038326,-0.054091,43350.0,GAS NATURAL INC
2,19930106,7953,36720410,19930106,-0.001458,-0.015765,0.014307,43350.0,GAS NATURAL INC
3,19930107,7953,36720410,19930107,0.024909,-0.001458,0.026367,43350.0,GAS NATURAL INC
4,19930108,7953,36720410,19930108,0.003884,0.024909,-0.021025,43350.0,GAS NATURAL INC
...,...,...,...,...,...,...,...,...,...
50284827,20191224,53453,88160R10,20191224,0.014005,0.032654,-0.018649,1318605.0,TESLA INC
50284828,20191226,53453,88160R10,20191226,0.008762,0.014005,-0.005243,1318605.0,TESLA INC
50284829,20191227,53453,88160R10,20191227,-0.000656,0.008762,-0.009418,1318605.0,TESLA INC
50284830,20191230,53453,88160R10,20191230,-0.031325,-0.000656,-0.030669,1318605.0,TESLA INC


In [6]:
########################################################################################
############ Concatenate and prepare merge: ID_DATA and TEXT_DATA ######################
########################################################################################

############## Define a function to concatenate all csv files with file name that matches a certain pattern into one data frame
def concatenate (indir, file_name_match):
    os.chdir(indir)
    file_list = glob.glob(file_name_match)
    df_list = list()
    colnames = pd.read_csv(file_list[0], header = None).loc[0]
    
    for filename in file_list:
        # print(filename)
        df = pd.read_csv(filename, low_memory = False)
        df_list.append(df)

    df_concat = pd.concat(df_list, axis = 0)
    df_concat.columns = colnames
    return df_concat

############## Concatenate id_data and text_data files and create two data frames
id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')
text_data = concatenate('..\\filings', data_type_text + '_'+ obj_type + '_' + '*.csv')
############## Save id_data dataframe into local file id_data_10-Q.csv
id_data.to_csv('..\\filings\\' + data_type_id + '_'+ obj_type + '.csv', index = 0)

############################################################
#################################### text_data modifications
############################################################

############## Correct modal words labels in text_data
text_data.columns = ['accnum', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_uctt', 'n_lit', 'n_cstr', \
                     'n_modal_strong', 'n_modal_moderate', 'n_modal_weak', 'n_negation']

############## Save text_data dataframe into local file text_data_10-Q.csv
text_data.to_csv('..\\filings\\' + data_type_text + '_'+ obj_type + '.csv', index = 0)

print('Number of ' + obj_type + ' in edgar from 1993 Q1 to 2020 Q1: ' + str(id_data.shape[0]))
print('Number of ' + obj_type + ' parsed and downloaded: ' + str(text_data.shape[0]))

Number of 8-K in edgar from 1993 Q1 to 2020 Q1: 1628467
Number of 8-K parsed and downloaded: 1578861


In [7]:
########################################################################################
######################## Merge ID_DATA with TEXT_DATA into EDGAR #######################
########################################################################################

id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')

############## Prepare merge: ID_DATA
id_data['fd'] = id_data['fd'].str.replace('-', '')
id_data['rp'] = id_data['rp'].str.replace('-', '')
id_data['date_key'] = id_data['rp'].astype(str)

############## Drop unnecessary columns and reorder id_data
id_data = id_data.drop(columns=['file_type', 'sic', 'fye', 'bazip', 'state'])
id_data = id_data[['cik', 'rp', 'fd', 'accnum', 'item8k', 'name', 'date_key']].sort_values(by = ['cik', 'rp'])

############## Drop 8-Ks that has duplicated accnum and cik-rp
del_accnum = id_data[id_data.duplicated('accnum')].shape[0]
id_data = id_data[id_data.duplicated('accnum') == False]
del_cik_rp = id_data[id_data.duplicated(subset=['cik', 'rp','item8k'])].shape[0]
id_data = id_data[id_data.duplicated(subset=['cik', 'rp','item8k']) == False]
print('Number of ' + obj_type + ' with duplicated accnum and cik-rp-item8k: ' + str(del_accnum + del_cik_rp))

############## Drop 8-Ks without items
del_item8k = id_data[id_data['item8k'].isnull()].shape[0]
id_data = id_data[id_data['item8k'].notnull()]
print('Number of ' + obj_type + ' with missing items8k: ' + str(del_item8k))

############## Drop 8-Ks with reporting period larger than filing date
del_rp_fd = id_data[id_data['rp'].astype(int) > id_data['fd'].astype(int)].shape[0]
id_data = id_data[id_data['rp'].astype(int) <= id_data['fd'].astype(int)]
print('Number of ' + obj_type + ' with reporting period larger than filing date: ' + str(del_rp_fd))

print('Number of remaining ' + obj_type + ' with after screening: ' + str(id_data.shape[0]))

############## Count number of 8k per cik-day
id_data = id_data.join(id_data.groupby(['cik', 'rp']).size().rename('n8k'), on=['cik', 'rp'])

Number of 8-K with duplicated accnum and cik-rp: 83967
Number of 8-K with missing items8k: 542
Number of 8-K with reporting period larger than filing date: 2931
Number of remaining 8-K with after screening: 1541027


In [10]:
##############################################################################
########################## disaggregate item 8-K into separate count variables
##############################################################################

id_data_before2004 = id_data[id_data['fd'].astype(int) < 20040823]
id_data_after2004 = id_data[id_data['fd'].astype(int) >= 20040823]

def count_item_occurances (df, item):
    for i in item:
        df.loc[:,'item_' + i] = df.groupby(['cik', 'rp'])['item8k'].transform(lambda x: x[x.str.contains(i)].count())
    return df

item_before2004 = ['1','2','3','4','5','6','7','8','9','10','11','12']
item_after2004 = ['1.01','1.02','1.03','1.04', \
                  '2.01','2.02','2.03','2.04','2.05','2.06', \
                  '3.01','3.02','3.03', \
                  '4.01','4.02',
                  '5.01','5.02','5.03','5.04','5.05','5.06','5.07','5.08', \
                  '6.01','6.02','6.03','6.04','6.05', \
                  '7.01', \
                  '8.01', \
                  '9.01']

##############################################################################
###################################################### Roughly takes 7 hours!!
##############################################################################
id_data_before2004 = count_item_occurances(id_data_before2004, item_before2004)

for i in item_after2004:
    id_data_before2004.loc[:,'item_' + i] = 0

for i in item_before2004:
    id_data_after2004.loc[:,'item_' + i] = 0
    
id_data_after2004 = count_item_occurances(id_data_after2004, item_after2004)

############################ Concatenate id_data_before2004 and id_data_after2004 and save it to local id_data_8-K_item_count.csv file
id_data = pd.concat([id_data_before2004, id_data_after2004]).sort_values(by = ['cik', 'rp'])

# id_data.to_csv('..\\filings\\' + data_type_id + '_'+ obj_type + '_item_count.csv', index = 0)

########## Merge id_data and text_data by accnum
edgar = pd.merge(id_data, text_data, on = ['accnum'], how = 'inner', validate = '1:1')

########## Sum up text count by cik-rp
text_count_vars = ['nw', 'nvocab', 'n_neg', 'n_pos', 'n_uctt', 'n_lit', 'n_cstr', \
              'n_modal_strong', 'n_modal_moderate', 'n_modal_weak', 'n_negation']
edgar_daily = edgar.groupby(['cik','rp'])[text_count_vars].sum()
edgar_daily.columns = text_count_vars
edgar = edgar.drop(columns = text_count_vars)
edgar = edgar.join(edgar_daily, on=['cik', 'rp'])

############## Calculate tone : tone = (n_pos - n_negation - n_neg)/nw
edgar['tone'] = (edgar['n_pos'] - edgar['n_negation'] - edgar['n_neg'])/edgar['nw']

########## Drop duplicate cik-rp
edgar = edgar[edgar.duplicated(subset=['cik', 'rp']) == False]

edgar.to_csv('..\\filings\\edgar_'+ obj_type + '.csv', index = 0)

In [28]:
########## Load edgar_8-K dataset
edgar_cols = ['cik', 'rp', 'fd', 'item8k', 'name', 'date_key', 'n8k', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_negation', 'tone']
edgar = pd.read_csv('..\\filings\\edgar_8-K.csv', usecols = edgar_cols)

########## Inspect edgar_8-K dataset
edgar

Unnamed: 0,cik,rp,fd,item8k,name,date_key,n8k,nw,nvocab,n_neg,n_pos,n_negation,tone
0,20,19950623,19950710,"2, 5, 7",K TRON INTERNATIONAL INC,19950623,1,13525,2257,252,56,2,-0.014640
1,20,20011016,20011017,"5, 7",K TRON INTERNATIONAL INC,20011016,1,1687,467,4,4,0,0.000000
2,20,20020622,20020626,"4, 7",K TRON INTERNATIONAL INC,20020622,1,452,193,3,1,1,-0.006637
3,20,20030102,20030115,"2, 7",K TRON INTERNATIONAL INC,20030102,1,813,335,2,0,0,-0.002460
4,20,20030417,20030421,"7, 9",K TRON INTERNATIONAL INC,20030417,1,201,112,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489621,1799463,20200220,20200220,"8.01, 9.01",Morgan Stanley Capital I Trust 2020-L4,20200220,1,2529,522,9,3,1,-0.002768
1489622,1800294,20200226,20200226,"8.01, 9.01",CITIGROUP COMMERCIAL MORTGAGE TRUST 2020-GC46,20200226,1,1807,498,9,5,0,-0.002214
1489623,1801338,20200228,20200228,"8.01, 9.01",Benchmark 2020-IG1 Mortgage Trust,20200228,1,1332,395,0,4,1,0.002252
1489624,1801661,20200305,20200310,"1.01, 3.02, 5.02, 5.03, 8.01, 9.01",Flying Eagle Acquisition Corp.,20200305,1,1465,419,8,4,0,-0.002730


In [13]:
########################################################################################
######################## Merge EDGAR with CRSP_COMPUSTAT ###############################
########################################################################################

############## Left merge ID_DATA and CRSP_COMP, key not unique in both data sets
crsp_comp_edgar = pd.merge(crsp_comp, edgar, on = ['cik', 'date_key'], how = 'outer', validate = '1:1')

# ### Delete duplicated columns in ID_DATA and CRSP_COMP, duplicated rows in accnum (edgar) and [cik * date] (compustat), and rename sic, date_key, date and datadate
# id_crsp_comp = id_crsp_comp[id_crsp_comp.duplicated('accnum') == False]
# id_crsp_comp = id_crsp_comp[id_crsp_comp.duplicated(subset=['cik', 'rp']) == False]
# id_crsp_comp = id_crsp_comp.rename(columns={'sic_y': 'SIC', 'date_key': 'cquarter', 'date': 'date_crsp', 'datadate': 'date_comp'})

# ### Reorder id_crsp_comp columns
# # 1st line: merge keys
# # 2nd line: extra id info
# # 3rd line: financial raw data (not lagged variables)
# # 4th line: financial raw data (lagged variables)
# # 5th line: ready-to-use regression variables
# id_crsp_comp = id_crsp_comp[['cusip', 'cik', 'rp', 'accnum', \
# 'name', 'gvkey', 'SIC', 'fd', 'date_crsp', 'date_comp', 'cquarter', 'fyearq', 'fqtr', 'incorp', 'state', 'addzip', 'costat', 'age', \
# 'actq', 'cheq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'xsgaq', \
# 'atq', 'lag_atq', 'ceqq', 'lag_ceqq', 'cshoq', 'lag_cshoq', 'dlcq', 'lag_dlcq', 'dlttq', 'lag_dlttq', 'prccq', 'lag_prccq', 'ibq', 'lag_ibq', \
# 'RET', 'STD_RET', 'EARN', 'LOSS', 'DEARN', 'STD_EARN', 'CFO', 'leap1_EARN', 'leap2_EARN', 'leap3_EARN', 'leap1_CFO', 'leap2_CFO', 'leap3_CFO', 'DA']]

MergeError: Merge keys are not unique in left dataset; not a one-to-one merge

In [12]:
################## Inspect id_crsp_comp 
print('number of observations after merging with edgar data: ' + str(id_crsp_comp.shape[0]))
id_crsp_comp

number of observations after merging with edgar data: 303034


Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,SIC,fd,date_crsp,date_comp,cquarter,fyearq,fqtr,incorp,state,addzip,costat,age,actq,cheq,dpq,ibq,intanq,lctq,revtq,txditcq,xsgaq,atq,lag_atq,ceqq,lag_ceqq,cshoq,lag_cshoq,dlcq,lag_dlcq,dlttq,lag_dlttq,prccq,lag_prccq,ibq.1,lag_ibq,RET,STD_RET,EARN,LOSS,DEARN,STD_EARN,CFO,leap1_EARN,leap2_EARN,leap3_EARN,leap1_CFO,leap2_CFO,leap3_CFO,DA
0,54626810,60512,19930630,0000060512-94-000005,LOUISIANA LAND & EXPLORATION CO,6819.0,1311.0,19930813,1993-06-30,19930630,199306,1993.0,2.0,MD,MD,70112,I,11324,193.800,64.300,27.300,5.600,,208.200,189.300,136.100,23.900,1278.000,1203.400,424.200,421.300,28.729,28.647,86.500,0.000,356.300,379.500,42.5000,45.25,5.600,2.700,-0.059108,0.054464,0.004653,0.0,0.002410,0.002744,0.031660,-0.001408,0.003730,0.003372,0.037089,0.041509,-0.075597,-31.417469
1,88579Y10,66740,19930630,0000066740-94-000015,MINNESOTA MINING & MANUFACTURING CO,7435.0,2670.0,19930813,1993-06-30,19930630,199306,1993.0,2.0,DE,DE,55144,A,17348,6382.000,650.000,264.000,331.000,,3365.000,3540.000,0.000,893.000,12145.000,12037.000,6590.000,6710.000,216.975,218.753,829.000,657.000,679.000,635.000,108.0000,109.50,331.000,330.000,-0.014779,0.036793,0.027499,0.0,0.000083,0.001605,0.038797,0.026019,0.023387,0.025088,0.050391,0.040150,-0.135689,-140.858059
2,08750910,11860,19930930,0000011860-94-000005,BETHLEHEM STEEL CORP /DE/,2189.0,3312.0,19931112,1993-09-30,19930930,199309,1993.0,3.0,DE,DE,18016-7699,I,24715,1074.900,232.500,69.100,30.700,,908.400,1055.300,0.000,38.400,5168.098,5463.199,580.800,958.900,91.307,91.025,100.300,88.400,728.500,650.200,14.3750,18.75,30.700,-13.600,-0.270146,0.119124,0.005619,0.0,0.008109,0.007031,0.007871,-0.046942,0.002195,0.004469,0.025212,-0.013205,0.004332,-6.732774
3,54626810,60512,19930930,0000060512-94-000007,LOUISIANA LAND & EXPLORATION CO,6819.0,1311.0,19931110,1993-09-30,19930930,199309,1993.0,3.0,MD,MD,70112,I,11416,180.200,9.400,28.200,-1.800,,195.100,187.900,124.800,23.700,1662.300,1278.000,417.100,424.200,33.137,28.729,0.000,86.500,747.700,356.300,44.6250,42.50,-1.800,5.600,0.025969,0.094378,-0.001408,1.0,-0.005790,0.004136,0.037089,0.003730,0.003372,0.000335,0.041509,-0.075597,0.026837,-48.974745
4,88579Y10,66740,19930930,0000066740-94-000016,MINNESOTA MINING & MANUFACTURING CO,7435.0,2670.0,19931112,1993-09-30,19930930,199309,1993.0,3.0,DE,DE,55144,A,17440,6445.000,665.000,262.000,316.000,,3404.000,3481.000,0.000,859.000,12229.000,12145.000,6600.000,6590.000,215.791,216.975,796.000,829.000,682.000,679.000,102.8750,108.00,316.000,331.000,-0.076728,0.030508,0.026019,0.0,-0.001235,0.001592,0.050391,0.023387,0.025088,0.026336,0.040150,-0.135689,0.031711,-302.706484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306668,87182910,96021,20191228,0000096021-20-000017,SYSCO CORP,10247.0,5140.0,20200205,2019-12-31,20191231,201912,2020.0,2.0,DE,DE,77077,A,17807,8661.610,539.625,185.011,383.410,4879.128,6931.968,15025.042,142.301,2218.801,19372.034,18956.575,2527.526,2454.748,508.843,510.864,897.619,160.338,8654.524,9183.272,85.5400,79.40,383.410,453.781,-0.001390,0.025118,0.020226,0.0,-0.003712,0.005675,0.030749,,,,,,,
306669,87840910,96699,20191228,0001171843-20-000876,TECHNICAL COMMUNICATIONS CORP,10364.0,3663.0,20200211,2019-12-31,20191231,201912,2020.0,1.0,MA,MA,01742-2892,A,13271,2.146,0.748,0.006,-0.480,0.000,0.574,0.666,0.000,0.789,2.847,2.917,1.752,2.219,1.850,1.850,0.148,0.000,0.521,0.000,5.0500,2.50,-0.480,1.032,1.266332,0.797053,-0.164553,1.0,-0.518341,0.266651,-0.162153,,,,,,,-94.810686
306670,86737U10,96793,20191231,0001564590-20-004619,SUNLINK HEALTH SYSTEMS INC,10380.0,5912.0,20200213,2019-12-31,20191231,201912,2020.0,2.0,OH,OH,30339,A,17197,14.384,4.607,0.350,0.203,1.295,6.440,12.805,0.000,0.994,22.823,25.811,14.762,14.810,6.983,6.987,0.782,3.270,0.851,0.937,1.0700,1.13,0.203,-0.143,-0.135859,0.028797,0.007865,0.0,0.013405,0.021058,-0.013483,,,,,,,
306671,87288520,98338,20191130,0001213900-20-000888,TSR INC,10305.0,7371.0,20200113,2019-11-29,20191130,201911,2019.0,2.0,DE,DE,11788,A,15218,10.365,2.988,0.002,0.061,0.000,5.241,15.233,0.000,2.634,11.826,12.109,6.287,6.225,1.962,1.962,0.252,0.281,0.271,0.327,3.2001,4.29,0.061,-0.663,-0.315755,0.142523,0.005038,0.0,0.059790,0.027598,-0.008671,-0.079909,,,-0.032133,,,6.976358


In [13]:
########################################################################################
###################### Merge TEXT_DATA with ID_CRSP_COMPUSTAT ##########################
########################################################################################

############## Left merge ID_CRSP_COMP and TEXT_DATA, key unique in both data sets
id_crsp_comp_text = pd.merge(id_crsp_comp, text_data, on = ['accnum'], how = 'inner', validate = '1:1')
print('number of firm-quarters of merged id_crsp_comp_text: ' + str(id_crsp_comp_text.shape[0]))

number of firm-quarters of merged id_crsp_comp_text: 303034


In [14]:
############### Inspect if firm-quarter key is unique : YES, key is unique
# print(id_crsp_comp_text[id_crsp_comp_text.duplicated('accnum')])
# print(id_crsp_comp_text[id_crsp_comp_text.duplicated(subset=['cik', 'rp'])])

In [16]:
####################### Modify data type in ID_CRSP_COMP_TEXT
########### Define a function that changes pandas series data type to string
def columns_to_str (df, colnames):
    for col in colnames:
        df[col] = df[col].astype(str)
    return df

########### Apply columns_to_str to various identification variables
id_crsp_comp_text = columns_to_str(id_crsp_comp_text, ['cik', 'gvkey', 'fyearq', 'fqtr'])

########## Convert date variables to date format
id_crsp_comp_text['fd'] = pd.to_datetime(id_crsp_comp_text['fd'], format='%Y%m%d')
id_crsp_comp_text['rp'] = pd.to_datetime(id_crsp_comp_text['rp'], format='%Y%m%d')
id_crsp_comp_text['date_comp'] = pd.to_datetime(id_crsp_comp_text['date_comp'], format='%Y%m%d')
# id_crsp_comp_text['ipodate'] = pd.to_datetime(id_crsp_comp_text['ipodate'], format='%Y%m%d')

########## Convert SIC variables to integer
id_crsp_comp_text['SIC'] = id_crsp_comp_text['SIC'].astype(int)

########### Inspect column data types
# print(id_crsp_comp_text.dtypes)

In [17]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

############################## Main Variables ##########################################
######## NW: natural log of 1 + total number of words in the document
id_crsp_comp_text['NW'] = np.log(1 + id_crsp_comp_text['nw'])

######## TONE: number of net positive words (n_pos - n_neg - n_negations) per 1000 total words
id_crsp_comp_text['TONE'] = id_crsp_comp_text['tone']*1000

######## TLAG: Time lag between the news release date (CRSP date) and document filing date (EDGAR filing date)
id_crsp_comp_text['TLAG'] = (id_crsp_comp_text['fd'] - id_crsp_comp_text['date_crsp']).dt.days

######## NEG: An indicator variable takes the value of 1 when market-adjusted stock return (RET) is negative and is 0 otherwise
id_crsp_comp_text['NEG'] = 0 
id_crsp_comp_text.loc[id_crsp_comp_text['RET'] < 0, 'NEG'] = 1

############################## Control Variables #######################################
######## Size: Firm size, defined as the natural logarithm of market value of equity [at the beginning of the quarter] \
######## defined as [beginning-of-quarter] common share price (Compustat data item prccq) \
######## times [beginning-of-quarter] common shares outstanding (Compustat data item cshoq)
id_crsp_comp_text['SIZE'] = np.log(id_crsp_comp_text['lag_prccq']*id_crsp_comp_text['lag_cshoq'])

######## MTB: Market-to-book ratio, defined as [beginning-of-quarter] market value of equity \
######## defined as common share price (Compustat data item prccq) times common shares outstanding (Compustat data item cshoq) \
######## divided by [beginning-of-quarter] book value of equity (Compustat data item ceqq) 
id_crsp_comp_text['MTB'] = (id_crsp_comp_text['lag_prccq']*id_crsp_comp_text['lag_cshoq'])/id_crsp_comp_text['lag_ceqq']

######## LEV: Leverage, defined as [beginning-of-quarter] short term debt plus [beginning-of-quarter] long term debt \
######## (Compustat data item dlcq + Compustat data item dlttq) scaled by [beginning-of-quarter] total assets (Compustat data item atq)
id_crsp_comp_text['LEV'] = (id_crsp_comp_text['lag_dlcq'] + id_crsp_comp_text['lag_dlttq'])/id_crsp_comp_text['lag_atq']

######## AGE: log(1 + age from the first year the firm entered the CRSP dataset)
id_crsp_comp_text['AGE'] = np.log(1 + id_crsp_comp_text['age'])

In [18]:
########################################################################################
############################### Variable Screening #####################################
########################################################################################

############## Drop financial and utility firms (SIC codes between 6000 and 6999 and between 4900 and 4999, respectively)
del_fin = id_crsp_comp_text.loc[(id_crsp_comp_text['SIC'] >= 6000) & (id_crsp_comp_text['SIC'] <= 6999)].shape[0]
id_crsp_comp_text = id_crsp_comp_text.loc[(id_crsp_comp_text['SIC'] < 6000) | (id_crsp_comp_text['SIC'] > 6999)] # financial
del_ut = id_crsp_comp_text.loc[(id_crsp_comp_text['SIC'] >= 4900) & (id_crsp_comp_text['SIC'] <= 4999)].shape[0]
id_crsp_comp_text = id_crsp_comp_text.loc[(id_crsp_comp_text['SIC'] < 4900) | (id_crsp_comp_text['SIC'] > 4999)] # utility
print('number of firm-quarters from utility and financial firms: ' + str(del_fin + del_ut))

############## Drop files (firm-quarter) that have missing SIZE, MTB, LEV, or with non-positive total assets or book value of equity, \
############## or with [beginning-of-quarter] common share price less than $1
del_size = id_crsp_comp_text[id_crsp_comp_text['SIZE'].isnull()].shape[0]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['SIZE'].isnull() == False]
del_mtb = id_crsp_comp_text[id_crsp_comp_text['MTB'].isnull()].shape[0]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['MTB'].isnull() == False]
del_lev = id_crsp_comp_text[id_crsp_comp_text['LEV'].isnull()].shape[0]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['LEV'].isnull() == False]
del_atq = id_crsp_comp_text.loc[(id_crsp_comp_text['atq'] <= 0) | (id_crsp_comp_text['atq'].isnull())].shape[0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['atq'] > 0]
del_ceqq = id_crsp_comp_text.loc[(id_crsp_comp_text['ceqq'] <= 0) | (id_crsp_comp_text['ceqq'].isnull())].shape[0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['ceqq'] > 0]
del_prccq = id_crsp_comp_text.loc[(id_crsp_comp_text['lag_prccq'] < 1) | (id_crsp_comp_text['lag_prccq'].isnull())].shape[0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['lag_prccq'] >= 1]
print('number of firm-quarters with missing SIZE, MTB, LEV or non-positive total assets or non-positive book value of equity, or lag_prcc < 1: ' \
      + str(del_size + del_mtb + del_lev + del_atq + del_ceqq + del_prccq))

## Change SIC back to str
id_crsp_comp_text['SIC'] = id_crsp_comp_text['SIC'].astype(str)

########## Drop files (firm-quarter) that contain number of words less than 1% threshold
nwq01 = id_crsp_comp_text['nw'].quantile(.01)
print('number of words, 1% quantile: ' + str(nwq01))
del_word01 = id_crsp_comp_text.loc[id_crsp_comp_text['nw'] < nwq01].shape[0]
print('number of files that contain total words less than 1% threshold: ' + str(del_word01))
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['nw'] >= nwq01]

########## Drop files (firm-quarter) that contain negative TLAG
# Rationale to drop negative TLAG: By construction, filings with filing date prior to news release date cannot be addressing the news. 
# ANTICIPATION is not purpose of the paper.
del_TLAG0 = id_crsp_comp_text[id_crsp_comp_text['TLAG'] < 0].shape[0]
# print('number of files that contain negative TLAG: ' + str(del_TLAG0))
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['TLAG'] >= 0]

########## Drop files (firm-quarter) that contain larger than 99% TLAG
tlagq99 = id_crsp_comp_text['TLAG'].quantile(.99)
print('TLAG 99% quantile: ' + str(tlagq99))
del_TLAG99 = id_crsp_comp_text.loc[id_crsp_comp_text['TLAG'] > tlagq99].shape[0]
print('number of files that contain negative or larger than 99% TLAG: ' + str(del_TLAG99 + del_TLAG0))
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['TLAG'] <= tlagq99]

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(id_crsp_comp_text.shape[0]))

############## Winsorize SIZE, MTB, LEV
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

id_crsp_comp_text = winsorize(id_crsp_comp_text, ['SIZE', 'MTB', 'LEV'])

############## Save merged ID_CRSP_COMP_TEXT to csv file
id_crsp_comp_text.to_csv('..\\filings\\id_crsp_comp_text_' + obj_type + '.csv', index = 0)

number of firm-quarters from utility and financial firms: 82612
number of firm-quarters with missing SIZE, MTB, LEV or non-positive total assets or non-positive book value of equity, or lag_prcc < 1: 26450
number of words, 1% quantile: 1145.0
number of files that contain total words less than 1% threshold: 1934
TLAG 99% quantile: 52.0
number of files that contain negative or larger than 99% TLAG: 1697
Number of firm-quarters after variable screening: 190341


In [19]:
########################################################################################
############### Table 1: Summary Statistics and Correlation Matrix #####################
########################################################################################

############# Table 1 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = id_crsp_comp_text[['NW','nw', 'TONE','TLAG', \
                                   'RET', 'NEG', 'SIZE', 'MTB', 'LEV' \
                                 # 'AGE', 'age', 'EARN', 'STD_RET', 'STD_EARN', 'LOSS', 'DEARN' \
                                 ]]

T1PA = selected_vars.describe().transpose() 

############# Summary statistics for all raw and processed variables
full_summary = id_crsp_comp_text.describe().transpose()

############# Save T1PA
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PA.to_excel(writer, sheet_name='T1PA_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T1PA.to_excel(table_path, sheet_name='T1PA_raw', float_format="%.4f")

T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,190341.0,8.998194,0.784043,7.044033,8.458928,9.068777,9.538924,13.490002
nw,190341.0,10887.473256,9968.794523,1145.0,4716.0,8679.0,13889.0,722159.0
TONE,190341.0,-8.540406,6.800588,-64.54289,-12.440758,-7.436261,-3.745318,22.28739
TLAG,190341.0,39.45554,6.176093,0.0,36.0,40.0,45.0,52.0
RET,190341.0,0.008255,0.285653,-1.833079,-0.131376,-0.002584,0.12766,18.312252
NEG,190341.0,0.505861,0.499967,0.0,0.0,1.0,1.0,1.0
SIZE,190341.0,6.005431,1.979078,2.001575,4.546041,5.906342,7.304254,11.206073
MTB,190341.0,3.647577,4.505002,0.288135,1.355554,2.261444,3.992795,30.900821
LEV,190341.0,0.197492,0.189621,0.0,0.009608,0.161434,0.32721,0.724242


In [20]:
# full_summary

In [21]:
############# Table 1 Panel B: Correlation matrix for selected variables
######### pearson correlation
T1PB_pearson = selected_vars.corr(method='pearson')

# T1PB_pearson

In [22]:
######### spearman correlation
T1PB_spearman = selected_vars.corr(method='spearman')

# T1PB_spearman

In [23]:
######### Combine two correlation matrices. right-up matrix: pearson; left-down matrix: spearman 
for row in list(range(0, len(T1PB_spearman.index))):
    T1PB_spearman.iloc[row, row+1:] = T1PB_pearson.iloc[row, row+1:]
    
##### Save T1PB
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PB_spearman.to_excel(writer, sheet_name='T1PB_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T1PB_spearman.to_excel(table_path, sheet_name='T1PB_raw', float_format="%.4f")

T1PB_spearman

Unnamed: 0,NW,nw,TONE,TLAG,RET,NEG,SIZE,MTB,LEV
NW,1.0,0.818686,-0.445295,-0.224474,-0.018596,0.007607,0.330459,0.095023,0.066745
nw,1.0,1.0,-0.377403,-0.121655,-0.013106,0.007425,0.234296,0.083641,0.069477
TONE,-0.454737,-0.454737,1.0,0.017566,0.024041,-0.019339,-0.072872,-0.028026,0.059154
TLAG,-0.295092,-0.295092,0.018932,1.0,-0.030915,0.045384,-0.409253,-0.038506,-0.026983
RET,-0.015184,-0.015184,0.032852,-0.05196,1.0,-0.664697,-0.031521,-0.018909,-0.005821
NEG,0.008848,0.008848,-0.022683,0.044987,-0.865966,1.0,-0.02546,0.012195,-0.002767
SIZE,0.342373,0.342373,-0.057625,-0.423105,0.015557,-0.027306,1.0,0.233385,0.119777
MTB,0.114767,0.114767,0.014027,-0.090102,-0.04585,0.026327,0.398114,1.0,0.056133
LEV,0.056743,0.056743,0.059015,-0.033975,0.002259,-0.004462,0.150865,-0.075261,1.0
