In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')

############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
#############################################################################################
####### Merge IBES annual data (oen-year forecast) with CRSP_COMP_EDGAR quarterly data ######
#############################################################################################

########### Read IBES raw data file
ibes_cols = ['cusip8', 'cname', 'fpi', 'value', 'fpedats', 'anndats', 'actual', 'anndats_act']
ibes = pd.read_csv('..\\filings\\ibes.csv', usecols = ibes_cols)
print('number of cusip-fpedats-analyst: ' + str(ibes.shape[0]))

### Reorder and rename IBES columns
ibes = ibes[['cusip8', 'cname', 'fpedats', 'value', 'actual', 'anndats_act', 'anndats', 'fpi']]
ibes = ibes.rename(columns={'cusip8': 'cusip'})

### correct fpedats format
ibes['fpedats'] = ibes['fpedats'].str.replace('jan','01')
ibes['fpedats'] = ibes['fpedats'].str.replace('feb','02')
ibes['fpedats'] = ibes['fpedats'].str.replace('mar','03')
ibes['fpedats'] = ibes['fpedats'].str.replace('apr','04')
ibes['fpedats'] = ibes['fpedats'].str.replace('may','05')
ibes['fpedats'] = ibes['fpedats'].str.replace('jun','06')
ibes['fpedats'] = ibes['fpedats'].str.replace('jul','07')
ibes['fpedats'] = ibes['fpedats'].str.replace('aug','08')
ibes['fpedats'] = ibes['fpedats'].str.replace('sep','09')
ibes['fpedats'] = ibes['fpedats'].str.replace('oct','10')
ibes['fpedats'] = ibes['fpedats'].str.replace('nov','11')
ibes['fpedats'] = ibes['fpedats'].str.replace('dec','12')
ibes['fpedats'] = pd.to_datetime(ibes['fpedats'], format='%d%m%Y')

print(ibes['fpedats'].describe())

### Delete missing cusip8 and actual
del_cusip = ibes[ibes['cusip'].isnull()].shape[0]
ibes = ibes[ibes['cusip'].isnull() == False]
print('number of obs. that contain missing cusip: ' + str(del_cusip))

del_actual = ibes[ibes['actual'].isnull()].shape[0]
ibes = ibes[ibes['actual'].isnull() == False]
print('number of obs. that contain missing actual: ' + str(del_actual))

### aggregate by cusip-fpedats and get actual, median, afe and consensus from ibes raw data
ibes_css = ibes.groupby(['cusip', 'fpedats'])['actual'].median().to_frame()
ibes_css['median'] = ibes.groupby(['cusip', 'fpedats'])['value'].median().to_frame()
ibes_css['afe'] = ibes_css['actual'] - ibes_css['median']
ibes_css['consensus'] = ibes.groupby(['cusip', 'fpedats'])['value'].mean().to_frame()
ibes_css['leap_consensus'] = ibes_css.groupby(['cusip'])['consensus'].shift(-1)

### make a consensus forecast dataset by cusip-fpedats
ibes = ibes.loc[ibes.duplicated(subset=['cusip', 'fpedats']) == False, ['cusip','fpedats']]
ibes = ibes.join(ibes_css, on=['cusip', 'fpedats'])

### create merge date_key - keep %y only
ibes['date_key'] = ibes['fpedats'].astype(str).str[:-6].astype(int)

print('number of cusip-fpedats, dropping missing cusip and actual: ' + str(ibes.shape[0]))

number of cusip-fpedats-analyst: 9812071
count                 9812071
unique                    446
top       2015-12-31 00:00:00
freq                   426980
first     1981-12-31 00:00:00
last      2019-07-31 00:00:00
Name: fpedats, dtype: object
number of obs. that contain missing cusip: 60781
number of obs. that contain missing actual: 1305946
number of cusip-fpedats, dropping missing cusip and actual: 155539


In [3]:
##################### Read EDGAR_CRSP_COMP and creat merge date_key
crsp_comp_edgar = pd.read_csv('..\\filings\\crsp_comp_edgar_10-Q.csv')
crsp_comp_edgar['date_key'] = crsp_comp_edgar['fyearq'].astype(int)

##################### Inner merge EDGAR_CRSP_COMP and IBES, key not unique in both datasets
crsp_comp_edgar_ibes = pd.merge(crsp_comp_edgar, ibes, on = ['cusip', 'date_key'], how='inner', validate = 'm:m')
print('number of cusip-date after merging: ' + str(crsp_comp_edgar_ibes.shape[0]))

crsp_comp_edgar_ibes.sort_values(by = ['cusip', 'cquarter'])

number of cusip-date after merging: 110095


Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,SIC,fd,date_crsp,date_comp,cquarter,fyearq,fqtr,incorp,state,addzip,costat,age,actq,cheq,dpq,ibq,intanq,lctq,revtq,txditcq,xsgaq,atq,lag_atq,ceqq,lag_ceqq,cshoq,lag_cshoq,dlcq,lag_dlcq,dlttq,lag_dlttq,prccq,lag_prccq,ibq.1,lag_ibq,RET,STD_RET,EARN,LOSS,DEARN,STD_EARN,CFO,leap1_EARN,leap2_EARN,leap3_EARN,leap1_CFO,leap2_CFO,leap3_CFO,TACC,LAG_TA_REV,DSAR,PPE,SIC2,nw,nvocab,n_neg,n_pos,n_uctt,n_lit,n_cstr,n_modal_strong,n_modal_moderate,n_modal_weak,n_negation,tone,NW,TONE,TLAG,NEG,SIZE,MTB,LEV,AGE,date_key,fpedats,actual,median,afe,consensus,leap_consensus
107231,00030710,1606180,2015-03-31,0001564590-15-003247,"AAC Holdings, Inc.",178698.0,8060,2015-05-05,2015-03-31,2015-03-31,201503,2015.0,1.0,NV,NV,37027,A,182,99.585,59.186,1.340,2.038,26.089,17.742,42.823,0.749,,188.983,145.952,101.145,97.474,21.781,21.374,4.440,4.357,71.628,24.284,30.580,30.9200,2.038,3.354,0.066935,0.234733,0.013963,0.0,-0.009017,,-0.072387,0.029394,0.011661,0.001540,0.048258,-0.003657,0.001268,12.603,0.699826,-0.022871,0.484228,80,15621,1738,160,47,186,78,97,23,26,31,1,-0.007298,9.656435,-7.297868,35,0,6.493578,6.780106,0.196236,5.209486,2015,2015-12-31,0.9700,0.7800,0.1900,0.753333,
107232,00030710,1606180,2015-06-30,0001564590-15-005942,"AAC Holdings, Inc.",178698.0,8060,2015-08-03,2015-06-30,2015-06-30,201506,2015.0,2.0,NV,NV,37027,A,273,97.666,45.021,1.676,5.555,28.972,30.118,53.784,0.303,,210.265,188.983,109.824,101.145,21.816,21.781,5.227,4.440,70.641,71.628,43.560,30.5800,5.555,2.038,0.375746,0.011999,0.029394,0.0,0.018610,,0.048258,0.011661,0.001540,0.001854,-0.003657,0.001268,-0.006056,-3.565,0.772302,0.007080,0.491055,80,20683,2073,296,84,292,170,131,31,33,90,13,-0.010878,9.937116,-10.878499,34,0,6.501384,6.585229,0.402512,5.613128,2015,2015-12-31,0.9700,0.7800,0.1900,0.753333,
107233,00030710,1606180,2015-09-30,0001564590-15-010373,"AAC Holdings, Inc.",178698.0,8060,2015-11-10,2015-09-30,2015-09-30,201509,2015.0,3.0,NV,NV,37027,A,365,76.724,10.764,1.921,2.452,112.594,32.120,57.372,1.942,,290.911,210.265,139.720,109.824,22.407,21.816,4.856,5.227,116.710,70.641,22.250,43.5600,2.452,5.555,-0.482160,0.121349,0.011661,0.0,-0.014758,0.009461,-0.003657,0.001540,0.001854,0.002768,0.001268,-0.006056,0.006473,3.221,0.898785,-0.034033,0.518812,80,20179,1987,226,65,209,132,102,27,32,37,7,-0.008325,9.912447,-8.325487,41,1,6.856783,8.652981,0.360821,5.902633,2015,2015-12-31,0.9700,0.7800,0.1900,0.753333,
9683,00036020,824142,1996-09-30,0000106455-96-000015,AAON INC,21542.0,3585,1996-11-01,1996-09-30,1996-09-30,199609,1996.0,3.0,NV,NV,74107,A,2100,26.423,0.025,0.621,0.522,,13.827,17.173,0.000,1.796,36.501,33.936,14.972,14.450,6.122,6.122,0.172,0.714,7.702,10.217,5.625,5.2500,0.522,0.475,0.038653,0.046985,0.015382,0.0,0.001385,0.005044,0.100660,0.018054,0.020130,0.020577,0.002548,-0.090078,0.024802,-2.894,0.888820,-0.029143,0.501680,35,1178,468,3,4,7,5,4,2,1,1,0,0.000849,7.072422,0.848896,32,0,3.470117,2.224256,0.322106,7.650169,1996,1996-12-31,0.0307,0.0702,-0.0395,0.071073,0.045367
25427,00036020,824142,1999-03-31,0001026608-99-000042,AAON INC,21542.0,3585,1999-05-14,1999-03-31,1999-03-31,199903,1999.0,1.0,NV,NV,74107,A,3012,31.559,0.369,0.732,1.764,,15.500,30.036,,4.266,50.827,50.506,26.223,24.411,6.227,6.219,0.757,0.757,9.104,10.980,11.000,9.3125,1.764,1.456,0.167819,0.136016,0.034927,0.0,0.006098,0.003367,-0.043361,0.047632,0.052270,0.047343,0.060263,0.034482,0.059992,3.954,1.011721,0.014573,0.647012,35,1547,625,17,6,23,11,6,3,7,4,0,-0.007111,7.344719,-7.110537,44,0,4.058967,2.372473,0.232388,8.010692,1999,1999-12-31,0.1308,0.0922,0.0386,0.102540,0.157771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75032,Y9369110,1352341,2009-04-30,0001193125-09-126243,Verigy Ltd.,166689.0,3559,2009-06-05,2009-04-30,2009-04-30,200904,2009.0,2.0,,U0,768923,I,1065,407.000,262.000,4.000,-30.000,18.000,113.000,71.000,0.000,48.300,572.000,613.000,410.000,432.000,59.191,58.119,0.000,0.000,0.000,0.000,11.000,8.3100,-30.000,-64.000,0.262954,0.120079,-0.048940,1.0,0.055465,0.046828,-0.060359,-0.036713,-0.017021,-0.008439,-0.019231,0.001418,0.116737,7.000,1.197390,-0.014682,0.138662,35,25449,2739,606,183,505,172,256,75,107,239,4,-0.016779,10.144471,-16.778655,36,0,6.179952,1.117984,0.000000,6.971669,2009,2009-10-31,-1.4900,-0.5800,-0.9100,-0.047253,0.181420
75033,Y9369110,1352341,2009-07-31,0001193125-09-187864,Verigy Ltd.,166689.0,3559,2009-09-04,2009-07-31,2009-07-31,200907,2009.0,3.0,,U0,768923,I,1157,527.000,385.000,4.000,-21.000,20.000,110.000,87.000,0.000,50.000,705.000,572.000,402.000,410.000,58.727,59.191,0.000,0.000,138.000,0.000,13.290,11.0000,-21.000,-30.000,0.050553,0.006159,-0.036713,1.0,0.015734,0.039254,-0.019231,-0.017021,-0.008439,-0.001389,0.001418,0.116737,0.005556,-10.000,1.071678,0.043706,0.155594,35,28287,2879,617,205,557,194,258,103,126,246,5,-0.014742,10.250193,-14.741754,35,0,6.478665,1.588051,0.000000,7.054450,2009,2009-10-31,-1.4900,-0.5800,-0.9100,-0.047253,0.181420
79771,Y9369110,1352341,2010-01-31,0001193125-10-048983,Verigy Ltd.,166689.0,3559,2010-03-05,2010-01-29,2010-01-31,201001,2010.0,1.0,,U0,768923,I,1339,531.000,360.000,5.000,-6.000,19.000,131.000,106.000,0.000,50.800,720.000,711.000,402.000,402.000,59.259,58.841,0.000,2.000,138.000,138.000,10.840,9.8400,-6.000,-12.000,0.087924,0.135557,-0.008439,1.0,0.008439,0.030990,0.116737,-0.001389,0.017639,0.012870,0.005556,0.029851,0.039897,-89.000,0.991561,0.004219,0.136428,35,25506,2720,543,215,506,208,227,103,114,225,14,-0.013409,10.146708,-13.408610,35,0,6.361295,1.440287,0.196906,7.200425,2010,2010-10-31,0.5700,0.2400,0.3300,0.181420,
79772,Y9369110,1352341,2010-04-30,0001193125-10-133113,Verigy Ltd.,166689.0,3559,2010-06-04,2010-04-30,2010-04-30,201004,2010.0,2.0,,U0,768923,I,1430,558.000,368.000,5.000,-1.000,19.000,137.000,120.000,0.000,54.500,737.000,720.000,405.000,402.000,59.369,59.259,0.000,0.000,138.000,138.000,11.920,10.8400,-1.000,-6.000,-0.010976,0.079584,-0.001389,1.0,0.006944,0.019840,0.005556,0.017639,0.012870,-0.006120,0.029851,0.039897,-0.094247,-5.000,0.987500,0.004167,0.138889,35,25412,2675,532,197,488,182,207,90,122,222,14,-0.013734,10.143016,-13.733669,35,1,6.465161,1.597929,0.191667,7.266129,2010,2010-10-31,0.5700,0.2400,0.3300,0.181420,


In [4]:
########################################################################################
############ Merge CRSP_COMP_EDGAR_IBES data with SEG data #############################
########################################################################################

########### Read SEG raw data file
seg_cols = ['gvkey', 'stype', 'sid', 'datadate', 'snms', 'cusip', 'cik']
seg = pd.read_csv('..\\filings\\compustat_seg.csv', usecols = seg_cols)
print('number of cusip-fpedats-analyst: ' + str(seg.shape[0]))

### Reorder and rename IBES columns
seg = seg[['gvkey', 'cik', 'cusip', 'datadate', 'stype', 'sid', 'snms']]
seg = seg.rename(columns={'datadate': 'date_comp'})

### parse date_comp format
seg['date_comp'] = pd.to_datetime(seg['date_comp'], format='%Y%m%d')

### aggregate by gvkey-date_comp-stype and get count of sid
seg_count = seg.groupby(['gvkey', 'date_comp', 'stype'])['sid'].count().to_frame()

### make a segment dataset by gvkey-date_comp-stype
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp', 'stype']) == False, ['gvkey', 'date_comp', 'stype']]
seg = seg.join(seg_count, on=['gvkey', 'date_comp', 'stype'])
seg_bus = seg[seg['stype']=='BUSSEG']
seg_bus = seg_bus.drop(columns=['stype'])
seg_bus = seg_bus.rename(columns={'sid': 'nseg_bus'})
seg_geo = seg[seg['stype']=='GEOSEG']
seg_geo = seg_geo.drop(columns=['stype'])
seg_geo = seg_geo.rename(columns={'sid': 'nseg_geo'})
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp']) == False, ['gvkey', 'date_comp']]
seg = pd.merge(seg, seg_bus, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = pd.merge(seg, seg_geo, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = seg.sort_values(by=['gvkey', 'date_comp'])
seg.loc[seg['nseg_geo'].isnull(), 'nseg_geo'] = 1
seg.loc[seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
seg['nseg_bus'] = seg['nseg_bus'].astype(int)
seg['nseg_geo'] = seg['nseg_geo'].astype(int)

### create merge date_key - %y only
seg['date_key'] = seg['date_comp'].astype(str).str[:-6].astype(int)
seg = seg.drop(columns=['date_comp'])

print('number of gvkey-datadate in SEGMENT data: ' + str(seg.shape[0]))

number of cusip-fpedats-analyst: 452653
number of gvkey-datadate in SEGMENT data: 50876


In [5]:
##################### left merge EDGAR_CRSP_COMP_IBES and SEG, key not unique in both datasets
crsp_comp_edgar_ibes_seg = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='left', validate = 'm:m')
print('Number of gvkey-quarter obs. after merging, left: '+ str(crsp_comp_edgar_ibes_seg.shape[0]))
# crsp_comp_edgar_ibes_seg_inner = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='inner', validate = 'm:m')
# print('Number of gvkey-quarter obs. after merging, inner: '+ str(crsp_comp_edgar_ibes_seg_inner.shape[0]))

##################### Fill missing segments in crsp_comp_edgar_ibes_seg_left with 1
crsp_comp_edgar_ibes_seg.loc[crsp_comp_edgar_ibes_seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
crsp_comp_edgar_ibes_seg.loc[crsp_comp_edgar_ibes_seg['nseg_geo'].isnull(), 'nseg_geo'] = 1

####### Drop duplicated gykey-cquarter
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg.duplicated(subset=['gvkey', 'cquarter']) == False]

Number of gvkey-quarter obs. after merging, left: 110114


In [6]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

################### CRSP_COMP_EDGAR_IBES_SEG_LEFT ######################################

######## AGE: log(1 + age from the first year the firm entered the CRSP dataset)
crsp_comp_edgar_ibes_seg['AGE'] = np.log(1 + crsp_comp_edgar_ibes_seg['age'])

######## BUSSEG: log(1 + number of business segments), or 1 if item is missing from Compustat; and
######## GEOSEG: log(1 + number of geographic segments), or 1 if item is missing from Compustat.
crsp_comp_edgar_ibes_seg['BUSSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg['nseg_bus'])
crsp_comp_edgar_ibes_seg['GEOSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg['nseg_geo'])

######## AFE, analyst forecast error, \
######## defined as I/B/E/S earnings per share minus the median of the most recent analysts’ forecasts, \
######## deflated by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg['AFE'] = crsp_comp_edgar_ibes_seg['afe']/crsp_comp_edgar_ibes_seg['prccq']

######## AF, analyst consensus forecast for one-year-ahead earnings per share, scaled by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg['AF'] = crsp_comp_edgar_ibes_seg['leap_consensus']/crsp_comp_edgar_ibes_seg['prccq']

In [7]:
########################################################################################
########################## Variable Screening ##########################################
########################################################################################

########## Drop files (firm-quarter) that have positive/negative infinity DEARN
del_DEARN = crsp_comp_edgar_ibes_seg[(crsp_comp_edgar_ibes_seg['DEARN']==np.inf) | (crsp_comp_edgar_ibes_seg['DEARN']==-np.inf)].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[(crsp_comp_edgar_ibes_seg['DEARN']!=np.inf) & (crsp_comp_edgar_ibes_seg['DEARN']!=-np.inf)]
print('number of files that have positive/negative infinity DEARN: ' + str(del_DEARN))

### Drop missing AF
del_AF = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AF'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AF'].isnull() == False]
print('number of obs. that contain missing AF: ' + str(del_AF))

### Drop missing AFE
del_AFE = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AFE'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AFE'].isnull() == False]
print('number of obs. that contain missing AFE: ' + str(del_AFE))

### Drop missing EARN
del_EARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['EARN'].isnull() == False]
print('number of obs. that contain missing EARN: ' + str(del_EARN))

### Drop missing STD_EARN
del_STD_EARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['STD_EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['STD_EARN'].isnull() == False]
print('number of obs. that contain missing STD_EARN: ' + str(del_STD_EARN))

### Drop missing EARN
del_DEARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DEARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DEARN'].isnull() == False]
print('number of obs. that contain missing DEARN: ' + str(del_DEARN))

############## Winsorize ##############################
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

############## Winsorize TABLE 1 (line 1) and TABLE 4 (line 2) variables
crsp_comp_edgar_ibes_seg = winsorize(crsp_comp_edgar_ibes_seg, \
                                          ['AF', 'AFE', 'BUSSEG','GEOSEG','AGE','EARN', 'DEARN', 'STD_EARN', 'STD_RET', \
                                          'CFO', 'leap1_EARN', 'leap2_EARN', 'leap3_EARN', 'leap1_CFO', 'leap2_CFO', 'leap3_CFO'])

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

number of files that have positive/negative infinity DEARN: 0
number of obs. that contain missing AF: 15228
number of obs. that contain missing AFE: 0
number of obs. that contain missing EARN: 64
number of obs. that contain missing STD_EARN: 3163
number of obs. that contain missing DEARN: 0
Number of firm-quarters after variable screening: 91606


In [8]:
######## Create ABTONE: residual from TONE regression 
y, X = ps.dmatrices('TONE ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
                    data = crsp_comp_edgar_ibes_seg, return_type = 'dataframe')
model = sm.OLS(y, X)
res = model.fit()
# res.summary()
crsp_comp_edgar_ibes_seg['ABTONE'] = res.resid

######## Create abtone for REPLICATION: residual from tone regression 
y, X = ps.dmatrices('tone ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
                    data = crsp_comp_edgar_ibes_seg, return_type = 'dataframe')
model = sm.OLS(y, X)
res = model.fit()
# res.summary()
crsp_comp_edgar_ibes_seg['abtone'] = res.resid

############## Save merged ID_CRSP_COMP_TEXT to csv file
crsp_comp_edgar_ibes_seg.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_10-Q.csv', index = 0)

crsp_comp_edgar_ibes_seg

Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,SIC,fd,date_crsp,date_comp,cquarter,fyearq,fqtr,incorp,state,addzip,costat,age,actq,cheq,dpq,ibq,intanq,lctq,revtq,txditcq,xsgaq,atq,lag_atq,ceqq,lag_ceqq,cshoq,lag_cshoq,dlcq,lag_dlcq,dlttq,lag_dlttq,prccq,lag_prccq,ibq.1,lag_ibq,RET,STD_RET,EARN,LOSS,DEARN,STD_EARN,CFO,leap1_EARN,leap2_EARN,leap3_EARN,leap1_CFO,leap2_CFO,leap3_CFO,TACC,LAG_TA_REV,DSAR,PPE,SIC2,nw,nvocab,n_neg,n_pos,n_uctt,n_lit,n_cstr,n_modal_strong,n_modal_moderate,n_modal_weak,n_negation,tone,NW,TONE,TLAG,NEG,SIZE,MTB,LEV,AGE,date_key,fpedats,actual,median,afe,consensus,leap_consensus,nseg_bus,nseg_geo,BUSSEG,GEOSEG,AFE,AF,ABTONE,abtone
0,54626810,60512,1993-06-30,0000060512-94-000005,LOUISIANA LAND & EXPLORATION CO,6819.0,1311,1993-08-13,1993-06-30,1993-06-30,199306,1993.0,2.0,MD,MD,70112,I,11324,193.800,64.300,27.300,5.600,,208.200,189.300,136.100,23.900,1278.000,1203.400,424.200,421.300,28.729,28.647,86.500,0.000,356.300,379.500,42.500,45.250,5.600,2.700,-0.059108,0.054464,0.004653,0.0,0.002410,0.002744,0.031660,-0.001408,0.003730,0.003372,0.037089,0.041509,-0.075597,-32.500,1.004737,-0.000249,1.996925,13,3018,820,58,6,22,41,5,9,5,5,0,-0.017230,8.012681,-17.229954,44,1,7.167251,3.076850,0.315356,9.334768,1993,1993-12-31,0.52,1.00,-0.48,0.990359,0.714757,1.0,1.0,0.693147,0.693147,-0.011294,0.016818,-10.133707,-0.010134
1,54626810,60512,1993-09-30,0000060512-94-000007,LOUISIANA LAND & EXPLORATION CO,6819.0,1311,1993-11-10,1993-09-30,1993-09-30,199309,1993.0,3.0,MD,MD,70112,I,11416,180.200,9.400,28.200,-1.800,,195.100,187.900,124.800,23.700,1662.300,1278.000,417.100,424.200,33.137,28.729,0.000,86.500,747.700,356.300,44.625,42.500,-1.800,5.600,0.025969,0.094378,-0.001408,1.0,-0.005790,0.004136,0.037089,0.003730,0.003372,0.000335,0.041509,-0.075597,0.026837,-49.200,0.941628,-0.031768,2.206729,13,3771,880,68,10,20,51,2,15,7,4,0,-0.015381,8.235361,-15.380536,41,0,7.107411,2.878318,0.346479,9.342859,1993,1993-12-31,0.52,1.00,-0.48,0.990359,0.714757,1.0,1.0,0.693147,0.693147,-0.010756,0.016017,-4.960103,-0.004960
2,08750910,11860,1993-09-30,0000011860-94-000005,BETHLEHEM STEEL CORP /DE/,2189.0,3312,1993-11-12,1993-09-30,1993-09-30,199309,1993.0,3.0,DE,DE,18016-7699,I,24715,1074.900,232.500,69.100,30.700,,908.400,1055.300,0.000,38.400,5168.098,5463.199,580.800,958.900,91.307,91.025,100.300,88.400,728.500,650.200,14.375,18.750,30.700,-13.600,-0.270146,0.119124,0.005619,0.0,0.008109,0.007031,0.007871,-0.046942,0.002195,0.004469,0.025212,-0.013205,0.004332,-12.300,1.004723,-0.008823,1.315529,33,3725,1048,71,29,18,53,28,16,10,4,0,-0.011275,8.223091,-11.275168,43,1,7.442328,1.779871,0.135196,10.115206,1993,1993-12-31,0.41,0.30,0.11,0.610301,1.489195,1.0,1.0,0.693147,0.693147,0.007652,0.103596,-3.315425,-0.003315
3,91335310,101929,1993-11-30,0000101929-94-000002,UNIVAR CORP,11003.0,5160,1994-01-13,1993-11-30,1993-11-30,199311,1993.0,3.0,DE,DE,98033,I,9041,372.727,18.416,6.890,0.458,,281.853,430.299,,60.117,654.434,660.336,160.280,161.722,19.643,19.635,42.298,47.584,150.563,152.012,11.000,13.250,0.458,1.957,-0.170798,0.066070,0.000694,0.0,-0.002270,0.002216,0.037449,-0.000348,0.002294,-0.026588,-0.015458,-0.057851,0.009150,-24.271,1.084569,-0.049554,,51,2584,715,16,8,10,52,10,1,5,3,0,-0.003096,7.857481,-3.095975,44,1,5.561311,1.608710,0.302264,9.109636,1993,1993-02-28,0.27,0.58,-0.31,0.636296,0.586216,1.0,1.0,0.693147,0.693147,-0.028182,0.053292,3.279267,0.003279
4,91345610,102037,1993-12-31,0000950118-94-000014,UNIVERSAL CORP /VA/,11017.0,5150,1994-02-11,1993-12-31,1993-12-31,199312,1994.0,2.0,VA,VA,23235,A,24168,1285.950,66.170,,20.210,,947.603,866.774,28.519,73.103,1783.210,1758.267,406.682,393.913,35.634,35.631,578.773,577.178,294.965,302.786,25.625,22.375,20.210,18.459,0.133202,0.057684,0.011494,0.0,0.000996,0.008327,0.003720,0.005239,-0.003138,0.002387,0.017610,0.089023,-0.102212,13.669,0.888372,0.068531,0.303600,51,1900,693,18,16,19,16,15,3,2,1,0,-0.001053,7.550135,-1.052632,42,0,6.681160,2.023908,0.500472,10.092826,1994,1994-06-30,1.41,2.50,-1.09,2.223333,1.772414,1.0,1.0,0.693147,0.693147,-0.042537,0.069167,5.974779,0.005975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109485,23719410,940944,2015-08-30,0000940944-15-000068,DARDEN RESTAURANTS INC,31846.0,5812,2015-10-02,2015-08-31,2015-08-31,201508,2015.0,1.0,FL,FL,32837,A,7430,1204.600,690.100,81.100,81.000,872.400,1142.600,1687.000,331.600,160.900,6026.500,5994.700,2424.100,2333.500,128.139,126.669,15.000,18.600,1437.600,1578.500,68.010,65.540,81.000,118.100,0.119210,0.049890,0.013512,0.0,-0.006189,0.012658,-0.060287,0.004995,0.020879,0.031187,0.008197,0.078151,0.047535,817.300,1.001368,-0.029042,0.904666,58,9477,1426,142,58,120,65,78,23,32,31,0,-0.008864,9.156729,-8.863564,32,0,9.024238,3.557697,0.266419,8.913416,2015,2015-05-31,2.56,2.87,-0.31,3.034414,3.052478,15.0,1.0,2.772589,0.693147,-0.004558,0.044883,-1.157436,-0.001157
109486,23719410,940944,2015-11-29,0000940944-16-000090,DARDEN RESTAURANTS INC,31846.0,5812,2016-01-06,2015-11-30,2015-11-30,201511,2015.0,2.0,FL,FL,32837,A,7521,1383.500,857.600,75.300,30.100,872.400,1815.400,1608.800,226.200,167.800,5182.300,6026.500,1989.800,2424.100,128.236,128.139,759.400,15.000,439.500,1437.600,56.170,68.010,30.100,81.000,-0.103575,0.091373,0.004995,0.0,-0.008446,0.011176,0.008197,0.020879,0.031187,0.024244,0.078151,0.047535,-0.133527,-22.500,0.994723,-0.013391,0.634680,58,12000,1519,161,81,135,73,88,23,31,32,0,-0.006667,9.392745,-6.666667,37,1,9.072770,3.595039,0.241035,8.925587,2015,2015-05-31,2.56,2.87,-0.31,3.034414,3.052478,15.0,1.0,2.772589,0.693147,-0.005519,0.054344,1.369508,0.001370
109487,23719410,940944,2016-02-28,0000940944-16-000101,DARDEN RESTAURANTS INC,31846.0,5812,2016-04-06,2016-02-29,2016-02-29,201602,2015.0,3.0,FL,FL,32837,A,7612,729.900,215.800,67.000,108.200,872.300,1201.100,1847.500,220.800,143.900,4501.900,5182.300,1918.200,1989.800,126.726,128.236,8.000,759.400,439.700,439.500,63.880,56.170,108.200,30.100,0.223430,0.059833,0.020879,0.0,0.015071,0.007116,0.078151,0.031187,0.024244,0.018237,0.047535,-0.133527,0.026955,-298.400,1.162901,0.047894,0.744283,58,12228,1543,149,79,143,71,86,26,32,33,0,-0.005725,9.411565,-5.724567,37,0,8.882255,3.619970,0.231345,8.937613,2015,2015-05-31,2.56,2.87,-0.31,3.034414,3.052478,15.0,1.0,2.772589,0.693147,-0.004853,0.047785,1.998064,0.001998
109497,31787A50,1094739,2016-01-31,0001094739-16-000109,FINISAR CORP,126417.0,3577,2016-03-10,2016-01-29,2016-01-31,201601,2015.0,3.0,DE,DE,94089,I,5936,1102.280,531.056,23.518,12.084,127.422,221.573,309.206,,76.071,1597.728,1577.254,1125.829,1107.269,107.521,106.785,0.000,0.000,228.561,226.151,12.700,11.370,12.084,6.644,0.216592,0.121350,0.007661,0.0,0.003449,0.002539,0.020076,0.008182,0.014541,0.029227,0.034175,-0.082694,0.036045,-19.581,0.994044,-0.015977,,35,14694,2048,475,127,369,222,122,69,98,214,12,-0.024500,9.595263,-24.499796,41,0,7.101796,1.096523,0.143383,8.688959,2015,2015-04-30,1.04,1.40,-0.36,1.404350,1.351429,3.0,12.0,1.386294,2.564949,-0.028346,0.106412,-14.592596,-0.014593


In [9]:
# ####################################################################################################################
# ############# Create COMP_DA: if not replicating TABLE 4, DO NOT run because cause large sample reduction ##########
# ####################################################################################################################

# ### Drop missing or infinite LAG_TA_REV, DSAR and PPE
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['LAG_TA_REV'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['LAG_TA_REV'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['LAG_TA_REV'] != -np.inf)]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['DSAR'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['DSAR'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['DSAR'] != -np.inf)]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['PPE'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['PPE'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['PPE'] != -np.inf)]
# print('number of obs. after deleting missing or infinite LAG_TA_REV, DSAR and PPE: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

# ############## Winsorize DA variables
# crsp_comp_edgar_ibes_seg = winsorize(crsp_comp_edgar_ibes_seg, ['TACC', 'LAG_TA_REV', 'DSAR', 'PPE'])

# ######## Create DA: residual from TACC regression by each two-digit SIC-year
# def TACC_reg(data):
#     try:
#         y, X = ps.dmatrices('TACC ~ 1 + LAG_TA_REV + DSAR + PPE', data = data, return_type = 'dataframe')
#         model = sm.OLS(y, X)
#         res = model.fit()
#         data['DA'] = res.resid
#         return data
#     except:
#         pass

# crsp_comp_edgar_ibes_seg1 = crsp_comp_edgar_ibes_seg.groupby(['SIC2','fyearq']).apply(TACC_reg)
# # comp_DA['DA'].describe()

# ####### Join DA to COMP
# crsp_comp_edgar_ibes_seg = pd.merge(crsp_comp_edgar_ibes_seg, crsp_comp_edgar_ibes_seg1[['cik', 'rp', 'DA']], on = ['cik', 'rp'], how='left', validate = '1:1')

# crsp_comp_edgar_ibes_seg

# ############## Dropping missing DA to replicate Huang et al. 2014 TABLE 4, \
# del_DA = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DA'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DA'].isnull() == False]
# print('number of obs. that contain missing DA: ' + str(del_DA))

# ############## Inspect sample size after variable screening
# print('Number of firm-quarters after dropping missing DA: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

# ############## Save merged ID_CRSP_COMP_TEXT to csv file
# crsp_comp_edgar_ibes_seg.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_DA_10-Q.csv', index = 0)

In [13]:
########################################################################################
########### Table 1 - Panel A: Summary Statistics and Correlation Matrix ###############
########################################################################################

############# Table 1 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = crsp_comp_edgar_ibes_seg[['NW','nw', 'TONE','TLAG', \
                                          'RET', 'NEG', 'SIZE', 'MTB', 'LEV', \
                                          'AF', 'AFE', 'BUSSEG','GEOSEG','AGE','EARN', 'DEARN', 'STD_EARN', 'STD_RET', 'LOSS', 'ABTONE',\
                                        # 'DA' 
                                        ]]

T1PA = selected_vars.describe().transpose() 

############# Summary statistics for all raw and processed variables
full_summary = crsp_comp_edgar_ibes_seg.describe().transpose()

############# Save T3PA
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PA.to_excel(writer, sheet_name='T1PA_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T1PA.to_excel(table_path, sheet_name='T1PA_raw', float_format="%.4f")

T1PA.to_latex(buf = r'..\latex\table\T1PA.tex')
T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,91606.0,8.945716,0.763607,7.044033,8.423542,9.00957,9.477233,13.490002
nw,91606.0,10215.34,9672.940271,1145.0,4552.0,8180.0,13058.0,722159.0
TONE,91606.0,-8.456529,6.884934,-64.54289,-12.433977,-7.471574,-3.641299,22.28739
TLAG,91606.0,39.02064,6.248857,0.0,36.0,40.0,44.0,52.0
RET,91606.0,0.01827066,0.253076,-1.578704,-0.112598,0.007312,0.129855,4.849226
NEG,91606.0,0.4828941,0.49971,0.0,0.0,0.0,1.0,1.0
SIZE,91606.0,6.447034,1.776382,2.001575,5.175115,6.317271,7.563028,11.206073
MTB,91606.0,3.515453,4.008952,0.288135,1.485381,2.343218,3.901803,30.900821
LEV,91606.0,0.1924244,0.182145,0.0,0.010645,0.162378,0.315125,0.724242
AF,91606.0,0.04305316,0.066424,-0.262045,0.022604,0.048521,0.073182,0.227262


In [14]:
# full_summary

In [19]:
############# Table 3 Panel B: Correlation matrix for selected variables
######### pearson correlation
T1PB_pearson = selected_vars.loc[:, (selected_vars.columns != 'nw') & \
                                 (selected_vars.columns != 'BUSSEG') & \
                                 (selected_vars.columns != 'GEOSEG') & \
                                 (selected_vars.columns != 'LOSS')].corr(method='pearson')

# T3PB_pearson

In [20]:
######### spearman correlation
T1PB_spearman = selected_vars.loc[:, (selected_vars.columns != 'nw') & \
                                 (selected_vars.columns != 'BUSSEG') & \
                                 (selected_vars.columns != 'GEOSEG') & \
                                 (selected_vars.columns != 'LOSS')].corr(method='spearman')

# T3PB_spearman

In [21]:
######### Combine two correlation matrices. right-up matrix: pearson; left-down matrix: spearman 
for row in list(range(0, len(T1PB_spearman.index))):
    T1PB_spearman.iloc[row, row+1:] = T1PB_pearson.iloc[row, row+1:]
    
##### Save T3PB
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PB_spearman.to_excel(writer, sheet_name='T1PB_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T1PB_spearman.to_excel(table_path, sheet_name='T1PB_raw', float_format="%.4f")

T1PB_spearman

Unnamed: 0,NW,TONE,TLAG,RET,NEG,SIZE,MTB,LEV,AF,AFE,AGE,EARN,DEARN,STD_EARN,STD_RET,ABTONE
NW,1.0,-0.461386,-0.195007,-0.00768,0.00324,0.257685,0.058613,0.036671,-0.067092,0.01213,-0.037679,-0.115276,0.001401,0.089847,-0.033637,-0.3842147
TONE,-0.485896,1.0,0.025296,0.02072,-0.021142,-0.069657,-0.016027,0.069069,0.069426,0.098054,0.053899,0.155902,-0.001898,-0.144103,-0.081156,0.955246
TLAG,-0.266132,0.029085,1.0,-0.022311,0.034239,-0.330851,-0.02174,0.009413,-0.092233,-0.127317,-0.228381,-0.136735,-0.004552,0.121026,0.189183,0.01959272
RET,-0.008074,0.028965,-0.03245,1.0,-0.68408,-0.064444,-0.025789,0.001942,-0.018167,0.154661,0.001746,0.06349,0.036183,0.010708,0.265984,-7.428794000000001e-17
NEG,0.004378,-0.024177,0.032909,-0.865518,1.0,0.000133,0.012464,-0.00185,0.015139,-0.124117,-0.01798,-0.071386,-0.018752,0.015758,-0.117523,-5.771174e-05
SIZE,0.266473,-0.052456,-0.332657,-0.02414,-0.00097,1.0,0.23348,0.099718,0.077092,0.26962,0.343996,0.258795,-0.023574,-0.197617,-0.310002,9.64711e-16
MTB,0.048338,0.037422,-0.042133,-0.054641,0.032891,0.381892,1.0,0.045756,-0.156107,0.120141,-0.088287,-0.041126,0.022218,0.158511,0.035846,1.04091e-15
LEV,0.014756,0.074522,-0.00043,0.002952,-0.004016,0.142932,-0.111043,1.0,0.166873,-0.06822,0.100915,0.039331,0.033861,-0.124368,-0.071983,0.06819981
AF,-0.017067,0.060423,-0.12463,-0.086575,0.071535,0.025461,-0.299087,0.250834,1.0,0.057255,0.202039,0.472202,0.015962,-0.256445,-0.145217,-1.166235e-16
AFE,0.039804,0.09693,-0.148947,0.180751,-0.157267,0.231409,0.226087,-0.052059,0.06037,1.0,0.071767,0.240869,0.004317,-0.143257,-0.158632,2.986362e-16
