In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')

############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
#############################################################################################
####### Merge IBES annual data (oen-year forecast) with CRSP_COMP_EDGAR quarterly data ######
#############################################################################################

########### Read IBES raw data file
ibes_cols = ['cusip8', 'cname', 'fpi', 'value', 'fpedats', 'anndats', 'actual', 'anndats_act']
ibes = pd.read_csv('..\\filings\\ibes.csv', usecols = ibes_cols)
print('number of cusip-fpedats-analyst: ' + str(ibes.shape[0]))

### Reorder and rename IBES columns
ibes = ibes[['cusip8', 'cname', 'fpedats', 'value', 'actual', 'anndats_act', 'anndats', 'fpi']]
ibes = ibes.rename(columns={'cusip8': 'cusip'})

### correct fpedats format
ibes['fpedats'] = ibes['fpedats'].str.replace('jan','01')
ibes['fpedats'] = ibes['fpedats'].str.replace('feb','02')
ibes['fpedats'] = ibes['fpedats'].str.replace('mar','03')
ibes['fpedats'] = ibes['fpedats'].str.replace('apr','04')
ibes['fpedats'] = ibes['fpedats'].str.replace('may','05')
ibes['fpedats'] = ibes['fpedats'].str.replace('jun','06')
ibes['fpedats'] = ibes['fpedats'].str.replace('jul','07')
ibes['fpedats'] = ibes['fpedats'].str.replace('aug','08')
ibes['fpedats'] = ibes['fpedats'].str.replace('sep','09')
ibes['fpedats'] = ibes['fpedats'].str.replace('oct','10')
ibes['fpedats'] = ibes['fpedats'].str.replace('nov','11')
ibes['fpedats'] = ibes['fpedats'].str.replace('dec','12')
ibes['fpedats'] = pd.to_datetime(ibes['fpedats'], format='%d%m%Y')

print(ibes['fpedats'].describe())

### Delete missing cusip8 and actual
del_cusip = ibes[ibes['cusip'].isnull()].shape[0]
ibes = ibes[ibes['cusip'].isnull() == False]
print('number of obs. that contain missing cusip: ' + str(del_cusip))

del_actual = ibes[ibes['actual'].isnull()].shape[0]
ibes = ibes[ibes['actual'].isnull() == False]
print('number of obs. that contain missing actual: ' + str(del_actual))

### aggregate by cusip-fpedats and get actual, median, afe and consensus from ibes raw data
ibes_css = ibes.groupby(['cusip', 'fpedats'])['actual'].median().to_frame()
ibes_css['median'] = ibes.groupby(['cusip', 'fpedats'])['value'].median().to_frame()
ibes_css['afe'] = ibes_css['actual'] - ibes_css['median']
ibes_css['consensus'] = ibes.groupby(['cusip', 'fpedats'])['value'].mean().to_frame()
ibes_css['leap_consensus'] = ibes_css.groupby(['cusip'])['consensus'].shift(-1)

### make a consensus forecast dataset by cusip-fpedats
ibes = ibes.loc[ibes.duplicated(subset=['cusip', 'fpedats']) == False, ['cusip','fpedats']]
ibes = ibes.join(ibes_css, on=['cusip', 'fpedats'])

### create merge date_key - keep %y only
ibes['date_key'] = ibes['fpedats'].astype(str).str[:-6].astype(int)

print('number of cusip-fpedats, dropping missing cusip and actual: ' + str(ibes.shape[0]))

number of cusip-fpedats-analyst: 9812071
count                 9812071
unique                    446
top       2015-12-31 00:00:00
freq                   426980
first     1981-12-31 00:00:00
last      2019-07-31 00:00:00
Name: fpedats, dtype: object
number of obs. that contain missing cusip: 60781
number of obs. that contain missing actual: 1305946
number of cusip-fpedats, dropping missing cusip and actual: 155539


In [3]:
##################### Read EDGAR_CRSP_COMP and creat merge date_key
crsp_comp_edgar = pd.read_csv('..\\filings\\crsp_comp_edgar_8-K.csv')
crsp_comp_edgar['date_key'] = crsp_comp_edgar['fyearq'].astype(int)

##################### Inner merge EDGAR_CRSP_COMP and IBES, key not unique in both datasets
crsp_comp_edgar_ibes = pd.merge(crsp_comp_edgar, ibes, on = ['cusip', 'date_key'], how='inner', validate = 'm:m')
print('number of cusip-date after merging: ' + str(crsp_comp_edgar_ibes.shape[0]))

crsp_comp_edgar_ibes.sort_values(by = ['cusip', 'cmonth'])

number of cusip-date after merging: 144634


Unnamed: 0,cusip,date_crsp,cik,gvkey,rp,name,SIC,fd,cmonth,fyearq,atq,lag_atq,ceqq,prccq,lag_ceqq,lag_cshoq,lag_dlcq,lag_dlttq,lag_prccq,RET,DRET,STD_RET,EARN,LOSS,DEARN,STD_EARN,BN,n8k,nitem,nw,nvocab,n_neg,n_pos,n_negation,tone,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_1.01,item_1.02,item_1.03,item_1.04,item_2.01,item_2.02,item_2.03,item_2.04,item_2.05,item_2.06,item_3.01,item_3.02,item_3.03,item_4.01,item_4.02,item_5.01,item_5.02,item_5.03,item_5.04,item_5.05,item_5.06,item_5.07,item_5.08,item_6.01,item_6.02,item_6.03,item_6.04,item_6.05,item_7.01,item_8.01,item_9.01,NW,TONE,TLAG,RLAG,SIZE,MTB,LEV,lag_date_crsp,date_key,fpedats,actual,median,afe,consensus,leap_consensus
64480,00030710,2015-02-26,1606180,178698.0,2015-01-28,"AAC Holdings, Inc.",8060,2015-02-03,201502,2015.0,188.983,145.952,101.145,30.58,97.474,21.374,4.357,24.284,30.92,-0.017751,-0.139429,0.038404,0.013963,0.0,-0.009017,0.009461,1,1,3.0,333,177,4,1,0,-0.009009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,5.811141,-9.009009,6,-29,6.493578,6.780106,0.196236,2014-12-10,2015,2015-12-31,0.97,0.78,0.19,0.753333,
64481,00030710,2015-05-01,1606180,178698.0,2015-04-29,"AAC Holdings, Inc.",8060,2015-04-29,201504,2015.0,210.265,188.983,109.824,43.56,101.145,21.781,4.440,71.628,30.58,-0.015936,-0.133657,0.030279,0.029394,0.0,0.018610,0.009461,1,1,2.0,298,150,0,1,0,0.003356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.700444,3.355705,0,-2,6.501384,6.585229,0.402512,2015-02-26,2015,2015-12-31,0.97,0.78,0.19,0.753333,
64482,00030710,2015-06-10,1606180,178698.0,2015-06-16,"AAC Holdings, Inc.",8060,2015-06-19,201506,2015.0,210.265,188.983,109.824,43.56,101.145,21.781,4.440,71.628,30.58,0.096483,0.158494,0.030279,0.029394,0.0,0.018610,0.009461,0,1,2.0,448,190,4,1,0,-0.006696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.107023,-6.696429,9,6,6.501384,6.585229,0.402512,2015-05-01,2015,2015-12-31,0.97,0.78,0.19,0.753333,
64483,00030710,2015-08-03,1606180,178698.0,2015-07-27,"AAC Holdings, Inc.",8060,2015-07-31,201507,2015.0,290.911,210.265,139.720,22.25,109.824,21.816,5.227,70.641,43.56,-0.133643,-0.148280,0.069242,0.011661,0.0,-0.014758,0.009461,1,1,1.0,272,150,7,3,0,-0.014706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.609472,-14.705882,4,-7,6.856783,8.652981,0.360821,2015-06-10,2015,2015-12-31,0.97,0.78,0.19,0.753333,
64484,00030710,2015-08-10,1606180,178698.0,2015-08-10,"AAC Holdings, Inc.",8060,2015-08-10,201508,2015.0,290.911,210.265,139.720,22.25,109.824,21.816,5.227,70.641,43.56,0.045946,0.155929,0.069242,0.011661,0.0,-0.014758,0.009461,0,2,5.0,753,359,3,2,0,-0.001328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,6.625392,-1.328021,0,0,6.856783,8.652981,0.360821,2015-08-03,2015,2015-12-31,0.97,0.78,0.19,0.753333,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53298,Y9369110,2009-04-17,1352341,166689.0,2009-04-06,Verigy Ltd.,3559,2009-04-07,200904,2009.0,572.000,613.000,410.000,11.00,432.000,58.119,0.000,0.000,8.31,-0.007074,-0.088684,0.036977,-0.048940,1.0,0.055465,0.046828,1,1,2.0,409,184,5,4,0,-0.002445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.016157,-2.444988,1,-11,6.179952,1.117984,0.000000,2009-02-11,2009,2009-10-31,-1.49,-0.58,-0.91,-0.047253,0.18142
53299,Y9369110,2009-07-10,1352341,166689.0,2009-06-15,Verigy Ltd.,3559,2009-06-16,200906,2009.0,705.000,572.000,402.000,13.29,410.000,59.191,0.000,0.000,11.00,0.032941,0.105364,0.024128,-0.036713,1.0,0.015734,0.039254,0,1,2.0,359,152,0,1,0,0.002786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.886104,2.785515,1,-25,6.478665,1.588051,0.000000,2009-04-17,2009,2009-10-31,-1.49,-0.58,-0.91,-0.047253,0.18142
53300,Y9369110,2009-08-21,1352341,166689.0,2009-08-20,Verigy Ltd.,3559,2009-08-20,200908,2010.0,737.000,720.000,405.000,11.92,402.000,59.259,0.000,138.000,10.84,-0.110608,-0.094245,0.022473,-0.001389,1.0,0.006944,0.019840,1,1,2.0,327,147,1,1,0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.793014,0.000000,0,-1,6.465161,1.597929,0.191667,2009-07-10,2010,2010-10-31,0.57,0.24,0.33,0.181420,
53301,Y9369110,2009-11-20,1352341,166689.0,2009-11-17,Verigy Ltd.,3559,2009-11-19,200911,2010.0,737.000,720.000,405.000,11.92,402.000,59.259,0.000,138.000,10.84,0.079485,0.108029,0.022473,-0.001389,1.0,0.006944,0.019840,0,1,3.0,598,249,7,6,0,-0.001672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.395262,-1.672241,2,-3,6.465161,1.597929,0.191667,2009-08-21,2010,2010-10-31,0.57,0.24,0.33,0.181420,


In [4]:
########################################################################################
############ Merge CRSP_COMP_EDGAR_IBES data with SEG data #############################
########################################################################################

########### Read SEG raw data file
seg_cols = ['gvkey', 'stype', 'sid', 'datadate', 'snms', 'cusip', 'cik']
seg = pd.read_csv('..\\filings\\compustat_seg.csv', usecols = seg_cols)
print('number of cusip-fpedats-analyst: ' + str(seg.shape[0]))

### Reorder and rename IBES columns
seg = seg[['gvkey', 'cik', 'cusip', 'datadate', 'stype', 'sid', 'snms']]
seg = seg.rename(columns={'datadate': 'date_comp'})

### parse date_comp format
seg['date_comp'] = pd.to_datetime(seg['date_comp'], format='%Y%m%d')

### aggregate by gvkey-date_comp-stype and get count of sid
seg_count = seg.groupby(['gvkey', 'date_comp', 'stype'])['sid'].count().to_frame()

### make a segment dataset by gvkey-date_comp-stype
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp', 'stype']) == False, ['gvkey', 'date_comp', 'stype']]
seg = seg.join(seg_count, on=['gvkey', 'date_comp', 'stype'])
seg_bus = seg[seg['stype']=='BUSSEG']
seg_bus = seg_bus.drop(columns=['stype'])
seg_bus = seg_bus.rename(columns={'sid': 'nseg_bus'})
seg_geo = seg[seg['stype']=='GEOSEG']
seg_geo = seg_geo.drop(columns=['stype'])
seg_geo = seg_geo.rename(columns={'sid': 'nseg_geo'})
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp']) == False, ['gvkey', 'date_comp']]
seg = pd.merge(seg, seg_bus, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = pd.merge(seg, seg_geo, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = seg.sort_values(by=['gvkey', 'date_comp'])
seg.loc[seg['nseg_geo'].isnull(), 'nseg_geo'] = 1
seg.loc[seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
seg['nseg_bus'] = seg['nseg_bus'].astype(int)
seg['nseg_geo'] = seg['nseg_geo'].astype(int)

### create merge date_key - %y only
seg['date_key'] = seg['date_comp'].astype(str).str[:-6].astype(int)
seg = seg.drop(columns=['date_comp'])

print('number of gvkey-datadate in SEGMENT data: ' + str(seg.shape[0]))

number of cusip-fpedats-analyst: 452653
number of gvkey-datadate in SEGMENT data: 50876


In [5]:
##################### left merge EDGAR_CRSP_COMP_IBES and SEG, key not unique in both datasets
crsp_comp_edgar_ibes_seg = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='left', validate = 'm:m')
print('Number of gvkey-quarter obs. after merging, left: '+ str(crsp_comp_edgar_ibes_seg.shape[0]))
# crsp_comp_edgar_ibes_seg_inner = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='inner', validate = 'm:m')
# print('Number of gvkey-quarter obs. after merging, inner: '+ str(crsp_comp_edgar_ibes_seg_inner.shape[0]))

##################### Fill missing segments in crsp_comp_edgar_ibes_seg_left with 1
crsp_comp_edgar_ibes_seg.loc[crsp_comp_edgar_ibes_seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
crsp_comp_edgar_ibes_seg.loc[crsp_comp_edgar_ibes_seg['nseg_geo'].isnull(), 'nseg_geo'] = 1

####### Drop duplicated gykey-cquarter
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg.duplicated(subset=['gvkey', 'cmonth']) == False]

Number of gvkey-quarter obs. after merging, left: 144687


In [6]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

################### CRSP_COMP_EDGAR_IBES_SEG_LEFT ######################################

######## BUSSEG: log(1 + number of business segments), or 1 if item is missing from Compustat; and
######## GEOSEG: log(1 + number of geographic segments), or 1 if item is missing from Compustat.
crsp_comp_edgar_ibes_seg['BUSSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg['nseg_bus'])
crsp_comp_edgar_ibes_seg['GEOSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg['nseg_geo'])

######## AFE, analyst forecast error, \
######## defined as I/B/E/S earnings per share minus the median of the most recent analysts’ forecasts, \
######## deflated by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg['AFE'] = crsp_comp_edgar_ibes_seg['afe']/crsp_comp_edgar_ibes_seg['prccq']

######## AF, analyst consensus forecast for one-year-ahead earnings per share, scaled by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg['AF'] = crsp_comp_edgar_ibes_seg['leap_consensus']/crsp_comp_edgar_ibes_seg['prccq']

In [7]:
########################################################################################
########################## Variable Screening ##########################################
########################################################################################

########## Drop files (firm-quarter) that have positive/negative infinity DEARN
del_DEARN = crsp_comp_edgar_ibes_seg[(crsp_comp_edgar_ibes_seg['DEARN']==np.inf) | (crsp_comp_edgar_ibes_seg['DEARN']==-np.inf)].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[(crsp_comp_edgar_ibes_seg['DEARN']!=np.inf) & (crsp_comp_edgar_ibes_seg['DEARN']!=-np.inf)]
print('number of files that have positive/negative infinity DEARN: ' + str(del_DEARN))

### Drop missing AF
del_AF = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AF'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AF'].isnull() == False]
print('number of obs. that contain missing AF: ' + str(del_AF))

### Drop missing AFE
del_AFE = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AFE'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AFE'].isnull() == False]
print('number of obs. that contain missing AFE: ' + str(del_AFE))

### Drop missing EARN
del_EARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['EARN'].isnull() == False]
print('number of obs. that contain missing EARN: ' + str(del_EARN))

### Drop missing STD_EARN
del_STD_EARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['STD_EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['STD_EARN'].isnull() == False]
print('number of obs. that contain missing STD_EARN: ' + str(del_STD_EARN))

### Drop missing EARN
del_DEARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DEARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DEARN'].isnull() == False]
print('number of obs. that contain missing DEARN: ' + str(del_DEARN))

############## Winsorize ##############################
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

############## Winsorize TABLE 1 (line 1) and TABLE 4 (line 2) variables
crsp_comp_edgar_ibes_seg = winsorize(crsp_comp_edgar_ibes_seg, \
                                          ['AF', 'AFE', 'BUSSEG','GEOSEG','EARN', 'DEARN', 'STD_EARN', 'STD_RET'])

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

number of files that have positive/negative infinity DEARN: 0
number of obs. that contain missing AF: 19713
number of obs. that contain missing AFE: 0
number of obs. that contain missing EARN: 4
number of obs. that contain missing STD_EARN: 48
number of obs. that contain missing DEARN: 0
Number of firm-quarters after variable screening: 114703


In [8]:
# ######## Create ABTONE: residual from TONE regression 
# y, X = ps.dmatrices('TONE ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
#                     data = crsp_comp_edgar_ibes_seg, return_type = 'dataframe')
# model = sm.OLS(y, X)
# res = model.fit()
# # res.summary()
# crsp_comp_edgar_ibes_seg['ABTONE'] = res.resid

# ######## Create abtone for REPLICATION: residual from tone regression 
# y, X = ps.dmatrices('tone ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
#                     data = crsp_comp_edgar_ibes_seg, return_type = 'dataframe')
# model = sm.OLS(y, X)
# res = model.fit()
# # res.summary()
# crsp_comp_edgar_ibes_seg['abtone'] = res.resid

############## Save merged CRSP_COMP_EDGAR_IBES_SEG to csv file
crsp_comp_edgar_ibes_seg.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_8-K.csv', index = 0)

crsp_comp_edgar_ibes_seg

Unnamed: 0,cusip,date_crsp,cik,gvkey,rp,name,SIC,fd,cmonth,fyearq,atq,lag_atq,ceqq,prccq,lag_ceqq,lag_cshoq,lag_dlcq,lag_dlttq,lag_prccq,RET,DRET,STD_RET,EARN,LOSS,DEARN,STD_EARN,BN,n8k,nitem,nw,nvocab,n_neg,n_pos,n_negation,tone,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_1.01,item_1.02,item_1.03,item_1.04,item_2.01,item_2.02,item_2.03,item_2.04,item_2.05,item_2.06,item_3.01,item_3.02,item_3.03,item_4.01,item_4.02,item_5.01,item_5.02,item_5.03,item_5.04,item_5.05,item_5.06,item_5.07,item_5.08,item_6.01,item_6.02,item_6.03,item_6.04,item_6.05,item_7.01,item_8.01,item_9.01,NW,TONE,TLAG,RLAG,SIZE,MTB,LEV,lag_date_crsp,date_key,fpedats,actual,median,afe,consensus,leap_consensus,nseg_bus,nseg_geo,BUSSEG,GEOSEG,AFE,AF
0,59100210,1998-10-01,1000015,61641.0,1998-06-12,META GROUP INC,8700,1998-06-12,199806,1998.0,112.187,93.984,72.690,29.750,62.785,11.586,0.000,0.000,32.6875,-0.112568,-0.124841,0.050776,0.029941,0.0,0.004873,0.004313,1,1,1.0,225,129,0,0,0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.420535,0.000000,0,-111,5.936790,6.031972,0.000000,,1998,1998-12-31,0.703,0.65330,0.04970,0.665152,0.838958,1.0,1.0,0.693147,0.693147,0.001671,0.028200
1,59100210,1998-10-20,1000015,61641.0,1998-10-19,META GROUP INC,8700,1998-11-03,199811,1998.0,112.187,93.984,72.690,29.750,62.785,11.586,0.000,0.000,32.6875,-0.088660,-0.173595,0.050776,0.029941,0.0,0.004873,0.004313,1,1,1.0,39162,4271,813,192,7,-0.016036,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.575488,-16.035953,15,-1,5.936790,6.031972,0.000000,1998-10-01,1998,1998-12-31,0.703,0.65330,0.04970,0.665152,0.838958,1.0,1.0,0.693147,0.693147,0.001671,0.028200
2,59100210,1999-04-15,1000015,61641.0,1999-04-15,META GROUP INC,8700,1999-04-15,199904,1999.0,91.403,109.766,54.284,15.375,74.721,10.932,0.000,0.000,15.3750,-0.330093,-0.468600,0.075509,0.018658,0.0,0.006733,0.007034,1,1,2.0,793,391,6,6,0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.677083,0.000000,0,0,5.124437,2.249428,0.000000,1998-10-20,1999,1999-12-31,0.790,0.84665,-0.05665,0.838958,0.964103,1.0,1.0,0.693147,0.693147,-0.003685,0.062706
3,59100210,1999-05-05,1000015,61641.0,1999-05-03,META GROUP INC,8700,1999-05-04,199905,1999.0,91.403,109.766,54.284,15.375,74.721,10.932,0.000,0.000,15.3750,-0.032899,-0.155471,0.075509,0.018658,0.0,0.006733,0.007034,1,1,2.0,1022,445,4,6,0,0.001957,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.930495,1.956947,1,-2,5.124437,2.249428,0.000000,1999-04-15,1999,1999-12-31,0.790,0.84665,-0.05665,0.838958,0.964103,1.0,1.0,0.693147,0.693147,-0.003685,0.062706
4,59100210,2000-11-13,1000015,61641.0,2000-09-19,META GROUP INC,8700,2000-10-03,200010,2001.0,135.015,142.577,52.786,2.190,66.053,12.830,16.122,5.778,2.0000,0.045878,0.241004,0.080060,-0.091361,1.0,-0.065677,0.041664,0,1,2.0,273,140,2,0,0,-0.007326,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.613128,-7.326007,14,-55,3.244933,0.388476,0.153601,1999-05-05,2001,2001-12-31,-0.320,1.21000,-1.53000,0.807500,-0.175000,1.0,1.0,0.693147,0.693147,-0.462636,-0.079909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144677,06780610,2014-07-29,9984,2049.0,2014-09-10,BARNES GROUP INC,3490,2014-09-16,201409,2014.0,2155.961,2214.624,1155.671,30.350,1173.699,54.261,68.971,501.945,38.5400,-0.018778,-0.047415,0.011815,0.015489,0.0,0.001839,0.001950,1,1,1.0,207,126,3,2,0,-0.004831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.337538,-4.830918,49,43,7.645502,1.781734,0.257794,2014-07-25,2014,2014-12-31,2.340,2.29000,0.05000,2.285692,2.504462,9.0,9.0,2.302585,2.302585,0.001647,0.082519
144678,06780610,2014-10-24,9984,2049.0,2014-10-15,BARNES GROUP INC,3490,2014-10-17,201410,2014.0,2073.885,2155.961,1111.793,37.010,1155.671,54.441,26.580,513.215,30.3500,0.051437,0.046161,0.012510,0.015426,0.0,-0.000484,0.001977,0,1,4.0,776,297,17,1,0,-0.020619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,6.655440,-20.618557,2,-9,7.409914,1.429719,0.250373,2014-07-29,2014,2014-12-31,2.340,2.29000,0.05000,2.285692,2.504462,9.0,9.0,2.302585,2.302585,0.001351,0.067670
144684,87253410,1994-11-22,99974,10751.0,1994-10-11,TJ INTERNATIONAL INC,2430,1994-10-26,199410,1995.0,616.617,614.477,237.509,17.500,237.914,16.916,4.323,102.499,17.7500,0.067745,0.123024,0.029218,0.002954,0.0,0.008874,0.007033,0,1,1.0,45306,3988,829,268,11,-0.012625,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.721217,-12.625259,15,-42,5.704645,1.262048,0.173842,,1995,1995-12-31,0.310,1.06000,-0.75000,1.037398,1.067200,1.0,1.0,0.693147,0.693147,-0.042857,0.060983
144685,87253410,1995-01-19,99974,10751.0,1995-01-04,TJ INTERNATIONAL INC,2430,1995-01-11,199501,1995.0,616.617,614.477,237.509,17.500,237.914,16.916,4.323,102.499,17.7500,0.050123,0.118422,0.029218,0.002954,0.0,0.008874,0.007033,0,1,1.0,454,246,3,9,0,0.013216,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.120297,13.215859,7,-15,5.704645,1.262048,0.173842,1994-11-22,1995,1995-12-31,0.310,1.06000,-0.75000,1.037398,1.067200,1.0,1.0,0.693147,0.693147,-0.042857,0.060983


In [9]:
# ####################################################################################################################
# ############# Create COMP_DA: if not replicating TABLE 4, DO NOT run because cause large sample reduction ##########
# ####################################################################################################################

# ### Drop missing or infinite LAG_TA_REV, DSAR and PPE
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['LAG_TA_REV'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['LAG_TA_REV'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['LAG_TA_REV'] != -np.inf)]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['DSAR'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['DSAR'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['DSAR'] != -np.inf)]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['PPE'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['PPE'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['PPE'] != -np.inf)]
# print('number of obs. after deleting missing or infinite LAG_TA_REV, DSAR and PPE: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

# ############## Winsorize DA variables
# crsp_comp_edgar_ibes_seg = winsorize(crsp_comp_edgar_ibes_seg, ['TACC', 'LAG_TA_REV', 'DSAR', 'PPE'])

# ######## Create DA: residual from TACC regression by each two-digit SIC-year
# def TACC_reg(data):
#     try:
#         y, X = ps.dmatrices('TACC ~ 1 + LAG_TA_REV + DSAR + PPE', data = data, return_type = 'dataframe')
#         model = sm.OLS(y, X)
#         res = model.fit()
#         data['DA'] = res.resid
#         return data
#     except:
#         pass

# crsp_comp_edgar_ibes_seg1 = crsp_comp_edgar_ibes_seg.groupby(['SIC2','fyearq']).apply(TACC_reg)
# # comp_DA['DA'].describe()

# ####### Join DA to COMP
# crsp_comp_edgar_ibes_seg = pd.merge(crsp_comp_edgar_ibes_seg, crsp_comp_edgar_ibes_seg1[['cik', 'rp', 'DA']], on = ['cik', 'rp'], how='left', validate = '1:1')

# crsp_comp_edgar_ibes_seg

# ############## Dropping missing DA to replicate Huang et al. 2014 TABLE 4, \
# del_DA = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DA'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DA'].isnull() == False]
# print('number of obs. that contain missing DA: ' + str(del_DA))

# ############## Inspect sample size after variable screening
# print('Number of firm-quarters after dropping missing DA: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

# ############## Save merged ID_CRSP_COMP_TEXT to csv file
# crsp_comp_edgar_ibes_seg.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_DA_10-Q.csv', index = 0)

In [10]:
########################################################################################
########### Table 2 - Panel A: Summary Statistics and Correlation Matrix ###############
########################################################################################

############# Table 2 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = crsp_comp_edgar_ibes_seg[['NW','nw', 'TONE','TLAG', \
                                          'RET', 'BN', 'SIZE', 'MTB', 'LEV', \
                                          'AF', 'AFE', 'BUSSEG','GEOSEG', 'EARN', 'DEARN', 'STD_EARN', 'STD_RET', 'LOSS' \
                                        # 'DA' 
                                        ]]

T2PB = selected_vars.describe().transpose() 

############ count number of unique firms
print('Number of unique firms in final 10-Q sample: ' + str(crsp_comp_edgar_ibes_seg['cik'].unique().shape[0]))

############# Summary statistics for all raw and processed variables
full_summary = crsp_comp_edgar_ibes_seg.describe().transpose()

############# Save T2PB
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T2PB.to_excel(writer, sheet_name='T2PB_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T2PB.to_excel(table_path, sheet_name='T2PB_raw', float_format="%.4f")

T2PB

Number of unique firms in final 10-Q sample: 5126


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,114703.0,6.04175,0.916342,4.89784,5.509388,5.777652,6.302619,13.57975
nw,114703.0,1288.640437,6828.238693,133.0,246.0,322.0,545.0,789969.0
TONE,114703.0,-0.313201,7.337511,-97.850863,-2.568349,0.0,3.816794,50.373134
TLAG,114703.0,7.302642,12.627903,0.0,0.0,2.0,7.0,80.0
RET,114703.0,0.007262,0.102503,-0.860449,-0.044753,0.004569,0.059176,3.086302
BN,114703.0,0.494991,0.499977,0.0,0.0,0.0,1.0,1.0
SIZE,114703.0,6.75868,1.738731,2.172419,5.531843,6.630625,7.862536,11.427072
MTB,114703.0,3.577172,4.203067,0.182014,1.484778,2.367583,3.929893,32.67415
LEV,114703.0,0.195923,0.186259,0.0,0.009287,0.165424,0.317592,0.747503
AF,114703.0,0.042871,0.069606,-0.290177,0.025485,0.051308,0.073988,0.220809


In [11]:
# full_summary

In [12]:
############# Table 3 Panel B: Correlation matrix for selected variables
######### pearson correlation
T2PD_pearson = selected_vars.loc[:, (selected_vars.columns != 'nw') & \
                                 (selected_vars.columns != 'BUSSEG') & \
                                 (selected_vars.columns != 'GEOSEG') & \
                                 (selected_vars.columns != 'LOSS')].corr(method='pearson')

# T3PB_pearson

In [13]:
######### spearman correlation
T2PD_spearman = selected_vars.loc[:, (selected_vars.columns != 'nw') & \
                                 (selected_vars.columns != 'BUSSEG') & \
                                 (selected_vars.columns != 'GEOSEG') & \
                                 (selected_vars.columns != 'LOSS')].corr(method='spearman')

# T3PB_spearman

In [14]:
######### Combine two correlation matrices. right-up matrix: pearson; left-down matrix: spearman 
for row in list(range(0, len(T2PD_spearman.index))):
    T2PD_spearman.iloc[row, row+1:] = T2PD_pearson.iloc[row, row+1:]
    
##### Save T2PD
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T2PD_spearman.to_excel(writer, sheet_name='T2PD_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T2PD_spearman.to_excel(table_path, sheet_name='T2PD_raw', float_format="%.4f")

T2PD_spearman

Unnamed: 0,NW,TONE,TLAG,RET,BN,SIZE,MTB,LEV,AF,AFE,EARN,DEARN,STD_EARN,STD_RET
NW,1.0,-0.404999,0.158925,0.012604,0.004689,-0.024262,0.026626,0.073131,-0.015063,-0.021109,-0.040407,-0.019146,0.045799,0.087932
TONE,-0.376816,1.0,-0.086095,0.002687,-0.006614,0.060733,0.01708,-0.034822,0.006328,0.056486,0.040247,-4e-06,-0.042609,-0.096373
TLAG,0.188456,-0.153152,1.0,-0.012869,0.045415,-0.084863,-0.0061,-0.001785,-0.019131,-0.015524,-0.019936,-0.005081,0.022804,0.071984
RET,0.0017,8.4e-05,-0.021713,1.0,-0.687053,-0.052947,0.009173,0.000707,-0.028488,0.022678,-0.010311,0.00417,0.036078,0.061256
BN,0.007315,-0.00494,0.046684,-0.851155,1.0,0.000302,-0.003306,-0.003657,-0.002312,-0.020209,-0.016445,-0.001554,0.003501,0.006456
SIZE,0.012405,0.067806,-0.117741,-0.0335,-0.002021,1.0,0.187592,0.133103,0.167313,0.258593,0.281103,0.002396,-0.242965,-0.52522
MTB,0.025504,0.038379,-0.035118,2.4e-05,-0.0044,0.342816,1.0,0.086891,-0.138099,0.119902,-0.028584,0.041238,0.138717,0.004756
LEV,0.059366,-0.043945,0.007692,-0.003397,-0.00383,0.190378,-0.064218,1.0,0.143487,-0.074592,0.026037,0.025249,-0.105067,-0.065805
AF,-0.017077,0.028606,-0.03494,-0.035844,0.006364,0.116098,-0.269302,0.237247,1.0,0.068731,0.463156,0.026282,-0.261614,-0.219973
AFE,-0.021929,0.047199,-0.043235,0.055735,-0.03579,0.223947,0.238159,-0.05571,0.067184,1.0,0.269332,0.065719,-0.156531,-0.324624
