In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')

############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
#############################################################################################
####### Merge IBES annual data (oen-year forecast) with CRSP_COMP_EDGAR quarterly data ######
#############################################################################################

########### Read IBES raw data file
ibes_cols = ['cusip8', 'cname', 'fpi', 'value', 'fpedats', 'anndats', 'actual', 'anndats_act']
ibes = pd.read_csv('..\\filings\\ibes.csv', usecols = ibes_cols)
print('number of cusip-fpedats-analyst: ' + str(ibes.shape[0]))

### Reorder and rename IBES columns
ibes = ibes[['cusip8', 'cname', 'fpedats', 'value', 'actual', 'anndats_act', 'anndats', 'fpi']]
ibes = ibes.rename(columns={'cusip8': 'cusip'})

### correct fpedats format
ibes['fpedats'] = ibes['fpedats'].str.replace('jan','01')
ibes['fpedats'] = ibes['fpedats'].str.replace('feb','02')
ibes['fpedats'] = ibes['fpedats'].str.replace('mar','03')
ibes['fpedats'] = ibes['fpedats'].str.replace('apr','04')
ibes['fpedats'] = ibes['fpedats'].str.replace('may','05')
ibes['fpedats'] = ibes['fpedats'].str.replace('jun','06')
ibes['fpedats'] = ibes['fpedats'].str.replace('jul','07')
ibes['fpedats'] = ibes['fpedats'].str.replace('aug','08')
ibes['fpedats'] = ibes['fpedats'].str.replace('sep','09')
ibes['fpedats'] = ibes['fpedats'].str.replace('oct','10')
ibes['fpedats'] = ibes['fpedats'].str.replace('nov','11')
ibes['fpedats'] = ibes['fpedats'].str.replace('dec','12')
ibes['fpedats'] = pd.to_datetime(ibes['fpedats'], format='%d%m%Y')

print(ibes['fpedats'].describe())

### Delete missing cusip8 and actual
del_cusip = ibes[ibes['cusip'].isnull()].shape[0]
ibes = ibes[ibes['cusip'].isnull() == False]
print('number of obs. that contain missing cusip: ' + str(del_cusip))

del_actual = ibes[ibes['actual'].isnull()].shape[0]
ibes = ibes[ibes['actual'].isnull() == False]
print('number of obs. that contain missing actual: ' + str(del_actual))

### aggregate by cusip-fpedats and get actual, median, afe and consensus from ibes raw data
ibes_css = ibes.groupby(['cusip', 'fpedats'])['actual'].median().to_frame()
ibes_css['median'] = ibes.groupby(['cusip', 'fpedats'])['value'].median().to_frame()
ibes_css['afe'] = ibes_css['actual'] - ibes_css['median']
ibes_css['consensus'] = ibes.groupby(['cusip', 'fpedats'])['value'].mean().to_frame()
ibes_css['leap_consensus'] = ibes_css.groupby(['cusip'])['consensus'].shift(-1)

### make a consensus forecast dataset by cusip-fpedats
ibes = ibes.loc[ibes.duplicated(subset=['cusip', 'fpedats']) == False, ['cusip','fpedats']]
ibes = ibes.join(ibes_css, on=['cusip', 'fpedats'])

### create merge date_key - keep %y only
ibes['date_key'] = ibes['fpedats'].astype(str).str[:-6].astype(int)

print('number of cusip-fpedats, dropping missing cusip and actual: ' + str(ibes.shape[0]))

number of cusip-fpedats-analyst: 9812071
count                 9812071
unique                    446
top       2015-12-31 00:00:00
freq                   426980
first     1981-12-31 00:00:00
last      2019-07-31 00:00:00
Name: fpedats, dtype: object
number of obs. that contain missing cusip: 60781
number of obs. that contain missing actual: 1305946
number of cusip-fpedats, dropping missing cusip and actual: 155539


In [3]:
##################### Read EDGAR_CRSP_COMP and creat merge date_key
crsp_comp_edgar = pd.read_csv('..\\filings\\crsp_comp_edgar_8-K.csv')
crsp_comp_edgar['date_key'] = crsp_comp_edgar['fyearq'].astype(int)

##################### Inner merge EDGAR_CRSP_COMP and IBES, key not unique in both datasets
crsp_comp_edgar_ibes = pd.merge(crsp_comp_edgar, ibes, on = ['cusip', 'date_key'], how='inner', validate = 'm:m')
print('number of cusip-date after merging: ' + str(crsp_comp_edgar_ibes.shape[0]))

crsp_comp_edgar_ibes.sort_values(by = ['cusip', 'cmonth'])

number of cusip-date after merging: 68614


Unnamed: 0,cusip,date_crsp,cik,gvkey,rp,name,SIC,fd,cmonth,fyearq,atq,lag_atq,ceqq,prccq,lag_ceqq,lag_cshoq,lag_dlcq,lag_dlttq,lag_prccq,RET,DRET,STD_RET,EARN,LOSS,DEARN,STD_EARN,BN,n8k,nitem,nw,nvocab,n_neg,n_pos,n_negation,tone,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_1.01,item_1.02,item_1.03,item_1.04,item_2.01,item_2.02,item_2.03,item_2.04,item_2.05,item_2.06,item_3.01,item_3.02,item_3.03,item_4.01,item_4.02,item_5.01,item_5.02,item_5.03,item_5.04,item_5.05,item_5.06,item_5.07,item_5.08,item_6.01,item_6.02,item_6.03,item_6.04,item_6.05,item_7.01,item_8.01,item_9.01,NW,TONE,TLAG,RLAG,SIZE,MTB,LEV,lag_date_crsp,date_key,fpedats,actual,median,afe,consensus,leap_consensus
30110,00030710,2015-06-10,1606180,178698.0,2015-06-16,"AAC Holdings, Inc.",8060,2015-06-19,201506,2015.0,210.265,188.983,109.824,43.560,101.145,21.781,4.440,71.628,30.580,0.096483,0.158494,0.030279,0.029394,0.0,0.018610,0.009461,0,1,2.0,448,190,4,1,0,-0.006696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.107023,-6.696429,9,6,6.501384,6.585229,0.402512,2014-12-10,2015,2015-12-31,0.9700,0.7800,0.1900,0.753333,
30111,00030710,2015-08-10,1606180,178698.0,2015-08-10,"AAC Holdings, Inc.",8060,2015-08-10,201508,2015.0,290.911,210.265,139.720,22.250,109.824,21.816,5.227,70.641,43.560,0.045946,0.155929,0.069242,0.011661,0.0,-0.014758,0.009461,0,2,5.0,753,359,3,2,0,-0.001328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,6.625392,-1.328021,0,0,6.856783,8.652981,0.360821,2015-06-10,2015,2015-12-31,0.9700,0.7800,0.1900,0.753333,
30112,00030710,2015-08-24,1606180,178698.0,2015-08-24,"AAC Holdings, Inc.",8060,2015-08-26,201508,2015.0,290.911,210.265,139.720,22.250,109.824,21.816,5.227,70.641,43.560,-0.043702,-0.186945,0.069242,0.011661,0.0,-0.014758,0.009461,1,1,1.0,429,219,32,2,0,-0.069930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6.063785,-69.930070,2,0,6.856783,8.652981,0.360821,2015-08-10,2015,2015-12-31,0.9700,0.7800,0.1900,0.753333,
50891,00036020,1996-02-27,824142,21542.0,1996-03-04,AAON INC,3585,1996-03-20,199603,1996.0,36.501,33.936,14.972,5.625,14.450,6.122,0.714,10.217,5.250,-0.102249,-0.258375,0.043777,0.015382,0.0,0.001385,0.005044,1,1,1.0,887,323,14,2,0,-0.013529,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.788972,-13.528749,22,6,3.470117,2.224256,0.322106,,1996,1996-12-31,0.0307,0.0702,-0.0395,0.071073,0.045367
50892,00036020,1997-02-26,824142,21542.0,1997-03-06,AAON INC,3585,1997-03-12,199703,1997.0,34.553,35.569,16.387,6.937,15.640,6.129,0.091,8.976,4.875,0.094302,0.128583,0.065730,0.020130,0.0,0.001603,0.002720,0,1,1.0,379,182,0,0,0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.940171,0.000000,14,8,3.397152,1.910414,0.254913,1996-02-27,1997,1997-12-31,0.0430,0.0439,-0.0009,0.045367,0.068925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24991,Y9369110,2007-08-10,1352341,166689.0,2007-08-22,Verigy Ltd.,3559,2007-08-22,200708,2007.0,771.000,714.000,498.000,22.990,479.000,59.650,0.000,0.000,24.460,0.039112,0.099563,0.035907,0.044818,0.0,0.002801,0.011957,0,1,2.0,293,155,0,1,0,0.003413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.683580,3.412969,12,12,7.285533,3.046010,0.000000,2007-08-08,2007,2007-10-31,1.7900,1.5750,0.2150,1.478500,1.590441
24992,Y9369110,2007-09-19,1352341,166689.0,2007-10-01,Verigy Ltd.,3559,2007-10-02,200710,2007.0,771.000,714.000,498.000,22.990,479.000,59.650,0.000,0.000,24.460,0.065650,0.098135,0.035907,0.044818,0.0,0.002801,0.011957,0,1,2.0,264,161,0,1,0,0.003788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.579730,3.787879,13,12,7.285533,3.046010,0.000000,2007-08-10,2007,2007-10-31,1.7900,1.5750,0.2150,1.478500,1.590441
24993,Y9369110,2007-11-29,1352341,166689.0,2007-12-03,Verigy Ltd.,3559,2007-12-06,200712,2008.0,787.000,771.000,533.000,20.880,498.000,59.705,0.000,0.000,22.990,-0.053090,-0.303970,0.040032,0.041505,0.0,0.000000,0.010772,1,1,3.0,853,344,7,6,0,-0.001172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.749931,-1.172333,7,4,7.224475,2.756261,0.000000,2007-09-19,2008,2008-10-31,1.1900,1.4300,-0.2400,1.590441,-0.047253
24994,Y9369110,2008-05-27,1352341,166689.0,2008-06-05,Verigy Ltd.,3559,2008-06-09,200806,2008.0,831.000,819.000,561.000,22.230,551.000,60.022,0.000,0.000,21.370,0.017333,-0.108319,0.026767,0.021978,0.0,0.004884,0.013251,1,1,3.0,565,245,4,3,0,-0.001770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.338594,-1.769912,13,9,7.156699,2.327895,0.000000,2007-11-29,2008,2008-10-31,1.1900,1.4300,-0.2400,1.590441,-0.047253


In [4]:
########################################################################################
############ Merge CRSP_COMP_EDGAR_IBES data with SEG data #############################
########################################################################################

########### Read SEG raw data file
seg_cols = ['gvkey', 'stype', 'sid', 'datadate', 'snms', 'cusip', 'cik']
seg = pd.read_csv('..\\filings\\compustat_seg.csv', usecols = seg_cols)
print('number of cusip-fpedats-analyst: ' + str(seg.shape[0]))

### Reorder and rename IBES columns
seg = seg[['gvkey', 'cik', 'cusip', 'datadate', 'stype', 'sid', 'snms']]
seg = seg.rename(columns={'datadate': 'date_comp'})

### parse date_comp format
seg['date_comp'] = pd.to_datetime(seg['date_comp'], format='%Y%m%d')

### aggregate by gvkey-date_comp-stype and get count of sid
seg_count = seg.groupby(['gvkey', 'date_comp', 'stype'])['sid'].count().to_frame()

### make a segment dataset by gvkey-date_comp-stype
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp', 'stype']) == False, ['gvkey', 'date_comp', 'stype']]
seg = seg.join(seg_count, on=['gvkey', 'date_comp', 'stype'])
seg_bus = seg[seg['stype']=='BUSSEG']
seg_bus = seg_bus.drop(columns=['stype'])
seg_bus = seg_bus.rename(columns={'sid': 'nseg_bus'})
seg_geo = seg[seg['stype']=='GEOSEG']
seg_geo = seg_geo.drop(columns=['stype'])
seg_geo = seg_geo.rename(columns={'sid': 'nseg_geo'})
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp']) == False, ['gvkey', 'date_comp']]
seg = pd.merge(seg, seg_bus, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = pd.merge(seg, seg_geo, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = seg.sort_values(by=['gvkey', 'date_comp'])
seg.loc[seg['nseg_geo'].isnull(), 'nseg_geo'] = 1
seg.loc[seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
seg['nseg_bus'] = seg['nseg_bus'].astype(int)
seg['nseg_geo'] = seg['nseg_geo'].astype(int)

### create merge date_key - %y only
seg['date_key'] = seg['date_comp'].astype(str).str[:-6].astype(int)
seg = seg.drop(columns=['date_comp'])

print('number of gvkey-datadate in SEGMENT data: ' + str(seg.shape[0]))

number of cusip-fpedats-analyst: 452653
number of gvkey-datadate in SEGMENT data: 50876


In [5]:
##################### left merge EDGAR_CRSP_COMP_IBES and SEG, key not unique in both datasets
crsp_comp_edgar_ibes_seg = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='left', validate = 'm:m')
print('Number of gvkey-quarter obs. after merging, left: '+ str(crsp_comp_edgar_ibes_seg.shape[0]))
# crsp_comp_edgar_ibes_seg_inner = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='inner', validate = 'm:m')
# print('Number of gvkey-quarter obs. after merging, inner: '+ str(crsp_comp_edgar_ibes_seg_inner.shape[0]))

##################### Fill missing segments in crsp_comp_edgar_ibes_seg_left with 1
crsp_comp_edgar_ibes_seg.loc[crsp_comp_edgar_ibes_seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
crsp_comp_edgar_ibes_seg.loc[crsp_comp_edgar_ibes_seg['nseg_geo'].isnull(), 'nseg_geo'] = 1

####### Drop duplicated gykey-cquarter
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg.duplicated(subset=['gvkey', 'cmonth']) == False]

Number of gvkey-quarter obs. after merging, left: 68638


In [6]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

################### CRSP_COMP_EDGAR_IBES_SEG_LEFT ######################################

######## BUSSEG: log(1 + number of business segments), or 1 if item is missing from Compustat; and
######## GEOSEG: log(1 + number of geographic segments), or 1 if item is missing from Compustat.
crsp_comp_edgar_ibes_seg['BUSSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg['nseg_bus'])
crsp_comp_edgar_ibes_seg['GEOSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg['nseg_geo'])

######## AFE, analyst forecast error, \
######## defined as I/B/E/S earnings per share minus the median of the most recent analysts’ forecasts, \
######## deflated by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg['AFE'] = crsp_comp_edgar_ibes_seg['afe']/crsp_comp_edgar_ibes_seg['prccq']

######## AF, analyst consensus forecast for one-year-ahead earnings per share, scaled by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg['AF'] = crsp_comp_edgar_ibes_seg['leap_consensus']/crsp_comp_edgar_ibes_seg['prccq']

In [7]:
########################################################################################
########################## Variable Screening ##########################################
########################################################################################

########## Drop files (firm-quarter) that have positive/negative infinity DEARN
del_DEARN = crsp_comp_edgar_ibes_seg[(crsp_comp_edgar_ibes_seg['DEARN']==np.inf) | (crsp_comp_edgar_ibes_seg['DEARN']==-np.inf)].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[(crsp_comp_edgar_ibes_seg['DEARN']!=np.inf) & (crsp_comp_edgar_ibes_seg['DEARN']!=-np.inf)]
print('number of files that have positive/negative infinity DEARN: ' + str(del_DEARN))

### Drop missing AF
del_AF = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AF'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AF'].isnull() == False]
print('number of obs. that contain missing AF: ' + str(del_AF))

### Drop missing AFE
del_AFE = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AFE'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['AFE'].isnull() == False]
print('number of obs. that contain missing AFE: ' + str(del_AFE))

### Drop missing EARN
del_EARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['EARN'].isnull() == False]
print('number of obs. that contain missing EARN: ' + str(del_EARN))

### Drop missing STD_EARN
del_STD_EARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['STD_EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['STD_EARN'].isnull() == False]
print('number of obs. that contain missing STD_EARN: ' + str(del_STD_EARN))

### Drop missing EARN
del_DEARN = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DEARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DEARN'].isnull() == False]
print('number of obs. that contain missing DEARN: ' + str(del_DEARN))

############## Winsorize ##############################
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

############## Winsorize TABLE 1 (line 1) and TABLE 4 (line 2) variables
crsp_comp_edgar_ibes_seg = winsorize(crsp_comp_edgar_ibes_seg, \
                                          ['AF', 'AFE', 'BUSSEG','GEOSEG','EARN', 'DEARN', 'STD_EARN', 'STD_RET'])

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

number of files that have positive/negative infinity DEARN: 0
number of obs. that contain missing AF: 9574
number of obs. that contain missing AFE: 0
number of obs. that contain missing EARN: 3
number of obs. that contain missing STD_EARN: 29
number of obs. that contain missing DEARN: 0
Number of firm-quarters after variable screening: 56245


In [8]:
# ######## Create ABTONE: residual from TONE regression 
# y, X = ps.dmatrices('TONE ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
#                     data = crsp_comp_edgar_ibes_seg, return_type = 'dataframe')
# model = sm.OLS(y, X)
# res = model.fit()
# # res.summary()
# crsp_comp_edgar_ibes_seg['ABTONE'] = res.resid

# ######## Create abtone for REPLICATION: residual from tone regression 
# y, X = ps.dmatrices('tone ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
#                     data = crsp_comp_edgar_ibes_seg, return_type = 'dataframe')
# model = sm.OLS(y, X)
# res = model.fit()
# # res.summary()
# crsp_comp_edgar_ibes_seg['abtone'] = res.resid

############## Save merged CRSP_COMP_EDGAR_IBES_SEG to csv file
crsp_comp_edgar_ibes_seg.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_8-K.csv', index = 0)

crsp_comp_edgar_ibes_seg

Unnamed: 0,cusip,date_crsp,cik,gvkey,rp,name,SIC,fd,cmonth,fyearq,atq,lag_atq,ceqq,prccq,lag_ceqq,lag_cshoq,lag_dlcq,lag_dlttq,lag_prccq,RET,DRET,STD_RET,EARN,LOSS,DEARN,STD_EARN,BN,n8k,nitem,nw,nvocab,n_neg,n_pos,n_negation,tone,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_1.01,item_1.02,item_1.03,item_1.04,item_2.01,item_2.02,item_2.03,item_2.04,item_2.05,item_2.06,item_3.01,item_3.02,item_3.03,item_4.01,item_4.02,item_5.01,item_5.02,item_5.03,item_5.04,item_5.05,item_5.06,item_5.07,item_5.08,item_6.01,item_6.02,item_6.03,item_6.04,item_6.05,item_7.01,item_8.01,item_9.01,NW,TONE,TLAG,RLAG,SIZE,MTB,LEV,lag_date_crsp,date_key,fpedats,actual,median,afe,consensus,leap_consensus,nseg_bus,nseg_geo,BUSSEG,GEOSEG,AFE,AF
0,59100210,1999-04-15,1000015,61641.0,1999-04-15,META GROUP INC,8700,1999-04-15,199904,1999.0,91.403,109.766,54.284,15.375,74.721,10.932,0.000,0.000,15.375,-0.330093,-0.468600,0.075509,0.018658,0.0,0.006733,0.007034,1,1,2.0,793,391,6,6,0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.677083,0.000000,0,0,5.124437,2.249428,0.000000,,1999,1999-12-31,0.79,0.84665,-0.05665,0.838958,0.964103,1.0,1.0,0.693147,0.693147,-0.003685,0.062706
1,59100210,2000-12-06,1000015,61641.0,2000-12-11,META GROUP INC,8700,2000-12-13,200012,2001.0,135.015,142.577,52.786,2.190,66.053,12.830,16.122,5.778,2.000,-0.059721,-0.200798,0.080060,-0.091361,1.0,-0.065677,0.041664,1,1,2.0,276,153,0,0,0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.624018,0.000000,7,5,3.244933,0.388476,0.153601,1999-04-15,2001,2001-12-31,-0.32,1.21000,-1.53000,0.807500,-0.175000,1.0,1.0,0.693147,0.693147,-0.489510,-0.079909
2,59100210,2001-04-05,1000015,61641.0,2001-04-05,META GROUP INC,8700,2001-04-10,200104,2001.0,135.015,142.577,52.786,2.190,66.053,12.830,16.122,5.778,2.000,0.216629,0.472604,0.080060,-0.091361,1.0,-0.065677,0.041664,0,1,2.0,201,123,1,0,0,-0.004975,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.308268,-4.975124,5,0,3.244933,0.388476,0.153601,2000-12-06,2001,2001-12-31,-0.32,1.21000,-1.53000,0.807500,-0.175000,1.0,1.0,0.693147,0.693147,-0.489510,-0.079909
3,59100210,2001-05-22,1000015,61641.0,2001-06-15,META GROUP INC,8700,2001-06-20,200106,2001.0,135.015,142.577,52.786,2.190,66.053,12.830,16.122,5.778,2.000,-0.042545,-0.248976,0.080060,-0.091361,1.0,-0.065677,0.041664,1,1,2.0,1009,433,11,15,1,0.002973,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.917706,2.973241,29,24,3.244933,0.388476,0.153601,2001-04-05,2001,2001-12-31,-0.32,1.21000,-1.53000,0.807500,-0.175000,1.0,1.0,0.693147,0.693147,-0.489510,-0.079909
8,80004C10,2001-12-13,1000180,61513.0,2001-12-17,SANDISK CORP,3572,2001-12-19,200112,2001.0,932.348,724.834,675.379,14.400,598.818,68.289,0.000,0.000,9.860,-0.073572,-0.195703,0.057305,0.035357,0.0,0.190477,0.100249,1,1,1.0,9456,2117,279,93,15,-0.021256,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.154510,-21.256345,6,4,6.512235,1.124431,0.000000,,2001,2001-12-31,-0.34,0.50000,-0.84000,0.272634,-0.008545,1.0,1.0,0.693147,0.693147,-0.058333,-0.000593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68628,06780610,2013-02-26,9984,2049.0,2013-02-27,BARNES GROUP INC,3490,2013-02-27,201302,2013.0,1779.174,1734.970,1016.546,34.920,974.990,52.402,67.140,177.242,29.990,0.005832,0.059853,0.013956,0.012323,0.0,0.007023,0.003645,0,1,1.0,1209,358,19,15,0,-0.003309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.098376,-3.308519,1,1,7.359809,1.611848,0.140857,2012-08-06,2013,2013-12-31,1.83,2.03500,-0.20500,2.038276,2.285692,9.0,9.0,2.302585,2.302585,-0.005871,0.065455
68629,06780610,2013-03-12,9984,2049.0,2013-04-22,BARNES GROUP INC,3490,2013-04-22,201304,2013.0,1779.174,1734.970,1016.546,34.920,974.990,52.402,67.140,177.242,29.990,-0.004298,-0.039333,0.013956,0.012323,0.0,0.007023,0.003645,1,1,4.0,137,92,0,1,0,0.007299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,4.927254,7.299270,41,41,7.359809,1.611848,0.140857,2013-02-26,2013,2013-12-31,1.83,2.03500,-0.20500,2.038276,2.285692,9.0,9.0,2.302585,2.302585,-0.005871,0.065455
68630,06780610,2013-10-25,9984,2049.0,2013-10-25,BARNES GROUP INC,3490,2013-10-25,201310,2013.0,2123.673,1779.174,1141.414,38.310,1016.546,52.744,62.533,285.600,34.920,-0.061480,-0.054819,0.011304,0.014789,0.0,0.002773,0.004217,1,1,2.0,140,94,0,1,0,0.007143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.948760,7.142857,0,0,7.518510,1.811842,0.195671,2013-03-12,2013,2013-12-31,1.83,2.03500,-0.20500,2.038276,2.285692,9.0,9.0,2.302585,2.302585,-0.005351,0.059663
68632,06780610,2014-07-29,9984,2049.0,2014-09-10,BARNES GROUP INC,3490,2014-09-16,201409,2014.0,2155.961,2214.624,1155.671,30.350,1173.699,54.261,68.971,501.945,38.540,-0.018778,-0.047415,0.011815,0.015489,0.0,0.001839,0.001950,1,1,1.0,207,126,3,2,0,-0.004831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.337538,-4.830918,49,43,7.645502,1.781734,0.257794,2013-10-28,2014,2014-12-31,2.34,2.29000,0.05000,2.285692,2.504462,9.0,9.0,2.302585,2.302585,0.001647,0.082519


In [9]:
# ####################################################################################################################
# ############# Create COMP_DA: if not replicating TABLE 4, DO NOT run because cause large sample reduction ##########
# ####################################################################################################################

# ### Drop missing or infinite LAG_TA_REV, DSAR and PPE
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['LAG_TA_REV'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['LAG_TA_REV'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['LAG_TA_REV'] != -np.inf)]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['DSAR'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['DSAR'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['DSAR'] != -np.inf)]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg.loc[(crsp_comp_edgar_ibes_seg['PPE'] != np.inf) & \
#                                                         (crsp_comp_edgar_ibes_seg['PPE'].isnull() == False) & \
#                                                         (crsp_comp_edgar_ibes_seg['PPE'] != -np.inf)]
# print('number of obs. after deleting missing or infinite LAG_TA_REV, DSAR and PPE: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

# ############## Winsorize DA variables
# crsp_comp_edgar_ibes_seg = winsorize(crsp_comp_edgar_ibes_seg, ['TACC', 'LAG_TA_REV', 'DSAR', 'PPE'])

# ######## Create DA: residual from TACC regression by each two-digit SIC-year
# def TACC_reg(data):
#     try:
#         y, X = ps.dmatrices('TACC ~ 1 + LAG_TA_REV + DSAR + PPE', data = data, return_type = 'dataframe')
#         model = sm.OLS(y, X)
#         res = model.fit()
#         data['DA'] = res.resid
#         return data
#     except:
#         pass

# crsp_comp_edgar_ibes_seg1 = crsp_comp_edgar_ibes_seg.groupby(['SIC2','fyearq']).apply(TACC_reg)
# # comp_DA['DA'].describe()

# ####### Join DA to COMP
# crsp_comp_edgar_ibes_seg = pd.merge(crsp_comp_edgar_ibes_seg, crsp_comp_edgar_ibes_seg1[['cik', 'rp', 'DA']], on = ['cik', 'rp'], how='left', validate = '1:1')

# crsp_comp_edgar_ibes_seg

# ############## Dropping missing DA to replicate Huang et al. 2014 TABLE 4, \
# del_DA = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DA'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg = crsp_comp_edgar_ibes_seg[crsp_comp_edgar_ibes_seg['DA'].isnull() == False]
# print('number of obs. that contain missing DA: ' + str(del_DA))

# ############## Inspect sample size after variable screening
# print('Number of firm-quarters after dropping missing DA: ' + str(crsp_comp_edgar_ibes_seg.shape[0]))

# ############## Save merged ID_CRSP_COMP_TEXT to csv file
# crsp_comp_edgar_ibes_seg.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_DA_10-Q.csv', index = 0)

In [11]:
########################################################################################
########### Table 2 - Panel A: Summary Statistics and Correlation Matrix ###############
########################################################################################

############# Table 2 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = crsp_comp_edgar_ibes_seg[['NW','nw', 'TONE','TLAG', \
                                          'RET', 'BN', 'SIZE', 'MTB', 'LEV', \
                                          'AF', 'AFE', 'BUSSEG','GEOSEG', 'EARN', 'DEARN', 'STD_EARN', 'STD_RET', 'LOSS' \
                                        # 'DA' 
                                        ]]

T2PD = selected_vars.describe().transpose() 

############ count number of unique firms
print('Number of unique firms in final 10-Q sample: ' + str(crsp_comp_edgar_ibes_seg['cik'].unique().shape[0]))

############# Summary statistics for all raw and processed variables
full_summary = crsp_comp_edgar_ibes_seg.describe().transpose()

############# Save T2PD
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T2PD.to_excel(writer, sheet_name='T2PD_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T2PD.to_excel(table_path, sheet_name='T2PD_raw', float_format="%.4f")

T2PD

Number of unique firms in final 10-Q sample: 4810


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,56245.0,6.054312,0.93938,4.89784,5.509388,5.783825,6.309918,12.486371
nw,56245.0,1369.27974,6775.12045,133.0,246.0,324.0,549.0,264704.0
TONE,56245.0,-0.456177,7.473017,-97.850863,-2.853067,0.0,3.816794,45.070423
TLAG,56245.0,15.523424,17.67333,0.0,2.0,9.0,22.0,93.0
RET,56245.0,0.001976,0.07811,-0.806857,-0.034195,-0.003005,0.037667,2.24474
BN,56245.0,0.540848,0.498333,0.0,0.0,1.0,1.0,1.0
SIZE,56245.0,6.734119,1.745431,2.122387,5.497065,6.612832,7.853723,11.379219
MTB,56245.0,3.547457,4.236065,0.123438,1.460405,2.334456,3.89133,33.390382
LEV,56245.0,0.198029,0.186825,0.0,0.011474,0.168023,0.31966,0.750668
AF,56245.0,0.042911,0.070738,-0.294933,0.025453,0.051698,0.074539,0.222972


In [12]:
# full_summary

In [13]:
############# Table 3 Panel B: Correlation matrix for selected variables
######### pearson correlation
T2PD_pearson = selected_vars.loc[:, (selected_vars.columns != 'nw') & \
                                 (selected_vars.columns != 'BUSSEG') & \
                                 (selected_vars.columns != 'GEOSEG') & \
                                 (selected_vars.columns != 'LOSS')].corr(method='pearson')

# T3PB_pearson

In [14]:
######### spearman correlation
T2PD_spearman = selected_vars.loc[:, (selected_vars.columns != 'nw') & \
                                 (selected_vars.columns != 'BUSSEG') & \
                                 (selected_vars.columns != 'GEOSEG') & \
                                 (selected_vars.columns != 'LOSS')].corr(method='spearman')

# T3PB_spearman

In [14]:
######### Combine two correlation matrices. right-up matrix: pearson; left-down matrix: spearman 
for row in list(range(0, len(T2PD_spearman.index))):
    T2PD_spearman.iloc[row, row+1:] = T2PD_pearson.iloc[row, row+1:]
    
##### Save T2PD
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T2PD_spearman.to_excel(writer, sheet_name='T2PD_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T2PD_spearman.to_excel(table_path, sheet_name='T2PD_raw', float_format="%.4f")

T2PD_spearman

Unnamed: 0,NW,TONE,TLAG,READ,RET,NEG,SIZE,MTB,LEV,AF,AFE,AGE,EARN,DEARN,STD_EARN,STD_RET,ABTONE
NW,1.0,-0.455693,-0.192174,-0.082593,-0.006464,0.002245,0.255384,0.057453,0.036042,-0.067753,0.010805,-0.040214,-0.116293,0.001621,0.091258,-0.030267,-0.3849668
TONE,-0.481466,1.0,0.015539,0.086127,0.020357,-0.020807,-0.061743,-0.013969,0.071639,0.071545,0.101819,0.059052,0.157343,-0.002893,-0.14801,-0.088735,0.9561762
TLAG,-0.263261,0.021266,1.0,0.047966,-0.02221,0.034012,-0.330778,-0.02175,0.009432,-0.09225,-0.127343,-0.228377,-0.136763,-0.00455,0.121034,0.189194,0.01693974
READ,-0.251936,0.169254,0.124632,1.0,-0.016437,0.015696,-0.014264,-0.035026,0.062856,0.045042,0.002045,0.088265,0.059172,0.002208,-0.047387,-0.050854,0.06109228
RET,-0.006911,0.028302,-0.0323,-0.028803,1.0,-0.684122,-0.064464,-0.02581,0.001968,-0.018165,0.154627,0.001703,0.063448,0.036141,0.0108,0.266061,-6.976698000000001e-17
NEG,0.003372,-0.023506,0.032701,0.028259,-0.865519,1.0,9.8e-05,0.012509,-0.001902,0.015156,-0.124063,-0.017917,-0.071295,-0.018664,0.015664,-0.117616,0.0004217309
SIZE,0.263978,-0.046533,-0.332576,-0.077923,-0.024161,-0.001004,1.0,0.233536,0.099647,0.07719,0.269668,0.343981,0.258838,-0.023619,-0.197631,-0.310121,2.52273e-16
MTB,0.046309,0.039931,-0.042137,-0.025544,-0.054681,0.03294,0.382007,1.0,0.045733,-0.156078,0.120161,-0.088334,-0.041167,0.022203,0.158318,0.035832,5.055858e-16
LEV,0.013848,0.076414,-0.000416,0.075072,0.003009,-0.004073,0.142834,-0.111068,1.0,0.166881,-0.068197,0.101104,0.039389,0.033884,-0.124458,-0.072001,0.06996245
AF,-0.018333,0.061766,-0.124698,0.035299,-0.086554,0.071511,0.025571,-0.299097,0.250819,1.0,0.057264,0.202124,0.472273,0.015995,-0.256432,-0.145248,1.843502e-17
