In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')

############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
########################################################################################
############ Merge IBES quarterly data with CRSP_COMP_EDGAR data #######################
########################################################################################

########### Read IBES raw data file
ibes_cols = ['cusip8', 'cname', 'fpi', 'value', 'fpedats', 'anndats', 'actual', 'anndats_act']
ibes = pd.read_csv('..\\filings\\ibes.csv', usecols = ibes_cols)
print('number of cusip-fpedats-analyst: ' + str(ibes.shape[0]))

### Reorder and rename IBES columns
ibes = ibes[['cusip8', 'cname', 'fpedats', 'value', 'actual', 'anndats_act', 'anndats', 'fpi']]
ibes = ibes.rename(columns={'cusip8': 'cusip'})

### correct fpedats format
ibes['fpedats'] = ibes['fpedats'].str.replace('jan','01')
ibes['fpedats'] = ibes['fpedats'].str.replace('feb','02')
ibes['fpedats'] = ibes['fpedats'].str.replace('mar','03')
ibes['fpedats'] = ibes['fpedats'].str.replace('apr','04')
ibes['fpedats'] = ibes['fpedats'].str.replace('may','05')
ibes['fpedats'] = ibes['fpedats'].str.replace('jun','06')
ibes['fpedats'] = ibes['fpedats'].str.replace('jul','07')
ibes['fpedats'] = ibes['fpedats'].str.replace('aug','08')
ibes['fpedats'] = ibes['fpedats'].str.replace('sep','09')
ibes['fpedats'] = ibes['fpedats'].str.replace('oct','10')
ibes['fpedats'] = ibes['fpedats'].str.replace('nov','11')
ibes['fpedats'] = ibes['fpedats'].str.replace('dec','12')
ibes['fpedats'] = pd.to_datetime(ibes['fpedats'], format='%d%m%Y')

print(ibes['fpedats'].describe())

### Delete missing cusip8, actual
del_cusip = ibes[ibes['cusip'].isnull()].shape[0]
ibes = ibes[ibes['cusip'].isnull() == False]
print('number of obs. that contain missing cusip: ' + str(del_cusip))

del_actual = ibes[ibes['actual'].isnull()].shape[0]
ibes = ibes[ibes['actual'].isnull() == False]
print('number of obs. that contain missing actual: ' + str(del_actual))

### aggregate by cusip-fpedats and get actual, median, afe and consensus from ibes raw data
ibes_css = ibes.groupby(['cusip', 'fpedats'])['actual'].median().to_frame()
ibes_css['median'] = ibes.groupby(['cusip', 'fpedats'])['value'].median().to_frame()
ibes_css['afe'] = ibes_css['actual'] - ibes_css['median']
ibes_css['consensus'] = ibes.groupby(['cusip', 'fpedats'])['value'].mean().to_frame()
ibes_css['leap_consensus'] = ibes_css.groupby(['cusip'])['consensus'].shift(-1)

### make a consensus forecast dataset by cusip-fpedats
ibes = ibes.loc[ibes.duplicated(subset=['cusip', 'fpedats']) == False, ['cusip','fpedats']]
ibes = ibes.join(ibes_css, on=['cusip', 'fpedats'])

### create merge date_key - %y only
ibes['date_key'] = ibes['fpedats'].astype(str).str[:-6].astype(int)

print('number of cusip-fpedats, dropping missing cusip and actual: ' + str(ibes.shape[0]))

number of cusip-fpedats-analyst: 9812071
count                 9812071
unique                    446
top       2015-12-31 00:00:00
freq                   426980
first     1981-12-31 00:00:00
last      2019-07-31 00:00:00
Name: fpedats, dtype: object
number of obs. that contain missing cusip: 60781
number of obs. that contain missing actual: 1305946
number of cusip-fpedats, dropping missing cusip and actual: 155539


In [3]:
##################### Read EDGAR_CRSP_COMP and creat merge date_key
crsp_comp_edgar = pd.read_csv('..\\filings\\id_crsp_comp_text_10-Q.csv')
crsp_comp_edgar['date_key'] = crsp_comp_edgar['fyearq'].astype(int)

##################### inner merge EDGAR_CRSP_COMP and IBES, key not unique in both datasets
crsp_comp_edgar_ibes = pd.merge(crsp_comp_edgar, ibes, on = ['cusip', 'date_key'], how='inner', validate = 'm:m')
print('number of cusip-date after merging: ' + str(crsp_comp_edgar_ibes.shape[0]))

number of cusip-date after merging: 110095


In [4]:
########################################################################################
############ Merge CRSP_COMP_EDGAR_IBES data with SEG data #############################
########################################################################################

########### Read SEG raw data file
seg_cols = ['gvkey', 'stype', 'sid', 'datadate', 'snms', 'cusip', 'cik']
seg = pd.read_csv('..\\filings\\compustat_seg.csv', usecols = seg_cols)
print('number of cusip-fpedats-analyst: ' + str(seg.shape[0]))

### Reorder and rename IBES columns
seg = seg[['gvkey', 'cik', 'cusip', 'datadate', 'stype', 'sid', 'snms']]
seg = seg.rename(columns={'datadate': 'date_comp'})

### parse date_comp format
seg['date_comp'] = pd.to_datetime(seg['date_comp'], format='%Y%m%d')

### aggregate by gvkey-date_comp-stype and get count of sid
seg_count = seg.groupby(['gvkey', 'date_comp', 'stype'])['sid'].count().to_frame()

### make a segment dataset by gvkey-date_comp-stype
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp', 'stype']) == False, ['gvkey', 'date_comp', 'stype']]
seg = seg.join(seg_count, on=['gvkey', 'date_comp', 'stype'])
seg_bus = seg[seg['stype']=='BUSSEG']
seg_bus = seg_bus.drop(columns=['stype'])
seg_bus = seg_bus.rename(columns={'sid': 'nseg_bus'})
seg_geo = seg[seg['stype']=='GEOSEG']
seg_geo = seg_geo.drop(columns=['stype'])
seg_geo = seg_geo.rename(columns={'sid': 'nseg_geo'})
seg = seg.loc[seg.duplicated(subset=['gvkey', 'date_comp']) == False, ['gvkey', 'date_comp']]
seg = pd.merge(seg, seg_bus, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = pd.merge(seg, seg_geo, on = ['gvkey', 'date_comp'], how='left', validate = '1:1')
seg = seg.sort_values(by=['gvkey', 'date_comp'])
seg.loc[seg['nseg_geo'].isnull(), 'nseg_geo'] = 1
seg.loc[seg['nseg_bus'].isnull(), 'nseg_bus'] = 1
seg['nseg_bus'] = seg['nseg_bus'].astype(int)
seg['nseg_geo'] = seg['nseg_geo'].astype(int)

### create merge date_key - %y only
seg['date_key'] = seg['date_comp'].astype(str).str[:-6].astype(int)
seg = seg.drop(columns=['date_comp'])

print('number of gvkey-datadate in SEGMENT data: ' + str(seg.shape[0]))

number of cusip-fpedats-analyst: 452653
number of gvkey-datadate in SEGMENT data: 50876


In [11]:
##################### left merge EDGAR_CRSP_COMP_IBES and SEG, key not unique in both datasets
crsp_comp_edgar_ibes_seg_left = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='left', validate = 'm:m')
print('Number of gvkey-quarter obs. after merging, left: '+ str(crsp_comp_edgar_ibes_seg_left.shape[0]))
# crsp_comp_edgar_ibes_seg_inner = pd.merge(crsp_comp_edgar_ibes, seg, on = ['gvkey', 'date_key'], how='inner', validate = 'm:m')
# print('Number of gvkey-quarter obs. after merging, inner: '+ str(crsp_comp_edgar_ibes_seg_inner.shape[0]))

##################### fill missing segments in crsp_comp_edgar_ibes_seg_left with 1
crsp_comp_edgar_ibes_seg_left.loc[crsp_comp_edgar_ibes_seg_left['nseg_bus'].isnull(), 'nseg_bus'] = 1
crsp_comp_edgar_ibes_seg_left.loc[crsp_comp_edgar_ibes_seg_left['nseg_geo'].isnull(), 'nseg_geo'] = 1

####### Drop duplicated gykey-cquarter
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left.duplicated(subset=['gvkey', 'cquarter']) == False]

Number of gvkey-quarter obs. after merging, left: 110114


In [12]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

################### CRSP_COMP_EDGAR_IBES_SEG_LEFT ######################################

######## AGE: log(1 + age from the first year the firm entered the CRSP dataset)
crsp_comp_edgar_ibes_seg_left['AGE'] = np.log(1 + crsp_comp_edgar_ibes_seg_left['age'])
######## BUSSEG: log(1 + number of business segments), or 1 if item is missing from Compustat; and
######## GEOSEG: log(1 + number of geographic segments), or 1 if item is missing from Compustat.
crsp_comp_edgar_ibes_seg_left['BUSSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg_left['nseg_bus'])
crsp_comp_edgar_ibes_seg_left['GEOSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg_left['nseg_geo'])
######## AFE, analyst forecast error, \
######## defined as I/B/E/S earnings per share minus the median of the most recent analysts’ forecasts, \
######## deflated by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg_left['AFE'] = crsp_comp_edgar_ibes_seg_left['afe']/crsp_comp_edgar_ibes_seg_left['prccq']
######## AF, analyst consensus forecast for one-year-ahead earnings per share, scaled by stock price per share at the end of the fiscal quarter
crsp_comp_edgar_ibes_seg_left['AF'] = crsp_comp_edgar_ibes_seg_left['leap_consensus']/crsp_comp_edgar_ibes_seg_left['prccq']

# ################### CRSP_COMP_EDGAR_IBES_SEG_INNER ######################################

# ######## AGE: log(1 + age from the first year the firm entered the CRSP dataset)
# crsp_comp_edgar_ibes_seg_inner['AGE'] = np.log(1 + crsp_comp_edgar_ibes_seg_inner['age'])
# ######## BUSSEG: log(1 + number of business segments), or 1 if item is missing from Compustat; and
# ######## GEOSEG: log(1 + number of geographic segments), or 1 if item is missing from Compustat.
# crsp_comp_edgar_ibes_seg_inner['BUSSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg_inner['nseg_bus'])
# crsp_comp_edgar_ibes_seg_inner['GEOSEG'] = np.log(1 + crsp_comp_edgar_ibes_seg_inner['nseg_geo'])
# ######## AFE, analyst forecast error, \
# ######## defined as I/B/E/S earnings per share minus the median of the most recent analysts’ forecasts, \
# ######## deflated by stock price per share at the end of the fiscal quarter
# crsp_comp_edgar_ibes_seg_inner['AFE'] = crsp_comp_edgar_ibes_seg_inner['afe']/crsp_comp_edgar_ibes_seg_inner['prccq']
# ######## AF, analyst consensus forecast for one-year-ahead earnings per share, scaled by stock price per share at the end of the fiscal quarter
# crsp_comp_edgar_ibes_seg_inner['AF'] = crsp_comp_edgar_ibes_seg_inner['leap_consensus']/crsp_comp_edgar_ibes_seg_inner['prccq']

In [13]:
########################################################################################
########################## Variable Screening (LEFT) ###################################
########################################################################################

########## Drop files (firm-quarter) that have positive/negative infinity DEARN
del_DEARN = crsp_comp_edgar_ibes_seg_left[(crsp_comp_edgar_ibes_seg_left['DEARN']==np.inf) \
                                          | (crsp_comp_edgar_ibes_seg_left['DEARN']==-np.inf)].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[(crsp_comp_edgar_ibes_seg_left['DEARN']!=np.inf) \
                                                              & (crsp_comp_edgar_ibes_seg_left['DEARN']!=-np.inf)]
print('number of files that have positive/negative infinity DEARN: ' + str(del_DEARN))

### Drop missing AF
del_AF = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['AF'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['AF'].isnull() == False]
print('number of obs. that contain missing AF: ' + str(del_AF))

### Drop missing AFE
del_AFE = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['AFE'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['AFE'].isnull() == False]
print('number of obs. that contain missing AFE: ' + str(del_AFE))

### Drop missing EARN
del_EARN = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['EARN'].isnull() == False]
print('number of obs. that contain missing EARN: ' + str(del_EARN))

### Drop missing STD_EARN
del_STD_EARN = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['STD_EARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['STD_EARN'].isnull() == False]
print('number of obs. that contain missing STD_EARN: ' + str(del_STD_EARN))

### Drop missing EARN
del_DEARN = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['DEARN'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['DEARN'].isnull() == False]
print('number of obs. that contain missing DEARN: ' + str(del_DEARN))

############## Winsorize ##############################
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

############## Winsorize TABLE 1 (line 1) and TABLE 4 (line 2) variables
crsp_comp_edgar_ibes_seg_left = winsorize(crsp_comp_edgar_ibes_seg_left, \
                                          ['AF', 'AFE', 'BUSSEG','GEOSEG','AGE','EARN', 'DEARN', 'STD_EARN', 'STD_RET', \
                                          'CFO', 'leap1_EARN', 'leap2_EARN', 'leap3_EARN', 'leap1_CFO', 'leap2_CFO', 'leap3_CFO'])

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_ibes_seg_left.shape[0]))

number of files that have positive/negative infinity DEARN: 0
number of obs. that contain missing AF: 15228
number of obs. that contain missing AFE: 0
number of obs. that contain missing EARN: 64
number of obs. that contain missing STD_EARN: 3163
number of obs. that contain missing DEARN: 0
Number of firm-quarters after variable screening: 91606


In [14]:
############## Dropping missing DA only to replicate Huang et al. 2014 TABLE 4, if not replicating don't drop because cause large sample reduction #######
### Drop missing DA
del_DA = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['DA'].isnull()].shape[0]
crsp_comp_edgar_ibes_seg_left = crsp_comp_edgar_ibes_seg_left[crsp_comp_edgar_ibes_seg_left['DA'].isnull() == False]
print('number of obs. that contain missing DA: ' + str(del_DA))

############## Inspect sample size after variable screening
print('Number of firm-quarters after dropping missing DA: ' + str(crsp_comp_edgar_ibes_seg_left.shape[0]))

number of obs. that contain missing DA: 38388
Number of firm-quarters after dropping missing DA: 53218


In [15]:
######## Create ABTONE: residual from TONE regression 
y, X = ps.dmatrices('TONE ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
                    data = crsp_comp_edgar_ibes_seg_left, return_type = 'dataframe')
model = sm.OLS(y, X)
res = model.fit()
# res.summary()
crsp_comp_edgar_ibes_seg_left['ABTONE'] = res.resid

######## Create abtone for REPLICATION: residual from tone regression 
y, X = ps.dmatrices('tone ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
                    data = crsp_comp_edgar_ibes_seg_left, return_type = 'dataframe')
model = sm.OLS(y, X)
res = model.fit()
# res.summary()
crsp_comp_edgar_ibes_seg_left['abtone'] = res.resid

############## Save merged ID_CRSP_COMP_TEXT to csv file
crsp_comp_edgar_ibes_seg_left.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_10-Q.csv', index = 0)

In [16]:
# ########################################################################################
# ########################## Variable Screening (INNER) ##################################
# ########################################################################################

# ########## Drop files (firm-quarter) that have positive/negative infinity DEARN
# del_DEARN = crsp_comp_edgar_ibes_seg_inner[(crsp_comp_edgar_ibes_seg_inner['DEARN']==np.inf) | (crsp_comp_edgar_ibes_seg_inner['DEARN']==-np.inf)].shape[0]
# crsp_comp_edgar_ibes_seg_inner = crsp_comp_edgar_ibes_seg_inner[(crsp_comp_edgar_ibes_seg_inner['DEARN']!=np.inf) & (crsp_comp_edgar_ibes_seg_inner['DEARN']!=-np.inf)]
# print('number of files that have positive/negative infinity DEARN: ' + str(del_DEARN))

# ### Drop missing AF
# del_AF = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['AF'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg_inner = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['AF'].isnull() == False]
# print('number of obs. that contain missing AF: ' + str(del_AF))

# ### Drop missing AFE
# del_AFE = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['AFE'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg_inner = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['AFE'].isnull() == False]
# print('number of obs. that contain missing AFE: ' + str(del_AFE))

# ### Drop missing EARN
# del_EARN = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['EARN'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg_inner = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['EARN'].isnull() == False]
# print('number of obs. that contain missing EARN: ' + str(del_EARN))

# ### Drop missing STD_EARN
# del_STD_EARN = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['STD_EARN'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg_inner = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['STD_EARN'].isnull() == False]
# print('number of obs. that contain missing STD_EARN: ' + str(del_STD_EARN))

# ### Drop missing EARN
# del_DEARN = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['DEARN'].isnull()].shape[0]
# crsp_comp_edgar_ibes_seg_inner = crsp_comp_edgar_ibes_seg_inner[crsp_comp_edgar_ibes_seg_inner['DEARN'].isnull() == False]
# print('number of obs. that contain missing DEARN: ' + str(del_DEARN))

# ############## Inspect sample size after variable screening
# print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_ibes_seg_inner.shape[0]))

# ############## Winsorize SIZE, MTB, LEV
# ###### Define a function that winsorize a variable at 1% and 99% 
# def winsorize (df, colnames):
#     for col in colnames:
#         varq01 = df[col].quantile(.01)
#         varq99 = df[col].quantile(.99)
#         df[col] = df[col].clip(varq01, varq99)
#     return df

# crsp_comp_edgar_ibes_seg_inner = winsorize(crsp_comp_edgar_ibes_seg_inner, ['AF', 'AFE', 'BUSSEG','GEOSEG','AGE','EARN', 'DEARN', 'STD_EARN', 'STD_RET'])

In [17]:
# ######## Create ABTONE for INNER: residual from TONE regression
# y, X = ps.dmatrices('TONE ~ 1 + EARN + RET + SIZE + MTB + STD_RET + STD_EARN + AGE + BUSSEG + GEOSEG + LOSS + DEARN + AFE + AF', \
#                     data = crsp_comp_edgar_ibes_seg_inner, return_type = 'dataframe')
# model = sm.OLS(y,X)
# res = model.fit()
# # res.summary()
# crsp_comp_edgar_ibes_seg_inner['ABTONE'] = res.resid

# ############## Save merged ID_CRSP_COMP_TEXT to csv file
# crsp_comp_edgar_ibes_seg_inner.to_csv('..\\filings\\crsp_comp_edgar_ibes_seg_10-Q.csv', index = 0)

In [18]:
########################################################################################
############### Table 3: Summary Statistics and Correlation Matrix #####################
########################################################################################

############# Table 3 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = crsp_comp_edgar_ibes_seg_left[['NW','nw', 'TONE','TLAG', \
                                               'RET', 'NEG', 'SIZE', 'MTB', 'LEV', \
                                               'AF', 'AFE', 'BUSSEG','GEOSEG','AGE','EARN', 'DEARN', 'STD_EARN', 'STD_RET', 'LOSS', 'ABTONE', 'DA' \
                                             ]]

T3PA = selected_vars.describe().transpose() 

############# Summary statistics for all raw and processed variables
full_summary = crsp_comp_edgar_ibes_seg_left.describe().transpose()

############# Save T3PA
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T3PA.to_excel(writer, sheet_name='T3PA_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T3PA.to_excel(table_path, sheet_name='T3PA_raw', float_format="%.4f")

T3PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,53218.0,8.859646,0.803466,7.044033,8.249575,8.906529,9.423999,13.490002
nw,53218.0,9766.948,10329.650478,1145.0,3825.0,7379.0,12381.0,722159.0
TONE,53218.0,-7.820618,6.804595,-64.54289,-11.522656,-6.867647,-3.181965,22.28739
TLAG,53218.0,39.10487,6.411387,0.0,36.0,40.0,44.0,52.0
RET,53218.0,0.01518333,0.24825,-1.578704,-0.112544,0.004833,0.123663,4.849226
NEG,53218.0,0.4880304,0.499861,0.0,0.0,0.0,1.0,1.0
SIZE,53218.0,6.52238,1.859641,2.001575,5.165372,6.394151,7.713877,11.206073
MTB,53218.0,3.487086,3.989004,0.288135,1.492856,2.320074,3.856557,30.900821
LEV,53218.0,0.206296,0.180075,0.0,0.029917,0.185856,0.327026,0.724242
AF,53218.0,0.04777429,0.061236,-0.262045,0.024551,0.050306,0.075099,0.227262


In [19]:
# full_summary

In [20]:
############# Table 3 Panel B: Correlation matrix for selected variables
######### pearson correlation
T3PB_pearson = selected_vars.corr(method='pearson')

# T3PB_pearson

In [21]:
######### spearman correlation
T3PB_spearman = selected_vars.corr(method='spearman')

# T3PB_spearman

In [22]:
######### Combine two correlation matrices. right-up matrix: pearson; left-down matrix: spearman 
for row in list(range(0, len(T3PB_spearman.index))):
    T3PB_spearman.iloc[row, row+1:] = T3PB_pearson.iloc[row, row+1:]
    
##### Save T3PB
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T3PB_spearman.to_excel(writer, sheet_name='T3PB_raw', float_format="%.4f")

    writer.save()
    writer.close()
    
else:
    T3PB_spearman.to_excel(table_path, sheet_name='T3PB_raw', float_format="%.4f")

T3PB_spearman

Unnamed: 0,NW,nw,TONE,TLAG,RET,NEG,SIZE,MTB,LEV,AF,AFE,BUSSEG,GEOSEG,AGE,EARN,DEARN,STD_EARN,STD_RET,LOSS,ABTONE,DA
NW,1.0,0.779746,-0.458335,-0.206129,-0.003128,-0.004318,0.285207,0.045326,0.074812,-0.02387,-0.00024,0.267205,0.284015,-0.019665,-0.08774,0.001769,0.074511,-0.043551,0.108578,-0.3714064,-0.00371
nw,1.0,1.0,-0.337293,-0.088771,-0.012154,0.007163,0.166525,0.036074,0.067465,-0.024129,-0.003928,0.154138,0.169282,-0.053224,-0.068417,-0.003456,0.056163,-0.016808,0.081034,-0.2830784,-0.018223
TONE,-0.487814,-0.487814,1.0,0.037364,0.016876,-0.015511,-0.090978,-0.01155,0.033661,0.033529,0.104498,-0.059368,-0.106902,0.025027,0.140626,-0.002177,-0.127607,-0.064185,-0.191872,0.9556586,-0.02265
TLAG,-0.281663,-0.281663,0.044266,1.0,-0.026248,0.040641,-0.334315,-0.011505,0.016146,-0.088282,-0.122665,-0.242778,-0.261878,-0.228667,-0.130399,-0.004635,0.125367,0.182615,0.133969,0.01087082,-0.062972
RET,0.00132,0.00132,0.025074,-0.040368,1.0,-0.680841,-0.04805,-0.012287,-0.001696,-0.043629,0.163737,-0.011919,-0.020486,0.001249,0.069754,0.046671,0.016795,0.26893,-0.069761,-2.556391e-16,0.010097
NEG,-0.005063,-0.005063,-0.018307,0.039037,-0.865777,1.0,-0.010069,0.007441,0.000674,0.032137,-0.131812,-0.000754,0.009853,-0.020885,-0.078632,-0.027181,0.012636,-0.117417,0.079748,0.002182002,-0.023831
SIZE,0.304281,0.304281,-0.081895,-0.340068,-0.005862,-0.011724,1.0,0.250158,0.073293,0.0317,0.284274,0.20954,0.22667,0.374121,0.249422,-0.023397,-0.201512,-0.314832,-0.249523,-3.339142e-16,0.176836
MTB,0.032837,0.032837,0.044097,-0.030597,-0.041436,0.025192,0.39309,1.0,0.04306,-0.165244,0.115611,-0.009132,-0.0099,-0.085884,-0.011865,0.018089,0.164418,0.048611,0.021177,-3.778512e-16,0.035505
LEV,0.064236,0.064236,0.026422,-0.000178,0.001141,-0.002087,0.118709,-0.1113,1.0,0.15436,-0.065615,0.029462,-0.011746,0.081173,0.004522,0.035996,-0.111133,-0.065184,-0.029599,0.04074629,0.007675
AF,0.024783,0.024783,0.016409,-0.120976,-0.105167,0.083425,-0.015376,-0.306237,0.225651,1.0,0.065443,0.06471,0.070265,0.174959,0.388148,0.014462,-0.216031,-0.120225,-0.347881,-9.692121e-18,0.029244
