In [1]:
############# This code creates *crsp_comp_edgar_section.csv* for sectional analysis in 10-Q ###################

############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')
    
############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
########################################################################################
############ Concatenate and prepare merge: ID_DATA and TEXT_DATA ######################
########################################################################################

############## Define a function to concatenate all csv files with file name that matches a certain pattern into one data frame
def concatenate (indir, file_name_match):
    os.chdir(indir)
    file_list = glob.glob(file_name_match)
    df_list = list()
    colnames = pd.read_csv(file_list[0], header = None).loc[0]
    
    for filename in file_list:
        # print(filename)
        df = pd.read_csv(filename, low_memory = False)
        df_list.append(df)

    df_concat = pd.concat(df_list, axis = 0)
    # df_concat.columns = colnames
    return df_concat

############## Concatenate id_data and text_data files and create two data frames
text_data = concatenate('..\\filings', 'text_data_section_' + '*.csv')

############## Calculate tone : tone = (n_pos - n_negation - n_neg)/nw
text_data['tone_mda'] = (text_data['n_pos_mda'] - text_data['n_negation_mda'] - text_data['n_neg_mda'])/text_data['nw_mda']
text_data['tone_note'] = (text_data['n_pos_note'] - text_data['n_negation_note'] - text_data['n_neg_note'])/text_data['nw_note']

############## Save text_data dataframe into local file text_data_10-Q.csv
text_data.to_csv('..\\filings\\text_data_section.csv', index = 0)

print('Number of 10-Q sections parsed: ' + str(text_data.shape[0]))

Number of 10-Q sections parsed: 282391


In [3]:
########################################################################################
############### Merge SECTION_DATA with CRSP_COMPUSTAT_EDGAR_10-Q ######################
########################################################################################

crsp_comp_edgar = pd.read_csv(r'..\filings\crsp_comp_edgar_ibes_seg_10-Q.csv')

############## prepare merge: ID_DATA
crsp_comp_edgar_section = pd.merge(crsp_comp_edgar, text_data, on = ['accnum'], how = 'inner', validate = '1:1')
print('number of observations after merging with section data: ' + str(crsp_comp_edgar_section.shape[0]))
crsp_comp_edgar_section

number of observations after merging with section data: 56015


Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,SIC,fd,date_crsp,date_comp,cquarter,fyearq,fqtr,incorp,state,addzip,costat,age,actq,aqcy,cheq,dpq,ibq,intanq,lctq,revtq,txditcq,xsgaq,sstky,xrdq,atq,lag_atq,ceqq,lag_ceqq,cshoq,lag_cshoq,dlcq,lag_dlcq,dlttq,lag_dlttq,prccq,lag_prccq,ibq.1,lag_ibq,nw,READ,NW,TONE,TONE_GI,TONE_HE,TLAG,RET,STD_RET,EARN,LOSS,DEARN,STD_EARN,CFO,lag1_CFO,lag2_CFO,lag3_CFO,PPE,SIC2,SG,LAG_SG,SKEW_RET,TURNOVER,LAG1_RET,LAG2_RET,LAG3_RET,LAG1_NW,LAG2_NW,LAG3_NW,LAG1_TONE,LAG2_TONE,LAG3_TONE,LAG1_TLAG,LAG2_TLAG,LAG3_TLAG,NEG,SIZE,MTB,LEV,AGE,date_key,fpedats,actual,median,afe,consensus,leap_consensus,nseg_bus,nseg_geo,BUSSEG,GEOSEG,AFE,AF,nw_mda,n_neg_mda,n_pos_mda,n_negation_mda,nw_note,n_neg_note,n_pos_note,n_negation_note,tone_mda,tone_note
0,48273010,20,1997-03-29,0000893220-97-000850,K TRON INTERNATIONAL INC,6314.0,3823,1997-04-30,1997-03-31,1997-03-31,199703,1997.0,1.0,NJ,NJ,08071-0888,I,5967,33.912,0.000,4.426,0.770,1.050,,19.139,21.344,0.459,7.684,0.05,0.722,53.037,55.330,13.770,13.194,3.143,3.137,0.325,0.861,18.316,20.807,10.375,10.250,1.050,1.185,1756,26.60,7.471363,-2.277904,21.640091,-3.416856,32.0,0.006413,0.006832,0.018977,0.0,-0.002440,0.003929,-0.189301,0.073235,0.060155,0.070127,0.732171,38,-0.009823,-0.012326,0.247687,3587.0,0.041001,0.055580,0.053657,9.420277,8.053887,7.869019,-9.809485,-7.949126,-8.033665,32.0,27.0,33.0,0,3.470545,2.437036,0.391614,8.694167,1997,1997-12-31,1.7000,1.4000,0.3000,1.334444,1.717500,1.0,1.0,0.693147,0.693147,0.028916,0.165542,16,0,0,0,275,0,1,1,0.000000,0.000000
1,00036110,1750,2000-02-29,0000912057-00-018126,AAR CORP,1004.0,5080,2000-04-14,2000-02-29,2000-02-29,200002,1999.0,3.0,DE,DE,60191,A,10197,526.935,0.000,4.431,4.585,10.955,,176.733,272.331,52.701,24.404,,,753.755,747.043,342.482,339.088,26.963,27.181,25.353,23.934,180.639,180.715,23.750,16.500,10.955,10.906,2757,23.61,7.922261,-6.166123,23.213638,14.508524,45.0,0.344770,0.139071,0.014664,0.0,0.000066,0.000966,0.015011,0.021036,-0.067703,0.006691,0.265871,50,-0.011520,0.046992,0.687084,83422.0,-0.313551,0.085637,0.222281,7.914983,7.863267,7.948032,-4.018999,-5.771451,-4.241782,45.0,45.0,45.0,0,6.105879,1.322626,0.273945,9.229947,1999,1999-05-31,1.4889,1.4589,0.0300,1.464510,1.721429,1.0,1.0,0.693147,0.693147,0.001263,0.072481,1473,10,3,0,2115,15,1,0,-0.004752,-0.006619
2,00036110,1750,2000-08-31,0000912057-00-044710,AAR CORP,1004.0,5080,2000-10-13,2000-08-31,2000-08-31,200008,2000.0,1.0,DE,DE,60191,A,10381,510.278,0.000,0.669,4.694,3.159,,168.282,241.770,58.441,24.544,0.00,,747.543,740.998,339.253,339.515,26.857,26.865,48.151,26.314,180.367,180.447,11.250,13.875,3.159,2.471,1885,23.51,7.542213,-6.896552,26.525199,-1.061008,43.0,-0.299449,0.090377,0.004263,0.0,0.000928,0.006116,-0.024014,0.005816,0.015011,0.021036,0.252117,50,0.022328,-0.063768,0.553016,69232.0,-0.423595,0.344770,-0.313551,7.922261,7.914983,7.863267,-6.166123,-4.018999,-5.771451,45.0,45.0,45.0,1,5.920913,1.097895,0.279030,9.247829,2000,2000-05-31,1.3890,1.6987,-0.3097,1.721429,1.460187,1.0,1.0,0.693147,0.693147,-0.027529,0.129794,783,7,3,0,1345,11,0,0,-0.005109,-0.008178
3,00036110,1750,2000-11-30,0000912057-01-001318,AAR CORP,1004.0,5080,2001-01-12,2000-11-30,2000-11-30,200011,2000.0,2.0,DE,DE,60191,A,10472,514.749,3.200,3.069,4.516,4.278,,189.367,211.335,60.937,23.879,0.00,,772.941,747.543,341.264,339.253,26.932,26.857,50.437,48.151,180.173,180.367,10.375,11.250,4.278,3.159,2585,19.48,7.857868,-5.415861,24.758221,-3.481625,43.0,0.114928,0.047362,0.005723,0.0,0.001497,0.005817,0.023936,-0.024014,0.005816,0.015011,,50,-0.039376,0.022328,-0.692640,66592.0,-0.299449,-0.423595,0.344770,7.542213,7.922261,7.914983,-6.896552,-6.166123,-4.018999,43.0,45.0,45.0,0,5.710895,0.890607,0.305692,9.256556,2000,2000-05-31,1.3890,1.6987,-0.3097,1.721429,1.460187,1.0,1.0,0.693147,0.693147,-0.029851,0.140741,1189,13,7,0,1944,17,5,0,-0.005046,-0.006173
4,00036110,1750,2001-02-28,0000912057-01-508531,AAR CORP,1004.0,5080,2001-04-13,2001-02-28,2001-02-28,200102,2000.0,3.0,DE,DE,60191,A,10562,526.537,3.200,0.767,4.788,5.388,,167.852,200.071,60.695,23.287,0.00,,754.718,772.941,344.865,341.264,26.945,26.932,49.665,50.437,180.106,180.173,13.600,10.375,5.388,4.278,2551,22.46,7.844633,-6.272050,25.088201,-5.488044,44.0,0.361173,0.089707,0.006971,0.0,0.001436,0.004520,-0.010654,0.023936,-0.024014,0.005816,,50,-0.014925,-0.039376,-0.662325,48964.0,0.114928,-0.299449,-0.423595,7.857868,7.542213,7.922261,-5.415861,-6.896552,-6.166123,43.0,43.0,45.0,0,5.632714,0.818778,0.298354,9.265113,2000,2000-05-31,1.3890,1.6987,-0.3097,1.721429,1.460187,1.0,1.0,0.693147,0.693147,-0.022772,0.107367,1245,16,8,0,2003,20,5,0,-0.006426,-0.007489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56010,G9381B10,1740547,2019-06-30,0001564590-19-031967,Urovant Sciences Ltd.,33816.0,2836,2019-08-14,2019-06-28,2019-06-30,201906,2019.0,1.0,,D0,SW1Y 4LB,A,301,71.019,0.000,62.603,0.054,-28.485,0.000,11.447,0.000,0.000,27.425,0.07,22.014,76.252,100.121,47.720,74.770,30.340,30.323,0.092,0.000,17.085,13.534,7.910,10.050,-28.485,-28.968,51904,19.89,10.857170,-21.250771,27.358200,2.196363,45.0,-0.243291,0.157806,-0.221086,1.0,0.004824,0.197081,0.281247,-0.213546,-0.263239,-0.192147,,28,0.000000,0.000000,-0.372995,35348.0,0.413040,-0.397484,,10.803811,10.779768,,-20.342634,-20.254798,,45.0,44.0,,1,5.719479,4.075781,0.135176,5.808142,2019,2019-03-31,-4.4300,-3.0050,-1.4250,-3.430000,-4.004091,1.0,1.0,0.693147,0.693147,-0.180152,-0.319823,6355,55,50,2,6997,76,40,17,-0.001101,-0.007575
56011,G9381B10,1740547,2019-09-30,0001564590-19-041855,Urovant Sciences Ltd.,33816.0,2836,2019-11-07,2019-09-30,2019-09-30,201909,2019.0,2.0,,D0,SW1Y 4LB,A,395,76.862,0.000,68.039,0.059,-25.741,0.000,11.565,0.000,0.000,25.172,0.22,17.796,82.028,76.252,23.756,47.720,30.381,30.340,0.275,0.092,46.707,17.085,9.470,7.910,-25.741,-28.485,55155,20.31,10.917921,-20.959115,27.993836,2.574563,38.0,0.179101,0.015114,-0.221086,1.0,0.035986,0.197081,-0.256573,0.294746,-0.213546,-0.192147,,28,0.000000,0.000000,0.198884,23278.0,-0.243291,0.413040,-0.397484,10.857170,10.803811,10.779768,-21.250771,-20.342634,-20.254798,45.0,45.0,44.0,0,5.480595,5.029116,0.225266,5.981414,2019,2019-03-31,-4.4300,-3.0050,-1.4250,-3.430000,-4.004091,1.0,1.0,0.693147,0.693147,-0.150475,-0.319823,7623,65,58,1,7812,89,42,15,-0.001049,-0.007937
56012,G8473T10,1757898,2019-06-30,0001757898-19-000012,STERIS plc,25313.0,3841,2019-08-07,2019-06-28,2019-06-30,201906,2019.0,1.0,,L2,D02 R296,A,9891,1044.282,34.970,238.067,47.102,84.590,2949.508,432.774,696.803,154.385,188.880,,15.585,5229.723,5073.071,3236.900,3177.810,84.754,84.517,20.139,0.000,1312.491,1183.227,148.880,128.030,84.590,108.745,14359,19.92,9.572202,-10.516053,20.474963,6.755345,38.0,0.116403,0.040292,0.016674,0.0,-0.004761,0.004466,-0.084794,0.035740,0.026662,0.024926,0.424755,38,-0.013655,0.014188,-0.362726,357669.0,0.055432,0.090596,0.030175,,,,,,,,,,0,9.289217,3.405084,0.233237,9.199482,2019,2019-03-31,4.8900,4.8000,0.0900,4.783333,5.495152,5.0,3.0,1.791759,1.386294,0.000605,0.036910,5643,112,51,1,7503,117,37,2,-0.010987,-0.010929
56013,G8473T10,1757898,2019-09-30,0001757898-19-000018,STERIS plc,25313.0,3841,2019-11-07,2019-09-30,2019-09-30,201909,2019.0,2.0,,L2,D02 R296,A,9985,1031.954,87.935,225.536,49.634,94.769,2895.441,428.023,736.840,152.748,189.523,,16.249,5185.717,5229.723,3228.668,3236.900,84.797,84.754,18.461,20.139,1285.105,1312.491,144.490,148.880,94.769,84.590,15919,19.35,9.675331,-11.244425,19.222313,7.224072,38.0,-0.032340,0.055242,0.018121,0.0,0.001946,0.004461,0.028809,-0.084794,0.035740,0.026662,0.416714,38,0.007721,-0.013655,0.033550,314667.0,0.116403,0.055432,0.090596,9.572202,,,-10.516053,,,38.0,,,1,9.442894,3.898228,0.254818,9.208939,2019,2019-03-31,4.8900,4.8000,0.0900,4.783333,5.495152,5.0,3.0,1.791759,1.386294,0.000623,0.038031,6539,127,51,1,7743,118,37,2,-0.011776,-0.010719


In [4]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

############################## Main Variables ##########################################
######## NW: natural log of 1 + total number of words in the document
crsp_comp_edgar_section['NW_MDA'] = np.log(1 + crsp_comp_edgar_section['nw_mda'])
crsp_comp_edgar_section['NW_NOTE'] = np.log(1 + crsp_comp_edgar_section['nw_note'])

######## TONE: number of net positive words (n_pos - n_neg - n_negations) per 1000 total words
crsp_comp_edgar_section['TONE_MDA'] = crsp_comp_edgar_section['tone_mda']*1000
crsp_comp_edgar_section['TONE_NOTE'] = crsp_comp_edgar_section['tone_note']*1000

In [5]:
########################################################################################
############################### Variable Screening #####################################
########################################################################################

## Change SIC to str
crsp_comp_edgar_section['SIC'] = crsp_comp_edgar_section['SIC'].astype(str)

########## Drop MDAs (firm-quarter) that contain number of words less than 1% threshold
# nwq01 = crsp_comp_edgar_section['nw_mda'].quantile(.01)
# print('number of words in MDA, 1% quantile: ' + str(nwq01))
del_word01 = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_mda'] < 100].shape[0]
print('number of MDAs that contain total words less than 1% threshold: ' + str(del_word01))
crsp_comp_edgar_section = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_mda'] >= 100]

########## Drop NOTES (firm-quarter) that contain number of words less than 1% threshold
# nwq01 = crsp_comp_edgar_section['nw_note'].quantile(.01)
# print('number of words in NOTES, 1% quantile: ' + str(nwq01))
del_word01 = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_note'] < 100].shape[0]
print('number of NOTES that contain total words less than 1% threshold: ' + str(del_word01))
crsp_comp_edgar_section = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_note'] >= 100]

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_section.shape[0]))

############## Save merged crsp_comp_edgar_section to csv file
crsp_comp_edgar_section.to_csv('..\\filings\\crsp_comp_edgar_section.csv', index = 0)

number of MDAs that contain total words less than 1% threshold: 7000
number of NOTES that contain total words less than 1% threshold: 926
Number of firm-quarters after variable screening: 48089


In [6]:
########################################################################################
############### Table 1: Summary Statistics and Correlation Matrix #####################
########################################################################################

############# Table 1 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = crsp_comp_edgar_section[['NW','nw', 'NW_MDA','nw_mda', 'NW_NOTE','nw_note', 'TONE', 'TONE_MDA', 'TONE_NOTE', 'TLAG', \
                                   'RET', 'NEG', 'SIZE', 'MTB', 'LEV' \
                                 # 'AGE', 'age', 'EARN', 'STD_RET', 'STD_EARN', 'LOSS', 'DEARN' \
                                 ]]

T1PA = selected_vars.describe().transpose() 

############# Summary statistics for all raw and processed variables
full_summary = crsp_comp_edgar_section.describe().transpose()

T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,48089.0,9.333314,0.60279,7.12206,8.972337,9.365804,9.729134,13.544145
nw,48089.0,13481.952234,9570.164514,1238.0,7881.0,11681.0,16799.0,762337.0
NW_MDA,48089.0,8.451062,0.612698,4.615121,8.131531,8.506941,8.844048,11.378822
nw_mda,48089.0,5488.946869,3072.490616,100.0,3399.0,4948.0,6932.0,87449.0
NW_NOTE,48089.0,8.124058,0.85122,4.615121,7.671827,8.282736,8.709135,11.018432
nw_note,48089.0,4535.583834,3388.737248,100.0,2146.0,3954.0,6057.0,60987.0
TONE,48089.0,-9.638662,7.074007,-60.5274,-13.743455,-8.409623,-4.589654,24.214907
TONE_MDA,48089.0,-4.964037,6.613193,-56.976744,-8.379511,-4.21301,-0.717703,26.845638
TONE_NOTE,48089.0,-9.938572,9.249293,-106.083086,-13.769363,-8.084475,-4.057344,26.086957
TLAG,48089.0,37.280813,5.924626,9.0,34.0,38.0,41.0,51.0
