In [1]:
############# This code creates *crsp_comp_edgar_section.csv* for sectional analysis in 10-Q ###################

############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')
    
############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [2]:
########################################################################################
############ Concatenate and prepare merge: ID_DATA and TEXT_DATA ######################
########################################################################################

############## Define a function to concatenate all csv files with file name that matches a certain pattern into one data frame
def concatenate (indir, file_name_match):
    os.chdir(indir)
    file_list = glob.glob(file_name_match)
    df_list = list()
    colnames = pd.read_csv(file_list[0], header = None).loc[0]
    
    for filename in file_list:
        # print(filename)
        df = pd.read_csv(filename, low_memory = False)
        df_list.append(df)

    df_concat = pd.concat(df_list, axis = 0)
    # df_concat.columns = colnames
    return df_concat

############## Concatenate id_data and text_data files and create two data frames
text_data = concatenate('..\\filings', 'text_data_section_' + '*.csv')

############## Calculate tone : tone = (n_pos - n_negation - n_neg)/nw
text_data['tone_mda'] = (text_data['n_pos_mda'] - text_data['n_negation_mda'] - text_data['n_neg_mda'])/text_data['nw_mda']
text_data['tone_note'] = (text_data['n_pos_note'] - text_data['n_negation_note'] - text_data['n_neg_note'])/text_data['nw_note']

############## Save text_data dataframe into local file text_data_10-Q.csv
text_data.to_csv('..\\filings\\text_data_section.csv', index = 0)

print('Number of 10-Q sections parsed: ' + str(text_data.shape[0]))

Number of 10-Q sections parsed: 282391


In [3]:
########################################################################################
############### Merge SECTION_DATA with CRSP_COMPUSTAT_EDGAR_10-Q ######################
########################################################################################

crsp_comp_edgar = pd.read_csv(r'..\filings\crsp_comp_edgar_ibes_seg_10-Q.csv')

############## prepare merge: ID_DATA
crsp_comp_edgar_section = pd.merge(crsp_comp_edgar, text_data, on = ['accnum'], how = 'inner', validate = '1:1')
print('number of observations after merging with section data: ' + str(crsp_comp_edgar_section.shape[0]))
crsp_comp_edgar_section

number of observations after merging with section data: 44635


Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,SIC,fd,date_crsp,date_comp,cquarter,fyearq,fqtr,incorp,state,addzip,costat,age,actq,aqcy,cheq,dpq,ibq,intanq,lctq,revtq,txditcq,xsgaq,sstky,atq,lag_atq,ceqq,lag_ceqq,cshoq,lag_cshoq,dlcq,lag_dlcq,dlttq,lag_dlttq,prccq,lag_prccq,ibq.1,lag_ibq,RET,STD_RET,EARN,LOSS,DEARN,STD_EARN,CFO,leap1_EARN,leap2_EARN,leap3_EARN,leap1_CFO,leap2_CFO,leap3_CFO,TACC,LAG_TA_REV,DSAR,PPE,SIC2,SG,SKEW_RET,TURNOVER,nw,nvocab,n_neg,n_pos,n_uctt,n_lit,n_cstr,n_modal_week,n_modal_moderate,n_modal_weak,n_negation,READ,tone,NW,TONE,TLAG,NEG,SIZE,MTB,LEV,AGE,date_key,fpedats,actual,median,afe,consensus,leap_consensus,nseg_bus,nseg_geo,LNASSETS,BUSSEG,GEOSEG,AFE,AF,ABTONE,abtone,nw_mda,n_neg_mda,n_pos_mda,n_negation_mda,nw_note,n_neg_note,n_pos_note,n_negation_note,tone_mda,tone_note
0,08373910,11454,1993-12-31,0000011454-94-000006,BERGEN BRUNSWIG CORP,2169.0,5122,1994-02-14,1993-12-31,1993-12-31,199312,1994.0,1.0,NJ,NJ,92868,I,10349,1474.257,0.000,22.616,7.381,10.331,,1244.074,1834.936,2.278,81.343,0.025,2013.332,1772.337,417.292,417.800,35.577,36.431,2.026,2.129,337.515,285.873,17.875,17.500,10.331,-13.128,0.173416,0.082377,0.005829,0.0,0.013236,0.006858,-0.150724,0.007216,0.007869,0.008513,-0.047104,0.064367,0.002421,280.035,0.990116,0.005681,0.111794,51,-0.004626,0.464055,47936.0,6291,1096,128,37,38,211,26,12,1,15,0,64.98,-0.014465,8.747034,-14.465109,45,0,6.457621,1.525951,0.162498,9.244742,1994,1994-09-30,0.6629,0.5524,0.1105,0.556018,0.635560,1.0,1.0,7.480054,0.693147,0.693147,0.006182,0.035556,-6.968715,-0.006969,2294,13,17,0,1264,3,5,0,0.001744,0.001582
1,08373910,11454,1994-03-31,0000011454-94-000008,BERGEN BRUNSWIG CORP,2169.0,5122,1994-05-16,1994-03-31,1994-03-31,199403,1994.0,2.0,NJ,NJ,92868,I,10439,1371.205,0.000,15.983,7.344,14.529,,1030.680,1845.449,2.244,80.081,0.099,1909.540,2013.332,427.282,417.292,36.443,35.577,1.791,2.026,437.561,337.515,16.750,17.875,14.529,10.331,-0.004270,0.078973,0.007216,0.0,0.002085,0.006707,-0.047104,0.007869,0.008513,0.006791,0.064367,0.002421,0.003467,109.364,0.880300,0.037470,0.101453,51,0.039004,-0.340700,38087.0,6475,1088,146,32,50,159,27,14,12,21,1,75.67,-0.017761,8.775858,-17.760618,46,1,6.455102,1.523966,0.168646,9.253400,1994,1994-09-30,0.6629,0.5524,0.1105,0.556018,0.635560,1.0,1.0,7.607546,0.693147,0.693147,0.006597,0.037944,-10.406271,-0.010406,3521,31,20,1,2337,16,7,1,-0.003408,-0.004279
2,08373910,11454,1994-06-30,0000011454-94-000016,BERGEN BRUNSWIG CORP,2169.0,5122,1994-08-15,1994-06-30,1994-06-30,199406,1994.0,3.0,NJ,NJ,92868,I,10530,1363.878,0.000,14.224,7.841,15.027,,1109.997,1878.548,2.138,79.872,0.180,1906.824,1909.540,451.746,427.282,37.253,36.443,1.293,1.791,332.178,437.561,16.750,16.750,15.027,14.529,0.018873,0.029090,0.007869,0.0,0.000261,0.006585,0.064367,0.008513,0.006791,0.008711,0.002421,0.003467,0.010880,-107.885,1.054354,0.012764,0.108509,51,0.005506,0.090021,33761.0,6684,1106,148,33,61,166,28,14,12,31,1,76.10,-0.017355,8.807621,-17.354877,46,0,6.414148,1.428612,0.230083,9.262079,1994,1994-09-30,0.6629,0.5524,0.1105,0.556018,0.635560,1.0,1.0,7.554618,0.693147,0.693147,0.006597,0.037944,-10.377356,-0.010377,3722,32,20,1,2403,16,7,1,-0.003493,-0.004161
3,14428510,17843,1993-12-31,0000017843-94-000005,CARPENTER TECHNOLOGY CORP,2787.0,3312,1994-02-10,1993-12-31,1993-12-31,199312,1994.0,2.0,DE,DE,19103,A,20671,183.206,22.200,18.047,7.224,7.360,,107.094,147.127,69.218,22.825,1.138,736.769,740.379,217.685,214.525,8.010,7.989,27.208,43.187,176.826,180.405,58.000,50.250,7.360,2.772,0.140644,0.044119,0.009941,0.0,0.006197,0.006358,0.052169,0.017407,0.021006,0.006757,0.022080,0.031095,-0.119186,-31.265,0.944874,0.027108,0.971353,33,-0.039219,-0.674115,10066.0,33206,2001,425,110,212,1228,267,68,33,186,3,128.45,-0.009577,10.410516,-9.576583,41,0,5.995076,1.871331,0.301997,9.936535,1994,1994-06-30,1.0300,0.9275,0.1025,0.896750,1.170000,1.0,1.0,6.607162,0.693147,0.693147,0.001767,0.020172,-3.143362,-0.003143,402,3,2,0,2091,10,9,0,-0.002488,-0.000478
4,19247910,21510,1993-12-25,0000912057-94-000209,COHERENT INC,3157.0,3826,1994-02-07,1993-12-31,1993-12-31,199312,1994.0,1.0,DE,DE,95054,A,7701,141.164,1.500,36.470,,1.429,,50.855,47.026,1.643,20.341,1.355,189.397,193.796,119.437,117.023,10.101,9.913,7.847,9.498,12.015,14.122,12.750,14.250,1.429,2.250,-0.126372,0.029781,0.007374,0.0,-0.004236,0.003236,-0.105549,0.015607,0.014496,0.014704,0.023221,0.027511,0.003111,21.884,0.959793,-0.018138,0.387454,38,0.022611,0.630859,25708.0,2451,637,29,20,15,15,12,3,5,1,3,77.39,-0.004896,7.804659,-4.895961,38,1,4.950604,1.207115,0.121881,8.949235,1994,1994-09-30,0.5000,0.5000,0.0000,0.493000,0.694545,1.0,1.0,5.266806,0.693147,0.693147,0.000000,0.054474,1.073312,0.001073,8,0,0,0,2037,27,20,3,0.000000,-0.004909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44630,70339510,891024,2016-01-30,0000891024-16-000007,"PATTERSON COMPANIES, INC.",25880.0,5047,2016-03-10,2016-01-29,2016-01-31,201601,2015.0,3.0,MN,MN,55120,A,8521,1710.856,1106.583,97.701,8.149,57.190,815.316,915.598,1400.853,,244.135,0.000,3555.670,3451.838,1362.547,1363.257,99.100,99.500,214.500,95.991,1030.250,1034.884,42.460,47.400,57.190,42.563,-0.025536,0.023640,0.016568,0.0,0.004237,0.004967,-0.009728,0.018455,0.011050,0.013076,0.068302,-0.065073,0.004993,90.770,1.228231,0.013672,,50,0.071365,-0.544070,513011.0,9166,1248,168,24,62,119,42,13,17,14,0,16.28,-0.015710,9.123365,-15.710233,41,1,8.458780,3.459582,0.327615,9.050406,2015,2015-04-30,2.2700,2.3600,-0.0900,2.364094,2.495757,9.0,9.0,8.146662,2.302585,2.302585,-0.002120,0.058779,-6.791241,-0.006791,2449,17,7,0,5278,87,16,0,-0.004083,-0.013452
44631,40161710,912463,2015-10-31,0000912463-15-000048,GUESS INC,63447.0,2330,2015-12-02,2015-10-30,2015-10-31,201510,2015.0,3.0,DE,DE,90021,A,7030,1050.145,1.263,402.386,17.570,12.444,41.605,327.050,493.214,0.000,162.826,-0.983,1522.472,1563.612,1001.941,1046.453,83.774,85.763,4.450,4.840,2.189,2.057,21.050,21.890,12.444,18.289,-0.008415,0.065393,0.007958,0.0,-0.003738,0.011523,0.005111,0.031381,-0.016363,0.020781,0.075339,-0.136652,0.008524,4.453,0.994645,-0.015869,,23,0.043475,-0.286724,713387.0,22573,2030,353,184,276,293,162,72,44,81,2,16.12,-0.007575,10.024554,-7.575422,33,1,7.537618,1.794015,0.004411,8.858084,2015,2015-01-31,1.1100,1.8750,-0.7650,1.745702,1.260522,1.0,15.0,7.354754,0.693147,2.772589,-0.036342,0.059882,2.877073,0.002877,10547,124,102,0,9402,152,64,2,-0.002086,-0.009572
44632,02553E10,919012,2015-10-31,0001564590-15-011286,AMERICAN EAGLE OUTFITTERS INC,30059.0,5600,2015-12-04,2015-10-30,2015-10-31,201510,2015.0,3.0,DE,DE,15203,A,7883,1050.347,0.000,363.116,37.624,69.267,59.734,578.143,919.072,0.000,220.798,6.347,1887.836,1760.209,1209.273,1165.678,194.438,195.429,0.000,0.000,0.000,0.000,15.280,17.750,69.267,33.263,-0.117398,0.044256,0.039352,0.0,0.020454,0.014900,0.061675,0.043280,0.025105,0.026055,0.094837,-0.189328,0.044789,-46.099,0.950319,0.061676,1.014006,56,0.055623,0.289382,3611576.0,11530,1421,165,94,108,34,102,14,36,31,3,15.58,-0.006418,9.352794,-6.418040,35,1,8.151583,2.975834,0.000000,8.972591,2015,2015-01-31,0.6300,0.9000,-0.2700,0.990995,0.974897,3.0,6.0,7.473188,1.386294,1.945910,-0.017670,0.063802,3.104451,0.003104,4966,64,54,1,5693,99,37,1,-0.002215,-0.011066
44633,25674610,935703,2015-10-31,0000935703-15-000109,DOLLAR TREE INC,31587.0,5331,2015-11-24,2015-10-30,2015-10-31,201510,2015.0,3.0,VA,VA,23320,A,7549,4628.000,6527.700,1110.900,211.900,81.900,8725.200,2161.200,4945.300,1634.700,1151.900,6.200,16699.000,16565.100,4142.200,4047.000,234.800,234.700,95.500,83.000,8248.000,8265.500,65.490,78.030,81.900,-98.000,-0.146328,0.060909,0.004944,0.0,0.010860,0.028283,-0.000290,0.013713,0.014628,0.010597,0.049787,-0.029551,0.022994,86.700,0.656151,0.116764,,53,0.050371,0.707065,2300996.0,13870,1661,242,39,192,190,104,48,36,58,2,74.03,-0.014780,9.537556,-14.780101,25,1,9.815401,4.525239,0.503981,8.929303,2015,2015-01-31,3.1200,3.1900,-0.0700,3.214515,3.400779,3.0,1.0,9.715053,1.386294,0.693147,-0.001069,0.051928,-4.785626,-0.004786,10895,205,34,2,1795,24,5,0,-0.015879,-0.010585


In [4]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

############################## Main Variables ##########################################
######## NW: natural log of 1 + total number of words in the document
crsp_comp_edgar_section['NW_MDA'] = np.log(1 + crsp_comp_edgar_section['nw_mda'])
crsp_comp_edgar_section['NW_NOTE'] = np.log(1 + crsp_comp_edgar_section['nw_note'])

######## TONE: number of net positive words (n_pos - n_neg - n_negations) per 1000 total words
crsp_comp_edgar_section['TONE_MDA'] = crsp_comp_edgar_section['tone_mda']*1000
crsp_comp_edgar_section['TONE_NOTE'] = crsp_comp_edgar_section['tone_note']*1000

In [5]:
########################################################################################
############################### Variable Screening #####################################
########################################################################################

## Change SIC to str
crsp_comp_edgar_section['SIC'] = crsp_comp_edgar_section['SIC'].astype(str)

########## Drop MDAs (firm-quarter) that contain number of words less than 1% threshold
# nwq01 = crsp_comp_edgar_section['nw_mda'].quantile(.01)
# print('number of words in MDA, 1% quantile: ' + str(nwq01))
del_word01 = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_mda'] < 100].shape[0]
print('number of MDAs that contain total words less than 1% threshold: ' + str(del_word01))
crsp_comp_edgar_section = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_mda'] >= 100]

########## Drop NOTES (firm-quarter) that contain number of words less than 1% threshold
# nwq01 = crsp_comp_edgar_section['nw_note'].quantile(.01)
# print('number of words in NOTES, 1% quantile: ' + str(nwq01))
del_word01 = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_note'] < 100].shape[0]
print('number of NOTES that contain total words less than 1% threshold: ' + str(del_word01))
crsp_comp_edgar_section = crsp_comp_edgar_section.loc[crsp_comp_edgar_section['nw_note'] >= 100]

############## Inspect sample size after variable screening
print('Number of firm-quarters after variable screening: ' + str(crsp_comp_edgar_section.shape[0]))

############## Save merged crsp_comp_edgar_section to csv file
crsp_comp_edgar_section.to_csv('..\\filings\\crsp_comp_edgar_section.csv', index = 0)

number of MDAs that contain total words less than 1% threshold: 6565
number of NOTES that contain total words less than 1% threshold: 854
Number of firm-quarters after variable screening: 37216


In [6]:
########################################################################################
############### Table 1: Summary Statistics and Correlation Matrix #####################
########################################################################################

############# Table 1 Panel A: Summary statistics for selected variables
######### Variable groups:
# 1st line: textual variables, generally consistent with LM's summary statistics
# 2nd line: fundamental variables (main)
# 3rd line: abtone
selected_vars = crsp_comp_edgar_section[['NW','nw', 'NW_MDA','nw_mda', 'NW_NOTE','nw_note', 'TONE', 'TONE_MDA', 'TONE_NOTE', 'TLAG', \
                                   'RET', 'NEG', 'SIZE', 'MTB', 'LEV' \
                                 # 'AGE', 'age', 'EARN', 'STD_RET', 'STD_EARN', 'LOSS', 'DEARN' \
                                 ]]

T1PA = selected_vars.describe().transpose() 

############# Summary statistics for all raw and processed variables
full_summary = crsp_comp_edgar_section.describe().transpose()

T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,37216.0,9.246983,0.610583,7.12206,8.8688,9.282103,9.664024,13.544145
nw,37216.0,12449.497662,9571.648064,1238.0,7105.75,10743.0,15740.0,762337.0
NW_MDA,37216.0,8.412272,0.639065,4.615121,8.067071,8.465584,8.831858,11.378822
nw_mda,37216.0,5358.583862,3136.592091,100.0,3186.75,4747.5,6848.0,87449.0
NW_NOTE,37216.0,7.971237,0.865844,4.615121,7.467228,8.123261,8.579276,11.018432
nw_note,37216.0,3957.618739,3090.43433,100.0,1748.75,3371.0,5319.25,60987.0
TONE,37216.0,-9.839382,7.055617,-60.5274,-13.965015,-8.749336,-4.840774,24.214907
TONE_MDA,37216.0,-5.425726,6.862283,-56.976744,-9.043313,-4.615465,-0.985594,26.845638
TONE_NOTE,37216.0,-10.523478,9.560204,-106.083086,-14.59854,-8.603112,-4.366812,26.086957
TLAG,37216.0,38.048689,6.160908,1.0,35.0,39.0,42.0,52.0
