In [1]:
############### import packages
import os, numpy as np, pandas as pd, time, glob, re
from tqdm import tqdm
from time import process_time

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
data_type_text = 'text_data'
data_type_id = 'id_data'

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

'F:\\github\\narrative_conservatism\\code'

In [2]:
############## Define a function to concatenate all csv files with file name that matches a certain pattern into one data frame
def concatenate (indir, file_name_match):
    os.chdir(indir)
    file_list = glob.glob(file_name_match)
    df_list = list()
    colnames = pd.read_csv(file_list[0], header = None).loc[0]
    
    for filename in file_list:
        # print(filename)
        df = pd.read_csv(filename, low_memory = False)
        df_list.append(df)

    df_concat = pd.concat(df_list, axis = 0)
    df_concat.columns = colnames
    return df_concat

############## Concatenate id_data and text_data files
id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')
text_data = concatenate('..\\filings', data_type_text + '_'+ obj_type + '_' + '*.csv')
############## Save id_data dataframe into local file id_text_10-Q.csv
id_data.to_csv('..\\filings\\' + data_type_id + '_'+ obj_type + '.csv', index = 0)

first_T1PB = len(id_data.index)
second_T1PB = len(text_data.index)
print('Number of 10-Qs in edgar from 1993 Q1 to 2020 Q1: ' + str(first_T1PB))
print('Number of 10-Qs parsed and downloaded: ' + str(second_T1PB))

Number of 10-Qs in edgar from 1993 Q1 to 2020 Q1: 594017
Number of 10-Qs parsed and downloaded: 575579


In [3]:
# ############## Inspect number of missing values in two data frames
# print(id_data.isnull().sum())
# print(text_data.isnull().sum())

In [4]:
###################################################################################
########################### text_data modifications ###############################
###################################################################################

############## Calculate tone : (tone = n_pos - n_negation - n_neg)/nw
text_data = text_data.assign(tone = (text_data['n_pos'] - text_data['n_negation'] - text_data['n_neg'])/text_data['nw'])

############## Drop files that contain number of words beyond 1% and 99% threshold
nwq01 = text_data['nw'].quantile(.01)
print('number of words, 1% quantile: ' + str(nwq01))
nwq99 = text_data['nw'].quantile(.99)
print('number of words, 99% quantile: ' + str(nwq99))
text_data = text_data.loc[(text_data['nw'] >= nwq01) & (text_data['nw'] <= nwq99)]

third_T1PB = str(len(text_data.index))
print('Number of 10-Qs after dropping files that contain number of words beyond 1% and 99% threshold: ' + str(third_T1PB))

### Correct modal words labels
text_data.columns = ['accnum', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_uctt', 'n_lit', 'n_cstr', \
                     'n_modal_strong', 'n_modal_moderate', 'n_modal_weak', 'n_negation', 'tone']
text_data.to_csv('..\\filings\\' + data_type_text + '_'+ obj_type + '.csv', index = 0)

number of words, 1% quantile: 890.0
number of words, 99% quantile: 45537.659999999916
Number of 10-Qs after dropping files that contain number of words beyond 1% and 99% threshold: 564075


In [5]:
############# Table 1: Summary statistics for text data
T1PA = text_data.describe().transpose() # generally consistent with LM's summary statistics
T1PA.to_excel('..\\output\\Tables.xlsx', sheet_name='T1PA', float_format="%.2f")

T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
nw,564075.0,9456.234724,7330.084401,890.0,4078.0,7423.0,12625.0,45537.0
nvocab,564075.0,1485.947454,684.937999,103.0,974.0,1396.0,1863.0,12326.0
n_neg,564075.0,148.089516,170.516014,0.0,42.0,90.0,185.0,2465.0
n_pos,564075.0,55.278101,56.92271,0.0,18.0,38.0,73.0,722.0
n_uctt,564075.0,126.774339,129.018217,0.0,43.0,89.0,164.0,1460.0
n_lit,564075.0,100.044047,152.590848,0.0,24.0,52.0,112.0,2851.0
n_cstr,564075.0,63.71263,64.953599,0.0,20.0,43.0,85.0,785.0
n_modal_strong,564075.0,28.149611,28.411316,0.0,10.0,20.0,37.0,767.0
n_modal_moderate,564075.0,30.2226,28.207681,0.0,11.0,22.0,40.0,322.0
n_modal_weak,564075.0,49.056952,74.929505,0.0,12.0,26.0,52.0,1038.0


In [6]:
########### Table 1 Panel B: Data screening process
print('Number of 10-Qs in edgar from 1993 Q1 to 2020 Q1: ' + str(first_T1PB))
print('Number of 10-Qs parsed and downloaded: ' + str(second_T1PB))
print('Number of 10-Qs after dropping files that contain number of words beyond 1% and 99% threshold: ' + str(third_T1PB))

Number of 10-Qs in edgar from 1993 Q1 to 2020 Q1: 594017
Number of 10-Qs parsed and downloaded: 575579
Number of 10-Qs after dropping files that contain number of words beyond 1% and 99% threshold: 564075


In [None]:
########## Mutate tone
########## Merge with CRSP data and look for proxy for news
########## look around how quarterly RET is defined
########## how to separate tone