In [2]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
data_type_text = 'text_data'
data_type_id = 'id_data'

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [3]:
########################################################################################
############ Concatenate and prepare merge: ID_DATA and TEXT_DATA ######################
########################################################################################

############## Define a function to concatenate all csv files with file name that matches a certain pattern into one data frame
def concatenate (indir, file_name_match):
    os.chdir(indir)
    file_list = glob.glob(file_name_match)
    df_list = list()
    colnames = pd.read_csv(file_list[0], header = None).loc[0]
    
    for filename in file_list:
        # print(filename)
        df = pd.read_csv(filename, low_memory = False)
        df_list.append(df)

    df_concat = pd.concat(df_list, axis = 0)
    df_concat.columns = colnames
    return df_concat

############## Concatenate id_data and text_data files and create two data frames
id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')
text_data = concatenate('..\\filings', data_type_text + '_'+ obj_type + '_' + '*.csv')
############## Save id_data dataframe into local file id_data_10-Q.csv
id_data.to_csv('..\\filings\\' + data_type_id + '_'+ obj_type + '.csv', index = 0)

############## text_data modifications #####################
############## Calculate tone : tone = (n_pos - n_negation - n_neg)/nw
text_data['tone'] = (text_data['n_pos'] - text_data['n_negation'] - text_data['n_neg'])/text_data['nw']

############## Correct modal words labels in text_data
text_data.columns = ['accnum', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_uctt', 'n_lit', 'n_cstr', \
                     'n_modal_strong', 'n_modal_moderate', 'n_modal_weak', 'n_negation', 'tone']

############## Save text_data dataframe into local file text_data_10-Q.csv
text_data.to_csv('..\\filings\\' + data_type_text + '_'+ obj_type + '.csv', index = 0)

print('Number of ' + obj_type + ' in edgar from 1993 Q1 to 2020 Q1: ' + str(len(id_data.index)))
print('Number of ' + obj_type + ' parsed and downloaded: ' + str(len(text_data.index)))

Number of 10-Q in edgar from 1993 Q1 to 2020 Q1: 594017
Number of 10-Q parsed and downloaded: 575579


In [4]:
########################################################################################
############ Merge COMPUSTAT quarterly data with CRSP monthly data #####################
########################################################################################

########### Read compustat raw data files
comp_cols = ['gvkey', 'datadate', 'fyearq', 'fqtr', 'fyr', 'cusip', 'conm', 'curcdq', 'actq', \
             'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'exchg', \
             'cik', 'costat', 'prccq', 'addzip', 'incorp', 'sic', 'ipodate']
comp = pd.read_csv('..\\filings\\compustat.csv', usecols = comp_cols)

### Reorder compustat column
# 1st line: merge keys
# 2nd line: extra id info
# 3rd line: financial data
comp = comp[['cusip', 'cik', 'datadate', \
             'gvkey', 'conm', 'sic', 'incorp', 'addzip', 'fyearq', 'fqtr', 'fyr', 'ipodate', 'costat', 'curcdq', 'exchg', \
             'actq', 'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'prccq']]

########### Read crsp raw data files
crsp_cols = ['date', 'CUSIP', 'RET', 'vwretd']
crsp = pd.read_csv('..\\filings\\crsp.csv', usecols = crsp_cols)
crsp.columns = ['date', 'cusip', 'ret', 'vwretd']

### Delete the two 'day digits' of compustat and crsp filings' data date and create the date_key
comp['date_key'] = comp['datadate'].astype(str).str[:-2]
crsp['date_key'] = crsp['date'].astype(str).str[:-2]

### Delete the 9th digit of compustat filings' cusip, and filter filings that have 8-digits cusip after deletion 
comp['cusip'] = comp['cusip'].astype(str).str[:-1]
print('number of quarterly filings in Compustat: ' + str(len(comp.index)))
comp = comp.loc[comp['cusip'].str.len() == 8]
print('number of quarterly filings in Compustat after deleting non-9-digits cusips: ' + str(len(comp.index)))

### Delete CRSP raw rows that contains non-numeric returns ('B' and 'C'), fill NA with 0
print('number of monthly data in CRSP: ' + str(len(crsp.index)))
crsp = crsp[(crsp['ret'] != 'B') & (crsp['ret'] != 'C')]
crsp['ret'] = crsp['ret'].fillna(0)
print('number of monthly data in CRSP that contains only numeric returns: ' + str(len(crsp.index)))

### Mutate adjusted monthly returns and delete ret and vwretd
crsp = crsp.assign(adj_ret_m = crsp['ret'].astype(float) - crsp['vwretd'].astype(float))
crsp = crsp.drop(columns=['ret', 'vwretd'])
crsp

number of quarterly filings in Compustat: 1240141
number of quarterly filings in Compustat after deleting non-9-digits cusips: 1239632
number of monthly data in CRSP: 2477547
number of monthly data in CRSP that contains only numeric returns: 2427687


Unnamed: 0,date,cusip,date_key,adj_ret_m
0,19921130,36720410,199211,-0.057287
1,19921231,36720410,199212,-0.032898
2,19930129,36720410,199301,-0.012644
3,19930226,36720410,199302,0.012455
4,19930331,36720410,199303,-0.014063
...,...,...,...,...
2477542,20190830,88160R10,201908,-0.045951
2477543,20190930,88160R10,201909,0.051615
2477544,20191031,88160R10,201910,0.288162
2477545,20191129,88160R10,201911,0.012724


In [5]:
##################### Left merge CRSP and Compstat, key not unique in Compustat
crsp_comp = pd.merge(crsp, comp, on = ['cusip', 'date_key'], how='left', validate = '1:m')
crsp_comp

Unnamed: 0,date,cusip,date_key,adj_ret_m,cik,datadate,gvkey,conm,sic,incorp,addzip,fyearq,fqtr,fyr,ipodate,costat,curcdq,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,prccq
0,19921130,36720410,199211,-0.057287,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,19921231,36720410,199212,-0.032898,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,19930129,36720410,199301,-0.012644,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,19930226,36720410,199302,0.012455,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,19930331,36720410,199303,-0.014063,43350.0,19930331.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1993.0,3.0,6.0,,I,USD,12.0,7.992,28.055,9.147,1.548,1.085,3.61,8.55,0.347,1.260,0.0,7.66,11.718,2.493,14.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2428674,20190830,88160R10,201908,-0.045951,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2428675,20190930,88160R10,201909,0.051615,1318605.0,20190930.0,184996.0,TESLA INC,3711.0,DE,94304,2019.0,3.0,12.0,20100629.0,A,USD,14.0,10940.000,32795.000,6040.000,5571.000,180.000,2253.00,12383.00,530.851,143.469,537.0,10146.00,6302.860,0.000,240.87
2428676,20191031,88160R10,201910,0.288162,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2428677,20191129,88160R10,201911,0.012724,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [6]:
########## Aggregate the monthly returns in CRSP to quarterly returns by summing up 3-months returns in a quarter
adj_ret = list()
gvkey = crsp_comp['gvkey'].values.tolist()
adj_ret_m = crsp_comp['adj_ret_m'].values.tolist()

for index, value in enumerate(gvkey):
    ### requires a quarter-end flag to be non-zero, and the quarter-end flag of previous two months to be zero (avoid time-slot mismatch)
    if math.isnan(value) == False and math.isnan(gvkey[index-1]) == True and math.isnan(gvkey[index-2]) == True:
        adj_ret_i = adj_ret_m[index] + adj_ret_m[index-1] + adj_ret_m[index-2]
    else:
        adj_ret_i = float('NaN')
    adj_ret.append(adj_ret_i)

crsp_comp = crsp_comp.assign(RET = adj_ret)

########## Delete adj_ret_m column, and any rows that contains missing values of gvkey or RET
crsp_comp = crsp_comp.drop(columns=['adj_ret_m'])
crsp_comp = crsp_comp[crsp_comp['gvkey'].notnull()]
crsp_comp = crsp_comp[crsp_comp['RET'].notnull()]
crsp_comp['datadate'] = crsp_comp['datadate'].astype(str).str[:-2]

############## Save text_data dataframe into local file text_data_10-Q.csv
crsp_comp.to_csv('..\\filings\\crsp_comp_' + obj_type + '.csv', index = 0)

In [7]:
################# Inspect crsp_comp 
first_fig1 = len(crsp_comp.index)
print('number of quarterly fillings after merging CRSP and Compustat: ' + str(first_fig1))
crsp_comp

number of quarterly fillings after merging CRSP and Compustat: 707333


Unnamed: 0,date,cusip,date_key,cik,datadate,gvkey,conm,sic,incorp,addzip,fyearq,fqtr,fyr,ipodate,costat,curcdq,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,prccq,RET
4,19930331,36720410,199303,43350.0,19930331,12994.0,GAS NATURAL INC,4924.0,OH,44114,1993.0,3.0,6.0,,I,USD,12.0,7.992,28.055,9.147,1.548,1.085,3.610,8.550,0.347,1.260,0.000,7.660,11.718,2.493,14.25,-0.014252
7,19930630,36720410,199306,43350.0,19930630,12994.0,GAS NATURAL INC,4924.0,OH,44114,1993.0,4.0,6.0,,I,USD,12.0,6.761,27.434,8.733,1.509,1.090,1.663,11.050,0.390,-0.296,0.000,4.881,4.058,2.763,16.50,0.154818
10,19930930,36720410,199309,43350.0,19930930,12994.0,GAS NATURAL INC,4924.0,OH,44114,1994.0,1.0,6.0,,I,USD,12.0,7.452,29.751,8.326,0.975,1.091,4.323,10.750,0.362,-0.320,0.000,6.753,3.978,3.309,16.75,-0.011296
13,19931231,36720410,199312,43350.0,19931231,12994.0,GAS NATURAL INC,4924.0,OH,44114,1994.0,2.0,6.0,,I,USD,12.0,10.962,33.954,8.757,0.440,1.091,4.423,10.768,0.366,0.621,0.000,10.299,9.172,3.323,18.50,0.086561
16,19940331,36720410,199403,43350.0,19940331,12994.0,GAS NATURAL INC,4924.0,OH,44114,1994.0,3.0,6.0,,I,USD,12.0,9.107,32.087,9.656,1.337,1.091,2.623,10.763,0.380,1.090,0.000,7.651,9.768,3.197,17.25,-0.013350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2428666,20181231,88160R10,201812,1318605.0,20181231,184996.0,TESLA INC,3711.0,DE,94304,2018.0,4.0,12.0,20100629.0,A,USD,14.0,8306.308,29739.614,4923.243,3878.169,172.603,2629.460,9454.055,496.737,139.483,350.651,9992.136,7225.873,0.000,332.80,0.407883
2428669,20190329,88160R10,201903,1318605.0,20190331,184996.0,TESLA INC,3711.0,DE,94304,2019.0,1.0,12.0,20100629.0,A,USD,14.0,7677.822,28912.524,4605.596,2329.119,173.682,1914.073,10834.402,467.577,-702.135,347.880,9242.800,4541.464,0.000,279.86,-0.294656
2428672,20190628,88160R10,201906,1318605.0,20190630,184996.0,TESLA INC,3711.0,DE,94304,2019.0,2.0,12.0,20100629.0,A,USD,14.0,10181.952,31872.597,5715.393,5082.746,179.118,2011.177,12309.747,578.572,-408.334,480.833,9588.773,6349.676,0.000,223.46,-0.208027
2428675,20190930,88160R10,201909,1318605.0,20190930,184996.0,TESLA INC,3711.0,DE,94304,2019.0,3.0,12.0,20100629.0,A,USD,14.0,10940.000,32795.000,6040.000,5571.000,180.000,2253.000,12383.000,530.851,143.469,537.000,10146.000,6302.860,0.000,240.87,0.075005


In [8]:
########################################################################################
######################## Merge ID_DATA with CRSP_COMPUSTAT #############################
########################################################################################

# id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')

############## prepare merge: ID_DATA
id_data['fd'] = id_data['fd'].str.replace('-', '')
id_data['rp'] = id_data['rp'].str.replace('-', '')
id_data['date_key'] = id_data['rp'].astype(str).str[:-2]

In [9]:
############## Left merge ID_DATA and CRSP_COMP, key not unique in both data sets
id_crsp_comp = pd.merge(id_data, crsp_comp, on = ['cik', 'date_key'], how = 'inner', validate = 'm:m')

### Delete duplicated columns in ID_DATA and CRSP_COMP, duplicated rows in accnum (edgar) and [cik * date] (compustat), and rename sic, date_key, date and datadate
# id_crsp_comp = id_crsp_comp.drop(columns=['conm', 'sic_x', 'bazip', 'file_type', 'fye', 'curcdq'])
id_crsp_comp = id_crsp_comp[id_crsp_comp.duplicated('accnum') == False]
id_crsp_comp = id_crsp_comp[id_crsp_comp.duplicated(subset=['cik', 'rp']) == False]
id_crsp_comp = id_crsp_comp.rename(columns={'sic_y': 'SIC', 'date_key': 'cquarter', 'date': 'date_crsp', 'datadate': 'date_comp'})

### Reorder id_crsp_comp columns
# 1st line: merge keys
# 2nd line: extra id info
# 3rd line: financial data
id_crsp_comp = id_crsp_comp[['cusip', 'cik', 'rp', 'accnum', \
'name', 'gvkey', 'SIC', 'fd', 'date_crsp', 'date_comp', 'cquarter', 'fyearq', 'fqtr', 'fyr', 'ipodate', 'incorp', 'state', 'addzip', 'costat', 'exchg', \
'actq', 'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'prccq', 'RET']]

In [10]:
################## Inspect id_crsp_comp 
second_fig1 = len(id_crsp_comp.index)
print('number of observations after merging with edgar data: ' + str(second_fig1))

id_crsp_comp

number of observations after merging with edgar data: 303843


Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,SIC,fd,date_crsp,date_comp,cquarter,fyearq,fqtr,fyr,ipodate,incorp,state,addzip,costat,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,prccq,RET
0,54626810,60512,19930630,0000060512-94-000005,LOUISIANA LAND & EXPLORATION CO,6819.0,1311.0,19930813,19930630,19930630,199306,1993.0,2.0,12.0,,MD,MD,70112,I,11.0,193.800,1278.000,424.200,64.300,28.729,86.500,356.300,27.300,5.600,,208.200,189.300,136.100,42.5000,-0.059108
1,88579Y10,66740,19930630,0000066740-94-000015,MINNESOTA MINING & MANUFACTURING CO,7435.0,2670.0,19930813,19930630,19930630,199306,1993.0,2.0,12.0,19831230.0,DE,DE,55144,A,11.0,6382.000,12145.000,6590.000,650.000,216.975,829.000,679.000,264.000,331.000,,3365.000,3540.000,0.000,108.0000,-0.014779
2,08750910,11860,19930930,0000011860-94-000005,BETHLEHEM STEEL CORP /DE/,2189.0,3312.0,19931112,19930930,19930930,199309,1993.0,3.0,12.0,,DE,DE,18016-7699,I,13.0,1074.900,5168.098,580.800,232.500,91.307,100.300,728.500,69.100,30.700,,908.400,1055.300,0.000,14.3750,-0.270146
3,54626810,60512,19930930,0000060512-94-000007,LOUISIANA LAND & EXPLORATION CO,6819.0,1311.0,19931110,19930930,19930930,199309,1993.0,3.0,12.0,,MD,MD,70112,I,11.0,180.200,1662.300,417.100,9.400,33.137,0.000,747.700,28.200,-1.800,,195.100,187.900,124.800,44.6250,0.025969
4,88579Y10,66740,19930930,0000066740-94-000016,MINNESOTA MINING & MANUFACTURING CO,7435.0,2670.0,19931112,19930930,19930930,199309,1993.0,3.0,12.0,19831230.0,DE,DE,55144,A,11.0,6445.000,12229.000,6600.000,665.000,215.791,796.000,682.000,262.000,316.000,,3404.000,3481.000,0.000,102.8750,-0.076728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307477,87182910,96021,20191228,0000096021-20-000017,SYSCO CORP,10247.0,5140.0,20200205,20191231,20191231,201912,2020.0,2.0,6.0,,DE,DE,77077,A,11.0,8661.610,19372.034,2527.526,539.625,508.843,897.619,8654.524,185.011,383.410,4879.128,6931.968,15025.042,142.301,85.5400,-0.001390
307478,87840910,96699,20191228,0001171843-20-000876,TECHNICAL COMMUNICATIONS CORP,10364.0,3663.0,20200211,20191231,20191231,201912,2020.0,1.0,9.0,,MA,MA,01742-2892,A,14.0,2.146,2.847,1.752,0.748,1.850,0.148,0.521,0.006,-0.480,0.000,0.574,0.666,0.000,5.0500,1.266332
307479,86737U10,96793,20191231,0001564590-20-004619,SUNLINK HEALTH SYSTEMS INC,10380.0,5912.0,20200213,20191231,20191231,201912,2020.0,2.0,6.0,,OH,OH,30339,A,12.0,14.384,22.823,14.762,4.607,6.983,0.782,0.851,0.350,0.203,1.295,6.440,12.805,0.000,1.0700,-0.135859
307480,87288520,98338,20191130,0001213900-20-000888,TSR INC,10305.0,7371.0,20200113,20191129,20191130,201911,2019.0,2.0,5.0,,DE,DE,11788,A,14.0,10.365,11.826,6.287,2.988,1.962,0.252,0.271,0.002,0.061,0.000,5.241,15.233,0.000,3.2001,-0.315755


In [11]:
########################################################################################
###################### Merge TEXT_DATA with ID_CRSP_COMPUSTAT ##########################
########################################################################################

############## Left merge ID_CRSP_COMP and TEXT_DATA, key unique in both data sets
id_crsp_comp_text = pd.merge(id_crsp_comp, text_data, on = ['accnum'], how = 'inner', validate = '1:1')

############## Save id_crsp_comp_text data frame as it is before variable creation and sample screening
# id_crsp_comp_text.to_csv('..\\filings\\id_crsp_comp_text_' + obj_type + '.csv', index = 0)

In [12]:
############### Inspect if firm-quarter key is unique : YES, key is unique
# print(id_crsp_comp_text[id_crsp_comp_text.duplicated('accnum')])
# print(id_crsp_comp_text[id_crsp_comp_text.duplicated(subset=['cik', 'rp'])])

In [13]:
####################### Modify data type in ID_CRSP_COMP_TEXT
########### Define a function that changes pandas series data type to string
def columns_to_str (df, colnames):
    for col in colnames:
        df[col] = df[col].astype(str)
    return df

########### Apply columns_to_str to various identification variables
id_crsp_comp_text = columns_to_str(id_crsp_comp_text, ['cik', 'gvkey', 'exchg', 'fyearq', 'fqtr', 'fyr'])

########## Convert date variables to date format
id_crsp_comp_text['fd'] = pd.to_datetime(id_crsp_comp_text['fd'])
id_crsp_comp_text['rp'] = pd.to_datetime(id_crsp_comp_text['rp'])
id_crsp_comp_text['date_crsp'] = id_crsp_comp_text['date_crsp'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
id_crsp_comp_text['date_comp'] = pd.to_datetime(id_crsp_comp_text['date_comp'])
id_crsp_comp_text['ipodate'] = pd.to_datetime(id_crsp_comp_text['ipodate'])

########## Convert SIC variables to integer
id_crsp_comp_text['SIC'] = id_crsp_comp_text['SIC'].astype(int)

########### Inspect column data types
# print(id_crsp_comp_text.dtypes)

In [14]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

############################## Main Variables ##########################################
######## NW: natural log of total number of words in the document
id_crsp_comp_text['NW'] = np.log(id_crsp_comp_text['nw'])
######## TONE: number of net positive words (n_pos - n_neg - n_negations) per 1000 total words
id_crsp_comp_text['TONE'] = id_crsp_comp_text['tone']*1000
######## TLAG: Time lag between the news release date (CRSP date) and document filing date (EDGAR filing date)
id_crsp_comp_text['TLAG'] = (id_crsp_comp_text['fd'] - id_crsp_comp_text['date_crsp']).dt.days
######## NEG: An indicator variable takes the value of 1 when market-adjusted stock return (RET) is negative and is 0 otherwise
id_crsp_comp_text['NEG'] = 0 
id_crsp_comp_text['NEG'][id_crsp_comp_text['RET'] < 0] = 1

############################## Control Variables #######################################
######## Size: Firm size, defined as the natural logarithm of market value of equity [at the beginning of the quarter] \
######## defined as [beginning-of-quarter] common share price (Compustat data item prccq) \
######## times [beginning-of-quarter] common shares outstanding (Compustat data item cshoq)
id_crsp_comp_text['SIZE'] = np.log(id_crsp_comp_text['prccq']*id_crsp_comp_text['cshoq'])
######## MTB: Market-to-book ratio, defined as [beginning-of-quarter] market value of equity \
######## defined as common share price (Compustat data item prccq) times common shares outstanding (Compustat data item cshoq) \
######## divided by [beginning-of-quarter] book value of equity (Compustat data item ceqq) 
id_crsp_comp_text['MTB'] = (id_crsp_comp_text['prccq']*id_crsp_comp_text['cshoq'])/id_crsp_comp_text['ceqq']
######## LEV: Leverage, defined as [beginning-of-quarter] short term debt plus [beginning-of-quarter] long term debt \
######## (Compustat data item dlcq + Compustat data item dlttq) scaled by [beginning-of-quarter] total assets (Compustat data item atq)
id_crsp_comp_text['LEV'] = (id_crsp_comp_text['dlcq'] + id_crsp_comp_text['dlttq'])/id_crsp_comp_text['atq']

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
########################################################################################
############################### Variable Screening #####################################
########################################################################################

############## Drop financial and utility firms (SIC codes between 6000 and 6999 and between 4900 and 4999, respectively)
id_crsp_comp_text = id_crsp_comp_text.loc[(id_crsp_comp_text['SIC'] < 6000) | (id_crsp_comp_text['SIC'] > 6999)] # financial
id_crsp_comp_text = id_crsp_comp_text.loc[(id_crsp_comp_text['SIC'] < 4900) | (id_crsp_comp_text['SIC'] > 4999)] # utility

## Change SIC back to str
id_crsp_comp_text['SIC'] = id_crsp_comp_text['SIC'].astype(str)

########## Drop files (firm-quarter) that contain number of words less than 1% threshold, with negative TLAG.  #### And TLAG larger than 99%?
# Rationale to drop negative TLAG: By construction, filings with filing date prior to news release date cannot be addressing the news. 
# ANTICIPATION is not purpose of the paper.
nwq01 = id_crsp_comp_text['nw'].quantile(.01)
print('number of words, 1% quantile: ' + str(nwq01))
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['nw'] >= nwq01]

id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['TLAG'] >= 0]

# tlagq99 = id_crsp_comp_text['TLAG'].quantile(.99)
# print('TLAG 99% quantile: ' + str(tlagq99))
# id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['TLAG'] <= tlagq99]

############## Drop files (firm-quarter) that have missing SIZE, MTB, LEV, or with non-positive total assets or book value of equity, \
############## or with common share price less than $1
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['SIZE'].isnull() == False]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['MTB'].isnull() == False]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['LEV'].isnull() == False]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['atq'] > 0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['ceqq'] > 0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['prccq'] >= 1]

############## Winsorize SIZE, MTB, LEV
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

id_crsp_comp_text = winsorize(id_crsp_comp_text, ['SIZE', 'MTB', 'LEV'])

############## Inspect sample size after variable screening
third_fig1 = str(len(id_crsp_comp_text.index))
print('Number of firm-quarters after variable screening: ' + str(third_fig1))

############## Inspect sample size after variable screening
id_crsp_comp_text.to_csv('..\\filings\\id_crsp_comp_text_' + obj_type + '.csv', index = 0)

number of words, 1% quantile: 1157.0
Number of firm-quarters after variable screening: 190926


In [23]:
########################################################################################
############### Table 1: Summary Statistics and Correlation Matrix #####################
########################################################################################

############# Table 1 Panel A: Summary statistics for selected variables
######### Variable groups:
# DEPENDENT: textual variables, generally consistent with LM's summary statistics
# INDEPENDENT: news variables
# control variables
selected_vars = id_crsp_comp_text[['NW','nw', 'TONE','TLAG', 'n_neg', 'n_pos', 'n_negation', 'nvocab', \
                                   'RET', 'NEG', \
                                   'SIZE', 'MTB', 'LEV' \
                                 ]]

T1PA = selected_vars.describe().transpose() 

############# Summary statistics for all raw and processed variables
full_summary = id_crsp_comp_text.describe().transpose()

############# Save T1PA
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PA.to_excel(writer, sheet_name='T1PA_raw', float_format="%.2f")

    writer.save()
    writer.close()
    
else:
    T1PA.to_excel(table_path, sheet_name='T1PA_raw', float_format="%.2f")

T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
NW,190926.0,9.003971,0.784468,7.053586,8.465057,9.073604,9.544811,13.490001
nw,190926.0,10957.889219,10063.73551,1157.0,4746.0,8722.0,13972.0,722159.0
TONE,190926.0,-8.569267,6.810352,-64.54289,-12.484366,-7.467071,-3.759398,22.28739
TLAG,190926.0,40.35883,18.190442,0.0,36.0,41.0,45.0,4069.0
n_neg,190926.0,184.408064,237.22286,0.0,48.0,108.0,220.0,9603.0
n_pos,190926.0,69.471434,76.475261,0.0,22.0,48.0,88.0,2828.0
n_negation,190926.0,4.84649,10.213958,0.0,0.0,1.0,5.0,285.0
nvocab,190926.0,1630.270031,784.84231,275.0,1089.0,1525.0,1995.0,13330.0
RET,190926.0,0.018267,0.301876,-1.578704,-0.127016,0.000257,0.132224,18.312252
NEG,190926.0,0.49934,0.500001,0.0,0.0,0.0,1.0,1.0


In [20]:
# full_summary

In [57]:
############# Table 1 Panel B: Correlation matrix for selected variables
######### pearson correlation
T1PB_pearson = selected_vars.corr(method='pearson')

# T1PB_pearson

Unnamed: 0,NW,nw,TONE,TLAG,n_neg,n_pos,n_negation,nvocab,RET,NEG,SIZE,MTB,LEV
NW,1.0,0.81772,-0.446541,-0.05124,0.737747,0.757811,0.460927,0.888264,-0.017706,0.008663,0.327615,0.098998,0.075658
nw,0.81772,1.0,-0.377635,-0.016631,0.905377,0.863815,0.498125,0.870054,-0.015039,0.009846,0.232475,0.089438,0.080859
TONE,-0.446541,-0.377635,1.0,-0.035789,-0.606668,-0.348897,-0.388,-0.468451,0.014181,-0.017202,-0.066448,-0.031748,0.056896
TLAG,-0.05124,-0.016631,-0.035789,1.0,-0.000765,-0.037206,0.000305,-0.006326,-0.015928,0.022416,-0.146794,-0.017116,-0.006105
n_neg,0.737747,0.905377,-0.606668,-0.000765,1.0,0.874465,0.595762,0.813621,-0.011989,0.008755,0.198726,0.099327,0.007866
n_pos,0.757811,0.863815,-0.348897,-0.037206,0.874465,1.0,0.645316,0.781423,-0.006257,0.005046,0.232675,0.129686,-0.003045
n_negation,0.460927,0.498125,-0.388,0.000305,0.595762,0.645316,1.0,0.50027,-0.001351,0.007661,0.059091,0.110528,-0.082404
nvocab,0.888264,0.870054,-0.468451,-0.006326,0.813621,0.781423,0.50027,1.0,-0.007932,0.007453,0.236813,0.097391,0.026878
RET,-0.017706,-0.015039,0.014181,-0.015928,-0.011989,-0.006257,-0.001351,-0.007932,1.0,-0.636247,0.04749,0.145642,-0.018847
NEG,0.008663,0.009846,-0.017202,0.022416,0.008755,0.005046,0.007661,0.007453,-0.636247,1.0,-0.0969,-0.100578,0.006635


In [58]:
######### spearman correlation
T1PB_spearman = selected_vars.corr(method='spearman')

# T1PB_spearman

Unnamed: 0,NW,nw,TONE,TLAG,n_neg,n_pos,n_negation,nvocab,RET,NEG,SIZE,MTB,LEV
NW,1.0,1.0,-0.456268,-0.283161,0.92276,0.926495,0.557829,0.941156,-0.016249,0.009986,0.339723,0.120104,0.065951
nw,1.0,1.0,-0.456268,-0.283161,0.92276,0.926495,0.557829,0.941156,-0.016249,0.009986,0.339723,0.120104,0.065951
TONE,-0.456268,-0.456268,1.0,0.010636,-0.723991,-0.370403,-0.422142,-0.501422,0.026635,-0.019752,-0.051104,0.011538,0.057433
TLAG,-0.283161,-0.283161,0.010636,1.0,-0.227952,-0.298559,-0.079695,-0.216435,-0.039572,0.037491,-0.426779,-0.101822,-0.030623
n_neg,0.92276,0.92276,-0.723991,-0.227952,1.0,0.881211,0.583516,0.905143,-0.020083,0.012983,0.290204,0.094524,0.014305
n_pos,0.926495,0.926495,-0.370403,-0.298559,0.881211,1.0,0.582217,0.886088,-0.007294,0.00199,0.351394,0.153152,0.013638
n_negation,0.557829,0.557829,-0.422142,-0.079695,0.583516,0.582217,1.0,0.573631,-0.016331,0.012926,0.083644,0.105468,-0.097449
nvocab,0.941156,0.941156,-0.501422,-0.216435,0.905143,0.886088,0.573631,1.0,-0.012476,0.006795,0.288782,0.120466,0.015728
RET,-0.016249,-0.016249,0.026635,-0.039572,-0.020083,-0.007294,-0.016331,-0.012476,1.0,-0.866025,0.107467,0.193086,-0.009499
NEG,0.009986,0.009986,-0.019752,0.037491,0.012983,0.00199,0.012926,0.006795,-0.866025,1.0,-0.100816,-0.161871,0.003967


In [61]:
######### Combine two correlation matrices. right-up matrix: pearson; left-down matrix: spearman 
for row in list(range(0, len(T1PB_spearman.index))):
    T1PB_spearman.iloc[row, row+1:] = T1PB_pearson.iloc[row, row+1:]
    
##### Save T1PB
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PB_spearman.to_excel(writer, sheet_name='T1PB', float_format="%.3f")

    writer.save()
    writer.close()
    
else:
    T1PB_spearman.to_excel(table_path, sheet_name='T1PB', float_format="%.3f")

T1PB_spearman

Unnamed: 0,NW,nw,TONE,TLAG,n_neg,n_pos,n_negation,nvocab,RET,NEG,SIZE,MTB,LEV
NW,1.0,0.81772,-0.446541,-0.05124,0.737747,0.757811,0.460927,0.888264,-0.017706,0.008663,0.327615,0.098998,0.075658
nw,1.0,1.0,-0.377635,-0.016631,0.905377,0.863815,0.498125,0.870054,-0.015039,0.009846,0.232475,0.089438,0.080859
TONE,-0.456268,-0.456268,1.0,-0.035789,-0.606668,-0.348897,-0.388,-0.468451,0.014181,-0.017202,-0.066448,-0.031748,0.056896
TLAG,-0.283161,-0.283161,0.010636,1.0,-0.000765,-0.037206,0.000305,-0.006326,-0.015928,0.022416,-0.146794,-0.017116,-0.006105
n_neg,0.92276,0.92276,-0.723991,-0.227952,1.0,0.874465,0.595762,0.813621,-0.011989,0.008755,0.198726,0.099327,0.007866
n_pos,0.926495,0.926495,-0.370403,-0.298559,0.881211,1.0,0.645316,0.781423,-0.006257,0.005046,0.232675,0.129686,-0.003045
n_negation,0.557829,0.557829,-0.422142,-0.079695,0.583516,0.582217,1.0,0.50027,-0.001351,0.007661,0.059091,0.110528,-0.082404
nvocab,0.941156,0.941156,-0.501422,-0.216435,0.905143,0.886088,0.573631,1.0,-0.007932,0.007453,0.236813,0.097391,0.026878
RET,-0.016249,-0.016249,0.026635,-0.039572,-0.020083,-0.007294,-0.016331,-0.012476,1.0,-0.636247,0.04749,0.145642,-0.018847
NEG,0.009986,0.009986,-0.019752,0.037491,0.012983,0.00199,0.012926,0.006795,-0.866025,1.0,-0.0969,-0.100578,0.006635


In [21]:
########### Figure 1: Data screening process
print('Number of firm-quarters from Compustat/CRSP merged data, 1993Q1 to 2019Q4: ' + str(first_fig1))
print('Number of firm-quarters after merging edgar data: ' + str(second_fig1))
print('Number of firm-quarters after data screening: ' + str(third_fig1))

Number of firm-quarters from Compustat/CRSP merged data, 1993Q1 to 2019Q4: 707333
Number of firm-quarters after merging edgar data: 303843
Number of firm-quarters after data screening: 190926
