In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math, statsmodels.api as sm, patsy as ps, matplotlib.pyplot as plt
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook
from patsy import dmatrices
from matplotlib import rc

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '8-K'
data_type_text = 'text_data'
data_type_id = 'id_data'

############### Set working directory to parent directory
if os.getcwd() != 'F:\\github\\narrative_conservatism\\code':
    os.chdir('F:\\github\\narrative_conservatism\\code')
    
############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
########################################################################################
################## Merge EDGAR with CRSP_COMPUSTAT: RETURN #############################
########################################################################################

########## Load CRSP_COMP dataset
crsp_comp_cols = ['date_crsp', 'cusip', 'cik', 'RET', 'DRET', 'date_key', 'fyearq', 'sic', 'BN', 'NEWS', \
                  'atq', 'lag_atq', 'ceqq', 'lag_ceqq', 'lag_cshoq', 'lag_dlcq', 'lag_dlttq', 'lag_prccq']
crsp_comp = pd.read_csv(r'H:\data\github_big_files\crsp_comp_' + obj_type + '.csv', \
                        usecols = crsp_comp_cols, dtype = {'cik': str, 'cusip': str, 'date_key': str})

### Delete last two digits (.0) of crsp_comp['cik'] 
crsp_comp['cik'] = crsp_comp['cik'].astype(str).str[:-2]

### Sort CRSP_COMP by cik-date_key in order to match NEWS and EDGAR later
crsp_comp = crsp_comp.sort_values(by = ['cik', 'date_key'])
crsp_comp['date_key'] = pd.to_datetime(crsp_comp['date_key'], format='%Y%m%d')

########## Load EDGAR dataset
edgar_cols = ['cik', 'rp', 'fd', 'item8k', 'name', 'n8k', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_negation', 'tone']
edgar = pd.read_csv('..\\filings\\edgar_8-K.csv', usecols = edgar_cols, dtype = {'cik': str, 'date_key': str})

### Sort CRSP_COMP by cik-date_key in order to match NEWS and EDGAR later
edgar['date_key'] = edgar['fd']
edgar['date_key'] = pd.to_datetime(edgar['date_key'], format='%Y%m%d')
edgar = edgar.sort_values(by = ['cik', 'date_key'])

############## Left merge CRSP_COMP and EDGAR, key unique in both data sets
crsp_comp_edgar = pd.merge(crsp_comp, edgar, on = ['cik', 'date_key'], how = 'left', validate = '1:m')

del edgar, crsp_comp

############# restrict samples to good and bad news as defined as
# Firm-day's positive (negative) RET change is three times larger than the firm’s calander yearly average increase (decrease) in RET, 
# and 0 otherwise.
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['NEWS']==1]
crsp_comp_edgar = crsp_comp_edgar.drop(['NEWS'], axis = 1) 

# ############# generate CR and NEG
# crsp_comp_edgar['lag1_RET'] = crsp_comp_edgar.groupby(['cik'])['RET'].shift(1)
# crsp_comp_edgar['leap1_RET'] = crsp_comp_edgar.groupby(['cik'])['RET'].shift(-1)
# crsp_comp_edgar['CR3'] = crsp_comp_edgar['RET'] + crsp_comp_edgar['leap1_RET'] + crsp_comp_edgar['lag1_RET']
# crsp_comp_edgar.drop(['lag1_RET', 'leap1_RET'], axis=1)

# ############# restrict samples to good and bad news as defined as
# # (a)
# # 3-day cummulative return is three times larger than the firm’s calander yearly average increase (decrease) in RET
# crsp_comp_edgar['NEG3'] = 0
# crsp_comp_edgar.loc[crsp_comp_edgar['CR3']<0, 'NEG3'] = 1

# crsp_comp_edgar['lag2_RET'] = crsp_comp_edgar.groupby(['cik'])['RET'].shift(2)
# crsp_comp_edgar['leap2_RET'] = crsp_comp_edgar.groupby(['cik'])['RET'].shift(-2)
# crsp_comp_edgar['CR5'] = crsp_comp_edgar['CR3'] + crsp_comp_edgar['leap2_RET'] + crsp_comp_edgar['lag2_RET']
# crsp_comp_edgar.drop(['lag2_RET', 'leap2_RET'], axis=1)

# crsp_comp_edgar['NEG5'] = 0
# crsp_comp_edgar.loc[crsp_comp_edgar['CR5']<0, 'NEG3'] = 1

# crsp_comp_edgar['lag3_RET'] = crsp_comp_edgar.groupby(['cik'])['RET'].shift(3)
# crsp_comp_edgar['leap3_RET'] = crsp_comp_edgar.groupby(['cik'])['RET'].shift(-3)
# crsp_comp_edgar['CR7'] = crsp_comp_edgar['CR5'] + crsp_comp_edgar['leap3_RET'] + crsp_comp_edgar['lag3_RET']
# crsp_comp_edgar.drop(['lag3_RET', 'leap3_RET'], axis=1)

# crsp_comp_edgar['NEG7'] = 0
# crsp_comp_edgar.loc[crsp_comp_edgar['CR7']<0, 'NEG3'] = 1

############ delete other firm-days without 8-K
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['rp'].notnull()]

############## Rename SIC
crsp_comp_edgar = crsp_comp_edgar.rename(columns={'sic': 'SIC'})

############### Create item_8k count variable set and Load processed EDGAR_8-K

item_before2004 = ['1','2','3','4','5','6','7','8','9','10','11','12']
item_before2004 = ['item_' + item for item in item_before2004]
item_after2004 = ['1.01','1.02','1.03','1.04', \
                  '2.01','2.02','2.03','2.04','2.05','2.06', \
                  '3.01','3.02','3.03', \
                  '4.01','4.02',
                  '5.01','5.02','5.03','5.04','5.05','5.06','5.07','5.08', \
                  '6.01','6.02','6.03','6.04','6.05', \
                  '7.01', \
                  '8.01', \
                  '9.01']
item_after2004 = ['item_' + item for item in item_after2004]
item  = item_before2004 + item_after2004

edgar_cols = ['cik', 'rp'] + item
edgar = pd.read_csv(r'..\filings\edgar_' + obj_type + '.csv', usecols = edgar_cols, dtype={'cik': str})

############## Add financial variables from CRSP_COMP to CRSP_COMP_EDGAR by merging
crsp_comp_edgar = pd.merge(crsp_comp_edgar, edgar, on = ['cik', 'rp'], how = 'left', validate = '1:1')
del edgar

############## Create filing date year-month fixed effects identifier cmonth
crsp_comp_edgar['cmonth'] = crsp_comp_edgar['fd'].astype(str).str[:-2]

############## Create nitem as count of total number of items appeared in one firm-day
crsp_comp_edgar['nitem'] = crsp_comp_edgar.loc[:,item].sum(axis = 1)

############# Reorder crsp_comp_edgar columns
# 1st line: merge keys
# 2nd line: extra id info
# 3th line: financial raw data (lagged variables)
# 4th line: text variables
crsp_comp_edgar = crsp_comp_edgar[['cusip', 'date_crsp', 'cik', 'rp', \
                                   'name', 'SIC', 'fd', 'cmonth', 'fyearq', \
                                   'atq', 'lag_atq', 'ceqq', 'lag_ceqq', 'lag_cshoq', 'lag_dlcq', 'lag_dlttq', 'lag_prccq', 'RET', 'NEG', 'DRET', 'BN', \
                                   'n8k', 'nitem', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_negation', 'tone'] + item]

print('Number of obs. in CRSP_COMP_EDGAR: ' + str(crsp_comp_edgar.shape[0]))

KeyboardInterrupt: 

In [7]:
####################### Modify data type in ID_CRSP_COMP_TEXT
########### Define a function that changes pandas series data type to string
def columns_to_str (df, colnames):
    for col in colnames:
        df[col] = df[col].astype(str)
    return df

########### Apply columns_to_str to various identification variables
crsp_comp_edgar = columns_to_str(crsp_comp_edgar, ['cik', 'cusip', 'fyearq'])

########## Convert date variables to date format
crsp_comp_edgar['date_crsp'] = pd.to_datetime(crsp_comp_edgar['date_crsp'], format='%Y%m%d')
crsp_comp_edgar['fd'] = pd.to_datetime(crsp_comp_edgar['fd'], format='%Y%m%d')
crsp_comp_edgar['rp'] = pd.to_datetime(crsp_comp_edgar['rp'], format='%Y%m%d')

############## Drop obs. with missing SIC and convert SIC variables to integer
del_sic = crsp_comp_edgar[crsp_comp_edgar['SIC'].isnull()].shape[0]
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['SIC'].notnull()]
print('Number of obs. with missing SIC: ' + str(del_sic))

crsp_comp_edgar['SIC'] = crsp_comp_edgar['SIC'].astype(int)

########### Inspect column data types
# print(crsp_comp_edgar.dtypes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http

Number of obs. with missing SIC: 3409


In [8]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

############################## Main Variables ##########################################
######## NW: natural log of 1 + total number of words in the document
crsp_comp_edgar['NW'] = np.log(1 + crsp_comp_edgar['nw'])

######## TONE: number of net positive words (n_pos - n_neg - n_negations) per 1000 total words
crsp_comp_edgar['TONE'] = crsp_comp_edgar['tone']*1000

######## TLAG: Time lag between the news release date (CRSP date) and document filing date (EDGAR filing date)
crsp_comp_edgar['TLAG'] = (crsp_comp_edgar['fd'] - crsp_comp_edgar['rp']).dt.days

### BN: An indicator of bad news, which takes 1 in a firm-day when the firm-day's negative RET change \
### is three times larger than the firm’s current calander-yearly average decrease in RET, and 0 if there is good news.

############## Drop obs. with [beginning-of-quarter] common share price less than $1
del_prccq = crsp_comp_edgar.loc[(crsp_comp_edgar['lag_prccq'] < 1) | (crsp_comp_edgar['lag_prccq'].isnull())].shape[0]
crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['lag_prccq'] >= 1]
print('Number of obs. with beginning common share price less than $1: ' + str(del_prccq))

############## Drop obs. with [beginning-of-quarter] non-positive common shares outstanding
del_cshoq = crsp_comp_edgar.loc[(crsp_comp_edgar['lag_cshoq'] == 0) | (crsp_comp_edgar['lag_cshoq'].isnull())].shape[0]
crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['lag_cshoq'] > 0]
print('Number of obs. with non-positive common shares outstanding: ' + str(del_cshoq))

############################## Control Variables #######################################
######## Size: Firm size, defined as the natural logarithm of market value of equity [at the beginning of the quarter] \
######## defined as [beginning-of-quarter] common share price (Compustat data item prccq) \
######## times [beginning-of-quarter] common shares outstanding (Compustat data item cshoq)
crsp_comp_edgar['SIZE'] = np.log(crsp_comp_edgar['lag_prccq']*crsp_comp_edgar['lag_cshoq'])

######## MTB: Market-to-book ratio, defined as [beginning-of-quarter] market value of equity \
######## defined as common share price (Compustat data item prccq) times common shares outstanding (Compustat data item cshoq) \
######## divided by [beginning-of-quarter] book value of equity (Compustat data item ceqq) 
crsp_comp_edgar['MTB'] = (crsp_comp_edgar['lag_prccq']*crsp_comp_edgar['lag_cshoq'])/crsp_comp_edgar['lag_ceqq']

######## LEV: Leverage, defined as [beginning-of-quarter] short term debt plus [beginning-of-quarter] long term debt \
######## (Compustat data item dlcq + Compustat data item dlttq) scaled by [beginning-of-quarter] total assets (Compustat data item atq)
crsp_comp_edgar['LEV'] = (crsp_comp_edgar['lag_dlcq'] + crsp_comp_edgar['lag_dlttq'])/crsp_comp_edgar['lag_atq']

# ######## AGE: log(1 + age from the first year the firm entered the CRSP dataset)
# id_crsp_comp_text['AGE'] = np.log(1 + id_crsp_comp_text['age'])

Number of obs. with beginning common share price less than $1: 2427
Number of obs. with non-positive common shares outstanding: 15


In [9]:
#######################################################################################
############################### Variable Screening #####################################
########################################################################################

############## Drop financial and utility firms (SIC codes between 6000 and 6999 and between 4900 and 4999, respectively)
del_fin = crsp_comp_edgar.loc[(crsp_comp_edgar['SIC'] >= 6000) & (crsp_comp_edgar['SIC'] <= 6999)].shape[0]
crsp_comp_edgar = crsp_comp_edgar.loc[(crsp_comp_edgar['SIC'] < 6000) | (crsp_comp_edgar['SIC'] > 6999)] # financial
del_ut = crsp_comp_edgar.loc[(crsp_comp_edgar['SIC'] >= 4900) & (crsp_comp_edgar['SIC'] <= 4999)].shape[0]
crsp_comp_edgar = crsp_comp_edgar.loc[(crsp_comp_edgar['SIC'] < 4900) | (crsp_comp_edgar['SIC'] > 4999)] # utility
print('number of obs. from utility and financial firms: ' + str(del_fin + del_ut))

############## Drop files (firm-quarter) that have missing SIC, SIZE, MTB, LEV, or with non-positive total assets or book value of equity
del_size = crsp_comp_edgar[crsp_comp_edgar['SIZE'].isnull()].shape[0]
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['SIZE'].notnull()]
del_mtb = crsp_comp_edgar[crsp_comp_edgar['MTB'].isnull()].shape[0]
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['MTB'].notnull()]
del_lev = crsp_comp_edgar[crsp_comp_edgar['LEV'].isnull()].shape[0]
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['LEV'].notnull()]
del_atq = crsp_comp_edgar.loc[(crsp_comp_edgar['atq'] <= 0) | (crsp_comp_edgar['atq'].isnull())].shape[0]
crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['atq'] > 0]
del_ceqq = crsp_comp_edgar.loc[(crsp_comp_edgar['ceqq'] <= 0) | (crsp_comp_edgar['ceqq'].isnull())].shape[0]
crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['ceqq'] > 0]
print('number of firm-quarters with missing SIC, SIZE, MTB, LEV or non-positive (total assets or book value of equity or common shares outstanding), or lag_prcc < 1: ' \
      + str(del_sic + del_size + del_mtb + del_lev + del_atq + del_ceqq + del_prccq + del_cshoq))

########## Drop files (firm-quarter) that contain number of words less than 1% threshold (or larger than 99% threshold)
nwq01 = crsp_comp_edgar['nw'].quantile(.01)
print('number of words, 1% quantile: ' + str(nwq01))
del_word01 = crsp_comp_edgar.loc[crsp_comp_edgar['nw'] < nwq01].shape[0]
print('number of obs. that contain total words less than 1% threshold: ' + str(del_word01))
crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['nw'] >= nwq01]

# nwq99 = crsp_comp_edgar['nw'].quantile(.99)
# print('number of words, 99% quantile: ' + str(nwq99))
# del_word99 = crsp_comp_edgar.loc[crsp_comp_edgar['nw'] > nwq99].shape[0]
# print('number of obs. that contain total words more than 99% threshold: ' + str(del_word99))
# crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['nw'] <= nwq99]

########## Drop files (firm-quarter) that contain negative RLAG
# Rationale to drop negative TLAG: By construction, filings with filing date prior to news release date cannot be addressing the news. 
# ANTICIPATION is not purpose of the paper.
del_TLAG0 = crsp_comp_edgar[crsp_comp_edgar['TLAG'] < 0].shape[0]
crsp_comp_edgar = crsp_comp_edgar[crsp_comp_edgar['TLAG'] >= 0]

########## Drop files (firm-quarter) that contain larger than 99% TLAG
tlagq99 = crsp_comp_edgar['TLAG'].quantile(.99)
print('TLAG 99% quantile: ' + str(tlagq99))
del_TLAG99 = crsp_comp_edgar.loc[crsp_comp_edgar['TLAG'] > tlagq99].shape[0]
print('number of obs. that contain negative or larger than 99% TLAG: ' + str(del_TLAG99 + del_TLAG0))
crsp_comp_edgar = crsp_comp_edgar.loc[crsp_comp_edgar['TLAG'] <= tlagq99]

############## Winsorize SIZE, MTB, LEV
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

crsp_comp_edgar = winsorize(crsp_comp_edgar, ['SIZE', 'MTB', 'LEV'])

############## Inspect sample size after variable screening
print('Number of obs. after variable screening: ' + str(crsp_comp_edgar.shape[0]))

############## Manual correction for one observation (with nitem=0): https://www.sec.gov/Archives/edgar/data/1164727/000089109204004189/0000891092-04-004189-index.htm
crsp_comp_edgar.loc[crsp_comp_edgar['nitem'] == 0, 'item_5'] = 1
crsp_comp_edgar.loc[crsp_comp_edgar['nitem'] == 0, 'item_7'] = 1
crsp_comp_edgar.loc[crsp_comp_edgar['nitem'] == 0, 'nitem'] = 2

############# Save processed CRSP_COMP_EDGAR to .csv file
crsp_comp_edgar.to_csv('..\\filings\\crsp_comp_edgar_'+ obj_type + '_no_match.csv', index = 0)

number of obs. from utility and financial firms: 15201
number of firm-quarters with missing SIC, SIZE, MTB, LEV or non-positive (total assets or book value of equity or common shares outstanding), or lag_prcc < 1: 9111
number of words, 1% quantile: 131.0
number of obs. that contain total words less than 1% threshold: 476
TLAG 99% quantile: 16.0
number of obs. that contain negative or larger than 99% TLAG: 466
Number of obs. after variable screening: 47837
