In [1]:
############### Import packages
import os, numpy as np, pandas as pd, time, glob, re, math
from tqdm import tqdm
from time import process_time
from datetime import datetime
from datetime import date
from openpyxl import load_workbook

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
data_type_text = 'text_data'
data_type_id = 'id_data'

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

############### Set pandas column printing constraint
pd.set_option('display.max_columns', None)

In [4]:
########################################################################################
############ Concatenate and prepare merge: ID_DATA and TEXT_DATA ######################
########################################################################################

############## Define a function to concatenate all csv files with file name that matches a certain pattern into one data frame
def concatenate (indir, file_name_match):
    os.chdir(indir)
    file_list = glob.glob(file_name_match)
    df_list = list()
    colnames = pd.read_csv(file_list[0], header = None).loc[0]
    
    for filename in file_list:
        # print(filename)
        df = pd.read_csv(filename, low_memory = False)
        df_list.append(df)

    df_concat = pd.concat(df_list, axis = 0)
    df_concat.columns = colnames
    return df_concat

############## Concatenate id_data and text_data files and create two data frames
id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')
text_data = concatenate('..\\filings', data_type_text + '_'+ obj_type + '_' + '*.csv')
############## Save id_data dataframe into local file id_data_10-Q.csv
id_data.to_csv('..\\filings\\' + data_type_id + '_'+ obj_type + '.csv', index = 0)

############## text_data modifications #####################
############## Calculate tone : tone = (n_pos - n_negation - n_neg)/nw
text_data['tone'] = (text_data['n_pos'] - text_data['n_negation'] - text_data['n_neg'])/text_data['nw']

############## Correct modal words labels in text_data
text_data.columns = ['accnum', 'nw', 'nvocab', 'n_neg', 'n_pos', 'n_uctt', 'n_lit', 'n_cstr', \
                     'n_modal_strong', 'n_modal_moderate', 'n_modal_weak', 'n_negation', 'tone']

############## Save text_data dataframe into local file text_data_10-Q.csv
text_data.to_csv('..\\filings\\' + data_type_text + '_'+ obj_type + '.csv', index = 0)

print('Number of ' + obj_type + ' in edgar from 1993 Q1 to 2020 Q1: ' + str(len(id_data.index)))
print('Number of ' + obj_type + ' parsed and downloaded: ' + str(len(text_data.index)))

Number of 10-Q in edgar from 1993 Q1 to 2020 Q1: 594017
Number of 10-Q parsed and downloaded: 575579


In [5]:
id_data

Unnamed: 0,accnum,file_type,cik,name,sic,fd,rp,fye,item8k,bazip,state
0,0000060512-94-000005,10-Q,60512,LOUISIANA LAND & EXPLORATION CO,1311.0,1993-08-13,1993-06-30,1231.0,,70112,MD
1,0000066740-94-000015,10-Q,66740,MINNESOTA MINING & MANUFACTURING CO,2670.0,1993-08-13,1993-06-30,1231.0,,55144,DE
2,0000011860-94-000005,10-Q,11860,BETHLEHEM STEEL CORP /DE/,3312.0,1993-11-12,1993-09-30,1231.0,,18016,DE
3,0000205239-94-000003,10-Q,205239,DATAPOINT CORP,7373.0,1993-12-14,1993-10-30,731.0,,75008,DE
4,0000950131-94-000025,10-Q,20762,CLARK REFINING & MARKETING INC,2911.0,1993-11-12,1993-09-30,1231.0,,63105,DE
...,...,...,...,...,...,...,...,...,...,...,...
1049,0001171843-20-000876,10-Q,96699,TECHNICAL COMMUNICATIONS CORP,3663.0,2020-02-11,2019-12-28,1003.0,,01742,MA
1050,0001564590-20-004619,10-Q,96793,SUNLINK HEALTH SYSTEMS INC,8062.0,2020-02-13,2019-12-31,630.0,,30339,OH
1051,0001185185-20-000178,10-Q,96885,TEL INSTRUMENT ELECTRONICS CORP,3670.0,2020-02-13,2019-12-31,331.0,,07073,NJ
1052,0001213900-20-000888,10-Q,98338,TSR INC,7371.0,2020-01-13,2019-11-30,531.0,,11788,DE


In [6]:
########################################################################################
############ Merge compustat quarterly data with CRSP monthly data #####################
########################################################################################

########### Read compustat raw data files
comp_cols = ['gvkey', 'datadate', 'fyearq', 'fqtr', 'fyr', 'cusip', 'conm', 'curcdq', 'actq', \
             'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'exchg', \
             'cik', 'costat', 'prccq', 'addzip', 'incorp', 'sic', 'ipodate']
comp = pd.read_csv('..\\filings\\compustat.csv', usecols = comp_cols)
comp.columns = ['gvkey', 'date', 'fyearq', 'fqtr', 'fyr', 'cusip', 'conm', 'curcdq', 'actq', \
             'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'exchg', \
             'cik', 'costat', 'prccq', 'addzip', 'incorp', 'sic', 'ipodate']

### Reassign compustat column order
# 1st line: merge keys
# 2nd line: extra id info
# 3rd line: financial data
comp = comp[['cusip', 'cik', 'date', \
             'gvkey', 'conm', 'sic', 'incorp', 'addzip', 'fyearq', 'fqtr', 'fyr', 'ipodate', 'costat', 'curcdq', 'exchg', \
             'actq', 'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'prccq']]

########### Read crsp raw data files
crsp_cols = ['date', 'CUSIP', 'RET', 'vwretd']
crsp = pd.read_csv('..\\filings\\crsp.csv', usecols = crsp_cols)
crsp.columns = ['date', 'cusip', 'ret', 'vwretd']

### Delete the two 'day digits' of compustat and crsp filings' datadate
comp['date'] = comp['date'].astype(str).str[:-2]
crsp['date'] = crsp['date'].astype(str).str[:-2]

### Delete the 9th digit of compustat filings' cusip, and filter filings that have 8-digits cusip after deletion 
comp['cusip'] = comp['cusip'].astype(str).str[:-1]
print('number of quarterly filings in Compustat: ' + str(len(comp.index)))
comp = comp.loc[comp['cusip'].str.len() == 8]
print('number of quarterly filings in Compustat after deleting non-9-digits cusips: ' + str(len(comp.index)))

### Delete CRSP raw rows that contains non-numeric returns ('B' and 'C'), fill NA with 0
print('number of monthly data in CRSP: ' + str(len(crsp.index)))
crsp = crsp[(crsp['ret'] != 'B') & (crsp['ret'] != 'C')]
crsp['ret'] = crsp['ret'].fillna(0)
print('number of monthly data in CRSP that contains only numeric returns: ' + str(len(crsp.index)))

### Mutate adjusted monthly returns and delete ret and vwretd
crsp = crsp.assign(adj_ret_m = crsp['ret'].astype(float) - crsp['vwretd'].astype(float))
crsp = crsp.drop(columns=['ret', 'vwretd'])
crsp

number of quarterly filings in Compustat: 1240141
number of quarterly filings in Compustat after deleting non-9-digits cusips: 1239632
number of monthly data in CRSP: 2477547
number of monthly data in CRSP that contains only numeric returns: 2427687


Unnamed: 0,date,cusip,adj_ret_m
0,199211,36720410,-0.057287
1,199212,36720410,-0.032898
2,199301,36720410,-0.012644
3,199302,36720410,0.012455
4,199303,36720410,-0.014063
...,...,...,...
2477542,201908,88160R10,-0.045951
2477543,201909,88160R10,0.051615
2477544,201910,88160R10,0.288162
2477545,201911,88160R10,0.012724


In [7]:
##################### Left merge CRSP and Compstat, key not unique in Compustat
crsp_comp = pd.merge(crsp, comp, on = ['cusip', 'date'], how='left', validate = '1:m')
crsp_comp

Unnamed: 0,date,cusip,adj_ret_m,cik,gvkey,conm,sic,incorp,addzip,fyearq,fqtr,fyr,ipodate,costat,curcdq,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,prccq
0,199211,36720410,-0.057287,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,199212,36720410,-0.032898,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,199301,36720410,-0.012644,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,199302,36720410,0.012455,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,199303,36720410,-0.014063,43350.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1993.0,3.0,6.0,,I,USD,12.0,7.992,28.055,9.147,1.548,1.085,3.61,8.55,0.347,1.260,0.0,7.66,11.718,2.493,14.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2428674,201908,88160R10,-0.045951,,,,,,,,,,,,,,,,,,,,,,,,,,,
2428675,201909,88160R10,0.051615,1318605.0,184996.0,TESLA INC,3711.0,DE,94304,2019.0,3.0,12.0,20100629.0,A,USD,14.0,10940.000,32795.000,6040.000,5571.000,180.000,2253.00,12383.00,530.851,143.469,537.0,10146.00,6302.860,0.000,240.87
2428676,201910,88160R10,0.288162,,,,,,,,,,,,,,,,,,,,,,,,,,,
2428677,201911,88160R10,0.012724,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [8]:
########## Aggregate the monthly returns in CRSP to quarterly returns by summing up 3-months returns in a quarter
adj_ret = list()
gvkey = crsp_comp['gvkey'].values.tolist()
adj_ret_m = crsp_comp['adj_ret_m'].values.tolist()

for index, value in enumerate(gvkey):
    ### requires a quarter-end flag to be non-zero, and the quarter-end flag of previous two months to be zero (avoid time-slot mismatch)
    if math.isnan(value) == False and math.isnan(gvkey[index-1]) == True and math.isnan(gvkey[index-2]) == True:
        adj_ret_i = adj_ret_m[index] + adj_ret_m[index-1] + adj_ret_m[index-2]
    else:
        adj_ret_i = float('NaN')
    adj_ret.append(adj_ret_i)

crsp_comp = crsp_comp.assign(RET = adj_ret)

########## Delete adj_ret_m column, and any rows that contains missing values of gvkey or RET
crsp_comp = crsp_comp.drop(columns=['adj_ret_m'])
crsp_comp = crsp_comp[crsp_comp['gvkey'].notnull()]
crsp_comp = crsp_comp[crsp_comp['RET'].notnull()]

############## Save text_data dataframe into local file text_data_10-Q.csv
crsp_comp.to_csv('..\\filings\\crsp_comp_' + obj_type + '.csv', index = 0)

In [9]:
################# Inspect crsp_comp 
first_T1PB = len(crsp_comp.index)
print('number of quarterly fillings after merging CRSP and Compustat: ' + str(first_T1PB))
crsp_comp

number of quarterly fillings after merging CRSP and Compustat: 707333


Unnamed: 0,date,cusip,cik,gvkey,conm,sic,incorp,addzip,fyearq,fqtr,fyr,ipodate,costat,curcdq,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,prccq,RET
4,199303,36720410,43350.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1993.0,3.0,6.0,,I,USD,12.0,7.992,28.055,9.147,1.548,1.085,3.610,8.550,0.347,1.260,0.000,7.660,11.718,2.493,14.25,-0.014252
7,199306,36720410,43350.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1993.0,4.0,6.0,,I,USD,12.0,6.761,27.434,8.733,1.509,1.090,1.663,11.050,0.390,-0.296,0.000,4.881,4.058,2.763,16.50,0.154818
10,199309,36720410,43350.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1994.0,1.0,6.0,,I,USD,12.0,7.452,29.751,8.326,0.975,1.091,4.323,10.750,0.362,-0.320,0.000,6.753,3.978,3.309,16.75,-0.011296
13,199312,36720410,43350.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1994.0,2.0,6.0,,I,USD,12.0,10.962,33.954,8.757,0.440,1.091,4.423,10.768,0.366,0.621,0.000,10.299,9.172,3.323,18.50,0.086561
16,199403,36720410,43350.0,12994.0,GAS NATURAL INC,4924.0,OH,44114,1994.0,3.0,6.0,,I,USD,12.0,9.107,32.087,9.656,1.337,1.091,2.623,10.763,0.380,1.090,0.000,7.651,9.768,3.197,17.25,-0.013350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2428666,201812,88160R10,1318605.0,184996.0,TESLA INC,3711.0,DE,94304,2018.0,4.0,12.0,20100629.0,A,USD,14.0,8306.308,29739.614,4923.243,3878.169,172.603,2629.460,9454.055,496.737,139.483,350.651,9992.136,7225.873,0.000,332.80,0.407883
2428669,201903,88160R10,1318605.0,184996.0,TESLA INC,3711.0,DE,94304,2019.0,1.0,12.0,20100629.0,A,USD,14.0,7677.822,28912.524,4605.596,2329.119,173.682,1914.073,10834.402,467.577,-702.135,347.880,9242.800,4541.464,0.000,279.86,-0.294656
2428672,201906,88160R10,1318605.0,184996.0,TESLA INC,3711.0,DE,94304,2019.0,2.0,12.0,20100629.0,A,USD,14.0,10181.952,31872.597,5715.393,5082.746,179.118,2011.177,12309.747,578.572,-408.334,480.833,9588.773,6349.676,0.000,223.46,-0.208027
2428675,201909,88160R10,1318605.0,184996.0,TESLA INC,3711.0,DE,94304,2019.0,3.0,12.0,20100629.0,A,USD,14.0,10940.000,32795.000,6040.000,5571.000,180.000,2253.000,12383.000,530.851,143.469,537.000,10146.000,6302.860,0.000,240.87,0.075005


In [10]:
########################################################################################
######################## Merge ID_DATA with CRSP_COMPUSTAT #############################
########################################################################################

# id_data = concatenate('..\\filings', data_type_id + '_'+ obj_type + '_' + '*.csv')

############## prepare merge: ID_DATA
id_data['fd'] = id_data['fd'].str.replace('-', '')
id_data['rp'] = id_data['rp'].str.replace('-', '')
id_data['date'] = id_data['rp'].astype(str).str[:-2]

In [11]:
############## Left merge ID_DATA and CRSP_COMP, key not unique in both data sets
id_crsp_comp = pd.merge(id_data, crsp_comp, on = ['cik', 'date'], how = 'inner', validate = 'm:m')

### Delete duplicated columns in ID_DATA and CRSP_COMP, duplicated rows in accnum (edgar) and [cik * date] (compustat), and rename sic
id_crsp_comp = id_crsp_comp.drop(columns=['conm', 'sic_x', 'date', 'bazip', 'file_type', 'fye', 'curcdq'])
id_crsp_comp = id_crsp_comp[id_crsp_comp.duplicated('accnum') == False]
id_crsp_comp = id_crsp_comp[id_crsp_comp.duplicated(subset=['cik', 'rp']) == False]
id_crsp_comp = id_crsp_comp.rename(columns={'sic_y': 'sic'})

### Reassign compustat column order
# 1st line: merge keys
# 2nd line: extra id info
# 3rd line: financial data
id_crsp_comp = id_crsp_comp[['cusip', 'cik', 'rp', 'accnum', \
            'name', 'gvkey', 'sic', 'fd', 'fyearq', 'fqtr', 'fyr', 'ipodate', 'incorp', 'state', 'addzip', 'costat', 'exchg', \
            'actq', 'atq', 'ceqq', 'cheq', 'cshoq', 'dlcq', 'dlttq', 'dpq', 'ibq', 'intanq', 'lctq', 'revtq', 'txditcq', 'prccq', 'RET']]

In [12]:
################## Inspect id_crsp_comp 
second_T1PB = len(id_crsp_comp.index)
print('number of observations after merging with edgar data: ' + str(second_T1PB))

id_crsp_comp

number of observations after merging with edgar data: 303843


Unnamed: 0,cusip,cik,rp,accnum,name,gvkey,sic,fd,fyearq,fqtr,fyr,ipodate,incorp,state,addzip,costat,exchg,actq,atq,ceqq,cheq,cshoq,dlcq,dlttq,dpq,ibq,intanq,lctq,revtq,txditcq,prccq,RET
0,54626810,60512,19930630,0000060512-94-000005,LOUISIANA LAND & EXPLORATION CO,6819.0,1311.0,19930813,1993.0,2.0,12.0,,MD,MD,70112,I,11.0,193.800,1278.000,424.200,64.300,28.729,86.500,356.300,27.300,5.600,,208.200,189.300,136.100,42.5000,-0.059108
1,88579Y10,66740,19930630,0000066740-94-000015,MINNESOTA MINING & MANUFACTURING CO,7435.0,2670.0,19930813,1993.0,2.0,12.0,19831230.0,DE,DE,55144,A,11.0,6382.000,12145.000,6590.000,650.000,216.975,829.000,679.000,264.000,331.000,,3365.000,3540.000,0.000,108.0000,-0.014779
2,08750910,11860,19930930,0000011860-94-000005,BETHLEHEM STEEL CORP /DE/,2189.0,3312.0,19931112,1993.0,3.0,12.0,,DE,DE,18016-7699,I,13.0,1074.900,5168.098,580.800,232.500,91.307,100.300,728.500,69.100,30.700,,908.400,1055.300,0.000,14.3750,-0.270146
3,54626810,60512,19930930,0000060512-94-000007,LOUISIANA LAND & EXPLORATION CO,6819.0,1311.0,19931110,1993.0,3.0,12.0,,MD,MD,70112,I,11.0,180.200,1662.300,417.100,9.400,33.137,0.000,747.700,28.200,-1.800,,195.100,187.900,124.800,44.6250,0.025969
4,88579Y10,66740,19930930,0000066740-94-000016,MINNESOTA MINING & MANUFACTURING CO,7435.0,2670.0,19931112,1993.0,3.0,12.0,19831230.0,DE,DE,55144,A,11.0,6445.000,12229.000,6600.000,665.000,215.791,796.000,682.000,262.000,316.000,,3404.000,3481.000,0.000,102.8750,-0.076728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307477,87182910,96021,20191228,0000096021-20-000017,SYSCO CORP,10247.0,5140.0,20200205,2020.0,2.0,6.0,,DE,DE,77077,A,11.0,8661.610,19372.034,2527.526,539.625,508.843,897.619,8654.524,185.011,383.410,4879.128,6931.968,15025.042,142.301,85.5400,-0.001390
307478,87840910,96699,20191228,0001171843-20-000876,TECHNICAL COMMUNICATIONS CORP,10364.0,3663.0,20200211,2020.0,1.0,9.0,,MA,MA,01742-2892,A,14.0,2.146,2.847,1.752,0.748,1.850,0.148,0.521,0.006,-0.480,0.000,0.574,0.666,0.000,5.0500,1.266332
307479,86737U10,96793,20191231,0001564590-20-004619,SUNLINK HEALTH SYSTEMS INC,10380.0,5912.0,20200213,2020.0,2.0,6.0,,OH,OH,30339,A,12.0,14.384,22.823,14.762,4.607,6.983,0.782,0.851,0.350,0.203,1.295,6.440,12.805,0.000,1.0700,-0.135859
307480,87288520,98338,20191130,0001213900-20-000888,TSR INC,10305.0,7371.0,20200113,2019.0,2.0,5.0,,DE,DE,11788,A,14.0,10.365,11.826,6.287,2.988,1.962,0.252,0.271,0.002,0.061,0.000,5.241,15.233,0.000,3.2001,-0.315755


In [62]:
############## Left merge ID_CRSP_COMP and TEXT_DATA, key unique in both data sets
id_crsp_comp_text = pd.merge(id_crsp_comp, text_data, on = ['accnum'], how = 'inner', validate = '1:1')
id_crsp_comp_text.to_csv('..\\filings\\id_crsp_comp_text_' + obj_type + '.csv', index = 0)

In [63]:
############### Inspect if firm-quarter key is unique : YES, key is unique
# print(id_crsp_comp_text[id_crsp_comp_text.duplicated('accnum')])
# print(id_crsp_comp_text[id_crsp_comp_text.duplicated(subset=['cik', 'rp'])])

Empty DataFrame
Columns: [cusip, cik, rp, accnum, name, gvkey, sic, fd, fyearq, fqtr, fyr, ipodate, incorp, state, addzip, costat, exchg, actq, atq, ceqq, cheq, cshoq, dlcq, dlttq, dpq, ibq, intanq, lctq, revtq, txditcq, prccq, RET, nw, nvocab, n_neg, n_pos, n_uctt, n_lit, n_cstr, n_modal_strong, n_modal_moderate, n_modal_weak, n_negation, tone]
Index: []
Empty DataFrame
Columns: [cusip, cik, rp, accnum, name, gvkey, sic, fd, fyearq, fqtr, fyr, ipodate, incorp, state, addzip, costat, exchg, actq, atq, ceqq, cheq, cshoq, dlcq, dlttq, dpq, ibq, intanq, lctq, revtq, txditcq, prccq, RET, nw, nvocab, n_neg, n_pos, n_uctt, n_lit, n_cstr, n_modal_strong, n_modal_moderate, n_modal_weak, n_negation, tone]
Index: []


In [64]:
####################### Modify data type
########### Define a function that changes pandas series data type to string
def columns_to_str (df, colnames):
    for col in colnames:
        df[col] = df[col].astype(str)
    return df

########### apply columns_to_str to various identification variables
id_crsp_comp_text = columns_to_str(id_crsp_comp_text, ['cik', 'gvkey', 'sic', 'exchg', 'fyearq', 'fqtr', 'fyr'])

########## convert date variables to date format
id_crsp_comp_text['fd'] = pd.to_datetime(id_crsp_comp_text['fd'])
id_crsp_comp_text['rp'] = pd.to_datetime(id_crsp_comp_text['rp'])
id_crsp_comp_text['ipodate'] = pd.to_datetime(id_crsp_comp_text['ipodate'])

########### Inspect column data types
# print(id_crsp_comp_text.dtypes)

In [65]:
########################################################################################
############################### Variable Creation ######################################
########################################################################################

############################## Main Variables ##########################################
######## TONE and NW: rename
id_crsp_comp_text = id_crsp_comp_text.rename(columns={'tone': 'TONE', 'nw': 'NW'})
######## TLAG: Time lag between the news release date and document filing date
id_crsp_comp_text['TLAG'] = (id_crsp_comp_text['fd'] - id_crsp_comp_text['rp']).dt.days
######## NEG: An indicator variable takes the value of 1 when market-adjusted stock return (RET) is negative and is 0 otherwise
id_crsp_comp_text['NEG'] = 0 
id_crsp_comp_text['NEG'][id_crsp_comp_text['RET'] < 0] = 1

############################## Control Variables #######################################
######## Size: Firm size, defined as the natural logarithm of market value of equity [at the beginning of the quarter] \
######## defined as [beginning-of-quarter] common share price (Compustat data item prccq) \
######## times [beginning-of-quarter] common shares outstanding (Compustat data item cshoq)
id_crsp_comp_text['SIZE'] = np.log(id_crsp_comp_text['prccq']*id_crsp_comp_text['cshoq'])
######## MTB: Market-to-book ratio, defined as [beginning-of-quarter] market value of equity \
######## defined as common share price (Compustat data item prccq) times common shares outstanding (Compustat data item cshoq) \
######## divided by [beginning-of-quarter] book value of equity (Compustat data item ceqq) 
id_crsp_comp_text['MTB'] = (id_crsp_comp_text['prccq']*id_crsp_comp_text['cshoq'])/id_crsp_comp_text['ceqq']
######## LEV: Leverage, defined as [beginning-of-quarter] short term debt plus [beginning-of-quarter] long term debt \
######## (Compustat data item dlcq + Compustat data item dlttq) scaled by [beginning-of-quarter] total assets (Compustat data item atq)
id_crsp_comp_text['LEV'] = (id_crsp_comp_text['dlcq'] + id_crsp_comp_text['dlttq'])/id_crsp_comp_text['atq']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [66]:
########################################################################################
############################### Variable Screening #####################################
########################################################################################

############## Drop financial and utility firms


############## Drop files (firm-quarter) that contain number of words beyond 1% (and 99%) threshold
nwq01 = id_crsp_comp_text['NW'].quantile(.01)
print('number of words, 1% quantile: ' + str(nwq01))
# nwq99 = id_crsp_comp_text['nw'].quantile(.99)
# print('number of words, 99% quantile: ' + str(nwq99))
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['NW'] >= nwq01]

############## Drop files (firm-quarter) that have missing SIZE, MTB, LEV, or with non-positive total assets or book value of equity, \
############## or with common share price less than $1
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['SIZE'].isnull() == False]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['MTB'].isnull() == False]
id_crsp_comp_text = id_crsp_comp_text[id_crsp_comp_text['LEV'].isnull() == False]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['atq'] > 0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['ceqq'] > 0]
id_crsp_comp_text = id_crsp_comp_text.loc[id_crsp_comp_text['prccq'] >= 1]

############## Winsorize SIZE, MTB, LEV
###### Define a function that winsorize a variable at 1% and 99% 
def winsorize (df, colnames):
    for col in colnames:
        varq01 = df[col].quantile(.01)
        varq99 = df[col].quantile(.99)
        df[col] = df[col].clip(varq01, varq99)
    return df

id_crsp_comp_text = winsorize(id_crsp_comp_text, ['SIZE', 'MTB', 'LEV'])

############## Inspect sample size after variable screening
third_T1PB = str(len(id_crsp_comp_text.index))
print('Number of firm-quarters after variable screening: ' + str(third_T1PB))

number of words, 1% quantile: 1185.0
Number of firm-quarters after variable screening: 253295


In [67]:
############# Table 1 Panel A: Summary statistics for text data
T1PA = id_crsp_comp_text.describe().transpose() # generally consistent with LM's summary statistics

############# Save T1PA
table_path = '..\\output\\Tables.xlsx'
if os.path.exists(table_path) == True:
    book = load_workbook(table_path)
    writer = pd.ExcelWriter(table_path, engine = 'openpyxl')
    writer.book = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

    T1PA.to_excel(writer, sheet_name='T1PA_raw', float_format="%.2f")

    writer.save()
    writer.close()
    
else:
    T1PA.to_excel(table_path, sheet_name='T1PA_raw', float_format="%.2f")

T1PA

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
actq,202882.0,994.172785,4316.581293,0.0,42.97125,141.5025,499.67975,167633.0
atq,253295.0,7021.516869,61050.654679,0.001,112.553,475.073,1952.4095,2764661.0
ceqq,253295.0,1450.058569,7986.675765,0.001,50.455,161.431,613.3395,397609.0
cheq,253212.0,818.700494,11245.564411,-26.0,9.711,40.218,148.16975,748548.0
cshoq,253295.0,97.202747,405.305957,0.0,10.445,25.4,61.6405,29206.44
dlcq,253295.0,674.672333,10687.725216,-0.094,0.0,2.862,32.8505,519230.0
dlttq,253295.0,1197.973402,9904.437847,0.0,0.222,36.802,334.4825,616814.0
dpq,232652.0,32.33607,174.126071,-33.0,0.52,2.4265,12.523,8166.0
ibq,252950.0,45.851606,368.692606,-41847.9,-0.4,2.291,15.2845,22628.0
intanq,180278.0,992.088326,5868.831769,-9.185,0.0,22.014,238.43975,312576.0


In [69]:
########### Table 1 Panel B: Data screening process
print('Number of firm-quarters from Compustat/CRSP merged data, 1993Q1 to 2019Q4: ' + str(first_T1PB))
print('Number of firm-quarters after merging edgar data: ' + str(second_T1PB))
print('Number of firm-quarters after data screening: ' + str(third_T1PB))

Number of firm-quarters from Compustat/CRSP merged data, 1993Q1 to 2019Q4: 707333
Number of firm-quarters after merging edgar data: 303843
Number of firm-quarters after data screening: 253295
