In [1]:
############### import packages
import os, nltk, numpy as np, pandas as pd, time, textstat
from nltk import word_tokenize
from tqdm import tqdm

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
period_start = 1993 # included
period_end = 1995 # included

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

'F:\\github\\narrative_conservatism\\code'

In [2]:
############### Read LM disctionary
LM = pd.read_excel('..\\LM\\LoughranMcDonald_MasterDictionary_2018.xlsx', encoding = "utf-8")

############### Create negative, positive, uncertainty, litigious, constraining and modal word lists
lm_neg = LM.loc[LM['Negative'] != 0]['Word'].values.tolist()
lm_pos = LM.loc[LM['Positive'] != 0]['Word'].values.tolist()
lm_uctt = LM.loc[LM['Uncertainty'] != 0]['Word'].values.tolist()
lm_lit = LM.loc[LM['Litigious'] != 0]['Word'].values.tolist()
lm_cstr = LM.loc[LM['Constraining'] != 0]['Word'].values.tolist()

lm_modal1 = LM.loc[LM['Modal'] == 1]['Word'].values.tolist()
lm_modal2 = LM.loc[LM['Modal'] == 2]['Word'].values.tolist()
lm_modal3 = LM.loc[LM['Modal'] == 3]['Word'].values.tolist()

lm_neg = [w.lower() for w in lm_neg]
lm_pos = [w.lower() for w in lm_pos]
lm_uctt = [w.lower() for w in lm_uctt]
lm_lit = [w.lower() for w in lm_lit]
lm_cstr = [w.lower() for w in lm_cstr]
lm_modal1 = [w.lower() for w in lm_modal1]
lm_modal2 = [w.lower() for w in lm_modal2]
lm_modal3 = [w.lower() for w in lm_modal3]

############## Read and create stop words list
lm_stop = list()
with open('..\\LM\\StopWords_Generic.txt', "r") as f:
    for line in f:
        line = line.replace('\n', '')
        lm_stop.append(line)
        
lm_stop = [w.lower() for w in lm_stop]

############# Create a negation word list
gt_negation = ['no', 'not', 'none', 'neither', 'never', 'nobody'] ## Gunnel Totie, 1991, Negation in Speech and Writing

In [3]:
############### Read GI disctionary
GI_cols = ['Entry', 'Source', 'Positiv', 'Negativ']
GI = pd.read_excel('..\\LM\\GI\\inquirerbasic.xls', encoding = "utf-8", usecols = GI_cols)
GI = GI[(GI['Entry'].str.endswith('#1') == True) | (GI['Entry'].str.contains('#') == False)]
GI['Entry'] = GI['Entry'].str.replace('#1','') 

############### Create negative, positive, uncertainty, litigious, constraining and modal word lists
gi_neg = GI.loc[GI['Negativ'].notnull()]['Entry'].values.tolist()
gi_pos = GI.loc[GI['Positiv'].notnull()]['Entry'].values.tolist()

gi_neg = [w.lower() for w in gi_neg]
gi_pos = [w.lower() for w in gi_pos]

############### Create Henry disctionary (HENRY 2008)
he_neg = ['negative', 'negatives', 'fail', 'fails', 'failing', 'failure', 'weak', 'weakness', 'weaknesses', 'difficult', 'difficulty', 'hurdle', 'hurdles', 'obstacle', 'obstacles', 'slump', 'slumps', 'slumping', 'slumped', 'uncertain', 'uncertainty', 'unsettled', 'unfavorable', 'downturn', 'depressed', 'disappoint', 'disappoints', 'disappointing', 'disappointed', 'disappointment', 'risk', 'risks', 'risky', 'threat', 'threats', 'penalty', 'penalties', 'down', 'decrease', 'decreases', 'decreasing', 'decreased', 'decline', 'declines', 'declining', 'declined', 'fall', 'falls', 'falling', 'fell', 'fallen', 'drop', 'drops', 'dropping', 'dropped', 'deteriorate', 'deteriorates', 'deteriorating', 'deteriorated', 'worsen', 'worsens', 'worsening', 'weaken', 'weakens', 'weakening', 'weakened', 'worse', 'worst', 'low', 'lower', 'lowest', 'less', 'least', 'smaller', 'smallest', 'shrink']
he_pos = ['positive', 'positives', 'success', 'successes', 'successful', 'succeed', 'succeeds', 'succeeding', 'succeeded', 'accomplish', 'accomplishes', 'accomplishing', 'accomplished', 'accomplishment', 'accomplishments', 'strong', 'strength', 'strengths', 'certain', 'certainty', 'definite', 'solid', 'excellent', 'good', 'leading', 'achieve', 'achieves', 'achieved', 'achieving', 'achievement', 'achievements', 'progress', 'progressing', 'deliver', 'delivers', 'delivered', 'delivering', 'leader', 'leading', 'pleased', 'reward', 'rewards', 'rewarding', 'rewarded', 'opportunity', 'opportunities', 'enjoy', 'enjoys', 'enjoying', 'enjoyed', 'encouraged', 'encouraging', 'up', 'increase', 'increases', 'increasing', 'increased', 'rise', 'rises', 'rising', 'rose', 'risen', 'improve', 'improves', 'improving', 'improved', 'improvement', 'improvements', 'strengthen', 'strengthens', 'strengthening', 'strengthened', 'stronger', 'strongest', 'better', 'best', 'more', 'most', 'above', 'record', 'high', 'higher', 'highest', 'greater', 'greatest', 'larger', 'largest', 'grow', 'grows', 'growing', 'grew', 'grown', 'growth', 'expand', 'expands', 'expanding', 'expanded', 'expansion', 'exceed', 'exceeds', 'exceeded', 'exceeding', 'beat', 'beats', 'beating']

In [4]:
#####################################################################
#################### FOR ALL PROCESSED FILES LOOP ###################
#####################################################################

############# Create input txt file index
processed = list()
for subdir, dirs, files in os.walk("H:\\data\\edgar\\processed\\" + obj_type + '\\' + str(period_start) + '-' + str(period_end)):
    for file in files:
        processed.append(os.path.join(subdir, file))

len(processed)

20346

In [5]:
#### Define a function count_occurrence to count the number of words in tup that pertaining to a list 
def count_occurrence(tup, lst): 
    count = 0
    for item in tup: 
        if item in lst: 
            count+= 1
      
    return count

### Define a function count_negation to count cases where negation occurs within four or fewer words from a word identified in list.
def count_negation(tup, lst, negation): 
    count = 0
    for item in tup: 
        if item in lst:
            if tup.index(item)-4 > 0 and tup.index(item)+4 < len(tup):
                neighbor = tup[tup.index(item)-4:tup.index(item)+4]
                for neighborw in neighbor:
                    if neighborw in negation:
                        count+= 1

            if tup.index(item)-4 < 0:
                pre = tup[0:tup.index(item)+4]
                for prew in pre:
                    if prew in negation:
                        count+= 1
                        
            if tup.index(item)+4 > len(tup):
                post = tup[tup.index(item)-4:len(tup)]
                for postw in post:
                    if postw in negation:
                        count+= 1
    return count

In [6]:
############ Full Text Raw Count
accnum = list()

nw = list()
nvocab = list()

n_neg = list()
n_pos = list()
n_neg_gi = list()
n_pos_gi = list()
n_neg_he = list()
n_pos_he = list()
n_uctt = list()
n_lit = list()
n_cstr = list()
n_modal1 = list()
n_modal2 = list()
n_modal3 = list()
n_negation = list()
READ = list()

############ Word Tokenization, count nword and nvocab, count negative, positive, uncertainty, litigious, constraining and modal words
for text in tqdm(processed):
    ############# Create an array of accession number
    a = text.split("\\")[6].split(".")[0]
    accnum.append(a)
    
    ############# Read processed txt file
    with open(text, 'r',  encoding = "utf-8") as file:
        contents = file.read().replace('\n', ' ').replace(u'\xa0', u' ')
        # print(repr(contents))
        
        ############ Word Tokenization
        ## Raw tokens: including punctuations, numbers etc.
        tokens = word_tokenize(contents)

        ## Convert all words into small cases
        ## Keep tokens that purely consist of alphabetic characters only
        ## Delete single-character words except for 'I'
        words = [w.lower() for w in tokens if w.isalpha() and len(w)>1 or w =='i']
        
        ########### Delete words with lenth smaller than 1% and largr than 99% of the document
        # wordlen99 = np.quantile([len(w) for w in words], 0.99)
        # wordlen1 = np.quantile([len(w) for w in words], 0.01)
        # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
        vocab = sorted(set(words))
        
        ########### Save text statistics
        ##### 1. nw 2. nvocab 3. tone 4. readability
        
        ## 1. nw
        a = len(words)
        nw.append(a)
        
        ## 2. nvocab
        b = len(vocab)
        nvocab.append(b)
        
        ## 3. tone
        neg = count_occurrence(words, lm_neg)
        n_neg.append(neg)
        pos = count_occurrence(words, lm_pos)
        n_pos.append(pos)
        uctt = count_occurrence(words, lm_uctt)
        n_uctt.append(uctt)
        lit = count_occurrence(words, lm_lit)
        n_lit.append(lit)
        cstr = count_occurrence(words, lm_cstr)
        n_cstr.append(cstr)
        modal1 = count_occurrence(words, lm_modal1)
        n_modal1.append(modal1)
        modal2 = count_occurrence(words, lm_modal2)
        n_modal2.append(modal2)
        modal3 = count_occurrence(words, lm_modal3)
        n_modal3.append(modal3)
        negation = count_negation(words, lm_pos, gt_negation)
        n_negation.append(negation)
        
        neg_gi = count_occurrence(words, gi_neg)
        n_neg_gi.append(neg_gi)
        pos_gi = count_occurrence(words, gi_pos)
        n_pos_gi.append(pos_gi)
        
        neg_he = count_occurrence(words, he_neg)
        n_neg_he.append(neg_he)
        pos_he = count_occurrence(words, he_pos)
        n_pos_he.append(pos_he)
        
        ## 4. readability
        read = textstat.gunning_fog(contents)
        READ.append(read)

100%|██████████████████████████████████████████████████████████████████████████| 20346/20346 [9:55:45<00:00,  1.76s/it]


In [7]:
############### Create Data Frame: full document
d = {'accnum': accnum, 'nw': nw, 'nvocab': nvocab, 'n_neg': n_neg, 'n_pos': n_pos, 'n_neg_gi': n_neg_gi, 'n_pos_gi': n_pos_gi, 'n_neg_he': n_neg_he, 'n_pos_he': n_pos_he, 'n_uctt': n_uctt, 'n_lit': n_lit, 'n_cstr': n_cstr, \
     'n_modal_week': n_modal1, 'n_modal_moderate': n_modal2, 'n_modal_strong': n_modal3, 'n_negation': n_negation, 'READ': READ}

text_data = pd.DataFrame(data=d)
text_data.to_csv('..\\filings\\text_data_' + obj_type + '_' + str(period_start) + '-' + str(period_end) + '.csv', index=False)

text_data

Unnamed: 0,accnum,nw,nvocab,n_neg,n_pos,n_neg_gi,n_pos_gi,n_neg_he,n_pos_he,n_uctt,n_lit,n_cstr,n_modal_week,n_modal_moderate,n_modal_strong,n_negation,READ
0,0000001952-95-000005,3301,832,110,13,80,119,22,39,30,111,10,11,4,4,0,140.51
1,0000001952-95-000006,2662,735,89,9,44,99,12,30,28,102,10,10,3,4,0,95.49
2,0000001952-95-000012,2768,760,95,8,44,108,14,27,27,108,10,10,3,3,0,159.52
3,0000001985-95-000004,6057,980,111,34,96,281,35,40,33,63,45,14,16,8,0,96.27
4,0000001988-95-000005,6273,1147,112,29,96,290,22,49,27,266,60,18,7,18,0,82.37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20341,0001001606-95-000010,3851,791,48,15,53,168,15,37,35,23,16,4,4,6,0,69.97
20342,0001001746-95-000008,1948,590,8,13,24,67,15,35,9,8,5,4,0,4,0,28.46
20343,0001003017-95-000003,2698,631,15,10,22,133,15,45,21,14,18,4,4,2,0,61.34
20344,0001004003-95-000003,1153,396,15,1,24,37,9,16,6,6,4,1,0,0,0,39.97


In [5]:
############ MDA and NOTE Raw Count
accnum = list()

nw_mda = list()
nvocab_mda = list()

n_neg_mda = list()
n_pos_mda = list()
n_uctt_mda = list()
n_lit_mda = list()
n_cstr_mda = list()
n_modal1_mda = list()
n_modal2_mda = list()
n_modal3_mda = list()
n_negation_mda = list()

READ_mda = list()

nw_note = list()
nvocab_note = list()

n_neg_note = list()
n_pos_note = list()
n_uctt_note = list()
n_lit_note = list()
n_cstr_note = list()
n_modal1_note = list()
n_modal2_note = list()
n_modal3_note = list()
n_negation_note = list()

READ_note = list()

############ Word Tokenization, count nword and nvocab, count negative, positive, uncertainty, litigious, constraining and modal words
for text in tqdm(processed):
    ############# Create an array of accession number
    a = text.split("\\")[6].split(".")[0]
    accnum.append(a)
    
    ############# Read processed txt file
    with open(text, 'r',  encoding = "utf-8") as file:
        contents = file.read().replace('\n', ' ').replace(u'\xa0', u' ')
        ############################## TO EXTRACT MDA AND NOTES SECTION, UNCOMMENT THIS SECTION ################################
        try:
            mda = contents[contents.index("ITEM 2."):contents.index("ITEM 3.")]
        except:
            try:
                mda = contents[contents.index("Item 2."):contents.index("Item 3.")]
            except:
                try:
                    mda = contents[contents.index("ITEM 2"):contents.index("ITEM 3")]
                except:
                    try:
                        mda = contents[contents.index("Item 2"):contents.index("Item 3")]
                    except:
                        mda = ''
                        pass
                    
        try:
            note = contents[contents.index("NOTES TO"):contents.index("ITEM 2.")]
        except:
            try:
                note = contents[contents.index("NOTES TO"):contents.index("ITEM 2")]
            except:
                try:
                    note = contents[contents.index("Notes to"):contents.index("Item 2.")]
                except:
                    try:
                        note = contents[contents.index("Notes to"):contents.index("Item 2")]
                    except:
                        note = ''
                        pass
        ###########################################################################################################
        
        ############ Word Tokenization
        ## Raw tokens: including punctuations, numbers etc.
        tokens_mda = word_tokenize(mda)
        tokens_note = word_tokenize(note)
        
        ####################################################################

        ## Convert all words into small cases
        ## Keep tokens that purely consist of alphabetic characters only
        ## Delete single-character words except for 'I'
        words_mda = [w.lower() for w in tokens_mda if w.isalpha() and len(w)>1 or w =='i']
        words_note = [w.lower() for w in tokens_note if w.isalpha() and len(w)>1 or w =='i']
        
        ########### Delete words with lenth smaller than 1% and largr than 99% of the document
        # wordlen99 = np.quantile([len(w) for w in words], 0.99)
        # wordlen1 = np.quantile([len(w) for w in words], 0.01)
        # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
        vocab_mda = sorted(set(words_mda))
        vocab_note = sorted(set(words_note))
        
        ########### Save text statistics
        ##### 1. nw 2. nvocab 3. tone 4. readability
        
        ## 1. nw
        a_mda = len(words_mda)
        nw_mda.append(a_mda)
        a_note = len(words_note)
        nw_note.append(a_note)
        
        ## 2. nvocab
#         b_mda = len(vocab_mda)
#         nvocab_mda.append(b_mda)
#         b_note = len(vocab_note)
#         nvocab_note.append(b_note)
        
        ## 3. tone
        neg_mda = count_occurrence(words_mda, lm_neg)
        n_neg_mda.append(neg_mda)
        pos_mda = count_occurrence(words_mda, lm_pos)
        n_pos_mda.append(pos_mda)
#         uctt_mda = count_occurrence(words_mda, lm_uctt)
#         n_uctt_mda.append(uctt_mda)
#         lit_mda = count_occurrence(words_mda, lm_lit)
#         n_lit_mda.append(lit_mda)
#         cstr_mda = count_occurrence(words_mda, lm_cstr)
#         n_cstr_mda.append(cstr_mda)
#         modal1_mda = count_occurrence(words_mda, lm_modal1)
#         n_modal1_mda.append(modal1_mda)
#         modal2_mda = count_occurrence(words_mda, lm_modal2)
#         n_modal2_mda.append(modal2_mda)
#         modal3_mda = count_occurrence(words_mda, lm_modal3)
#         n_modal3_mda.append(modal3_mda)
        negation_mda = count_negation(words_mda, lm_pos, gt_negation)
        n_negation_mda.append(negation_mda)
        
        neg_note = count_occurrence(words_note, lm_neg)
        n_neg_note.append(neg_note)
        pos_note = count_occurrence(words_note, lm_pos)
        n_pos_note.append(pos_note)
#         uctt_note = count_occurrence(words_note, lm_uctt)
#         n_uctt_note.append(uctt_note)
#         lit_note = count_occurrence(words_note, lm_lit)
#         n_lit_note.append(lit_note)
#         cstr_note = count_occurrence(words_note, lm_cstr)
#         n_cstr_note.append(cstr_note)
#         modal1_note = count_occurrence(words_note, lm_modal1)
#         n_modal1_note.append(modal1_note)
#         modal2_note = count_occurrence(words_note, lm_modal2)
#         n_modal2_note.append(modal2_note)
#         modal3_note = count_occurrence(words_note, lm_modal3)
#         n_modal3_note.append(modal3_note)
        negation_note = count_negation(words_note, lm_pos, gt_negation)
        n_negation_note.append(negation_note)
        
        ## 4. readability
        read_mda = textstat.gunning_fog(mda)
        READ_mda.append(read_mda)
        read_note = textstat.gunning_fog(note)
        READ_note.append(read_note)

100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [05:42<00:00,  3.07it/s]


In [6]:
############### Create Data Frame: MDA and NOTES
d = {'accnum': accnum, 'nw_mda': nw_mda, 'n_neg_mda': n_neg_mda, 'n_pos_mda': n_pos_mda, 'n_negation_mda': n_negation_mda, 'nw_note': nw_note, 'n_neg_note': n_neg_note, 'n_pos_note': n_pos_note, 'n_negation_note': n_negation_note, 'READ_MDA':READ_mda, 'READ_NOTE':READ_note}
#      'nvocab_mda': nvocab_mda, 'n_uctt_mda': n_uctt_mda, 'n_lit_mda': n_lit_mda, 'n_cstr_mda': n_cstr_mda, \
#      'n_modal_strong_mda': n_modal1_mda, 'n_modal_moderate_mda': n_modal2_mda, 'n_modal_weak_mda': n_modal3_mda, \
#      'nvocab_note': nvocab_note, 'n_uctt_note': n_uctt_note, 'n_lit_note': n_lit_note, 'n_cstr_note': n_cstr_note, \
#      'n_modal_strong_note': n_modal1_note, 'n_modal_moderate_note': n_modal2_note, 'n_modal_weak_note': n_modal3_note}

text_data = pd.DataFrame(data=d)
print('percentage of filings whose MDA and NOTES are successfully extracted: ' + str(text_data[(text_data['nw_mda']!=0) & (text_data['nw_note']!=0)].shape[0]/text_data.shape[0]))
text_data = text_data[(text_data['nw_mda']!=0) & (text_data['nw_note']!=0)]

text_data.to_csv('..\\filings\\text_data_section_' + str(period_start) + '-' + str(period_end) + '.csv', index=False)
text_data 

percentage of filings whose MDA and NOTES are successfully extracted: 0.5975261655566128


Unnamed: 0,accnum,nw_mda,n_neg_mda,n_pos_mda,n_negation_mda,nw_note,n_neg_note,n_pos_note,n_negation_note
2,0000004127-20-000007,1544,13,8,1,2585,34,8,0
3,0000004457-20-000027,8667,89,48,0,7863,73,36,0
4,0000006281-20-000013,3067,10,18,1,4851,30,32,0
6,0000006951-20-000014,6441,75,39,0,7804,90,47,0
8,0000008670-20-000007,5946,46,81,0,4037,44,21,0
...,...,...,...,...,...,...,...,...,...
1043,0001744489-20-000046,6929,80,30,0,15947,196,73,0
1044,0001748790-20-000010,4016,86,28,0,5728,103,33,0
1047,0001757898-20-000003,6543,127,53,1,7777,119,38,2
1049,0001772016-20-000018,2997,43,20,0,6041,74,30,0


In [33]:
# #####################################################################
# ################### FOR SINGLE FILE INSPECTION ######################
# #####################################################################

# ############ Word Tokenization
# ## Raw tokens: including punctuations, numbers etc.
# with open(processed[1], 'r',  encoding = "utf-8") as file:
#     contents = file.read().replace('\n', ' ').replace('\xa0', ' ')
# tokens = word_tokenize(contents)

# #tokens

# ## Convert all words into small cases
# ## And keep tokens that purely consist of alphabetic characters only
# words = [w.lower() for w in tokens if w.isalpha() and len(w)>1 or w =='i']
# vocab = sorted(set(words))

# # words[2500:2600]
# # vocab[:50]

In [7]:
# def count_occurrence(tup, lst): 
#     count = 0
#     for item in tup: 
#         if item in lst: 
#             count+= 1
      
#     return count

# count_occurrence(words, lm_neg)

In [None]:
# gt_negation = ['no', 'not', 'none', 'neither', 'never', 'nobody'] ## Gunnel Totie, 1991, Negation in Speech and Writing

# def count_negation(tup, lst, negation): 
#     count = 0
#     for item in tup: 
#         if item in lst:
#             if tup.index(item)-4 > 0 and tup.index(item)+4 < len(tup):
#                 neighbor = tup[tup.index(item)-4:tup.index(item)+4]
#                 for neighborw in neighbor:
#                     if neighborw in negation:
#                         count+= 1

#             if tup.index(item)-4 < 0:
#                 pre = tup[0:tup.index(item)+4]
#                 for prew in pre:
#                     if prew in negation:
#                         count+= 1
                        
#             if tup.index(item)+4 > len(tup):
#                 post = tup[tup.index(item)-4:len(tup)]
#                 for postw in post:
#                     if postw in negation:
#                         count+= 1
#     return count

# count_negation(words, lm_pos, gt_negation)

In [21]:
# ########### Winsorize words with lenth smaller than 1% and largr than 99% of the document
# wordlen99 = np.quantile([len(w) for w in words], 0.99)
# wordlen1 = np.quantile([len(w) for w in words], 0.01)
# words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
# vocab = sorted(set(words))

# vocab[:50]

In [23]:
######### See the most common 20 words
# fdist = nltk.FreqDist(words)
# fdist.most_common(30)