In [1]:
############### import packages
import os, nltk, numpy as np, pandas as pd, time
from nltk import word_tokenize
from tqdm import tqdm
from time import process_time

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
period_start = 2011 # included
period_end = 2013 # included

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

'F:\\github\\narrative_conservatism\\code'

In [2]:
############### Read LM disctionary
LM = pd.read_excel('..\\LM\\LoughranMcDonald_MasterDictionary_2018.xlsx', encoding = "utf-8")

############### Create negative, positive, uncertainty, litigious, constraining and modal word lists
lm_neg = LM.loc[LM['Negative'] != 0]['Word'].values.tolist()
lm_pos = LM.loc[LM['Positive'] != 0]['Word'].values.tolist()
lm_uctt = LM.loc[LM['Uncertainty'] != 0]['Word'].values.tolist()
lm_lit = LM.loc[LM['Litigious'] != 0]['Word'].values.tolist()
lm_cstr = LM.loc[LM['Constraining'] != 0]['Word'].values.tolist()

lm_modal1 = LM.loc[LM['Modal'] == 1]['Word'].values.tolist()
lm_modal2 = LM.loc[LM['Modal'] == 2]['Word'].values.tolist()
lm_modal3 = LM.loc[LM['Modal'] == 3]['Word'].values.tolist()

lm_neg = [w.lower() for w in lm_neg]
lm_pos = [w.lower() for w in lm_pos]
lm_uctt = [w.lower() for w in lm_uctt]
lm_lit = [w.lower() for w in lm_lit]
lm_cstr = [w.lower() for w in lm_cstr]
lm_modal1 = [w.lower() for w in lm_modal1]
lm_modal2 = [w.lower() for w in lm_modal2]
lm_modal3 = [w.lower() for w in lm_modal3]

############## Read and create stop words list
lm_stop = list()
with open('..\\LM\\StopWords_Generic.txt', "r") as f:
    for line in f:
        line = line.replace('\n', '')
        lm_stop.append(line)
        
lm_stop = [w.lower() for w in lm_stop]

############# Create a negation word list
gt_negation = ['no', 'not', 'none', 'neither', 'never', 'nobody'] ## Gunnel Totie, 1991, Negation in Speech and Writing

In [3]:
#####################################################################
#################### FOR ALL PROCESSED FILES LOOP ###################
#####################################################################

############# Create input txt file index
processed = list()
for subdir, dirs, files in os.walk("H:\\data\\edgar\\processed\\" + obj_type + '\\' + str(period_start) + '-' + str(period_end)):
    for file in files:
        processed.append(os.path.join(subdir, file))

len(processed)

71497

In [4]:
#### Define a function count_occurrence to count the number of words in tup that pertaining to a list 
def count_occurrence(tup, lst): 
    count = 0
    for item in tup: 
        if item in lst: 
            count+= 1
      
    return count

### Define a function count_negation to count cases where negation occurs within four or fewer words from a word identified in list.
def count_negation(tup, lst, negation): 
    count = 0
    for item in tup: 
        if item in lst:
            if tup.index(item)-4 > 0 and tup.index(item)+4 < len(tup):
                neighbor = tup[tup.index(item)-4:tup.index(item)+4]
                for neighborw in neighbor:
                    if neighborw in negation:
                        count+= 1

            if tup.index(item)-4 < 0:
                pre = tup[0:tup.index(item)+4]
                for prew in pre:
                    if prew in negation:
                        count+= 1
                        
            if tup.index(item)+4 > len(tup):
                post = tup[tup.index(item)-4:len(tup)]
                for postw in post:
                    if postw in negation:
                        count+= 1
    return count

In [11]:
############ Full Text Raw Count
accnum = list()

nw = list()
nvocab = list()

n_neg = list()
n_pos = list()
n_uctt = list()
n_lit = list()
n_cstr = list()
n_modal1 = list()
n_modal2 = list()
n_modal3 = list()
n_negation = list()

############ Word Tokenization, count nword and nvocab, count negative, positive, uncertainty, litigious, constraining and modal words
for text in tqdm(processed):
    ############# Create an array of accession number
    a = text.split("\\")[6].split(".")[0]
    accnum.append(a)
    
    ############# Read processed txt file
    with open(text, 'r',  encoding = "utf-8") as file:
        contents = file.read().replace('\n', ' ').replace('\xa0', ' ')
        # print(repr(contents))
        
        ############ Word Tokenization
        ## Raw tokens: including punctuations, numbers etc.
        tokens = word_tokenize(contents)

        ## Convert all words into small cases
        ## Keep tokens that purely consist of alphabetic characters only
        ## Delete single-character words except for 'I'
        words = [w.lower() for w in tokens if w.isalpha() and len(w)>1 or w =='i']
        
        ########### Delete words with lenth smaller than 1% and largr than 99% of the document
        # wordlen99 = np.quantile([len(w) for w in words], 0.99)
        # wordlen1 = np.quantile([len(w) for w in words], 0.01)
        # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
        vocab = sorted(set(words))
        
        ########### Save text statistics
        ##### 1. nw: 1) nw 2) nw_mda 3) nw_notes
        ##### 2. nvocab: 1) nvvocab 2) nvocab_mda 3) nvocab_notes
        ##### 3. tone: 1) tone 2) tone_mda 3) tone_notes
        
        ## 1.1) nw
        a = len(words)
        nw.append(a)
        
        ## 2.1) nvocab
        b = len(vocab)
        nvocab.append(b)
        
        ## 3.1) tone
        neg = count_occurrence(words, lm_neg)
        n_neg.append(neg)
        pos = count_occurrence(words, lm_pos)
        n_pos.append(pos)
        uctt = count_occurrence(words, lm_uctt)
        n_uctt.append(uctt)
        lit = count_occurrence(words, lm_lit)
        n_lit.append(lit)
        cstr = count_occurrence(words, lm_cstr)
        n_cstr.append(cstr)
        modal1 = count_occurrence(words, lm_modal1)
        n_modal1.append(modal1)
        modal2 = count_occurrence(words, lm_modal2)
        n_modal2.append(modal2)
        modal3 = count_occurrence(words, lm_modal3)
        n_modal3.append(modal3)
        negation = count_negation(words, lm_pos, gt_negation)
        n_negation.append(negation)

100%|██████████████████████████████████████████████████████████████████████████████| 1051/1051 [05:49<00:00,  3.01it/s]


In [12]:
############### Create Data Frame: full document
d = {'accnum': accnum, 'nw': nw, 'nvocab': nvocab, 'n_neg': n_neg, 'n_pos': n_pos, 'n_uctt': n_uctt, 'n_lit': n_lit, 'n_cstr': n_cstr, \
     'n_modal_week': n_modal1, 'n_modal_moderate': n_modal2, 'n_modal_strong': n_modal3, 'n_negation': n_negation}

text_data = pd.DataFrame(data=d)
text_data.to_csv('..\\filings\\text_data_' + obj_type + '_' + str(period_start) + '-' + str(period_end) + '.csv', index=False)

text_data

Unnamed: 0,accnum,nw_mda,nvocab_mda,n_neg_mda,n_pos_mda,n_uctt_mda,n_lit_mda,n_cstr_mda,n_modal_strong_mda,n_modal_moderate_mda,n_modal_weak_mda,n_negation_mda
0,0000002969-20-000010,4088,738,43,52,30,33,14,1,9,9,0
1,0000003545-20-000039,3662,923,25,19,65,20,19,12,13,12,0
2,0000004127-20-000007,1544,497,13,8,22,9,9,3,5,7,1
3,0000004457-20-000027,8667,1158,89,48,93,50,36,16,19,22,0
4,0000006281-20-000013,3067,757,10,18,49,16,21,14,4,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1046,0001753926-20-000021,0,0,0,0,0,0,0,0,0,0,0
1047,0001757898-20-000003,6543,1064,127,53,92,64,24,13,13,33,1
1048,0001766016-20-000002,0,0,0,0,0,0,0,0,0,0,0
1049,0001772016-20-000018,2997,756,43,20,38,23,19,4,8,10,0


In [5]:
############ MDA and NOTE Raw Count
accnum = list()

nw_mda = list()
nvocab_mda = list()

n_neg_mda = list()
n_pos_mda = list()
n_uctt_mda = list()
n_lit_mda = list()
n_cstr_mda = list()
n_modal1_mda = list()
n_modal2_mda = list()
n_modal3_mda = list()
n_negation_mda = list()

nw_note = list()
nvocab_note = list()

n_neg_note = list()
n_pos_note = list()
n_uctt_note = list()
n_lit_note = list()
n_cstr_note = list()
n_modal1_note = list()
n_modal2_note = list()
n_modal3_note = list()
n_negation_note = list()

############ Word Tokenization, count nword and nvocab, count negative, positive, uncertainty, litigious, constraining and modal words
for text in tqdm(processed):
    ############# Create an array of accession number
    a = text.split("\\")[6].split(".")[0]
    accnum.append(a)
    
    ############# Read processed txt file
    with open(text, 'r',  encoding = "utf-8") as file:
        contents = file.read().replace('\n', ' ').replace('\xa0', ' ')
        ############################## TO EXTRACT MDA AND NOTES SECTION, UNCOMMENT THIS SECTION ################################
        try:
            mda = contents[contents.index("ITEM 2."):contents.index("ITEM 3.")]
        except:
            try:
                mda = contents[contents.index("Item 2."):contents.index("Item 3.")]
            except:
                try:
                    mda = contents[contents.index("ITEM 2"):contents.index("ITEM 3")]
                except:
                    try:
                        mda = contents[contents.index("Item 2"):contents.index("Item 3")]
                    except:
                        mda = ''
                        pass
                    
        try:
            note = contents[contents.index("NOTES TO"):contents.index("ITEM 2.")]
        except:
            try:
                note = contents[contents.index("NOTES TO"):contents.index("ITEM 2")]
            except:
                try:
                    note = contents[contents.index("Notes to"):contents.index("Item 2.")]
                except:
                    try:
                        note = contents[contents.index("Notes to"):contents.index("Item 2")]
                    except:
                        note = ''
                        pass
        ###########################################################################################################
        # print(repr(contents))
        
        ############ Word Tokenization
        ## Raw tokens: including punctuations, numbers etc.
        tokens_mda = word_tokenize(mda)
        tokens_note = word_tokenize(note)
        
        ####################################################################

        ## Convert all words into small cases
        ## Keep tokens that purely consist of alphabetic characters only
        ## Delete single-character words except for 'I'
        words_mda = [w.lower() for w in tokens_mda if w.isalpha() and len(w)>1 or w =='i']
        words_note = [w.lower() for w in tokens_note if w.isalpha() and len(w)>1 or w =='i']
        
        ########### Delete words with lenth smaller than 1% and largr than 99% of the document
        # wordlen99 = np.quantile([len(w) for w in words], 0.99)
        # wordlen1 = np.quantile([len(w) for w in words], 0.01)
        # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
        vocab_mda = sorted(set(words_mda))
        vocab_note = sorted(set(words_note))
        
        ########### Save text statistics
        ##### 1. nw: 1) nw 2) nw_mda 3) nw_notes
        ##### 2. nvocab: 1) nvvocab 2) nvocab_mda 3) nvocab_notes
        ##### 3. tone: 1) tone 2) tone_mda 3) tone_notes
        
        ## 1.1) nw
        a_mda = len(words_mda)
        nw_mda.append(a_mda)
        a_note = len(words_note)
        nw_note.append(a_note)
        
        ## 2.1) nvocab
#         b_mda = len(vocab_mda)
#         nvocab_mda.append(b_mda)
#         b_note = len(vocab_note)
#         nvocab_note.append(b_note)
        
        ## 3.1) tone
        neg_mda = count_occurrence(words_mda, lm_neg)
        n_neg_mda.append(neg_mda)
        pos_mda = count_occurrence(words_mda, lm_pos)
        n_pos_mda.append(pos_mda)
#         uctt_mda = count_occurrence(words_mda, lm_uctt)
#         n_uctt_mda.append(uctt_mda)
#         lit_mda = count_occurrence(words_mda, lm_lit)
#         n_lit_mda.append(lit_mda)
#         cstr_mda = count_occurrence(words_mda, lm_cstr)
#         n_cstr_mda.append(cstr_mda)
#         modal1_mda = count_occurrence(words_mda, lm_modal1)
#         n_modal1_mda.append(modal1_mda)
#         modal2_mda = count_occurrence(words_mda, lm_modal2)
#         n_modal2_mda.append(modal2_mda)
#         modal3_mda = count_occurrence(words_mda, lm_modal3)
#         n_modal3_mda.append(modal3_mda)
        negation_mda = count_negation(words_mda, lm_pos, gt_negation)
        n_negation_mda.append(negation_mda)
        
        neg_note = count_occurrence(words_note, lm_neg)
        n_neg_note.append(neg_note)
        pos_note = count_occurrence(words_note, lm_pos)
        n_pos_note.append(pos_note)
#         uctt_note = count_occurrence(words_note, lm_uctt)
#         n_uctt_note.append(uctt_note)
#         lit_note = count_occurrence(words_note, lm_lit)
#         n_lit_note.append(lit_note)
#         cstr_note = count_occurrence(words_note, lm_cstr)
#         n_cstr_note.append(cstr_note)
#         modal1_note = count_occurrence(words_note, lm_modal1)
#         n_modal1_note.append(modal1_note)
#         modal2_note = count_occurrence(words_note, lm_modal2)
#         n_modal2_note.append(modal2_note)
#         modal3_note = count_occurrence(words_note, lm_modal3)
#         n_modal3_note.append(modal3_note)
        negation_note = count_negation(words_note, lm_pos, gt_negation)
        n_negation_note.append(negation_note)

100%|█████████████████████████████████████████████████████████████████████████| 71497/71497 [14:14:25<00:00,  1.39it/s]


In [6]:
############### Create Data Frame: MDA and NOTES
d = {'accnum': accnum, 'nw_mda': nw_mda, 'n_neg_mda': n_neg_mda, 'n_pos_mda': n_pos_mda, 'n_negation_mda': n_negation_mda, 'nw_note': nw_note, 'n_neg_note': n_neg_note, 'n_pos_note': n_pos_note, 'n_negation_note': n_negation_note}
#      'nvocab_mda': nvocab_mda, 'n_uctt_mda': n_uctt_mda, 'n_lit_mda': n_lit_mda, 'n_cstr_mda': n_cstr_mda, \
#      'n_modal_strong_mda': n_modal1_mda, 'n_modal_moderate_mda': n_modal2_mda, 'n_modal_weak_mda': n_modal3_mda, \
#      'nvocab_note': nvocab_note, 'n_uctt_note': n_uctt_note, 'n_lit_note': n_lit_note, 'n_cstr_note': n_cstr_note, \
#      'n_modal_strong_note': n_modal1_note, 'n_modal_moderate_note': n_modal2_note, 'n_modal_weak_note': n_modal3_note}

text_data = pd.DataFrame(data=d)
print('percentage of filings whose MDA and NOTES are successfully extracted: ' + str(text_data[(text_data['nw_mda']!=0) & (text_data['nw_note']!=0)].shape[0]/text_data.shape[0]))
text_data = text_data[(text_data['nw_mda']!=0) & (text_data['nw_note']!=0)]

text_data.to_csv('..\\filings\\text_data_section_' + str(period_start) + '-' + str(period_end) + '.csv', index=False)
text_data 

percentage of filings whose MDA and NOTES are successfully extracted: 0.5557296110326307


Unnamed: 0,accnum,nw_mda,n_neg_mda,n_pos_mda,n_negation_mda,nw_note,n_neg_note,n_pos_note,n_negation_note
24,0000003570-12-000037,6363,33,27,0,2866,33,6,0
25,0000003570-12-000104,7135,45,28,0,3511,40,7,0
26,0000003570-12-000143,7721,47,25,0,5261,65,23,0
27,0000003570-13-000071,7512,48,22,0,6515,71,31,0
28,0000003570-13-000161,8241,55,25,0,7840,75,30,0
...,...,...,...,...,...,...,...,...,...
71487,0001582718-13-000032,713,7,3,0,1752,17,8,0
71488,0001582718-13-000043,713,7,3,0,1753,17,8,0
71489,0001582741-13-000024,1839,34,22,0,2313,31,13,0
71493,0001594062-13-000006,2209,25,18,0,2189,25,13,0


In [33]:
# #####################################################################
# ################### FOR SINGLE FILE INSPECTION ######################
# #####################################################################

# ############ Word Tokenization
# ## Raw tokens: including punctuations, numbers etc.
# with open(processed[6], 'r',  encoding = "utf-8") as file:
#     contents = file.read().replace('\n', ' ').replace('\xa0', ' ')
# tokens = word_tokenize(contents)

# #tokens

# ## Convert all words into small cases
# ## And keep tokens that purely consist of alphabetic characters only
# words = [w.lower() for w in tokens if w.isalpha() and len(w)>1 or w =='i']
# vocab = sorted(set(words))

# # words[2500:2600]
# # vocab[:50]

In [7]:
# def count_occurrence(tup, lst): 
#     count = 0
#     for item in tup: 
#         if item in lst: 
#             count+= 1
      
#     return count

# count_occurrence(words, lm_neg)

In [None]:
# gt_negation = ['no', 'not', 'none', 'neither', 'never', 'nobody'] ## Gunnel Totie, 1991, Negation in Speech and Writing

# def count_negation(tup, lst, negation): 
#     count = 0
#     for item in tup: 
#         if item in lst:
#             if tup.index(item)-4 > 0 and tup.index(item)+4 < len(tup):
#                 neighbor = tup[tup.index(item)-4:tup.index(item)+4]
#                 for neighborw in neighbor:
#                     if neighborw in negation:
#                         count+= 1

#             if tup.index(item)-4 < 0:
#                 pre = tup[0:tup.index(item)+4]
#                 for prew in pre:
#                     if prew in negation:
#                         count+= 1
                        
#             if tup.index(item)+4 > len(tup):
#                 post = tup[tup.index(item)-4:len(tup)]
#                 for postw in post:
#                     if postw in negation:
#                         count+= 1
#     return count

# count_negation(words, lm_pos, gt_negation)

In [21]:
# ########### Winsorize words with lenth smaller than 1% and largr than 99% of the document
# wordlen99 = np.quantile([len(w) for w in words], 0.99)
# wordlen1 = np.quantile([len(w) for w in words], 0.01)
# words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
# vocab = sorted(set(words))

# vocab[:50]

In [23]:
######### See the most common 20 words
# fdist = nltk.FreqDist(words)
# fdist.most_common(30)