In [1]:
############### import packages
import os, nltk, numpy as np, pandas as pd, time
from nltk import word_tokenize
from tqdm import tqdm
from time import process_time

##########################################################
##################### parameter ##########################
##########################################################
obj_type = '10-Q'
period_start = 2011 # included
period_end = 2013 # included

############### Set working directory to parent directory
os.getcwd()
# os.chdir('F:\\github\\narrative_conservatism\\code')

'F:\\github\\narrative_conservatism\\code'

In [2]:
############### Read LM disctionary
LM = pd.read_excel('..\\LM\\LoughranMcDonald_MasterDictionary_2018.xlsx', encoding = "utf-8")

############### Create negative, positive, uncertainty, litigious, constraining and modal word lists
lm_neg = LM.loc[LM['Negative'] != 0]['Word'].values.tolist()
lm_pos = LM.loc[LM['Positive'] != 0]['Word'].values.tolist()
lm_uctt = LM.loc[LM['Uncertainty'] != 0]['Word'].values.tolist()
lm_lit = LM.loc[LM['Litigious'] != 0]['Word'].values.tolist()
lm_cstr = LM.loc[LM['Constraining'] != 0]['Word'].values.tolist()

lm_modal1 = LM.loc[LM['Modal'] == 1]['Word'].values.tolist()
lm_modal2 = LM.loc[LM['Modal'] == 2]['Word'].values.tolist()
lm_modal3 = LM.loc[LM['Modal'] == 3]['Word'].values.tolist()

lm_neg = [w.lower() for w in lm_neg]
lm_pos = [w.lower() for w in lm_pos]
lm_uctt = [w.lower() for w in lm_uctt]
lm_lit = [w.lower() for w in lm_lit]
lm_cstr = [w.lower() for w in lm_cstr]
lm_modal1 = [w.lower() for w in lm_modal1]
lm_modal2 = [w.lower() for w in lm_modal2]
lm_modal3 = [w.lower() for w in lm_modal3]

############## Read and create stop words list
lm_stop = list()
with open('..\\LM\\StopWords_Generic.txt', "r") as f:
    for line in f:
        line = line.replace('\n', '')
        lm_stop.append(line)
        
lm_stop = [w.lower() for w in lm_stop]

############# Create a negation word list
gt_negation = ['no', 'not', 'none', 'neither', 'never', 'nobody'] ## Gunnel Totie, 1991, Negation in Speech and Writing

In [3]:
#####################################################################
#################### FOR ALL PROCESSED FILES LOOP ###################
#####################################################################

############# Create processed txt file index
processed = list()
for subdir, dirs, files in os.walk("H:\\data\\edgar\\processed\\" + obj_type + '\\' + str(period_start) + '-' + str(period_end)):
    for file in files:
        processed.append(os.path.join(subdir, file))

len(processed)

71497

In [4]:
############ Word Tokenization, count nword and nvocab, count negative, positive, uncertainty, litigious, constraining and modal words
accnum = list()

nw = list()
nvocab = list()

### Define a function count_occurrence to count the number of words in tup that pertaining to a lst 
def count_occurrence(tup, lst): 
    count = 0
    for item in tup: 
        if item in lst: 
            count+= 1
      
    return count

### Define a function count_negation to count cases where negation occurs within four or fewer words from a word identified in list.
def count_negation(tup, lst, negation): 
    count = 0
    for item in tup: 
        if item in lst:
            if tup.index(item)-4 > 0 and tup.index(item)+4 < len(tup):
                neighbor = tup[tup.index(item)-4:tup.index(item)+4]
                for neighborw in neighbor:
                    if neighborw in negation:
                        count+= 1

            if tup.index(item)-4 < 0:
                pre = tup[0:tup.index(item)+4]
                for prew in pre:
                    if prew in negation:
                        count+= 1
                        
            if tup.index(item)+4 > len(tup):
                post = tup[tup.index(item)-4:len(tup)]
                for postw in post:
                    if postw in negation:
                        count+= 1
    return count

n_neg = list()
n_pos = list()
n_uctt = list()
n_lit = list()
n_cstr = list()
n_modal1 = list()
n_modal2 = list()
n_modal3 = list()
n_negation = list()
net_pos = list()

# t1_start = process_time()
t1_start = time.time()

for text in tqdm(processed):
    ############# Create an array of accession number
    a = text.split("\\")[6].split(".")[0]
    accnum.append(a)
    
    ############# Read processed txt file
    with open(text, 'r',  encoding = "utf-8") as file:
        contents = file.read().replace('\n', '').replace('\xa0', '')
        # print(repr(contents))
        
        ############ Word Tokenization
        ## Raw tokens: including punctuations, numbers etc.
        tokens = word_tokenize(contents)

        ## Convert all words into small cases
        ## Keep tokens that purely consist of alphabetic characters only
        ## Delete single-character words except for 'I'
        words = [w.lower() for w in tokens if w.isalpha() and len(w)>1 or w =='i']
        
        ########### Delete words with lenth smaller than 1% and largr than 99% of the document
        # wordlen99 = np.quantile([len(w) for w in words], 0.99)
        # wordlen1 = np.quantile([len(w) for w in words], 0.01)
        # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
        vocab = sorted(set(words))
        
        ########### Save text statistics
        ##### 1. nw: 1) nw 2) nw_mda 3) nw_notes
        ##### 2. nvocab: 1) nvvocab 2) nvocab_mda 3) nvocab_notes
        ##### 3. tone: 1) tone 2) tone_mda 3) tone_notes
        
        ## 1.1) nw
        a = len(words)
        nw.append(a)
        
        ## 2.1) nvocab
        b = len(vocab)
        nvocab.append(b)
        
        ## 3.1) tone
        neg = count_occurrence(words, lm_neg)
        n_neg.append(neg)
        pos = count_occurrence(words, lm_pos)
        n_pos.append(pos)
        uctt = count_occurrence(words, lm_uctt)
        n_uctt.append(uctt)
        lit = count_occurrence(words, lm_lit)
        n_lit.append(lit)
        cstr = count_occurrence(words, lm_cstr)
        n_cstr.append(cstr)
        modal1 = count_occurrence(words, lm_modal1)
        n_modal1.append(modal1)
        modal2 = count_occurrence(words, lm_modal2)
        n_modal2.append(modal2)
        modal3 = count_occurrence(words, lm_modal3)
        n_modal3.append(modal3)
        negation = count_negation(words, lm_pos, gt_negation)
        n_negation.append(negation)
        netpos = pos - negation
        net_pos.append(netpos)

# t1_end = process_time()
t1_end = time.time()
print("Elapsed time during the whole program in seconds:", t1_end - t1_start)

100%|█████████████████████████████████████████████████████████████████████████| 71497/71497 [43:56:58<00:00,  2.21s/it]


Elapsed time during the whole program in seconds: 158218.68824601173


In [5]:
############### Create Data Frame
d = {'accnum': accnum, 'nw': nw, 'nvocab': nvocab, 'n_neg': n_neg, 'n_pos': n_pos, 'n_uctt': n_uctt, 'n_lit': n_lit, 'n_cstr': n_cstr, \
     'n_modal_week': n_modal1, 'n_modal_moderate': n_modal2, 'n_modal_strong': n_modal3, 'n_negation': n_negation}

text_data = pd.DataFrame(data=d)
text_data.to_csv('..\\filings\\text_data_' + obj_type + '_' + str(period_start) + '-' + str(period_end) + '.csv', index=False)

text_data

Unnamed: 0,accnum,nw,nvocab,n_neg,n_pos,n_uctt,n_lit,n_cstr,n_modal_week,n_modal_moderate,n_modal_strong,n_negation
0,0000002178-11-000022,5709,1203,68,33,72,44,34,10,17,18,4
1,0000002178-11-000032,5878,1232,72,37,73,44,34,12,18,15,4
2,0000002178-11-000049,5912,1246,73,33,75,45,35,11,18,17,0
3,0000002178-12-000026,6303,1262,74,42,76,55,40,10,18,21,0
4,0000002178-12-000037,6418,1271,80,38,73,58,40,10,20,19,0
...,...,...,...,...,...,...,...,...,...,...,...,...
71492,0001589728-13-000008,22017,2096,208,216,248,141,82,103,44,81,0
71493,0001594062-13-000006,4837,975,47,31,54,34,39,20,19,14,0
71494,0001594062-13-000013,4197,825,43,25,22,33,19,6,12,7,1
71495,0001594062-13-000020,7217,1313,98,37,68,40,59,44,30,18,0


In [6]:
# #####################################################################
# ################### FOR SINGLE FILE INSPECTION ######################
# #####################################################################

# ############ Word Tokenization
# ## Raw tokens: including punctuations, numbers etc.
# with open(processed[1], 'r',  encoding = "utf-8") as file:
#     contents = file.read().replace('\n', '').replace('\xa0', '')
# tokens = word_tokenize(contents)

# #tokens

# ## Convert all words into small cases
# ## And keep tokens that purely consist of alphabetic characters only
# words = [w.lower() for w in tokens if w.isalpha() and len(w)>1 or w =='i']
# vocab = sorted(set(words))

# # words[2500:2600]
# # vocab[:50]

In [7]:
# def count_occurrence(tup, lst): 
#     count = 0
#     for item in tup: 
#         if item in lst: 
#             count+= 1
      
#     return count

# count_occurrence(words, lm_neg)

In [None]:
# gt_negation = ['no', 'not', 'none', 'neither', 'never', 'nobody'] ## Gunnel Totie, 1991, Negation in Speech and Writing

# def count_negation(tup, lst, negation): 
#     count = 0
#     for item in tup: 
#         if item in lst:
#             if tup.index(item)-4 > 0 and tup.index(item)+4 < len(tup):
#                 neighbor = tup[tup.index(item)-4:tup.index(item)+4]
#                 for neighborw in neighbor:
#                     if neighborw in negation:
#                         count+= 1

#             if tup.index(item)-4 < 0:
#                 pre = tup[0:tup.index(item)+4]
#                 for prew in pre:
#                     if prew in negation:
#                         count+= 1
                        
#             if tup.index(item)+4 > len(tup):
#                 post = tup[tup.index(item)-4:len(tup)]
#                 for postw in post:
#                     if postw in negation:
#                         count+= 1
#     return count

# count_negation(words, lm_pos, gt_negation)

In [21]:
# ########### Winsorize words with lenth smaller than 1% and largr than 99% of the document
# wordlen99 = np.quantile([len(w) for w in words], 0.99)
# wordlen1 = np.quantile([len(w) for w in words], 0.01)
# words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
# vocab = sorted(set(words))

# vocab[:50]

In [23]:
######### See the most common 20 words
# fdist = nltk.FreqDist(words)
# fdist.most_common(30)