In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import string

import pandas as pd
import numpy as np

import math

In [2]:
def make_word_tokens(filename):
    file = open(filename, 'rt')
    text = file.read()
    file.close()

    # split into words
    tokens = word_tokenize(text)

    # convert to lower case
    tokens = [w.lower() for w in tokens]
    
    return tokens

In [None]:
def make_sentence_tokens(filename):
    file = open(filename, 'rt')
    text = file.read()
    file.close()
    
    sent_tokenize_list = sent_tokenize(text)
    
    return sent_tokenize_list

In [3]:
def make_stopwords_list():
    stopwords_list = make_word_tokens('stopwords/StopWords_Generic.txt') + \
    make_word_tokens('stopwords/StopWords_Auditor.txt') + make_word_tokens('stopwords/StopWords_Currencies.txt') + \
    make_word_tokens('stopwords/StopWords_DatesandNumbers.txt') + make_word_tokens('stopwords/StopWords_GenericLong.txt') + \
    make_word_tokens('stopwords/StopWords_Geographic.txt') + make_word_tokens('stopwords/StopWords_Names.txt')
    
    return stopwords_list

In [4]:
def clean_text(filename):
    tokens = make_word_tokens(filename)

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    
    #print(words)
    print(len(words))
    
    stopwords_list = make_stopwords_list()
    
    words = [w for w in words if not w in stopwords_list]
    
    #print(words)
    print(len(words))
    
    return words

In [5]:
def get_master_dictionary():
    # Assign spreadsheet filename to `file`
    file = 'masterdictionary/LoughranMcDonald_MasterDictionary_2016.xlsx'
    # Load spreadsheet
    xl = pd.ExcelFile(file)
    # Print the sheet names
    print(xl.sheet_names)
    
    # Load a sheet into a DataFrame by name: df1
    dict_df = xl.parse('LoughranMcDonald_MasterDictiona')
    dict_df = dict_df[['Word', 'Negative', 'Positive']]
    dict_df.replace(0, np.nan, inplace=True)
    
    dict_list = dict_df['Word'].tolist()
    temp = []
    for i in range(len(dict_list)):
        temp.append(str(dict_list[i]).lower())
    dict_list = temp
    
    return dict_df, dict_list

In [6]:
def derived_variables(filename):
    words = clean_text(filename)
    dict_df, dict_list = get_master_dictionary()
    
    positive = 0
    negative = 0
    for temp_word in words:
        flag = False
        for i in range(len(dict_df)):
            if temp_word in dict_list[i]:
                flag = True
                break
        if flag:
            if not math.isnan(dict_df.iloc[i]['Positive']):
                positive = positive + 1
                print("'{}' is positive".format(temp_word.upper()))
            if not math.isnan(dict_df.iloc[i]['Negative']):
                negative = negative - 1
                print("'{}' is negative".format(temp_word.upper()))
        else:
            print("'{}' is neither positive nor negative".format(temp_word.upper()))
        
    print('Positive Score: {}'.format(positive))
    print('Negative Score: {}'.format(negative))    
    polarity = (positive - negative)/((positive + negative) + 0.000001)
    subjectivity = (positive + negative)/((len(words)) + 0.000001)
    print('Polarity: {}'.format(polarity))
    print('Subjectivity: {}'.format(subjectivity))

In [7]:
filename = 'data/0000004457-00-000018.txt'
derived_variables(filename)

8635
4392
['LoughranMcDonald_MasterDictiona']
'PRIVACYENHANCED' is neither positive nor negative
'PROCTYPE' is neither positive nor negative
'MICCLEAR' is neither positive nor negative
'ORIGINATORNAME' is neither positive nor negative
'WWWSECGOV' is neither positive nor negative
'ORIGINATORKEYASYMMETRIC' is neither positive nor negative
'MICINFO' is neither positive nor negative
'RSA' is negative
'SECDOCUMENT' is neither positive nor negative
'SECHEADER' is neither positive nor negative
'AMERCO' is neither positive nor negative
'SERVICESAUTO' is neither positive nor negative
'VALUES' is negative
'AIRMOTIVE' is neither positive nor negative
'AIRMOTIVE' is neither positive nor negative
'AMERCO' is neither positive nor negative
'SECHEADER' is neither positive nor negative
'AMERCO' is neither positive nor negative
'AIRMOTIVE' is neither positive nor negative
'UHAUL' is neither positive nor negative
'AMERCO' is neither positive nor negative
'UHAUL' is neither positive nor negative
'UHAUL' i

'UNPAID' is negative
'PAID' is negative
'PAID' is negative
'UNPAID' is negative
'UNFORESEEN' is negative
'INTERESTSENSITIVE' is neither positive nor negative
'BENEFIT' is positive
'BENEFIT' is positive
'RATIO' is negative
'LIQUID' is negative
'LIQUIDITY' is negative
'AMERCO' is neither positive nor negative
'PAID' is negative
'ADVERSE' is negative
'AMERCO' is neither positive nor negative
'UHAUL' is neither positive nor negative
'UHAUL' is neither positive nor negative
'AMERCO' is neither positive nor negative
'MEDIUMTERM' is neither positive nor negative
'AMERCO' is neither positive nor negative
'AMERCO' is neither positive nor negative
'AMERCO' is neither positive nor negative
'PLACING' is negative
'AMERCO' is neither positive nor negative
'AMERCO' is neither positive nor negative
'AMERCO' is neither positive nor negative
'AMERCO' is neither positive nor negative
'AMERCO' is neither positive nor negative
'STATEMENTS' is negative
'AMERCO' is neither positive nor negative
'PROBLEMS' is