In [20]:
import collections
import re
import sys
import time
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
stopWords = set(stopwords.words('english'))

myStopWords = ["reddit","x200b","http","https","www","com","free","luck","sure","good","sorry","pm","personalfinancecananda","r","would","get","also","may","need","hilariously","sad","iis","make","like","one","take","know","wanting"]

def tokenize(string):
    """Convert string to lowercase and split into words (ignoring
    punctuation), returning list of words.
    """
    return re.findall(r'\w+', string.lower())


def count_ngrams(lines, min_length=1, max_length=3):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                ngrams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in tokenize(line):
            if (word not in stopWords) and (word not in myStopWords):
                queue.append(word)
                if len(queue) >= max_length:
                    add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return ngrams


def print_most_frequent(ngrams, num=10):
    """Print num most common n-grams of each length in n-grams dict."""
    for n in sorted(ngrams):
        print('----- {} most common {}-grams -----'.format(num, n))
        for gram, count in ngrams[n].most_common(num):
            print('{0}: {1}'.format(' '.join(gram), count))
        print('')


if __name__ == '__main__':
    start_time = time.time()
    
    print("positive sentiment")
    with open("finaldata_insurance_positive.txt",encoding='utf-8') as f:
        ngrams = count_ngrams(f)
    print_most_frequent(ngrams)
    
    print("negative sentiment")
    with open("finaldata_insurance_negative.txt",encoding='utf-8') as f:
        ngrams = count_ngrams(f)
    print_most_frequent(ngrams)
    
    
    elapsed_time = time.time() - start_time
    print('Took {:.03f} seconds'.format(elapsed_time))

positive sentiment
----- 10 most common 1-grams -----
insurance: 169
life: 154
policy: 117
money: 88
pay: 78
trust: 59
help: 51
financial: 51
wife: 50
family: 44

----- 10 most common 2-grams -----
life insurance: 84
whole life: 34
social security: 15
insurance policy: 14
insurance company: 12
credit card: 12
life policy: 11
cash value: 10
financial planner: 9
family members: 8

----- 10 most common 3-grams -----
life insurance policy: 11
whole life policy: 8
beneficiary life insurance: 6
1 2 million: 5
whole life policies: 4
amount received financial: 4
received financial advisers: 4
financial advisers help: 4
advisers help percentage: 4
life insurance payout: 3

negative sentiment
----- 10 most common 1-grams -----
life: 485
insurance: 439
policy: 373
money: 219
term: 178
years: 159
whole: 138
pay: 134
family: 120
trust: 120

----- 10 most common 2-grams -----
life insurance: 233
whole life: 118
life policy: 59
insurance policy: 45
death benefit: 42
term life: 41
cash value: 36
term 