In [17]:
import collections
import re
import sys
import time
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
stopWords = set(stopwords.words('english'))

myStopWords = ["x200b","http","https","www","com","good","also","would","like","get","luck","people","sure","use","one","need","make","hope","best","level","really","work","want","think","business","much","know"]

def tokenize(string):
    """Convert string to lowercase and split into words (ignoring
    punctuation), returning list of words.
    """
    return re.findall(r'\w+', string.lower())


def count_ngrams(lines, min_length=1, max_length=3):
    """Iterate through given lines iterator (file object or list of
    lines) and return n-gram frequencies. The return value is a dict
    mapping the length of the n-gram to a collections.Counter
    object of n-gram tuple and number of times that n-gram occurred.
    Returned dict includes n-grams of length min_length to max_length.
    """
    lengths = range(min_length, max_length + 1)
    ngrams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                ngrams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in tokenize(line):
            if (word not in stopWords) and (word not in myStopWords):
                queue.append(word)
                if len(queue) >= max_length:
                    add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return ngrams


def print_most_frequent(ngrams, num=10):
    """Print num most common n-grams of each length in n-grams dict."""
    for n in sorted(ngrams):
        print('----- {} most common {}-grams -----'.format(num, n))
        for gram, count in ngrams[n].most_common(num):
            print('{0}: {1}'.format(' '.join(gram), count))
        print('')


if __name__ == '__main__':
    start_time = time.time()
    
    print("positive sentiment")
    with open("finaldata_marketing_positive.txt",encoding='utf-8') as f:
        ngrams = count_ngrams(f)
    print_most_frequent(ngrams)
    
    print("negative sentiment")
    with open("finaldata_marketing_negative.txt",encoding='utf-8') as f:
        ngrams = count_ngrams(f)
    print_most_frequent(ngrams)
    
    
    elapsed_time = time.time() - start_time
    print('Took {:.03f} seconds'.format(elapsed_time))

positive sentiment
----- 10 most common 1-grams -----
marketing: 1421
digital: 813
google: 751
content: 572
ads: 516
social: 490
facebook: 478
seo: 471
time: 381
learn: 359

----- 10 most common 2-grams -----
digital marketing: 560
social media: 299
google ads: 108
google analytics: 100
email marketing: 91
content marketing: 79
facebook ads: 59
digital marketer: 56
landing page: 54
search engine: 47

----- 10 most common 3-grams -----
social media marketing: 46
digital marketing course: 15
part digital marketing: 15
digital marketing strategy: 14
marketing digital marketing: 13
seo social media: 12
message compose r: 11
google tag manager: 11
digital marketing manager: 10
social media examiner: 10

negative sentiment
----- 10 most common 1-grams -----
marketing: 1160
digital: 702
google: 399
social: 380
time: 364
media: 323
ads: 304
help: 246
way: 239
seo: 238

----- 10 most common 2-grams -----
digital marketing: 507
social media: 259
google analytics: 65
google ads: 57
digital market