[Issue #3] Unit Tests (#8)

* Fleshed out the husk of the unit tests * Modularized blog parser a bit and removed error.txt * added simple happy path tests for analyzer * Added autopep8 pre-commit hook
dvfeinblum · Oct 20, 2018 · 67d598c · 67d598c
1 parent 7f062c6
commit 67d598c
Show file tree

Hide file tree

Showing 9 changed files with 117 additions and 29 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,8 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v1.3.0
+    hooks:
+        -   id: trailing-whitespace
+        -   id: end-of-file-fixer
+        -   id: autopep8-wrapper
+        -   id: requirements-txt-fixer
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 beautifulsoup4 >= 4.6.3
 nltk >= 3.3
+pre-commit==1.10.3
 redis >= 2.10.6
 requests >= 2.19.1
-validators
+validators
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/blog_parser.py b/src/blog_parser.py
@@ -5,8 +5,8 @@
 import re
 import requests as r
 
-from redis_init import nltk_client, LINKS_KEY, word_client
 import utils
+from redis_init import nltk_client, LINKS_KEY, word_client
 
 # Some useful constants for parsing blog html
 POST_URL_REL = "alternate"
@@ -15,9 +15,6 @@
 word_count = 0
 pos_counts = {}
 
-# Makes for easier debugging when it comes to nltk failures
-error_file = open('error.txt', 'w')
-
 
 def get_blogpost_links():
     """
@@ -44,13 +41,13 @@ async def fetch_posts(urls):
 
 async def parse_blog_post(blog_link):
     """
-    Given a blog post's URL, this function GETs it and pulls the body out
+    Given a blog post's URL, this function GETs it and pulls the body out.
+    We then analyze each word with NTLK and write some data into redis.
     :param blog_link: String
     :return: Raw text from the blog post w/html tags stripped out
     """
     global blogs_scraped_counter
     global word_count
-    global error_file
     print('Fetching raw text for {}'.format(blog_link))
     soup = BeautifulSoup(r.get(blog_link).content,
                          features='html.parser')
@@ -61,31 +58,48 @@ async def parse_blog_post(blog_link):
     if utils.DEBUG_MODE:
         print('\nSanitized blogpost:\n{clean}\n\nOriginal text:{orig}'.format(clean=sanitized_post_text,
                                                                               orig=post_text))
-    for word in sanitized_post_text.split(' '):
-        # First we hit the word count cache
-        word_client.incr(word)
-        word_count = word_count + 1
-
-        # Now we do some nltk wizardry
-        try:
-            pos_array = nltk.pos_tag([word])
-
-            pos_tuple = pos_array[0]
-            pos = pos_tuple[1]
-            nltk_client.incr(pos)
-            if pos in pos_counts:
-                pos_counts[pos] = pos_counts[pos] + 1
-            else:
-                pos_counts[pos] = 1
-        except Exception as e:
-            # This is the only instance in which an exception is actually cause for concern
-            if len(word) > 0:
-                print('failed to nltk-ify a post.\nURL: {url}\nException: {e}'.format(e=e, url=blog_link))
-                error_file.write('URL: ' + blog_link + '\n' + repr(sanitized_post_text) + '\n')
+
+    [analyze_word(word, blog_link) for word in sanitized_post_text.split(' ')]
 
     blogs_scraped_counter = blogs_scraped_counter + 1
 
 
+def analyze_word(word, blog_link):
+    """
+    Given a word, we figure out its POS and store various info in redis.
+    :param word: str
+    :param blog_link: url that the word came from, only used for logging
+    :return: tuple containing the word and POS of that word
+    """
+    global word_count
+    # First we hit the word count cache
+    word_client.incr(word)
+    word_count = word_count + 1
+
+    # Now we do some nltk wizardry
+    try:
+        pos_array = nltk.pos_tag([word])
+
+        pos_tuple = pos_array[0]
+        pos = pos_tuple[1]
+        nltk_client.incr(pos)
+        if pos in pos_counts:
+            pos_counts[pos] = pos_counts[pos] + 1
+        else:
+            pos_counts[pos] = 1
+
+        # we don't actually need this but it's useful for testing
+        return pos_tuple
+    except IndexError:
+        # This is the only instance in which an exception is actually cause for concern
+        if len(word) > 0:
+            print('Failed to nltk-ify a post.\nURL: {url}\nWord: {word}'.format(url=blog_link,
+                                                                                word=word))
+    except Exception as e:
+        print('Word analyzer encountered an unexpected exception on word: {w}\n Exception:{e}'.format(w=word,
+                                                                                                      e=e))
+
+
 def get_results():
     """
     Once the run is complete, we'll spit out some stats.
@@ -121,4 +135,3 @@ def main():
         loop.close()
 
     get_results()
-    error_file.close()
diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/mocks/__init__.py b/test/mocks/__init__.py
diff --git a/test/mocks/redis_client.py b/test/mocks/redis_client.py
@@ -0,0 +1,15 @@
+class MockRedis:
+    def __init__(self, cache=None):
+        self.cache = cache
+
+    def incr(self, key):
+        self.cache[key] = self.cache.setdefault(key, 0) + 1
+
+    def dbsize(self):
+        return len(self.cache)
+
+    def set(self, k, v):
+        self.cache[k] = v
+
+    def get(self, k):
+        return self.cache.get(k)
diff --git a/test/test_blog_parser.py b/test/test_blog_parser.py
@@ -0,0 +1,25 @@
+from unittest import TestCase, mock
+
+from src.blog_parser import get_results, analyze_word
+from test.mocks.redis_client import MockRedis
+
+
+class TestBlogParser(TestCase):
+
+    @mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
+    @mock.patch('src.blog_parser.blogs_scraped_counter', new=1)
+    def test_result_generator(self):
+        """
+        Test ensures that stats can be calculated, given a functioning redis client
+        """
+        get_results()
+
+    @mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
+    def test_word_analyze(self):
+        """
+        Check that the analyzer runs and doesn't bark at empty strings
+        """
+        self.assertEqual(analyze_word('foo', 'https://amifired.today'), ('foo', 'NN'))
+        self.assertIsNone(analyze_word('', 'https://foo.bar'))
+
+
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -0,0 +1,26 @@
+from unittest import TestCase
+
+from src.utils import sanitize_blogpost
+
+
+class TestUtilities(TestCase):
+    def test_sanitizer(self):
+        """
+        Ensures that the blog post sanitizer works as intended.
+        """
+        fake_post = 'Words! Mere words! \n\nHow terrible they were! ' \
+                    'How clear\n, and vivid, ~and cruel! \"One could not ' \
+                    'escape from them.\n And yet\n what a \'subtle magic there ' \
+                    'was in them! They seemed .to be able :to give a plastic ' \
+                    'form to\n\n formless things, and. to ;have a music \nof their ' \
+                    'own as sweet as that\n of viol or of lute. Mere words! Was ' \
+                    'there anything so real as words? Also here\'s a hyphenated-word.'
+        expected_result = 'words mere words how terrible they were ' \
+                          'how clear and vivid and cruel one could not ' \
+                          'escape from them and yet what a subtle magic ' \
+                          'there was in them they seemed to be able to give ' \
+                          'a plastic form to formless things and to have a music ' \
+                          'of their own as sweet as that of viol or of lute mere ' \
+                          'words was there anything so real as words also heres a hyphenated-word'
+
+        self.assertEqual(sanitize_blogpost(fake_post), expected_result)