Skip to content

Commit

Permalink
[Issue #3] Unit Tests (#8)
Browse files Browse the repository at this point in the history
* Fleshed out the husk of the unit tests

* Modularized blog parser a bit and removed error.txt

* added simple happy path tests for analyzer

* Added autopep8 pre-commit hook
  • Loading branch information
dvfeinblum committed Oct 20, 2018
1 parent 7f062c6 commit 67d598c
Show file tree
Hide file tree
Showing 9 changed files with 117 additions and 29 deletions.
8 changes: 8 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.3.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: autopep8-wrapper
- id: requirements-txt-fixer
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
beautifulsoup4 >= 4.6.3
nltk >= 3.3
pre-commit==1.10.3
redis >= 2.10.6
requests >= 2.19.1
validators
validators
Empty file added src/__init__.py
Empty file.
69 changes: 41 additions & 28 deletions src/blog_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import re
import requests as r

from redis_init import nltk_client, LINKS_KEY, word_client
import utils
from redis_init import nltk_client, LINKS_KEY, word_client

# Some useful constants for parsing blog html
POST_URL_REL = "alternate"
Expand All @@ -15,9 +15,6 @@
word_count = 0
pos_counts = {}

# Makes for easier debugging when it comes to nltk failures
error_file = open('error.txt', 'w')


def get_blogpost_links():
"""
Expand All @@ -44,13 +41,13 @@ async def fetch_posts(urls):

async def parse_blog_post(blog_link):
"""
Given a blog post's URL, this function GETs it and pulls the body out
Given a blog post's URL, this function GETs it and pulls the body out.
We then analyze each word with NTLK and write some data into redis.
:param blog_link: String
:return: Raw text from the blog post w/html tags stripped out
"""
global blogs_scraped_counter
global word_count
global error_file
print('Fetching raw text for {}'.format(blog_link))
soup = BeautifulSoup(r.get(blog_link).content,
features='html.parser')
Expand All @@ -61,31 +58,48 @@ async def parse_blog_post(blog_link):
if utils.DEBUG_MODE:
print('\nSanitized blogpost:\n{clean}\n\nOriginal text:{orig}'.format(clean=sanitized_post_text,
orig=post_text))
for word in sanitized_post_text.split(' '):
# First we hit the word count cache
word_client.incr(word)
word_count = word_count + 1

# Now we do some nltk wizardry
try:
pos_array = nltk.pos_tag([word])

pos_tuple = pos_array[0]
pos = pos_tuple[1]
nltk_client.incr(pos)
if pos in pos_counts:
pos_counts[pos] = pos_counts[pos] + 1
else:
pos_counts[pos] = 1
except Exception as e:
# This is the only instance in which an exception is actually cause for concern
if len(word) > 0:
print('failed to nltk-ify a post.\nURL: {url}\nException: {e}'.format(e=e, url=blog_link))
error_file.write('URL: ' + blog_link + '\n' + repr(sanitized_post_text) + '\n')

[analyze_word(word, blog_link) for word in sanitized_post_text.split(' ')]

blogs_scraped_counter = blogs_scraped_counter + 1


def analyze_word(word, blog_link):
"""
Given a word, we figure out its POS and store various info in redis.
:param word: str
:param blog_link: url that the word came from, only used for logging
:return: tuple containing the word and POS of that word
"""
global word_count
# First we hit the word count cache
word_client.incr(word)
word_count = word_count + 1

# Now we do some nltk wizardry
try:
pos_array = nltk.pos_tag([word])

pos_tuple = pos_array[0]
pos = pos_tuple[1]
nltk_client.incr(pos)
if pos in pos_counts:
pos_counts[pos] = pos_counts[pos] + 1
else:
pos_counts[pos] = 1

# we don't actually need this but it's useful for testing
return pos_tuple
except IndexError:
# This is the only instance in which an exception is actually cause for concern
if len(word) > 0:
print('Failed to nltk-ify a post.\nURL: {url}\nWord: {word}'.format(url=blog_link,
word=word))
except Exception as e:
print('Word analyzer encountered an unexpected exception on word: {w}\n Exception:{e}'.format(w=word,
e=e))


def get_results():
"""
Once the run is complete, we'll spit out some stats.
Expand Down Expand Up @@ -121,4 +135,3 @@ def main():
loop.close()

get_results()
error_file.close()
Empty file added test/__init__.py
Empty file.
Empty file added test/mocks/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions test/mocks/redis_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
class MockRedis:
def __init__(self, cache=None):
self.cache = cache

def incr(self, key):
self.cache[key] = self.cache.setdefault(key, 0) + 1

def dbsize(self):
return len(self.cache)

def set(self, k, v):
self.cache[k] = v

def get(self, k):
return self.cache.get(k)
25 changes: 25 additions & 0 deletions test/test_blog_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from unittest import TestCase, mock

from src.blog_parser import get_results, analyze_word
from test.mocks.redis_client import MockRedis


class TestBlogParser(TestCase):

@mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
@mock.patch('src.blog_parser.blogs_scraped_counter', new=1)
def test_result_generator(self):
"""
Test ensures that stats can be calculated, given a functioning redis client
"""
get_results()

@mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
def test_word_analyze(self):
"""
Check that the analyzer runs and doesn't bark at empty strings
"""
self.assertEqual(analyze_word('foo', 'https://amifired.today'), ('foo', 'NN'))
self.assertIsNone(analyze_word('', 'https://foo.bar'))


26 changes: 26 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from unittest import TestCase

from src.utils import sanitize_blogpost


class TestUtilities(TestCase):
def test_sanitizer(self):
"""
Ensures that the blog post sanitizer works as intended.
"""
fake_post = 'Words! Mere words! \n\nHow terrible they were! ' \
'How clear\n, and vivid, ~and cruel! \"One could not ' \
'escape from them.\n And yet\n what a \'subtle magic there ' \
'was in them! They seemed .to be able :to give a plastic ' \
'form to\n\n formless things, and. to ;have a music \nof their ' \
'own as sweet as that\n of viol or of lute. Mere words! Was ' \
'there anything so real as words? Also here\'s a hyphenated-word.'
expected_result = 'words mere words how terrible they were ' \
'how clear and vivid and cruel one could not ' \
'escape from them and yet what a subtle magic ' \
'there was in them they seemed to be able to give ' \
'a plastic form to formless things and to have a music ' \
'of their own as sweet as that of viol or of lute mere ' \
'words was there anything so real as words also heres a hyphenated-word'

self.assertEqual(sanitize_blogpost(fake_post), expected_result)

0 comments on commit 67d598c

Please sign in to comment.