[Issue #4] Use New Lexicount DB (#11)

* Added a blog_parser role and fleshed out basic db_utils * Added a mock client for testing and built out some queries * Finished all queries and they actually work. * Updated unittests * Added myself as a user for tomfoolery
dvfeinblum · Oct 27, 2018 · 38a4946 · 38a4946
1 parent ffb55f4
commit 38a4946
Show file tree

Hide file tree

Showing 14 changed files with 173 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -4,11 +4,17 @@ This is a little tool I wrote to see how obnoxious a writer I am.
 # Getting Started
 ## Setup
 The backend for this tool is a simple postgres database. For ease of use, you can set it up using docker:
-```
+```bash
 cd sqitch/
 docker-compose up
 ```
 
+Also, there's a redis that's used for caching stuff. That's also nice and easy to set up:
+```bash
+docker pull redis
+docker run -d -p 6379:6379 redis
+```
+
 Now that you have the db, you're actually ready to go. Set up a `venv` and install requirements with:
 ```
 python3 -m venv venv

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,7 @@
 beautifulsoup4 >= 4.6.3
 nltk >= 3.3
 pre-commit==1.10.3
+psycopg2 >= 2.7.5
 redis >= 2.10.6
 requests >= 2.19.1
 validators
diff --git a/sqitch/deploy/role.blog_parser.sql b/sqitch/deploy/role.blog_parser.sql
@@ -0,0 +1,22 @@
+-- Deploy lexicount:role.blog_parser to pg
+
+BEGIN;
+
+SET ROLE sqitch;
+
+DO
+$do$
+BEGIN
+   IF NOT EXISTS (
+      SELECT *
+      FROM   pg_catalog.pg_roles
+      WHERE  rolname = 'blog_parser') THEN
+        CREATE ROLE blog_parser LOGIN;
+   END IF;
+END
+$do$;
+
+GRANT SELECT, INSERT, UPDATE ON public.blog_details to blog_parser;
+GRANT SELECT, INSERT, UPDATE ON public.word_details to blog_parser;
+
+COMMIT;
diff --git a/sqitch/deploy/table.public.blog_details.sql b/sqitch/deploy/table.public.blog_details.sql
@@ -5,9 +5,10 @@ BEGIN;
 SET ROLE sqitch;
 
 CREATE TABLE IF NOT EXISTS public.blog_details (
-    url            VARCHAR(100) PRIMARY KEY,
+    url            VARCHAR(100),
     word           VARCHAR(100),
-    count          INTEGER
+    count          INTEGER,
+    PRIMARY KEY(url, word)
 );
 
 COMMIT;
diff --git a/sqitch/revert/role.blog_parser.sql b/sqitch/revert/role.blog_parser.sql
@@ -0,0 +1,21 @@
+-- Revert lexicount:role.blog_parser from pg
+
+BEGIN;
+
+SET ROLE sqitch;
+
+DO
+$do$
+BEGIN
+   IF NOT EXISTS (
+      SELECT *
+      FROM   pg_catalog.pg_roles
+      WHERE  rolname = 'blog_parser') THEN
+          REVOKE ALL PRIVILEGES ON table "blog_details" from blog_parser;
+          REVOKE ALL PRIVILEGES ON table "word_details" from blog_parser;
+          DROP ROLE blog_parser;
+   END IF;
+END
+$do$;
+
+COMMIT;
diff --git a/sqitch/sqitch.plan b/sqitch/sqitch.plan
@@ -5,3 +5,6 @@
 table.public.word_details 2018-10-27T07:04:48Z David Feinblum <dvfeinblum@gmail.com> # created word_details table
 table.public.blog_details 2018-10-27T14:26:55Z David Feinblum <dvfeinblum@gmail.com> # created blog_details table
 @0.0.1 2018-10-27T15:22:37Z David Feinblum <dvfeinblum@gmail.com> # initial table creation
+
+role.blog_parser 2018-10-27T17:13:37Z David Feinblum <dvfeinblum@gmail.com> # added a role for the blog parser
+@0.1.0 2018-10-27T19:50:01Z David Feinblum <dvfeinblum@gmail.com> # Finished necessary tables and permissioning for parser
diff --git a/sqitch/start_postgres.sh b/sqitch/start_postgres.sh
@@ -4,6 +4,6 @@ until psql -h mypgdb -p 5432 --username sqitch -d lexicount -c '\l'; do
   >&2 echo "DB is unavailable - sleeping"
   sleep 1
 done
-
+psql -h mypgdb -p 5432 --username sqitch -d lexicount -c 'CREATE ROLE dfeinblu LOGIN SUPERUSER'
 cd /sqitch
 sqitch deploy db:postgres://sqitch@mypgdb:5432/lexicount
diff --git a/sqitch/verify/role.blog_parser.sql b/sqitch/verify/role.blog_parser.sql
@@ -0,0 +1,15 @@
+-- Verify lexicount:role.blog_parser on pg
+
+BEGIN;
+
+SET ROLE blog_parser;
+
+SELECT has_table_privilege('public.blog_details', 'UPDATE');
+SELECT has_table_privilege('public.blog_details', 'INSERT');
+SELECT has_table_privilege('public.blog_details', 'SELECT');
+
+SELECT has_table_privilege('public.word_details', 'UPDATE');
+SELECT has_table_privilege('public.word_details', 'INSERT');
+SELECT has_table_privilege('public.word_details', 'SELECT');
+
+ROLLBACK;
diff --git a/src/blog_parser.py b/src/blog_parser.py
@@ -5,8 +5,9 @@
 import re
 import requests as r
 
+from db_utils import update_blog_details, update_word_details, get_unique_words, close_db_connection
 import utils
-from redis_init import nltk_client, LINKS_KEY, word_client
+from redis_init import LINKS_KEY, word_client
 
 # Some useful constants for parsing blog html
 POST_URL_REL = "alternate"
@@ -72,8 +73,6 @@ def analyze_word(word, blog_link):
     :return: tuple containing the word and POS of that word
     """
     global word_count
-    # First we hit the word count cache
-    word_client.incr(word)
     word_count = word_count + 1
 
     # Now we do some nltk wizardry
@@ -82,7 +81,10 @@ def analyze_word(word, blog_link):
 
         pos_tuple = pos_array[0]
         pos = pos_tuple[1]
-        nltk_client.incr(pos)
+
+        # Send some info to the db
+        update_word_details(word, pos)
+        update_blog_details(word, blog_link)
         if pos in pos_counts:
             pos_counts[pos] = pos_counts[pos] + 1
         else:
@@ -105,21 +107,24 @@ def get_results():
     Once the run is complete, we'll spit out some stats.
     """
     # we subtract one because of the blog_links entry
-    unique_word_count = word_client.dbsize() - 1
+    unique_word_count = get_unique_words()
     print('\nRESULTS\n')
     print('Number of words found across all posts: {}'.format(word_count))
     print('Number of unique words found across all posts: {}'.format(unique_word_count))
     print('Number of posts scraped: {}\n'.format(blogs_scraped_counter))
     print('Average repeat-rate of all words: {}'.format(word_count / unique_word_count))
-    print('Average words per post: {}'.format(word_count / blogs_scraped_counter))
-    print('Unique words per post: {}\n'.format(unique_word_count / blogs_scraped_counter))
+    print('Average words per post: {}'.format(
+        word_count / blogs_scraped_counter))
+    print('Unique words per post: {}\n'.format(
+        unique_word_count / blogs_scraped_counter))
     print('Part of Speech stats: {}\n'.format(pos_counts))
 
 
 def main():
     blog_links = word_client.get(LINKS_KEY)
     if blog_links is None:
-        print('Link cache is currently empty. Scraping blog feed at {}'.format(utils.BLOG_FEED_URL))
+        print('Link cache is currently empty. Scraping blog feed at {}'.format(
+            utils.BLOG_FEED_URL))
         blog_links = get_blogpost_links()
     else:
         print('Link cache was hit.')
@@ -135,3 +140,4 @@ def main():
         loop.close()
 
     get_results()
+    close_db_connection()
diff --git a/src/db_utils.py b/src/db_utils.py
@@ -0,0 +1,71 @@
+import psycopg2 as pg
+
+_WORD_DETAILS_TABLE = 'public.word_details'
+_BLOG_DETAILS_TABLE = 'public.blog_details'
+_PG_USER = 'blog_parser'
+_PG_HOST = '0.0.0.0'
+_PG_PORT = 5432
+_PG_DB = 'lexicount'
+
+_WORD_UPDATE_QUERY = "INSERT INTO " + _WORD_DETAILS_TABLE + \
+                     " (word, count, part_of_speech) " \
+                     "VALUES ('{word}', 1, '{pos}') " \
+                     "ON CONFLICT (word) DO UPDATE SET count = " + \
+    _WORD_DETAILS_TABLE + ".count + 1;"
+_BLOG_UPDATE_QUERY = "INSERT INTO " + _BLOG_DETAILS_TABLE + \
+                     " (word, count, url) " \
+                     "VALUES ('{word}', 1, '{url}')" \
+                     "ON CONFLICT (word, url) DO UPDATE SET count = " + \
+    _BLOG_DETAILS_TABLE + ".count + 1;"
+_GET_WORD_COUNT_QUERY = 'SELECT COUNT(DISTINCT word) FROM word_details;'
+
+_db_conn = pg.connect(host=_PG_HOST,
+                      port=_PG_PORT,
+                      user=_PG_USER,
+                      database=_PG_DB)
+_db_cursor = _db_conn.cursor()
+
+
+def execute_query(query):
+    """
+    Fetches a connection to our pg db
+    """
+    _db_cursor.execute(query)
+    try:
+        result = _db_cursor.fetchall()
+    except pg.ProgrammingError:
+        result = None
+    return result
+
+
+def update_word_details(word, pos):
+    """
+    Given a word and a part of speech, we update the word_details table
+    :param word: it's uh.. a word. Pulled from the blog post being parsed
+    :param pos: part of speech as determined by NLTK
+    """
+    execute_query(_WORD_UPDATE_QUERY.format(word=word,
+                                            pos=pos))
+
+
+def update_blog_details(word, url):
+    """
+    Given a word and a url, we update the blog_details table
+    :param word: yeah again.. it's a word
+    :param url: blog's url
+    """
+    execute_query(_BLOG_UPDATE_QUERY.format(word=word,
+                                            url=url))
+
+
+def get_unique_words():
+    """
+    Runs a COUNT DISTINCT on the word_details table
+    """
+    return execute_query(_GET_WORD_COUNT_QUERY)[0][0]
+
+
+def close_db_connection():
+    _db_cursor.close()
+    _db_conn.commit()
+    _db_conn.close()
diff --git a/src/redis_init.py b/src/redis_init.py
@@ -1,8 +1,5 @@
 from redis import StrictRedis
 
 # Redis stuff
-WORD_DB_ID = 0
-NLTK_DB_ID = 1
-word_client = StrictRedis(db=WORD_DB_ID)
-nltk_client = StrictRedis(db=NLTK_DB_ID)
+word_client = StrictRedis()
 LINKS_KEY = 'blog_links'
diff --git a/src/utils.py b/src/utils.py
@@ -22,7 +22,8 @@
 else:
     BLOG_FEED_URL = 'http://avagadbro.blogspot.com/feeds/posts/default'
 
-POST_PREFIX_REGEX = '^{uri.scheme}://{uri.netloc}/2'.format(uri=urlparse(BLOG_FEED_URL))
+POST_PREFIX_REGEX = '^{uri.scheme}://{uri.netloc}/2'.format(
+    uri=urlparse(BLOG_FEED_URL))
 
 
 def sanitize_blogpost(post):

diff --git a/test/mocks/pg_client.py b/test/mocks/pg_client.py
@@ -0,0 +1,7 @@
+class MockPostgresCursor:
+    def __init__(self, query_cache=set([])):
+        self.query_cache = query_cache
+
+    def execute(self, sql):
+        print('Received the following sql: %s' % sql)
+        self.query_cache.add(sql)
diff --git a/test/test_blog_parser.py b/test/test_blog_parser.py
@@ -2,24 +2,26 @@
 
 from src.blog_parser import get_results, analyze_word
 from test.mocks.redis_client import MockRedis
+from test.mocks.pg_client import MockPostgresCursor
 
 
 class TestBlogParser(TestCase):
 
     @mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
     @mock.patch('src.blog_parser.blogs_scraped_counter', new=1)
+    @mock.patch('src.db_utils._db_cursor', new=MockPostgresCursor())
     def test_result_generator(self):
         """
         Test ensures that stats can be calculated, given a functioning redis client
         """
         get_results()
 
     @mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
+    @mock.patch('src.db_utils._db_cursor', new=MockPostgresCursor())
     def test_word_analyze(self):
         """
         Check that the analyzer runs and doesn't bark at empty strings
         """
-        self.assertEqual(analyze_word('foo', 'https://amifired.today'), ('foo', 'NN'))
+        self.assertEqual(analyze_word(
+            'foo', 'https://amifired.today'), ('foo', 'NN'))
         self.assertIsNone(analyze_word('', 'https://foo.bar'))
-
-