Skip to content

Commit

Permalink
[Issue #4] Use New Lexicount DB (#11)
Browse files Browse the repository at this point in the history
* Added a blog_parser role and fleshed out basic db_utils

* Added a mock client for testing and built out some queries

* Finished all queries and they actually work.

* Updated unittests

* Added myself as a user for tomfoolery
  • Loading branch information
dvfeinblum committed Oct 27, 2018
1 parent ffb55f4 commit 38a4946
Show file tree
Hide file tree
Showing 14 changed files with 173 additions and 20 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@ This is a little tool I wrote to see how obnoxious a writer I am.
# Getting Started
## Setup
The backend for this tool is a simple postgres database. For ease of use, you can set it up using docker:
```
```bash
cd sqitch/
docker-compose up
```

Also, there's a redis that's used for caching stuff. That's also nice and easy to set up:
```bash
docker pull redis
docker run -d -p 6379:6379 redis
```

Now that you have the db, you're actually ready to go. Set up a `venv` and install requirements with:
```
python3 -m venv venv
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
beautifulsoup4 >= 4.6.3
nltk >= 3.3
pre-commit==1.10.3
psycopg2 >= 2.7.5
redis >= 2.10.6
requests >= 2.19.1
validators
22 changes: 22 additions & 0 deletions sqitch/deploy/role.blog_parser.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Deploy lexicount:role.blog_parser to pg

BEGIN;

SET ROLE sqitch;

DO
$do$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_roles
WHERE rolname = 'blog_parser') THEN
CREATE ROLE blog_parser LOGIN;
END IF;
END
$do$;

GRANT SELECT, INSERT, UPDATE ON public.blog_details to blog_parser;
GRANT SELECT, INSERT, UPDATE ON public.word_details to blog_parser;

COMMIT;
5 changes: 3 additions & 2 deletions sqitch/deploy/table.public.blog_details.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@ BEGIN;
SET ROLE sqitch;

CREATE TABLE IF NOT EXISTS public.blog_details (
url VARCHAR(100) PRIMARY KEY,
url VARCHAR(100),
word VARCHAR(100),
count INTEGER
count INTEGER,
PRIMARY KEY(url, word)
);

COMMIT;
21 changes: 21 additions & 0 deletions sqitch/revert/role.blog_parser.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- Revert lexicount:role.blog_parser from pg

BEGIN;

SET ROLE sqitch;

DO
$do$
BEGIN
IF NOT EXISTS (
SELECT *
FROM pg_catalog.pg_roles
WHERE rolname = 'blog_parser') THEN
REVOKE ALL PRIVILEGES ON table "blog_details" from blog_parser;
REVOKE ALL PRIVILEGES ON table "word_details" from blog_parser;
DROP ROLE blog_parser;
END IF;
END
$do$;

COMMIT;
3 changes: 3 additions & 0 deletions sqitch/sqitch.plan
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
table.public.word_details 2018-10-27T07:04:48Z David Feinblum <dvfeinblum@gmail.com> # created word_details table
table.public.blog_details 2018-10-27T14:26:55Z David Feinblum <dvfeinblum@gmail.com> # created blog_details table
@0.0.1 2018-10-27T15:22:37Z David Feinblum <dvfeinblum@gmail.com> # initial table creation

role.blog_parser 2018-10-27T17:13:37Z David Feinblum <dvfeinblum@gmail.com> # added a role for the blog parser
@0.1.0 2018-10-27T19:50:01Z David Feinblum <dvfeinblum@gmail.com> # Finished necessary tables and permissioning for parser
2 changes: 1 addition & 1 deletion sqitch/start_postgres.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ until psql -h mypgdb -p 5432 --username sqitch -d lexicount -c '\l'; do
>&2 echo "DB is unavailable - sleeping"
sleep 1
done

psql -h mypgdb -p 5432 --username sqitch -d lexicount -c 'CREATE ROLE dfeinblu LOGIN SUPERUSER'
cd /sqitch
sqitch deploy db:postgres://sqitch@mypgdb:5432/lexicount
15 changes: 15 additions & 0 deletions sqitch/verify/role.blog_parser.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- Verify lexicount:role.blog_parser on pg

BEGIN;

SET ROLE blog_parser;

SELECT has_table_privilege('public.blog_details', 'UPDATE');
SELECT has_table_privilege('public.blog_details', 'INSERT');
SELECT has_table_privilege('public.blog_details', 'SELECT');

SELECT has_table_privilege('public.word_details', 'UPDATE');
SELECT has_table_privilege('public.word_details', 'INSERT');
SELECT has_table_privilege('public.word_details', 'SELECT');

ROLLBACK;
22 changes: 14 additions & 8 deletions src/blog_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import re
import requests as r

from db_utils import update_blog_details, update_word_details, get_unique_words, close_db_connection
import utils
from redis_init import nltk_client, LINKS_KEY, word_client
from redis_init import LINKS_KEY, word_client

# Some useful constants for parsing blog html
POST_URL_REL = "alternate"
Expand Down Expand Up @@ -72,8 +73,6 @@ def analyze_word(word, blog_link):
:return: tuple containing the word and POS of that word
"""
global word_count
# First we hit the word count cache
word_client.incr(word)
word_count = word_count + 1

# Now we do some nltk wizardry
Expand All @@ -82,7 +81,10 @@ def analyze_word(word, blog_link):

pos_tuple = pos_array[0]
pos = pos_tuple[1]
nltk_client.incr(pos)

# Send some info to the db
update_word_details(word, pos)
update_blog_details(word, blog_link)
if pos in pos_counts:
pos_counts[pos] = pos_counts[pos] + 1
else:
Expand All @@ -105,21 +107,24 @@ def get_results():
Once the run is complete, we'll spit out some stats.
"""
# we subtract one because of the blog_links entry
unique_word_count = word_client.dbsize() - 1
unique_word_count = get_unique_words()
print('\nRESULTS\n')
print('Number of words found across all posts: {}'.format(word_count))
print('Number of unique words found across all posts: {}'.format(unique_word_count))
print('Number of posts scraped: {}\n'.format(blogs_scraped_counter))
print('Average repeat-rate of all words: {}'.format(word_count / unique_word_count))
print('Average words per post: {}'.format(word_count / blogs_scraped_counter))
print('Unique words per post: {}\n'.format(unique_word_count / blogs_scraped_counter))
print('Average words per post: {}'.format(
word_count / blogs_scraped_counter))
print('Unique words per post: {}\n'.format(
unique_word_count / blogs_scraped_counter))
print('Part of Speech stats: {}\n'.format(pos_counts))


def main():
blog_links = word_client.get(LINKS_KEY)
if blog_links is None:
print('Link cache is currently empty. Scraping blog feed at {}'.format(utils.BLOG_FEED_URL))
print('Link cache is currently empty. Scraping blog feed at {}'.format(
utils.BLOG_FEED_URL))
blog_links = get_blogpost_links()
else:
print('Link cache was hit.')
Expand All @@ -135,3 +140,4 @@ def main():
loop.close()

get_results()
close_db_connection()
71 changes: 71 additions & 0 deletions src/db_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import psycopg2 as pg

_WORD_DETAILS_TABLE = 'public.word_details'
_BLOG_DETAILS_TABLE = 'public.blog_details'
_PG_USER = 'blog_parser'
_PG_HOST = '0.0.0.0'
_PG_PORT = 5432
_PG_DB = 'lexicount'

_WORD_UPDATE_QUERY = "INSERT INTO " + _WORD_DETAILS_TABLE + \
" (word, count, part_of_speech) " \
"VALUES ('{word}', 1, '{pos}') " \
"ON CONFLICT (word) DO UPDATE SET count = " + \
_WORD_DETAILS_TABLE + ".count + 1;"
_BLOG_UPDATE_QUERY = "INSERT INTO " + _BLOG_DETAILS_TABLE + \
" (word, count, url) " \
"VALUES ('{word}', 1, '{url}')" \
"ON CONFLICT (word, url) DO UPDATE SET count = " + \
_BLOG_DETAILS_TABLE + ".count + 1;"
_GET_WORD_COUNT_QUERY = 'SELECT COUNT(DISTINCT word) FROM word_details;'

_db_conn = pg.connect(host=_PG_HOST,
port=_PG_PORT,
user=_PG_USER,
database=_PG_DB)
_db_cursor = _db_conn.cursor()


def execute_query(query):
"""
Fetches a connection to our pg db
"""
_db_cursor.execute(query)
try:
result = _db_cursor.fetchall()
except pg.ProgrammingError:
result = None
return result


def update_word_details(word, pos):
"""
Given a word and a part of speech, we update the word_details table
:param word: it's uh.. a word. Pulled from the blog post being parsed
:param pos: part of speech as determined by NLTK
"""
execute_query(_WORD_UPDATE_QUERY.format(word=word,
pos=pos))


def update_blog_details(word, url):
"""
Given a word and a url, we update the blog_details table
:param word: yeah again.. it's a word
:param url: blog's url
"""
execute_query(_BLOG_UPDATE_QUERY.format(word=word,
url=url))


def get_unique_words():
"""
Runs a COUNT DISTINCT on the word_details table
"""
return execute_query(_GET_WORD_COUNT_QUERY)[0][0]


def close_db_connection():
_db_cursor.close()
_db_conn.commit()
_db_conn.close()
5 changes: 1 addition & 4 deletions src/redis_init.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from redis import StrictRedis

# Redis stuff
WORD_DB_ID = 0
NLTK_DB_ID = 1
word_client = StrictRedis(db=WORD_DB_ID)
nltk_client = StrictRedis(db=NLTK_DB_ID)
word_client = StrictRedis()
LINKS_KEY = 'blog_links'
3 changes: 2 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
else:
BLOG_FEED_URL = 'http://avagadbro.blogspot.com/feeds/posts/default'

POST_PREFIX_REGEX = '^{uri.scheme}://{uri.netloc}/2'.format(uri=urlparse(BLOG_FEED_URL))
POST_PREFIX_REGEX = '^{uri.scheme}://{uri.netloc}/2'.format(
uri=urlparse(BLOG_FEED_URL))


def sanitize_blogpost(post):
Expand Down
7 changes: 7 additions & 0 deletions test/mocks/pg_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
class MockPostgresCursor:
def __init__(self, query_cache=set([])):
self.query_cache = query_cache

def execute(self, sql):
print('Received the following sql: %s' % sql)
self.query_cache.add(sql)
8 changes: 5 additions & 3 deletions test/test_blog_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,26 @@

from src.blog_parser import get_results, analyze_word
from test.mocks.redis_client import MockRedis
from test.mocks.pg_client import MockPostgresCursor


class TestBlogParser(TestCase):

@mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
@mock.patch('src.blog_parser.blogs_scraped_counter', new=1)
@mock.patch('src.db_utils._db_cursor', new=MockPostgresCursor())
def test_result_generator(self):
"""
Test ensures that stats can be calculated, given a functioning redis client
"""
get_results()

@mock.patch('src.blog_parser.word_client', new=MockRedis(cache={'foo': 2, 'bar': 1}))
@mock.patch('src.db_utils._db_cursor', new=MockPostgresCursor())
def test_word_analyze(self):
"""
Check that the analyzer runs and doesn't bark at empty strings
"""
self.assertEqual(analyze_word('foo', 'https://amifired.today'), ('foo', 'NN'))
self.assertEqual(analyze_word(
'foo', 'https://amifired.today'), ('foo', 'NN'))
self.assertIsNone(analyze_word('', 'https://foo.bar'))


0 comments on commit 38a4946

Please sign in to comment.