diff --git a/CHANGELOG.md b/CHANGELOG.md index 01f6a37..c3b759f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +# v2022.08.14 + +## Data + +- Added clues from puzzles published since previous release date. +- Consolidated `html`, `json` and `puz` tables into a single `raw` table (with + a `content_type` column). This is in preparation for PDF ingestion. + # v2022.02.06 ## Data diff --git a/cryptics/amuselabs.py b/cryptics/amuselabs.py index 65c742f..3c4d106 100644 --- a/cryptics/amuselabs.py +++ b/cryptics/amuselabs.py @@ -76,7 +76,7 @@ def amuse_b64(e, amuseKey=None): with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() cursor.execute( - f"SELECT EXISTS(SELECT 1 FROM json WHERE url = '{solver_url}')" + f"SELECT EXISTS(SELECT 1 FROM raw WHERE location = '{solver_url}')" ) scraped = bool(cursor.fetchone()[0]) @@ -115,7 +115,7 @@ def amuse_b64(e, amuseKey=None): with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() cursor.execute( - "INSERT INTO json (source, url, json) VALUES ('new_yorker', ?, ?)", + "INSERT INTO raw (source, location, content_type, content) VALUES ('new_yorker', ?, 'json', ?)", (solver_url, json.dumps(puz_json)), ) conn.commit() diff --git a/cryptics/jsons.py b/cryptics/jsons.py index 0dc63c4..26708a2 100644 --- a/cryptics/jsons.py +++ b/cryptics/jsons.py @@ -47,7 +47,7 @@ def parse_json(puzzle): if __name__ == "__main__": with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() - cursor.execute(f"SELECT DISTINCT url FROM json WHERE NOT is_parsed;") + cursor.execute(f"SELECT DISTINCT location FROM raw WHERE content_type = 'json' AND NOT is_parsed;") urls_to_parse = cursor.fetchall() urls_to_parse = {url[0] for url in urls_to_parse} @@ -55,7 +55,7 @@ def parse_json(puzzle): print(f"Parsing {url}") with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() - cursor.execute(f"SELECT json FROM json WHERE url = '{url}';") + cursor.execute(f"SELECT content FROM raw WHERE location = '{url}';") puzzle_json = cursor.fetchone()[0] puzzle = json.loads(puzzle_json) @@ -68,5 +68,5 @@ def parse_json(puzzle): with sqlite3.connect(SQLITE_DATABASE) as conn: data.to_sql(f"clues", conn, if_exists="append", index=False) cursor = conn.cursor() - sql = f"UPDATE json SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE url = '{url}';" + sql = f"UPDATE raw SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE location = '{url}';" cursor.execute(sql) diff --git a/cryptics/main.py b/cryptics/main.py index ca5cd0c..c0e2c19 100644 --- a/cryptics/main.py +++ b/cryptics/main.py @@ -15,14 +15,14 @@ def parse_unparsed_html(sources: List[str], datetime_requested: str): with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() cursor.execute( - f"SELECT url FROM html WHERE source = '{source}' AND NOT is_parsed AND datetime_requested >= '{datetime_requested}';" + f"SELECT location FROM raw WHERE content_type = 'html' AND source = '{source}' AND NOT is_parsed AND datetime_requested >= '{datetime_requested}';" ) urls = [url for url, in cursor.fetchall()] for i, url in enumerate(urls): with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() - cursor.execute(f"SELECT html FROM html WHERE url = '{url}';") + cursor.execute(f"SELECT content FROM raw WHERE location = '{url}';") (html,) = cursor.fetchone() data = None @@ -43,7 +43,7 @@ def parse_unparsed_html(sources: List[str], datetime_requested: str): data.to_sql(f"clues", conn, if_exists="append", index=False) cursor = conn.cursor() - sql = f"UPDATE html SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE url = '{url}';" + sql = f"UPDATE raw SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE location = '{url}';" cursor.execute(sql) diff --git a/cryptics/puzzes.py b/cryptics/puzzes.py index 3da27c1..50240b8 100644 --- a/cryptics/puzzes.py +++ b/cryptics/puzzes.py @@ -25,7 +25,7 @@ def insert_puz(source, path, puz_filename): with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() cursor.execute( - "INSERT INTO puz (source, path, puz, is_parsed, datetime_parsed) VALUES (?, ?, ?, 1, datetime('now'))", + "INSERT INTO raw (source, location, content_type, content) VALUES (?, ?, 'puz', ?)", (source, path, puz_blob), ) conn.commit() diff --git a/cryptics/scrape_blogs.py b/cryptics/scrape_blogs.py index 2ad3be2..f4b5b5f 100644 --- a/cryptics/scrape_blogs.py +++ b/cryptics/scrape_blogs.py @@ -17,7 +17,7 @@ def scrape_blogs(sources, sleep_interval=1): logging.info(f"Populating from {source}...") with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() - cursor.execute(f"SELECT url FROM html WHERE source = '{source}';") + cursor.execute(f"SELECT location FROM raw WHERE content_type = 'html' AND source = '{source}';") known_urls = {url[0] for url in cursor.fetchall()} new_urls = get_new_urls_func(known_urls) logging.info(f"Found {len(new_urls)} new urls.") @@ -33,7 +33,7 @@ def scrape_blogs(sources, sleep_interval=1): with sqlite3.connect(SQLITE_DATABASE) as conn: cursor = conn.cursor() - sql = f"INSERT INTO html (source, url, html) VALUES (?, ?, ?)" + sql = f"INSERT INTO raw (source, location, content_type, content) VALUES (?, ?, 'html', ?)" cursor.execute(sql, (source, url, response.text)) conn.commit() except: diff --git a/cryptics/summary.py b/cryptics/summary.py deleted file mode 100644 index 8018d91..0000000 --- a/cryptics/summary.py +++ /dev/null @@ -1,48 +0,0 @@ -import sqlite3 - -from cryptics.config import SQLITE_DATABASE - - -def query_and_print(prompt, sql): - with sqlite3.connect(SQLITE_DATABASE) as conn: - cursor = conn.cursor() - cursor.execute(sql) - (output,) = cursor.fetchone() - print(f"\t{prompt.ljust(20)}: {str(output).rjust(7)}") - - -if __name__ == "__main__": - with sqlite3.connect(SQLITE_DATABASE) as conn: - cursor = conn.cursor() - cursor.execute("SELECT DISTINCT source FROM clues;") - sources = sorted([source for (source,) in cursor.fetchall()]) - - queries = { - source: [ - ("# crosswords", f"SELECT count(1) FROM html WHERE source = '{source}';"), - ( - "# crosswords parsed", - f"SELECT count(1) FROM html WHERE source = '{source}' AND is_parsed;", - ), - ( - "% crosswords parsed", - f"SELECT printf('%.1f', 100.0 * (SELECT count(1) FROM html WHERE source = '{source}' AND is_parsed) / (SELECT count(1) FROM html WHERE source = '{source}'));", - ), - ("# clues", f"SELECT count(1) FROM clues WHERE source = '{source}';"), - ( - "# clues reviewed", - f"SELECT count(1) FROM clues WHERE source = '{source}' AND is_reviewed;", - ), - ( - "% clues reviewed", - f"SELECT printf('%.1f', 100.0 * (SELECT count(1) FROM clues WHERE source = '{source}' AND is_reviewed) / (SELECT count(1) FROM clues WHERE source = '{source}'));", - ), - ] - for source in sources - } - - for source, queries_ in queries.items(): - print(source) - for (prompt, query) in queries_: - query_and_print(prompt, query) - print() diff --git a/queries/initialize-db.sql b/queries/initialize-db.sql index 60f2884..2380212 100644 --- a/queries/initialize-db.sql +++ b/queries/initialize-db.sql @@ -1,24 +1,9 @@ -CREATE TABLE IF NOT EXISTS html ( +CREATE TABLE IF NOT EXISTS raw ( source TEXT, - url PRIMARY KEY, + location PRIMARY KEY, datetime_requested TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - html TEXT, - is_parsed BOOLEAN DEFAULT FALSE, - datetime_parsed TIMESTAMP DEFAULT NULL -); -CREATE TABLE IF NOT EXISTS json ( - source TEXT, - url PRIMARY KEY, - datetime_requested TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - json TEXT, - is_parsed BOOLEAN DEFAULT FALSE, - datetime_parsed TIMESTAMP DEFAULT NULL -); -CREATE TABLE IF NOT EXISTS puz ( - source TEXT, - path PRIMARY KEY, - datetime_requested TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - puz BLOB, + content_type TEXT, + content BLOB, is_parsed BOOLEAN DEFAULT FALSE, datetime_parsed TIMESTAMP DEFAULT NULL );