Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# v2022.08.14

## Data

- Added clues from puzzles published since previous release date.
- Consolidated `html`, `json` and `puz` tables into a single `raw` table (with
a `content_type` column). This is in preparation for PDF ingestion.

# v2022.02.06

## Data
Expand Down
4 changes: 2 additions & 2 deletions cryptics/amuselabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def amuse_b64(e, amuseKey=None):
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(
f"SELECT EXISTS(SELECT 1 FROM json WHERE url = '{solver_url}')"
f"SELECT EXISTS(SELECT 1 FROM raw WHERE location = '{solver_url}')"
)
scraped = bool(cursor.fetchone()[0])

Expand Down Expand Up @@ -115,7 +115,7 @@ def amuse_b64(e, amuseKey=None):
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO json (source, url, json) VALUES ('new_yorker', ?, ?)",
"INSERT INTO raw (source, location, content_type, content) VALUES ('new_yorker', ?, 'json', ?)",
(solver_url, json.dumps(puz_json)),
)
conn.commit()
Expand Down
6 changes: 3 additions & 3 deletions cryptics/jsons.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ def parse_json(puzzle):
if __name__ == "__main__":
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT DISTINCT url FROM json WHERE NOT is_parsed;")
cursor.execute(f"SELECT DISTINCT location FROM raw WHERE content_type = 'json' AND NOT is_parsed;")
urls_to_parse = cursor.fetchall()
urls_to_parse = {url[0] for url in urls_to_parse}

for url in urls_to_parse:
print(f"Parsing {url}")
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT json FROM json WHERE url = '{url}';")
cursor.execute(f"SELECT content FROM raw WHERE location = '{url}';")
puzzle_json = cursor.fetchone()[0]

puzzle = json.loads(puzzle_json)
Expand All @@ -68,5 +68,5 @@ def parse_json(puzzle):
with sqlite3.connect(SQLITE_DATABASE) as conn:
data.to_sql(f"clues", conn, if_exists="append", index=False)
cursor = conn.cursor()
sql = f"UPDATE json SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE url = '{url}';"
sql = f"UPDATE raw SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE location = '{url}';"
cursor.execute(sql)
6 changes: 3 additions & 3 deletions cryptics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ def parse_unparsed_html(sources: List[str], datetime_requested: str):
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(
f"SELECT url FROM html WHERE source = '{source}' AND NOT is_parsed AND datetime_requested >= '{datetime_requested}';"
f"SELECT location FROM raw WHERE content_type = 'html' AND source = '{source}' AND NOT is_parsed AND datetime_requested >= '{datetime_requested}';"
)
urls = [url for url, in cursor.fetchall()]

for i, url in enumerate(urls):
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT html FROM html WHERE url = '{url}';")
cursor.execute(f"SELECT content FROM raw WHERE location = '{url}';")
(html,) = cursor.fetchone()

data = None
Expand All @@ -43,7 +43,7 @@ def parse_unparsed_html(sources: List[str], datetime_requested: str):
data.to_sql(f"clues", conn, if_exists="append", index=False)

cursor = conn.cursor()
sql = f"UPDATE html SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE url = '{url}';"
sql = f"UPDATE raw SET is_parsed = TRUE, datetime_parsed = datetime('now') WHERE location = '{url}';"
cursor.execute(sql)


Expand Down
2 changes: 1 addition & 1 deletion cryptics/puzzes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def insert_puz(source, path, puz_filename):
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO puz (source, path, puz, is_parsed, datetime_parsed) VALUES (?, ?, ?, 1, datetime('now'))",
"INSERT INTO raw (source, location, content_type, content) VALUES (?, ?, 'puz', ?)",
(source, path, puz_blob),
)
conn.commit()
Expand Down
4 changes: 2 additions & 2 deletions cryptics/scrape_blogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def scrape_blogs(sources, sleep_interval=1):
logging.info(f"Populating from {source}...")
with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
cursor.execute(f"SELECT url FROM html WHERE source = '{source}';")
cursor.execute(f"SELECT location FROM raw WHERE content_type = 'html' AND source = '{source}';")
known_urls = {url[0] for url in cursor.fetchall()}
new_urls = get_new_urls_func(known_urls)
logging.info(f"Found {len(new_urls)} new urls.")
Expand All @@ -33,7 +33,7 @@ def scrape_blogs(sources, sleep_interval=1):

with sqlite3.connect(SQLITE_DATABASE) as conn:
cursor = conn.cursor()
sql = f"INSERT INTO html (source, url, html) VALUES (?, ?, ?)"
sql = f"INSERT INTO raw (source, location, content_type, content) VALUES (?, ?, 'html', ?)"
cursor.execute(sql, (source, url, response.text))
conn.commit()
except:
Expand Down
48 changes: 0 additions & 48 deletions cryptics/summary.py

This file was deleted.

23 changes: 4 additions & 19 deletions queries/initialize-db.sql
Original file line number Diff line number Diff line change
@@ -1,24 +1,9 @@
CREATE TABLE IF NOT EXISTS html (
CREATE TABLE IF NOT EXISTS raw (
source TEXT,
url PRIMARY KEY,
location PRIMARY KEY,
datetime_requested TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
html TEXT,
is_parsed BOOLEAN DEFAULT FALSE,
datetime_parsed TIMESTAMP DEFAULT NULL
);
CREATE TABLE IF NOT EXISTS json (
source TEXT,
url PRIMARY KEY,
datetime_requested TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
json TEXT,
is_parsed BOOLEAN DEFAULT FALSE,
datetime_parsed TIMESTAMP DEFAULT NULL
);
CREATE TABLE IF NOT EXISTS puz (
source TEXT,
path PRIMARY KEY,
datetime_requested TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
puz BLOB,
content_type TEXT,
content BLOB,
is_parsed BOOLEAN DEFAULT FALSE,
datetime_parsed TIMESTAMP DEFAULT NULL
);
Expand Down