In [1]:
%load_ext nb_black

import traceback
import datetime
import os
import re
import time
import random
import json
from collections import defaultdict

import requests
import bs4
import numpy as np
import pandas as pd

import ipdb

from cryptic_info.parse import try_parse

<IPython.core.display.Javascript object>

In [2]:
from cryptic_info.tables import (
    is_parsable_table_type_1,
    parse_table_type_1,
    is_parsable_table_type_2,
    parse_table_type_2,
)
from cryptic_info.lists import (
    is_parsable_list_type_1,
    parse_list_type_1,
    is_parsable_list_type_2,
    parse_list_type_2,
)
from cryptic_info.utils import extract_puzzle_url

<IPython.core.display.Javascript object>

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)",
    "Accept-Encoding": "gzip",
}

<IPython.core.display.Javascript object>

## Getting URLs

In [4]:
# Create initial metadata JSON
metadata = dict()

metadata["last_run"] = (
    datetime.datetime.now().astimezone(datetime.timezone.utc).strftime("%c %Z")
)
metadata["unindexed_urls"] = []
metadata["indexed_urls"] = []
metadata["errored_urls"] = []

SITEMAP_URL = "https://www.fifteensquared.net/wp-sitemap.xml"

response = requests.get(SITEMAP_URL, headers=headers)
soup = bs4.BeautifulSoup(response.text)

# FIXME: let's just do the most recent sitemap first...
sitemaps = list(
    reversed(
        [
            sitemap.text
            for sitemap in soup.find_all("sitemap")
            if re.search(
                r"https://www.fifteensquared.net/wp-sitemap-posts-post-10.xml",
                sitemap.text,
            )
        ]
    )
)

for sitemap in sitemaps:
    response = requests.get(sitemap, headers=headers)
    soup = bs4.BeautifulSoup(response.text)
    urls = [url.text for url in soup.find_all("url")]
    metadata["unindexed_urls"].extend(urls)

with open("metadata.json", "w+") as f:
    json.dump(metadata, f)

<IPython.core.display.Javascript object>

In [71]:
while metadata["unindexed_urls"]:
    url = metadata["unindexed_urls"].pop()
    print(url)

    response = requests.get(url, headers=headers)
    soup = bs4.BeautifulSoup(response.text)
    print("Requested response")

    data = None
    try:
        data = try_parse(response, url)
    except Exception:
        print(traceback.format_exc())
    if data is None:
        print("Failed to parse")
        metadata["errored_urls"].append(url)
    else:
        print("Successfully parsed")
        data.to_csv("data.csv", index=False, mode="a")
        metadata["indexed_urls"].append(url)

    metadata["last_run"] = (
        datetime.datetime.now().astimezone(datetime.timezone.utc).strftime("%c %Z")
    )
    with open("metadata.json", "w") as f:
        json.dump(metadata, f)
    print("Wrote metadata")

    print("Sleeping...")
    sleep_time = random.uniform(20, 40)
    time.sleep(sleep_time)
    print(f"Slept for {sleep_time:.2f}s")
    print(78 * "=")

https://www.fifteensquared.net/2020/09/08/independent-10579-kairos/
Requested response
Failed to parse
Wrote metadata
Sleeping...
Slept for 22.98s
https://www.fifteensquared.net/2020/09/13/everyman-3856/
Requested response
Parsing using parse_table_type_1
Successfully parsed
Wrote metadata
Sleeping...
Slept for 39.41s
https://www.fifteensquared.net/2020/09/07/independent-10578-by-harold/
Requested response
Parsing using parse_table_type_1
Successfully parsed
Wrote metadata
Sleeping...
Slept for 31.42s
https://www.fifteensquared.net/2020/09/17/financial-times-16574-by-julius/
Requested response
Parsing using parse_table_type_1
Successfully parsed
Wrote metadata
Sleeping...
Slept for 27.78s
https://www.fifteensquared.net/2020/09/05/independent-10577-by-tyrus/
Requested response
Failed to parse
Wrote metadata
Sleeping...
Slept for 37.63s


<IPython.core.display.Javascript object>

## Testing out

In [22]:
# Tables - type 1
# source_url = (
# "https://www.fifteensquared.net/2021/05/20/financial-times-16790-by-leonidas/"
# "https://www.fifteensquared.net/2021/05/21/financial-times-16791-by-buccaneer/"
# "https://www.fifteensquared.net/2021/05/21/independent-10797-by-phi/"
# "https://www.fifteensquared.net/2021/05/23/azed-no-2553-plain/"
# "https://www.fifteensquared.net/2021/05/23/everyman-3892/"
# )

# Tables - type 2
# source_url = "https://www.fifteensquared.net/2021/05/17/guardian-28447-anto/"

# List - type 1
# source_url = (
# "https://www.fifteensquared.net/2021/05/22/guardian-saturday-puzzle-28446-tramp/"
# "https://www.fifteensquared.net/2021/05/23/independent-on-sunday-1630-by-raich/"
# "https://www.fifteensquared.net/2021/05/19/guardian-28449-pasquale/"
# "https://www.fifteensquared.net/2021/05/17/guardian-quiptic-1122-hectence/"
# "https://www.fifteensquared.net/2021/05/16/independent-on-sunday-1629-hoskins/"
# )

# List - type 2
# source_url = (
    # "https://www.fifteensquared.net/2021/05/20/independent-10796-by-tees/"
    # "https://www.fifteensquared.net/2021/05/17/financial-times-16787-by-peto/"
# )

# TODO: Tables - type 3???
# Hihoba does the hard puzzles, and formats his posts fairly inconsistently, depending on the theme...
# https://www.fifteensquared.net/author/hihoba/
# source_url = "https://www.fifteensquared.net/2021/05/18/inquisitor-1698-spooky-manifestations-by-kruger/"

# TODO: Lists - type 3???
# source_url = "https://www.fifteensquared.net/2021/05/21/guardian-cryptic-28451-puck/"

# FIXME: why does the extract_definitions fail? Not urgent
# source_url = "https://www.fifteensquared.net/2021/05/16/everyman-3891/"

response = requests.get(source_url, headers=headers)

<IPython.core.display.Javascript object>

In [23]:
data = try_parse(response, source_url)

Parsing using parse_list_type_2


<IPython.core.display.Javascript object>

In [None]:
data