# Indexing Crosswords

- After running on sitemaps 9 and 10: N indexed, N errored (2959 total)

In [1]:
%load_ext nb_black

import traceback
import datetime
import os
import re
import time
import random
import json
from collections import defaultdict

import requests
import bs4
import numpy as np
import pandas as pd

import ipdb

from cryptic_info.parse import try_parse

<IPython.core.display.Javascript object>

In [2]:
from cryptic_info.tables import (
    is_parsable_table_type_1,
    parse_table_type_1,
    is_parsable_table_type_2,
    parse_table_type_2,
)
from cryptic_info.lists import (
    is_parsable_list_type_1,
    parse_list_type_1,
    is_parsable_list_type_2,
    parse_list_type_2,
)
from cryptic_info.utils import extract_puzzle_url

<IPython.core.display.Javascript object>

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)",
    "Accept-Encoding": "gzip",
}

<IPython.core.display.Javascript object>

## Getting URLs

In [5]:
# Create initial metadata JSON

metadata = dict()

metadata["last_run"] = (
    datetime.datetime.now().astimezone(datetime.timezone.utc).strftime("%c %Z")
)
metadata["unindexed_urls"] = []
metadata["indexed_urls"] = []
metadata["errored_urls"] = []

SITEMAP_URL = "https://www.fifteensquared.net/wp-sitemap.xml"

# with open("cryptic_info/metadata.json") as f:
#     metadata = json.load(f)

response = requests.get(SITEMAP_URL, headers=headers)
soup = bs4.BeautifulSoup(response.text)

# Just do sitemaps 9 and 10 for now
sitemaps = list(
    reversed(
        [
            sitemap.text
            for sitemap in soup.find_all("sitemap")
            if re.search(
                r"https://www.fifteensquared.net/wp-sitemap-posts-post-(9|10).xml",
                sitemap.text,
            )
        ]
    )
)

for sitemap in sitemaps:
    response = requests.get(sitemap, headers=headers)
    soup = bs4.BeautifulSoup(response.text)
    urls = [url.text for url in soup.find_all("url")]
    metadata["unindexed_urls"].extend(urls)

with open("cryptic_info/metadata.json", "w+") as f:
    json.dump(metadata, f)

<IPython.core.display.Javascript object>

## Testing out

In [8]:
# Tables - type 1
# source_url = (
# "https://www.fifteensquared.net/2021/05/20/financial-times-16790-by-leonidas/"
# "https://www.fifteensquared.net/2021/05/21/financial-times-16791-by-buccaneer/"
# "https://www.fifteensquared.net/2021/05/21/independent-10797-by-phi/"
# "https://www.fifteensquared.net/2021/05/23/azed-no-2553-plain/"
# "https://www.fifteensquared.net/2021/05/23/everyman-3892/"
# )

# Tables - type 2
# source_url = "https://www.fifteensquared.net/2021/05/17/guardian-28447-anto/"

# List - type 1
# source_url = (
# "https://www.fifteensquared.net/2021/05/22/guardian-saturday-puzzle-28446-tramp/"
# "https://www.fifteensquared.net/2021/05/23/independent-on-sunday-1630-by-raich/"
# "https://www.fifteensquared.net/2021/05/19/guardian-28449-pasquale/"
# "https://www.fifteensquared.net/2021/05/17/guardian-quiptic-1122-hectence/"
# "https://www.fifteensquared.net/2021/05/16/independent-on-sunday-1629-hoskins/"
# )

# List - type 2
source_url = (
    "https://www.fifteensquared.net/2021/05/20/independent-10796-by-tees/"
    # "https://www.fifteensquared.net/2021/05/17/financial-times-16787-by-peto/"
    # "https://www.fifteensquared.net/2021/06/02/guardian-28461-imogen/"
)

# List - type 3
# source_url = "https://www.fifteensquared.net/2021/06/01/financial-times-16800-chalmie/"
# source_url = "https://www.fifteensquared.net/2021/05/21/guardian-cryptic-28451-puck/"
# source_url = "https://www.fifteensquared.net/2021/05/24/guardian-quiptic-1123-matilda/"

# Hihoba does hard themed puzzles, and formats their posts fairly inconsistently, depending on the theme
# https://www.fifteensquared.net/author/hihoba/
# source_url = "https://www.fifteensquared.net/2021/05/18/inquisitor-1698-spooky-manifestations-by-kruger/"

# RatkojaRiku does not include clues with their posts.
# https://www.fifteensquared.net/author/ratkojariku/
# source_url = "https://www.fifteensquared.net/2021/05/13/independent-10790-serpent/"

# FIXME: why does the extract_definitions fail? Not urgent
# source_url = "https://www.fifteensquared.net/2021/05/16/everyman-3891/"

# source_url = "https://www.fifteensquared.net/2021/06/01/financial-times-16800-chalmie/"

response = requests.get(source_url, headers=headers)

<IPython.core.display.Javascript object>

In [9]:
data = try_parse(response, source_url)

Parsing using parse_list_type_2


<IPython.core.display.Javascript object>

In [10]:
data

Unnamed: 0,ClueNumber,Clue,Definition,Answer,Annotation,PuzzleURL,SourceURL
0,1a,Alison‘s associate? (4),Alison/associate,ALLY,Double definition,,https://www.fifteensquared.net/2021/05/20/inde...
1,3a,Needs a slap when drunk walks by the sea (10),walks by the sea,ESPLANADES,An anagram (”when drunk’) of NEEDS A SLAP,,https://www.fifteensquared.net/2021/05/20/inde...
2,10a,Shop till you drop? That’s said to be soon! (...,to be soon,BY AND BY,Sounds like (‘that’s said’) BUY AND BUY (‘shop...,,https://www.fifteensquared.net/2021/05/20/inde...
3,11a,Nameless dog at home — one who kept escaping (7),who kept escaping,HOUDINI,HOUnD (dog) missing N (name) or ‘nameless’ + I...,,https://www.fifteensquared.net/2021/05/20/inde...
4,12a,.Ruin everything! (5),everything,TOTAL,Double definition,,https://www.fifteensquared.net/2021/05/20/inde...
5,13a,Untouched as plant in midwinter? (9),Untouched,INVIOLATE,VIOLA (plant) in wINTEr (‘mid’dle letters only),,https://www.fifteensquared.net/2021/05/20/inde...
6,19a,Persevere over each period before play starts...,period /before play starts,PRE-SEASON,PRESS ON (persevere) around or ‘over’ EA (each),,https://www.fifteensquared.net/2021/05/20/inde...
7,21a,Long drink containing peel from plump fruit (9),fruit,PINEAPPLE,PINE (long) ALE (drink) around or ‘containing’...,,https://www.fifteensquared.net/2021/05/20/inde...
8,22a,Tips to help all dodging fifty per cent in ta...,tax,TITHE,TIps To HElp (missing last half of each word o...,,https://www.fifteensquared.net/2021/05/20/inde...
9,24a,Energy movement generates strong feeling (7),strong feeling,EMOTION,E (energy) MOTION (movement),,https://www.fifteensquared.net/2021/05/20/inde...


<IPython.core.display.Javascript object>