# `cryptics.eigenfoo.xyz` - Playground

In [1]:
%load_ext nb_black

import traceback
import datetime
import os
import re
import time
import random
import json
import string
import sqlite3
from collections import defaultdict

import requests
import bs4
import numpy as np
import pandas as pd

import ipdb

from cryptics.parse import try_parse

<IPython.core.display.Javascript object>

In [2]:
from cryptics.tables import *
from cryptics.text import *
from cryptics.lists import *
from cryptics.utils import extract_puzzle_url

<IPython.core.display.Javascript object>

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64)",
    "Accept-Encoding": "gzip",
}

<IPython.core.display.Javascript object>

## `natpostcryptic`

In [4]:
DASHES = ["-", "—", "–", "–", "—"]
PUNCTUATION_IN_CLUE = list("/\\")
PUNCTUATION_IN_ANNOTATION = DASHES + list("{}~*/\\")
PUNCTUATION_IN_ANSWERS = DASHES + list("(){}|~*/\\_<'")


def delete_chars(s, chars):
    for char in chars:
        s = s.replace(char, "")
    return s

<IPython.core.display.Javascript object>

In [5]:
def is_parsable_special_type_1(html):
    soup = bs4.BeautifulSoup(html, "html.parser")
    entry_content = soup.find("div", attrs={"class": lambda s: s in ["entry-content"]})
    answers_and_annotations = [
        line for line in entry_content.text.split("\n") if line.strip()
    ]

    phrases = [
        "cox",
        "rathvon",
        "signing off for today",
        "falcon",
        "key to reference sources",
    ]

    return (
        30 - 10
        <= len(
            entry_content.find_all(
                "div", style="background-color: blue; line-height: 200%;"
            )
        )
        and 100 <= len(answers_and_annotations)
        and 3 <= sum([phrase in entry_content.text.lower() for phrase in phrases])
    )

<IPython.core.display.Javascript object>

In [6]:
def parse_special_type_1(html):
    soup = bs4.BeautifulSoup(html, "html.parser")
    entry_content = soup.find("div", attrs={"class": lambda s: s in ["entry-content"]})

    clue_number_and_clues = [
        a.text.strip()
        for a in entry_content.find_all(
            "div", style="background-color: blue; line-height: 200%;"
        )
    ]

    clue_numbers = []
    clues = []
    for line in clue_number_and_clues:
        clue_number = re.search(r"^[0-9]+[a|d]?", line)
        if clue_number is None:
            continue
        clue = line[clue_number.end() :].replace("\n", " ").strip()

        clue_numbers.append(clue_number.group())
        clues.append(delete_chars(clue, PUNCTUATION_IN_CLUE))

    raw_definitions = [
        tag
        for table in entry_content.find_all(
            "div", style="background-color: blue; line-height: 200%;"
        )
        for tag in table.find_all("u")
    ]

    for table in entry_content.find_all("table"):
        table.extract()

    stop_phrases = ["introduction", "epilogue", "signing off for today"]
    answers_and_annotations = [
        line
        for line in entry_content.text.split("\n")
        if line.strip()
        and not any(
            line.lower().startswith(stop_phrase) for stop_phrase in stop_phrases
        )
    ]
    while True:
        try:
            line = answers_and_annotations.pop(0)
        except IndexError:
            return None

        if line.lower().strip() == "across":
            break

    answers = []
    annotations = []
    for line in answers_and_annotations:
        try:
            # Take the first match
            matches = [
                re.search("\s+[" + "|".join(DASHES) + "]\s+", line),
                re.search("\s+[" + "|".join(DASHES) + "]\s?", line),
                re.search("\s?[" + "|".join(DASHES) + "]\s+", line),
            ]
            divider = next(m for m in matches if m is not None)

            answer = line[: divider.start()]
            annotation = line[divider.end() :]
            if (
                not any([c.isalpha() for c in answer])
                or not answer == answer.upper()
                or len(
                    delete_chars(
                        answer, PUNCTUATION_IN_ANSWERS + list(string.whitespace)
                    )
                )
                > 15
            ):
                continue
        except (StopIteration, AttributeError):
            continue

        answers.append(delete_chars(answer, PUNCTUATION_IN_ANSWERS))
        annotations.append(annotation.strip("".join(PUNCTUATION_IN_ANNOTATION + [" "])))

    definitions = extract_definitions(soup, clues, raw_definitions=raw_definitions)

    out = pd.DataFrame(
        data=[clue_numbers, answers, clues, annotations, definitions],
        index=["clue_number", "answer", "clue", "annotation", "definition"],
    ).T

    if out.isna().any(0).any(0):
        return None

    return out

<IPython.core.display.Javascript object>

In [15]:
source_url = (
    # "https://natpostcryptic.blogspot.com/2021/09/saturday-september-4-2020-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2021/08/saturday-august-28-2021-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2021/08/saturday-august-21-2021-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2021/08/saturday-august-14-2021-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2021/08/saturday-august-7-2021-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2021/07/saturday-july-31-2021-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2017/04/saturday-april-1-2017-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2019/06/saturday-june-22-2019-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2019/06/saturday-june-15-2019-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2019/02/saturday-february-16-2019-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2018/09/saturday-september-29-2018-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2019/01/saturday-january-19-2019-cox-rathvon.html"
    # "https://natpostcryptic.blogspot.com/2014/03/saturday-march-1-2014-preliminary.html"
    # "https://natpostcryptic.blogspot.com/2014/07/saturday-july-5-2014-preliminary-posting.html"
    # "https://natpostcryptic.blogspot.com/2014/04/saturday-april-12-2014-preliminary-post.html"
    # "https://natpostcryptic.blogspot.com/2014/12/saturday-december-20-2014-preliminary.html"
    # "https://natpostcryptic.blogspot.com/2015/04/saturday-april-18-2015-cox-rathvon.html"
    "https://natpostcryptic.blogspot.com/2015/06/saturday-june-20-2015-cox-rathvon.html"
)
html = requests.get(source_url, headers=headers)

<IPython.core.display.Javascript object>

In [16]:
is_parsable_special_type_1(html.text)

True

<IPython.core.display.Javascript object>

In [17]:
soup = bs4.BeautifulSoup(html.text, "html.parser")
entry_content = soup.find("div", attrs={"class": lambda s: s in ["entry-content"]})

clue_number_and_clues = [
    a.text.strip()
    for a in entry_content.find_all(
        "div", style=lambda s: "background-color:" in s if s is not None else None
    )
]

clue_numbers = []
clues = []
for line in clue_number_and_clues:
    clue_number = re.search(r"^[0-9]+[a|d]?", line)
    if clue_number is None:
        continue
    clue = line[clue_number.end() :].replace("\n", " ").strip()

    clue_numbers.append(clue_number.group())
    clues.append(delete_chars(clue, PUNCTUATION_IN_CLUE))

# Save this for later - before we extract all the tables.
raw_definitions = [
    tag
    for table in entry_content.find_all(
        "div", style=lambda s: "background-color:" in s if s is not None else None
    )
    for tag in table.find_all("u")
]

for table in entry_content.find_all("table"):
    table.extract()

stop_phrases = ["introduction", "epilogue", "signing off for today"]
answers_and_annotations = [
    line
    for line in entry_content.text.split("\n")
    if line.strip()
    and not any(line.lower().startswith(stop_phrase) for stop_phrase in stop_phrases)
]
while True:
    try:
        line = answers_and_annotations.pop(0)
    except IndexError:
        print("None")

    if line.lower().strip() == "across":
        break

answers = []
annotations = []
for line in answers_and_annotations:
    try:
        # Take the first match
        matches = [
            re.search("\s+[" + "|".join(DASHES) + "]\s+", line),
            re.search("\s+[" + "|".join(DASHES) + "]\s?", line),
            re.search("\s?[" + "|".join(DASHES) + "]\s+", line),
        ]
        divider = next(m for m in matches if m is not None)

        answer = line[: divider.start()]
        stripped_answer = delete_chars(
            answer, PUNCTUATION_IN_ANSWERS + list(string.whitespace)
        )
        annotation = line[divider.end() :]
        if (
            not any([c.isalpha() for c in answer])
            # or not answer == answer.upper()
            or sum([c.isupper() for c in stripped_answer])
            <= len(stripped_answer)
            - 5  # Occasionally there will be an answer like "M(E)ETS or ME(E)TS"
            or len(
                delete_chars(answer, PUNCTUATION_IN_ANSWERS + list(string.whitespace))
            )
            > 15
        ):
            continue
    except (StopIteration, AttributeError):
        continue

    answers.append(delete_chars(answer, PUNCTUATION_IN_ANSWERS))
    annotations.append(annotation.strip("".join(PUNCTUATION_IN_ANNOTATION + [" "])))

definitions = extract_definitions(soup, clues, raw_definitions=raw_definitions)

out = pd.DataFrame(
    data=[clue_numbers, answers, clues, annotations, definitions],
    index=["clue_number", "answer", "clue", "annotation", "definition"],
).T

if out.isna().any(0).any(0):
    print("None")

None


<IPython.core.display.Javascript object>

In [18]:
out

Unnamed: 0,clue_number,answer,clue,annotation,definition
0,1a,ANGLO,"WASP, in part, agitated Logan (5)",anagram (agitated) of LOGAN,"WASP, in part"
1,4a,MASSAGING,"Rubbing Sam the wrong way, spill the beans ab...",reversal (the wrong way) of SAM + SING (spill ...,Rubbing
2,9a,ANTEING,"Stirred neat gin, feeding the kitty (7)",anagram (stirred) of NEAT GIN,feeding the kitty
3,10a,ARCADES,Game parlours are sorry at first about churl (7),ARE (†) + S (sorry at first; initial letter of...,Game parlours
4,11a,ALMA MATER,"Change around Mom’s old school (4,5)",ALTER (change) containing (around) MAMA (Mom),
5,12a,FOLIO,Medium for painting of turning leaf (5),reversal (turning) of {OIL (medium for paintin...,leaf
6,13a,KINGPIN,Big cheese in shocking pink (7),hidden in (in) shocKING PINk,Big cheese
7,15a,WEEKEND,"Saturday and Sunday, Kenneth gets into pot (7)",KEN ([diminutive for] Kenneth) contained in (g...,Saturday and Sunday
8,17a,NATASHA,"War and Peace heroine has a tan, strangely (7)",anagram (strangely) of HAS A TAN,War and Peace heroine
9,20a,REDUCES,Shrinks rescued nuts (7),anagram (nuts) of RESCUED,Shrinks


<IPython.core.display.Javascript object>

In [19]:
answers_and_annotations

['1a\xa0\xa0 WASP, in part, // agitated Logan (5)',
 'ANGLO — anagram (agitated) of LOGAN',
 'WASP[5]  is an acronym for White Anglo-Saxon Protestant,',
 ' a North American expression for an upper- or middle-class American ',
 'white Protestant, considered to be a member of the most powerful group ',
 'in society.',
 '4a\xa0\xa0 Rubbing // Sam the wrong way, spill the beans about a gang’s leader (9)',
 "MAS<|S(A|G)ING* — reversal (the wrong way) of SAM + SING (spill the beans) containing (about) {A (†) + G (gang's leader; initial letter of Gang)}",
 '9a\xa0\xa0 Stirred neat gin, // feeding the kitty (7)',
 'ANTEING* — anagram (stirred) of NEAT GIN',
 '10a\xa0\xa0 Game parlours // are sorry at first',
 'about churl (7)',
 'AR(CAD)E|S — {ARE (†) + S (sorry at first; initial letter of Sorry)} containing (about) CAD (churl)',
 '11a\xa0\xa0 Change around Mom/’s/ old',
 'school (4,5)',
 'AL(MA MA)TER — ALTER (change) containing (around) MAMA (Mom)',
 '12a\xa0\xa0 Medium for painting of turni

<IPython.core.display.Javascript object>

In [20]:
line = "DISC|U(S)SED\xa0 or DISC|US(S)ED — DISC (platter; phonograph record) + USED (consumed) containing (with added) S (salt)"

<IPython.core.display.Javascript object>

In [21]:
# Take the first match
matches = [
    re.search("\s+[" + "|".join(DASHES) + "]\s+", line),
    re.search("\s+[" + "|".join(DASHES) + "]\s?", line),
    re.search("\s?[" + "|".join(DASHES) + "]\s+", line),
]
divider = next(m for m in matches if m is not None)

answer = line[: divider.start()]
stripped_answer = delete_chars(answer, PUNCTUATION_IN_ANSWERS + list(string.whitespace))
annotation = line[divider.end() :]
(
    not any([c.isalpha() for c in answer])
    # or not answer == answer.upper()
    or sum([c.isupper() for c in stripped_answer])
    <= len(stripped_answer)
    - 5  # Occasionally there will be an answer like "M(E)ETS or ME(E)TS"
    or len(delete_chars(answer, PUNCTUATION_IN_ANSWERS + list(string.whitespace)))
    > 15 + 5
)

True

<IPython.core.display.Javascript object>

In [26]:
(len(delete_chars(answer, PUNCTUATION_IN_ANSWERS + list(string.whitespace))) > 15 + 10)

False

<IPython.core.display.Javascript object>

## `thehinducrosswordcorner`

In [40]:
# List type 4 - bold and italicized definitions, bold and underlined ACROSS/DOWN headers
# source_url = (
# "https://thehinducrosswordcorner.blogspot.com/2021/09/no-13350-monday-13-sep-2021-kriskross.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/09/the-sunday-crossword-no-3167-sunday-12.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/09/no-13349-friday-10-sep-2021-afterdark.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/09/no-13348-thursday-09-sep-2021-afterdark.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/06/no-13287-wednesday-30-jun-2021-gussalufz.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/08/no-13319-friday-06-aug-2021-incognito.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/08/no-13330-thursday-19-aug-2021-dr-x.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/08/no-13317-wednesday-04-aug-2021-neyartha.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/08/no-13333-monday-23-aug-2021-avtaar.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/06/the-sunday-crossword-no-3155-sunday-20.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/07/the-sunday-crossword-no-3157-sunday-04.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/07/the-sunday-crossword-no-3160-sunday-25.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/07/no-13311-wednesday-28-jul-2021-avtaar.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/05/no-13260-saturday-29-may-2021-dr-x.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/06/no-13272-saturday-12-jun-2021-incognito.html"
# "https://thehinducrosswordcorner.blogspot.com/2021/07/no-13289-friday-02-jul-2021-arden.html"
# FIXME: instead of h4 ACROSS/DOWN headers, this has nothing... should we support this?
# "https://thehinducrosswordcorner.blogspot.com/2021/08/no-13326-saturday-14-aug-2021-kriskross.html"
# )

# Text type 2 - (only) bold definitions, h4 ACROSS/DOWN headers
source_url = (
    # "https://thehinducrosswordcorner.blogspot.com/2021/07/no-13302-saturday-17-jul-2021-kriskross.html"
    # "https://thehinducrosswordcorner.blogspot.com/2021/06/no-13278-saturday-19-jun-2021-kriskross.html"
    # "https://thehinducrosswordcorner.blogspot.com/2021/08/no-13338-saturday-28-aug-2021-arden.html"
    # "https://thehinducrosswordcorner.blogspot.com/2021/05/no-13254-saturday-22-may-2021-vulcan.html"
    # "https://thehinducrosswordcorner.blogspot.com/2017/09/no-12124-thursday-28-sep-2017-arden.html"
    # "https://thehinducrosswordcorner.blogspot.com/2017/10/no-12129-thursday-05-oct-2017-incognito.html"
    # "https://thehinducrosswordcorner.blogspot.com/2017/08/no-12095-friday-25-aug-2017-gridman.html"
    # "https://thehinducrosswordcorner.blogspot.com/2017/06/the-sunday-crossword-no-2949-sunday-25.html"
    # "https://thehinducrosswordcorner.blogspot.com/2017/08/no-12099-wednesday-30-aug-2017-arden.html"
    # "https://thehinducrosswordcorner.blogspot.com/2018/02/no-12243-saturday-17-feb-2018-vulcan.html"
    "https://thehinducrosswordcorner.blogspot.com/2019/03/no-12570-monday-11-mar-2019-gridman.html"
)

html = requests.get(source_url, headers=headers)

<IPython.core.display.Javascript object>

In [None]:
is_parsable_text_type_2??

In [None]:
is_parsable_text_type_2(html.text)

In [None]:
parse_text_type_2(html.text)

## `fifteensquared`

## `times-xwd-times`

## `bigdave44`

In [5]:
# With buttons
# source_url = "http://bigdave44.com/2021/07/05/dt-29719/"
# source_url = "http://bigdave44.com/2021/07/03/ntspp-595/"
# source_url = "http://bigdave44.com/2021/07/02/toughie-2672/"
# source_url = "http://bigdave44.com/2021/07/02/dt-29717/"
# source_url = "http://bigdave44.com/2021/07/02/dt-29712/"
# source_url = "http://bigdave44.com/2021/07/01/toughie-2671/"
# source_url = "http://bigdave44.com/2021/07/01/dt-29716/"
# source_url = "http://bigdave44.com/2021/06/30/dt-29715/"
# source_url = "http://bigdave44.com/2012/12/21/dt-27050/"

# With white text in { }
# source_url = "http://bigdave44.com/2012/07/30/dt-26931/"
# source_url = "http://bigdave44.com/2011/09/29/toughie-641/"
# source_url = "http://bigdave44.com/2009/08/11/toughie-196/"
# source_url = "http://bigdave44.com/2010/11/03/toughie-452/"
# source_url = "http://bigdave44.com/2010/10/15/toughie-442/"

# TODO: posts that have only whitespace separating answer and annotation, but have the spoiler button...
# source_url = "http://bigdave44.com/2021/02/20/ntspp-576/"
source_url = "http://bigdave44.com/2021/04/08/toughie-2623/"

response = requests.get(source_url, headers=headers)

<IPython.core.display.Javascript object>

In [6]:
# FIXME: look at ALSO RAN... it's been split up across the answer and annotation columns!
%debug
parse_text_type_1(response.text)

> [0;32m<ipython-input-1-7028f67ebff8>[0m(21)[0;36m<module>[0;34m()[0m
[0;32m     17 [0;31m[0;32mimport[0m [0mtqdm[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     18 [0;31m[0;34m[0m[0m
[0m[0;32m     19 [0;31m[0;32mimport[0m [0mipdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m[0;34m[0m[0m
[0m[0;32m---> 21 [0;31m[0;32mfrom[0m [0mcryptic_index[0m[0;34m.[0m[0mparse[0m [0;32mimport[0m [0mtry_parse[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


Unnamed: 0,clue_number,clue,definition,answer,annotation
0,1a,One could raise a stink about charging post of...,One could raise a stink,POLECAT T,he Latin abbreviation for about ‘charging’ or ...
1,5a,Father trapped after losing head is feeling an...,feeling anxious,FRAUGHT T,he abbreviation for father and a synonym for t...
2,9a,Teacher‘s conjecture obtaining accomplished na...,Teacher,GOVERNESS R,eplace the U (university) in a synonym with fo...
3,10a,Clear one’s throat but not expect to speak (5),to speak,ORATE A,verb meaning to clear one’s throat by coughing...
4,11a,What helps maintain circulation in a naked man...,What helps maintain circulation,AORTA A,from the clue) and a human being (man) without...
5,12a,Constant change affecting most fads and tastes...,Constant,STEADFAST A,n anagram (change) of most of FADs and TASTES
6,13a,Change answer drained teacher’s put before cla...,Change,TRANSFORM T,he outside letters (drained) of TeacheR put be...
7,16a,Miserable time to spend week in Bury (5),Bury,INTER S,pend or remove the abbreviation for week from ...
8,17a,Smart set will know these guys? (5),guys,ROPES T,hese guys form part of a saying about clever p...
9,18a,Revolutionary new pesticide really is beginnin...,creepy-crawly,CENTIPEDE A,n anagram (revolutionary) of NEw PEsTiCIDE onc...


<IPython.core.display.Javascript object>

### Scratch work - balancing expressions

### Scratch work - running one-time function over all HTMLs