In [33]:
from bs4 import BeautifulSoup
import requests
import re
from collections.abc import Callable

In [2]:
response = requests.get("https://www.builtinla.com/job/engineer/full-stack-software-engineer/132124")

In [3]:
doc = response.text

In [48]:
# /* eslint-env es6:false */
# /*
#  * Copyright (c) 2010 Arc90 Inc
#  *
#  * Licensed under the Apache License, Version 2.0 (the "License");
#  * you may not use this file except in compliance with the License.
#  * You may obtain a copy of the License at
#  *
#  *     http://www.apache.org/licenses/LICENSE-2.0
#  *
#  * Unless required by applicable law or agreed to in writing, software
#  * distributed under the License is distributed on an "AS IS" BASIS,
#  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  * See the License for the specific language governing permissions and
#  * limitations under the License.
#  */
# 
# /*
#  * This code is heavily based on Arc90's readability.js (1.7.1) script
#  * available at: http://code.google.com/p/arc90labs-readability
#  */

REGEXPS = {
    # Note: These two regular expressions are duplicated in
    # Readability.js. Please keep both copies in sync.
    "unlikelyCandidates": re.compile(r"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote", re.IGNORECASE),
    "okMaybeItsACandidate": re.compile(r"and|article|body|column|content|main|shadow", re.IGNORECASE),
}

def isNodeVisible(node):
    # Have to null-check node.get('style') and node.get('class') to deal with SVG and MathML nodes.
    return (
        (not node.get('style') or node.get('style') != "none")
        and not node.has_attr('hidden')
        # check for "fallback-image" so that wikimedia math images are displayed
        and (not node.has_attr('aria-hidden') 
            or node.get('aria-hidden') != "true" 
            or (node.get('class') and 'fallback-image' in node.get('class')))
    )


# /**
#  * Decides whether or not the document is reader-able without parsing the whole thing.
#  * @param {Object} options Configuration object.
#  * @param {number} [options.minContentLength=140] The minimum node content length used to decide if the document is readerable.
#  * @param {number} [options.minScore=20] The minumum cumulated 'score' used to determine if the document is readerable.
#  * @param {Function} [options.visibilityChecker=isNodeVisible] The function used to determine if a node is visible.
#  * @return {boolean} Whether or not we suspect Readability.parse() will suceeed at returning an article object.
#  */
def isProbablyReaderable(doc, options = {}):
    soup = BeautifulSoup(doc, 'html.parser')
#   // For backward compatibility reasons 'options' can either be a configuration object or the function used
#   // to determine if a node is visible.
    if isinstance(options, Callable):
        options = {"visibilityChecker": options}

    defaultOptions = { "minScore": 20, "minContentLength": 140, "visibilityChecker": isNodeVisible }
    options = {**defaultOptions, **options}

    nodes = soup.select("p, pre, article")

    # Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
    # Some articles' DOM structures might look like
    # <div>
    #   Sentences<br>
    #   <br>
    #   Sentences<br>
    # </div>
    brNodes = soup.select("div > br")
    if brNodes:
        # create a set of nodes and add the parent nodes of <br> elements
        node_set = set(nodes)
        for nodes in brNodes:
            node_set.add(node.parent)
        # convert the set back to a list
        nodes = list(node_set)

    score = 0;
    # This is a little cheeky, we use the accumulator 'score' to decide what to return from
    # this callback:
    for node in nodes:
        if not options["visibilityChecker"](node):
            continue

        matchString = " ".join(node.get("class", [])) + " " + " ".join(node.get("id", []))
        if (re.search(REGEXPS["unlikelyCandidates"], matchString) and
                not re.search(REGEXPS["okMaybeItsACandidate"], matchString)):
            continue

        if node.select_one("li p"):
            continue

        textContentLength = len(node.get_text(strip=True))
        if textContentLength < options["minContentLength"]:
            continue

        score += (textContentLength - options["minContentLength"]) ** 0.5

        if score > options["minScore"]:
            return True

    return False


In [49]:
isProbablyReaderable(response.text)

Score:  76.28237017817419 20


True