# todo
_fixRelativeUris
_simplifyNestedElements
_cleanClasses
_prepArticle
hasSingleTagInsideElement

In [63]:
from bs4 import BeautifulSoup, element
from html import unescape
import re
import requests

from typing import Union

In [84]:
class Readability:
    def __init__(self, doc, options={}):
        self.doc = doc
        # self.FLAG_STRIP_UNLIKELYS= "0x1" # hex
        # self.FLAG_WEIGHT_CLASSES= "0x2" # hex
        # self.FLAG_CLEAN_CONDITIONALLY= "0x4" # hex
        self.FLAG_STRIP_UNLIKELYS = 0x1
        self.FLAG_WEIGHT_CLASSES = 0x2
        self.FLAG_CLEAN_CONDITIONALLY = 0x4

        #   // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
        self.ELEMENT_NODE= 1,
        self.TEXT_NODE= 3,

        #   // Max number of nodes supported by this parser. Default: 0 (no limit)
        self.DEFAULT_MAX_ELEMS_TO_PARSE= 0

        #   // The number of top candidates to consider when analysing how
        #   // tight the competition is among candidates.
        self.DEFAULT_N_TOP_CANDIDATES= 5

        # // Element tags to score by default.
        self.DEFAULT_TAGS_TO_SCORE= "section,h2,h3,h4,h5,h6,p,td,pre".upper().split(","),

        # // The default number of chars an article must have in order to return a result
        self.DEFAULT_CHAR_THRESHOLD= 500

        # // All of the regular expressions in use within readability.
        # // Defined up here so we don't instantiate them repeatedly in loops.
        self.REGEXPS = {
            # NOTE: These two regular expressions are duplicated in
            # Readability-readerable.js. Please keep both copies in sync.
            'unlikelyCandidates': re.compile(r'-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote', re.IGNORECASE),
            'okMaybeItsACandidate': re.compile(r'and|article|body|column|content|main|shadow', re.IGNORECASE),

            'positive': re.compile(r'article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story', re.IGNORECASE),
            'negative': re.compile(r'-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget', re.IGNORECASE),
            'extraneous': re.compile(r'print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility', re.IGNORECASE),
            'byline': re.compile(r'byline|author|dateline|writtenby|p-author', re.IGNORECASE),
            'replaceFonts': re.compile(r'<(\/?)font[^>]*>', re.IGNORECASE),
            'normalize': re.compile(r'\s{2,}'),
            'videos': re.compile(r'\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)', re.IGNORECASE),
            'shareElements': re.compile(r'(\b|_)(share|sharedaddy)(\b|_)', re.IGNORECASE),
            'nextLink': re.compile(r'(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', re.IGNORECASE),
            'prevLink': re.compile(r'(prev|earl|old|new|<|«)', re.IGNORECASE),
            'tokenize': re.compile(r'\W+'),
            'whitespace': re.compile(r'^\s*$'),
            'hasContent': re.compile(r'\S$'),
            'hashUrl': re.compile(r'^#.+'),
            'srcsetUrl': re.compile(r'(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))'),
            'b64DataUrl': re.compile(r'^data:\s*([^\s;,]+)\s*;\s*base64\s*,', re.IGNORECASE),
            # See: https://schema.org/Article
            'jsonLdArticleTypes': re.compile(r'^Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference$'),
        }

        self.UNLIKELY_ROLES = [ "menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog" ]

        self.DIV_TO_P_ELEMS =  {"BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL"}

        self.ALTER_TO_DIV_EXCEPTIONS = ["DIV", "ARTICLE", "SECTION", "P"]

        self.PRESENTATIONAL_ATTRIBUTES = [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ]

        self.DEPRECATED_SIZE_ATTRIBUTE_ELEMS = [ "TABLE", "TH", "TD", "HR", "PRE" ]

        # // The commented out elements qualify as phrasing content but tend to be
        # // removed by readability when put into paragraphs, so we ignore them here.
        self.PHRASING_ELEMS = [
            # // "CANVAS", "IFRAME", "SVG", "VIDEO",
            "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
            "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
            "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
            "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
            "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
        ]
        

        # // These are the classes that readability sets itself.
        self.CLASSES_TO_PRESERVE = [ "page" ]

        # // These are the list of HTML entities that need to be escaped.
        self.HTML_ESCAPE_MAP = {
            "lt": "<",
            "gt": ">",
            "amp": "&",
            "quot": '"',
            "apos": "'",
        }

        self._doc = BeautifulSoup(doc, "html.parser") # doc
        # self._docJSDOMParser = self._doc.firstChild.__JSDOMParser__; # Needs work
        self._articleTitle = None
        self._articleByline = None
        self._articleDir = None
        self._articleSiteName = None
        self._attempts = []

        # // Configurable options
        self._debug = options.get("debug", True) # Does not originally have a default
        
    
        self._maxElemsToParse = options.get("maxElemsToParse", self.DEFAULT_MAX_ELEMS_TO_PARSE)
        self._nbTopCandidates = options.get("nbTopCandidates", self.DEFAULT_N_TOP_CANDIDATES)
        self._charThreshold = options.get("charThreshold", self.DEFAULT_CHAR_THRESHOLD)
        self._classesToPreserve = self.CLASSES_TO_PRESERVE + options.get("classesToPreserve", [])
        self._keepClasses = options.get("keepClasses", True) # Does not originally have a default
        self.serializer = options.get("serializer", lambda el: el["innterHTML"])
        self._disableJSONLD = options.get("disableJSONLD", False) # Does not originally have a default

        # // Start with all flags set
        # self._flags = hex(self.FLAG_STRIP_UNLIKELYS |
        #                 self.FLAG_WEIGHT_CLASSES |
        #                 self.FLAG_CLEAN_CONDITIONALLY)
        self._flags = self.FLAG_STRIP_UNLIKELYS | self.FLAG_WEIGHT_CLASSES | self.FLAG_CLEAN_CONDITIONALLY

#     def Readability(self, doc = None, options = {}):
#         """
#         Does not support passing a URI as the first argument
# 
#         Arguments
#             * options (dict): 
#                 * debug (bool)
#                 * maxElemsToParse (int?) optional
#                 * nbTopCandidates (?) optional
#                 * charThreshold (int?) optional
#                 * classesToPreserve (?) optional
#                 * keepClasses (bool)
#                 * serializer (func) optional
#                 * disableJSONLD
# 
#             The options object accepts a number of properties, all optional:
# 
#             * debug (boolean, default false): whether to enable logging.
#             * maxElemsToParse (number, default 0 i.e. no limit): the maximum number of elements to parse.
#             * nbTopCandidates (number, default 5): the number of top candidates to consider when analysing how tight the competition is among candidates.
#             * charThreshold (number, default 500): the number of characters an article must have in order to return a result.
#             * classesToPreserve (array): a set of classes to preserve on HTML elements when the keepClasses options is set to false.
#             * keepClasses (boolean, default false): whether to preserve all classes on HTML elements. When set to false only classes specified in the classesToPreserve array are kept.
#             * disableJSONLD (boolean, default false): when extracting page metadata, Readability gives precendence to Schema.org fields specified in the JSON-LD format. Set this option to true to skip JSON-LD parsing.
#             * serializer (function, default el => el.innerHTML) controls how the the content property returned by the parse() method is produced from the root DOM element. It may be useful to specify the serializer as the identity function (el => el) to obtain a DOM element instead of a string for content if you plan to process it further.
# 
#         """
#         if (doc is None):
#             raise Exception("First argument to Readability constructor should be a document object.");
#     
#         # Bad code, fix later
#         if "documentElement" not in doc:
#             raise Exception("First argument to Readability constructor should be a document object.");
# 
#         self._doc = doc;
#         self._docJSDOMParser = self._doc.firstChild.__JSDOMParser__; # Needs work
#         self._articleTitle = None;
#         self._articleByline = None;
#         self._articleDir = None;
#         self._articleSiteName = None;
#         self._attempts = [];
# 
#         # // Configurable options
#         self._debug = options["debug"];
#         
#     
#         self._maxElemsToParse = options.get("maxElemsToParse", self.DEFAULT_MAX_ELEMS_TO_PARSE)
#         self._nbTopCandidates = options.get("nbTopCandidates", self.DEFAULT_N_TOP_CANDIDATES)
#         self._charThreshold = options.get("charThreshold", self.DEFAULT_CHAR_THRESHOLD)
#         self._classesToPreserve = this.CLASSES_TO_PRESERVE + options.get("classesToPreserve", [])
#         self._keepClasses = options["keepClasses"];
#         self.serializer = options.get("serializer", lambda el: el["innterHTML"])
#         self._disableJSONLD = options["disableJSONLD"];
# 
#         # // Start with all flags set
#         self._flags = hex(self.FLAG_STRIP_UNLIKELYS |
#                         self.FLAG_WEIGHT_CLASSES |
#                         self.FLAG_CLEAN_CONDITIONALLY)


        # // Control whether log messages are sent to the console
#         if (self._debug) {
#             logNode = lambda node: (
#                 if (node["nodeType"] == node["TEXT_NODE"]) {
#                     return "{0} (\"{1}\")".format(node["nodeName"], node["nodeContext"])
#                 }
#                 
#                 attributes = node.get("attributes", [])
#                 attrPairs = " ".join(["{0}=\"{1}\"".format(attr["name"], attr["value"]), for attr in node.get("attributes", [])])
# 
#                 return "<{0} {1}>".format(node["localName"], attrPairs)
#             )
# 
#             this.log = function () {
#             if (typeof dump !== "undefined") {
#                 var msg = Array.prototype.map.call(arguments, function(x) {
#                 return (x && x.nodeName) ? logNode(x) : x;
#                 }).join(" ");
#                 dump("Reader: (Readability) " + msg + "\n");
#             } else if (typeof console !== "undefined") {
#                 let args = Array.from(arguments, arg => {
#                 if (arg && arg.nodeType == this.ELEMENT_NODE) {
#                     return logNode(arg);
#                 }
#                 return arg;
#                 });
#                 args.unshift("Reader: (Readability)");
#                 console.log.apply(console, args);
#             }
#             };
#         } else {
#             this.log = function () {};
#         }

    def _initialize_node(self, node):
        node.readability = {"contentScore": 0}

        tag_name = node.name.lower()

        if tag_name == "div":
            node.readability["contentScore"] += 5
        elif tag_name in ["pre", "td", "blockquote"]:
            node.readability["contentScore"] += 3
        elif tag_name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]:
            node.readability["contentScore"] -= 3
        elif tag_name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]:
            node.readability["contentScore"] -= 5

        node.readability["contentScore"] += self._getClassWeight(node)

    def _removeAndGetNext(self, node: element.Tag) -> element.Tag:
        nextNode = node.find_next()
        node.extract()
        return nextNode

    def _checkByline(self, node, matchString):
        if (self._articleByline):
            return False

        if hasattr(node, "attrs"):
            rel = node.attrs.get("rel", "")
            itemprop = node.attrs.get("itemprop", "")

        if (rel == "author"
            or "authot" in itemprop
            or re.match(self.REGEXPS['byline'], matchString)):
            self._articleByline = node.get_text()
            return True
        
        return False
    
    def _getNodeAncestors(self, node: element.Tag, maxDepth: int = 0):
        i = 0
        ancestors = []
        while node.parent:
            ancestors.appent(node.parent)
            i += 1
            if maxDepth and i == maxDepth:
                break
            node = node.parent
        return ancestors

    def _grabArticle(self, page):
        """
        /***
        * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
        *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
        *
        * @param page a document to run upon. Needs to be a full document, complete with body.
        * @return Element
        **/
        """
        print("**** grabArticle ****")
        isPaging = page != None
        page = page if page else self._doc.body

        # We can't grab an article if we don't have a page!
        if (not page):
            print("No body found in document. Abort.")
            return
        
        pageCacheHtml = page
        
        while True:
            print("Starting grabArticle loop")
            stripUnlikelyCandidates = self._flagIsActive(self.FLAG_STRIP_UNLIKELYS)

            # First, node prepping. Trash nodes that look cruddy (like ones with the
            # class name "comment", etc), and turn divs into P tags where they have been
            # used inappropriately (as in, where they contain no other block level elements.)
            elementsToScore = []
            node = self._doc.html

            shouldRemoveTitleHeader = True

            while (node):

                if node.name == "html":
                    self._articleLang = node.attrs.get("lang")
                
                matchString = " ".join(node.get("class")) +  " " + node.get("id")

                if not self._isProbablyVisible(node):
                    print("Removing hidden node - " + matchString)
                    node = self.removeAndGetNext(node)
                    continue

                # User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
                if node.attrs.get("aria-modal", "") == "true" and node.attrs.get("role", "") == "dialog":
                    node = self._removeAndGetNext(node)
                    continue

                # Check to see if this node is a byline, and remove it if it is
                if self._checkByLine(node, matchString):
                    node = self._removeAndGetNext(node)
                    continue
            
                if shouldRemoveTitleHeader and self._headerDuplicatesTitle(node):
                    print("Removing header: ", node.get_text().strip(), self._articleTitle.strip())
                    shouldRemoveTitleHeader = false
                    node = self._removeAndGetNext(node)
                    continue

                if stripUnlikelyCandidates:
                    if (re.match(self.REGEXPS["unlikelyCandidates"], matchString) 
                        and not re.match(self.REGEXPS["okMaybeItsACandidate", matchString]) 
                        and not self._hasAncestorTag(node, "table")
                        and not self._hasAncestorTag(node, "code")
                        and node.name != "body"
                        and node.name != "a"
                    ):
                        print("Removing unlikely candidate - ", matchString)
                        node = self._removeAndGetNext(node)
                        continue

                    if node.attrs.get("role") in self.UNLIKELY_ROLES:
                        print("Removing content with role ", node.attrs.get("role"), " - ", matchString)
                        node = self._removeAndGetNext(node)
                        continue

                # Remove div, section, and header nodes without any content (e.g. text, iamge, video, or iframe).
                if (node.name in ["div, section, header, h1, h2, h3, h4, h5, h6"] and self.isElementWithoutContent(node)):
                    node = self._removeAndGetNext(node)
                    continue
                    
                if node.name in self.DEFAULT_TAGS_TO_SCORE:
                    elementsToScore.append(node)

                # Turn all divs that don't have children block level elements into p's
                if node.name == "div":
                    p = None
                    childNode = node.find()
                    while childNode:
                        nextSibling = childNode.next_sibling
                        if self._isPhrasingContent(childNode):
                            if p is not None:
                                p.append(childNode)
                            elif not self._isWhitespace(childNode):
                                p = self._doc.new_tag("p")
                                node.replace_with(p)
                                p.append(childNode)
                        elif p is not None:
                            while p.last_child and self._isWhitespace(p.last_child):
                                p.last_child.extract()
                            p = None
                        childNode = nextSibling

                # Sites like http://mobile.slate.com encloses each paragraph with a DIV
                # element. DIVs with only a P element inside and no text content can be
                # safely converted into plain P elements to avoid confusing the scoring
                # algorithm with DIVs with are, in practice, paragraphs.
                if (self._hasSingleTagInsideElement(node, "p") 
                    and self._getLinkDensity(node) < 0.25):
                    newNode = node.contents[0]
                    node.parent.replace_with(newNode)
                    node = newNode
                    elementsToScore.append(node)

                elif not self._hasChildBlockElement(node):
                    node = self._setNodeTag(node, "p")
                    elementsToScore.append(node)
                node = self._getNextNode(node)
            

            # /**
            # * Loop through all paragraphs, and assign a score to them based on how content-y they look.
            # * Then add their score to their parent node.
            # *
            # * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
            # **/
            candidates = []
            for elementToScore in elementsToScore:
                if (not elementToScore.parent) or (not elementToScore.parent.name):
                    continue

                # If this paragraph is less than 25 characters, don't even count it.
                innerText = self._getInnerText(elementToScore)
                if len(innerText) < 25:
                    continue
                
                # Exclude nodes with no ancestor.
                ancestors = self._getNodeAncestors(elementToScore, 5):
                if len(ancestors) == 0:
                    continue

                contentScore = 0

                # Add a point for the paragraph itself as a base.
                contentScore += 1

                # Add points for any commas within this paragraph.
                contentScore += len(innerText.split(","))

                # For every 100 characters in this paragraph, add another point. Up to 3 points.
                contentScore += min(len(innerText) // 100, 3)

                # Initialize and score ancestors.
                for ancestor, level in zip(ancestors, range(len(ancestors))):
                    if (not ancestor.name) or (not ancestor.parent) or (not ancestor.parent.name):
                        continue

                if not hasattr(ancestor, "readability"):
                    self.initializeNode(ancestor)
                    candidates.append(ancestor)

                # Node score divider:
                # - parent:             1 (no division)
                # - grandparent:        2
                # - great grandparent+: ancestor level * 3
                if level == 0:
                    scoreDivider = 1
                elif level == 1:
                    scoreDivider = 2
                else:
                    scoreDivider = level * 3
                ancestor.readability["contentScore"] += contentScore / scoreDivider

            # After we've calculated scores, loop through all of the possible
            # candidate nodes we found and find the one with the highest score.
            topCandidates = []
            for candidate in candidates:
                # Scale the final candidates score based on link density. Good content
                # should have a relatively small link density (5% or less) and be mostly
                # unaffected by this operation.
                candidateScore = candidate.readability["contentScore"] * (1 - self._getLinkDensity(candidate))
                candidate.readability["contentScore"] = candidateScore

                print("Candidate: ", candidate, " with score ", candidateScore)
                for t in range(self._nbTopCandidates):
                    aTopCandidate = topCandidates[t]
                    if (not aTopCandidate) or (candidateScore > aTopCandidate.readability["contentScore"]): # todo
                        topCandidates.insert(t, candidate)
                        if len(topCandidates) > self._nbTopCandidates:
                            topCandidates.pop()
                        break

            topCandidate = topCandidates.get(0, None)
            neededToCreateTopCandidate = False
            parentOfTopCandidate = None

            # If we still have no top candidate, just use the body as a last resort.
            # We also have to copy the body node so it is something we can modify.
            if (topCandidate is None) or (topCandidate.name == "body"):
                # Move all of the patge's children into topCandidate
                topCandidate = self._doc.new_tag("div")
                neededToCreateTopCandidate = True
                # Move everything (not just elements, also text nodes etc.) into the container
                # so we even include text directly in the body
                while self._doc.contents:
                    child = self._doc.body.contents.pop(0)
                    print("Moving child out: ", child)
                    topCandidate.append(child)

                self._doc.body.append(topCandidate)

                self._initializeNode(topCandidate)
            elif topCandidate:
                # Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
                # and whose scores are quite closed with current `topCandidate` node.
                alternativeCandidateAncestors = []
                for i in range(1, len(topCandidates)):
                    if (topCandidates[i].readability["contentScore"] / topCandidate.readability["contentScore"]) >= 0.75:
                        alternativeCandidateAncestors.append(self._getNodeAncestors(topCandidates[i]))

                MINIMUM_TOPCANDIDATES = 3
                if len(alternativeCandidateAncestors) >= MINIMUM_TOPCANDIDATES:
                    parentOfTopCandidate = topCandidate.parent
                    while parentOfTopCandidate.name != "body":
                        listsContainingThisAncestor = 0
                        for ancestorIndex in range(len(alternativeCandidateAncestors)):
                            listsContainingThisAncestor += int(parentOfTopCandidate in alternativeCandidateAncestors[ancestorIndex])
                        if listsContainingThisAncestor >= MINIMUM_TOPCANDIDATES:
                            topCandidate = parentOfTopCandidate
                            break
                        parentOfTopCandidate = parentOfTopCandidate.parent

                if not topCandidate.readability:
                    self._initializeNode(topCandidate)
                
                # Because of our bonus system, parents of candidates might have scores
                # themselves. They get half of the node. There won't be nodes with higher
                # scores than our topCandidate, but if we see the score going *up* in the first
                # few steps up the tree, that's a decent sign that there might be more content
                # lurking in other places that we want to unify in. The sibling stuff
                # below does some of that - but only if we've looked high enough up the DOM
                # tree.
                parentOfTopCandidate = topCandidate.parent
                lastScore = topCandidate.readability["contentScore"]
                # The scores shouldn't get too low
                scoreThreshold = lastScore / 3
                while parentOfTopCandidate.name != "body":
                    if parentOfTopCandidate.readability:
                        parentOfTopCandidate = parentOfTopCandidate.parent
                        continue

                    parentScore = parentOfTopCandidate.readability["contentScore"]
                    if parentScore < scoreThreshold:
                        break
                    if parentScore > lastScore:
                        # Alright! We found a better parent to use.
                        topCandidate = parentOfTopCandidate
                        break
                    
                    lastScore = parentOfTopCandidate.readability["contentScore"]
                    parentOfTopCandidate = parentOfTopCandidate.parent
                
                # If the top candidate is the only child, use parent instead. This will help sibling
                # joining logic when adjacent content is actually located in parent's sibling node.
                parentOfTopCandidate = topCandidate.parent
                while parentOfTopCandidate.name != "body" and len(parentOfTopCandidate.contents) == 1:
                    topCandidate = parentOfTopCandidate
                    parentOfTopCandidate = topCandidate.parent
                
                if not topCandidate.readability:
                    self._initializeNode(topCandidate)
            
            # Now that we have the top candidate, look through its siblings for content
            # that might also be related. Things like preambles, content split by ads
            # that we removed, etc.
            articleContent = self._doc.new_tag("div")
            if isPaging:
                articleContent["id"] = "readability-content"
            
            siblingScoreThreshold = max(10, topCandidate.readability["contentScore"] * 0.2)
            # Keep potential top candidate's parent node to try to get text direction of it later
            parentOfTopCandidate = topCandidate.parent
            siblings = parentOfTopCandidate.children

            for sibling in siblings:
                append = False

                print("Looking at sibling node: ", sibling, sibling.readability.content_score if sibling.readability else "")
                print("Sibling has score ", sibling.readability.content_score if sibling.readability else "Unknown")

                if sibling == topCandidate:
                    append = True
                else:
                    contentBonus = 0

                # Give a bonus if sibling nodes and top candidates have the example same classname
                if sibling.get("class") == topCandidate.get("class") and topCandidate.get("class"):
                    contentBonus += topCandidate.readability["contentScore"] * 0.2

                if sibling.readability and ((sibling.readability["contentScore"] + contentBonus) >= siblingScoreThreshold):
                    append = True
                elif sibling.name == "p":
                    linkDensity = self._getLinkDensity(sibling)
                    nodeContent = self._getInnerText(sibling)
                    nodeLength = len(nodeContent)

                    if (nodeLength > 80 and linkDensity < 0.25):
                        appent = True
                    elif (nodeLength < 80 and nodeLength > 0 and linkDensity == 0 and re.search(r'\.( |$)', nodeContent)):
                        append = True
                
                if append:
                    print("Appending node: ", sibling)
                    if (sibling.name in self.ALTER_TO_DIV_EXCEPTIONS):
                        # We have a node that isn't a common block level element, like a form or td tag.
                        # Turn it into a div so it doesn't get filtered out later by accident.
                        print("Altering sibling: ", sibling, " to div.")
                        sibling = self._setNodeTag(sibling, "div")

                    articleContent.append(sibling)
                    # Fetch children again to make it compatible
                    # with DOM parsers without live collection support
                    siblings = parentOfTopCandidate.children
                    # todo - there might be errors here. The original code is:
                    #   // siblings is a reference to the children array, and
                    #   // sibling is removed from the array when we call appendChild().
                    #   // As a result, we must revisit this index since the nodes
                    #   // have been shifted.
                    #   s -= 1;
                    #   sl -= 1;
            # We have all the content that we need. Now we clean it up for presentation
            print("Article content pre-prep: " + articleContent)
            self._prepArticle(articleContent)
            print("Article content post-prep: ", articleContent)

            if neededToCreateTopCandidate:
                # We already create a fake div thing, and there wouldn't have been any siblings left
                # for the previous loop, so there's no point trying to create a new div, and then
                # move all the children over. Just assign IDs and class names here. No need to append
                # because that already happened anyway
                topCandidate.attrs["id"] = ["readability-page-1"] # todo - this might need to be a list
                topCandidate.attrs["class"] = ["page"]
            else:
                div = self._doc.new_tag("div")
                div.attrs["id"] = ["readability-page-1"]
                div.attrs["class"] = ["page"]
                while articleContent.find():
                    div.append(articleContent.pop(0))
                articleContent.append(div)
            
            print("Article content after paging: ", articleContent)
            parseSuccesful = True

            # // Now that we've gone through the full algorithm, check to see if
            # // we got any meaningful content. If we didn't, we may need to re-run
            # // grabArticle with different flags set. This gives us a higher likelihood of
            # // finding the content, and the sieve approach gives us a higher likelihood of
            # // finding the -right- content.
            textLength = len(self._getInnerText(articleContent, True))
            if textLength < self._charThreshold:
                parseSuccesful = False
                page = pageCacheHtml

                if self._flagIsActive(self.FLAG_STRIP_UNLIKELYS):
                    self._removeFlag(self.FLAG_STRIP_UNLIKELYS)
                    self._attempts.append({"articleContent": articleContent, "textLength": textLength})
                elif self._flagIsActive(self.FLAG_WEIGHT_CLASSES):
                    self.removeFlag(self.FLAG_WEIGHT_CLASSES)
                    self._attempts.append({"articleContent": articleContent, "textLength": textLength})
                elif self._flagIsActive(self.FLAG_CLEAN_CONDITIONALLY):
                    self.removeFlag(self.FLAG_CLEAN_CONDITIONALLY)
                    self._attempts.append({"articleContent": articleContent, "textLength": textLength})
                else:
                    self._attempts.append({"articleContent": articleContent, "textLength": textLength})
                    # No luck after removing flags, just return the longest text we found during the different loops
                    self.attempts.sort(key=lambda x: x["textLength"], reverse=True)
                    
                    if self.attempts[0].get("textLength", None) is None:
                        return None
                    
                    articleContent = self._attempts[0]["articleContent"]
                    parseSuccesful = True





            





    def _postProcessContent(self, articleContent):
        """
        /**
        * Run any post-process modifications to article content as necessary.
        *
        * @param Element
        * @return void
        **/
        """
        # Readability cannot open relative uris so we convert them to absolute uris.
        articleContent = self._fixRelativeUris(articleContent)
        articleContent = self._simplifyNestedElements(articleContent)
        if not self._keepClasses:
            # Remove classes.
            articleConent = self._cleanClasses(articleContent)
        return articleConent

    def _removeNodes(self, nodeList, filterFn = None):
        """
        /**
        * Iterates over a NodeList, calls `filterFn` for each node and removes node
        * if function returned `true`.
        *
        * If function is not passed, removes all the nodes in node list.
        *
        * @param NodeList nodeList The nodes to operate on
        * @param Function filterFn the function to use as a filter
        * @return void
        */
        """
        # Avoid ever operating on live node lists.
        # if (self._docJSDOMParser and nodeList["_isLiveNodeList"])""
            # raise Exception("Do not pass live node lists to _removeNodes")
        
        for i in range(len(nodeList) - 1, 0, -1):
            node = nodeList[i]
            parentNode = node.parent
            if parentNode:
                if not filterFn or filterFn(node, i, nodeList):
                    node.extract()

    def _replaceNodeTags(self, nodeList, newTagName):
        """
        /**
        * Iterates over a NodeList, and calls _setNodeTag for each node.
        *
        * @param NodeList nodeList The nodes to operate on
        * @param String newTagName the new tag name to use
        * @return void
        */
        """
        # Avoid ever operating on live node lists.
        # if (self._docJSDOMParser and nodeList["_isLiveNodeList"]):
            # raise Exception("Do not pass live node lists to _replaceNodeTags")
        
        for node in nodeList:
            self._setNodeTag(node, newTagName) # todo
  
    def _forEachNode(self, nodeList, fn):
        """
        /**
        * Iterate over a NodeList, which doesn't natively fully implement the Array
        * interface.
        *
        * For convenience, the current object context is applied to the provided
        * iterate function.
        *
        * @param  NodeList nodeList The NodeList.
        * @param  Function fn       The iterate function.
        * @return void
        */
        """
        for n in nodeList:
            fn(n) # todo - maybe should be n = fn(n)

    def _findNode(self, nodeList, fn):
        """
        /**
        * Iterate over a NodeList, and return the first node that passes
        * the supplied test function
        *
        * For convenience, the current object context is applied to the provided
        * test function.
        *
        * @param  NodeList nodeList The NodeList.
        * @param  Function fn       The test function.
        * @return void
        */"""
        for n in nodeList:
            if fn(n):
                return n
            
    def _someNode(self, nodeList, fn):
        """
        /**
        * Iterate over a NodeList, return true if any of the provided iterate
        * function calls returns true, false otherwise.
        *
        * For convenience, the current object context is applied to the
        * provided iterate function.
        *
        * @param  NodeList nodeList The NodeList.
        * @param  Function fn       The iterate function.
        * @return Boolean
        */
        """
        for n in nodeList:
            if fn(n):
                return True
            return False
        
    def _everyNode(self, nodeList, fn):
        """
        /**
        * Iterate over a NodeList, return true if all of the provided iterate
        * function calls return true, false otherwise.
        *
        * For convenience, the current object context is applied to the
        * provided iterate function.
        *
        * @param  NodeList nodeList The NodeList.
        * @param  Function fn       The iterate function.
        * @return Boolean
        */
        """
        if [fn(n) for n in nodeList].all():
            return True
        return False
    
    def _getAllNodesWithTag(self, node, tagNames):
        if node.select:
            selector = ",".join(tagNames)
            return node.select(selector)
        else:
            nodes = []
            for tag in tagNames:
                collection = node.find_all(tag)
                if isinstance(collection, list):
                    nodes.extend(collection)
                else:
                    nodes.append(collection)
        return nodes

    def _unescapeHtmlEntities(self, str):
        """
        /**
        * Converts some of the common HTML entities in string to their corresponding characters.
        *
        * @param str {string} - a string to unescape.
        * @return string without HTML entity.
        */
        """
        if not str:
            return str

        str = unescape(str)
        str = re.sub(r"&(#?[\w\d]+);", lambda match: BeautifulSoup(match.group(0), "html.parser").text, str)
        return str
 
    def _getArticleMetadata(self, jsonld):
        """
        /**
        * Attempts to get excerpt and byline metadata for the article.
        *
        * @param {Object} jsonld — object containing any metadata that
        * could be extracted from JSON-LD object.
        *
        * @return Object with optional "excerpt" and "byline" properties
        */
        """
        metadata = {}
        values = {}
        metaElements = self._doc.find_all("meta")

        # property is a space-separated list of values
        propertyPattern = re.compile(r'\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name)\s*', re.I)
        
        # name is a single value
        namePattern = re.compile(r'^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name)\s*$', re.I)

        # Find description tags
        for element in metaElements:
            elementName = element.get('name')
            elementProperty = element.get('property')
            content = element.get('content')
            if not content:
                continue
            matches = None
            name = None
            if elementProperty:
                matches = propertyPattern.match(elementProperty)
                if matches:
                    # Convert to lowercase, and remove any whitespace
                    # so we can match below.
                    name = matches.group(0).lower().replace(' ', '')
                    # multiple authors
                    values[name] = content.strip()
            if not matches and elementName and namePattern.match(elementName):
                name = elementName
                if content:
                    # Convert to lowercase, remove any whitespace, and convert dots
                    # to colons so we can match below.
                    name = name.lower().replace(' ', '').replace('.', ':')
                    values[name] = content.strip()

        # get title
        metadata['title'] = jsonld.get('title') or \
                            values.get('dc:title') or \
                            values.get('dcterm:title') or \
                            values.get('og:title') or \
                            values.get('weibo:article:title') or \
                            values.get('weibo:webpage:title') or \
                            values.get('title') or \
                            values.get('twitter:title')
        if not metadata['title']:
            metadata['title'] = self._getArticleTitle()

        # get author
        metadata['byline'] = jsonld.get('byline') or \
                            values.get('dc:creator') or \
                            values.get('dcterm:creator') or \
                            values.get('author')

        # get description
        metadata['excerpt'] = jsonld.get('excerpt') or \
                            values.get('dc:description') or \
                            values.get('dcterm:description') or \
                            values.get('og:description') or \
                            values.get('weibo:article:description') or \
                            values.get('weibo:webpage:description') or \
                            values.get('description') or \
                            values.get('twitter:description')

        # get site name
        metadata['siteName'] = jsonld.get('siteName') or \
                                values.get('og:site_name')

        # in many sites the meta value is escaped with HTML entities,
        # so here we need to unescape it
        metadata['title'] = self._unescapeHtmlEntities(metadata['title'])
        metadata['byline'] = self._unescapeHtmlEntities(metadata['byline'])
        metadata['excerpt'] = self._unescapeHtmlEntities(metadata['excerpt'])
        metadata['siteName'] = self._unescapeHtmlEntities(metadata['siteName'])

        return metadata
        
    def _unwrapNoscriptImages(self, doc):
        """
        /**
        * Find all <noscript> that are located after <img> nodes, and which contain only one
        * <img> element. Replace the first image with the image from inside the <noscript> tag,
        * and remove the <noscript> tag. This improves the quality of the images we use on
        * some sites (e.g. Medium).
        *
        * @param Element
        **/
        """
        # Find img without source or attributes that might contains image, and remove it.
        # This is done to prevent a placeholder img is replaced by img from noscript in next step.
        imgs = doc.find_all('img')
        for img in imgs:
            for attr in img.attrs:
                if attr in ['src', 'srcset', 'data-src', 'data-srcset']:
                    break
                if re.search(r'\.(jpg|jpeg|png|webp)$', attr):
                    break
            else:
                img.extract()

        # Next find noscript and try to extract its image
        noscripts = doc.find_all('noscript')
        for noscript in noscripts:
            # Parse content of noscript and make sure it only contains image
            # tmp = BeautifulSoup(noscript.contents[0], 'html.parser')
            if not len(noscript.find_all('img')) == 1:
                continue

            # If noscript has previous sibling and it only contains image,
            # replace it with noscript content. However we also keep old
            # attributes that might contains image.
            prevElement = noscript.previous_sibling
            if prevElement and len(prevElement.find_all('img')) == 1:
                prevImg = prevElement.find('img')
                newImg = noscript.find('img')
                for attr, value in prevImg.attrs.items():
                    if value == "":
                        continue
                    if attr in ['src', 'srcset'] or re.search(r'\.(jpg|jpeg|png|webp)$', value):
                        if newImg.get(attr) == value:
                            continue
                        attrName = attr
                        if newImg.has_attr(attrName):
                            attrName = 'data-old-' + attrName
                        newImg[attrName] = value
                prevElement.replace_with(noscript.contents[0])

    def _removeScripts(doc):
        """
        /**
        * Removes script tags from the document.
        *
        * @param Element
        **/
        """
        for script in doc.find_all(['script', 'noscript']):
            script.extract()

    def _isElementWithoutContent(self, node: element.Tag):
        return (type(node) == element.Tag
                and len(node.get_text().strip()) == 0
                and (len(node.children) == 0
                    or len(node.children) == (len(node.find_all("br")) + len(node.find_all("hr")))))

    def _hasChildBlockElement(self, element):
        """
        /**
        * Determine whether element has any children block level elements.
        *
        * @param Element
        */
        """
        return any(
            [node.name in self.DIV_TO_P_ELEMS or self._hasChildBlockElement(node) for node in element.contents]
        )

    def _isPhrasingContent(self, node: element.Tag) -> bool:
        """
        /***
        * Determine if a node qualifies as phrasing content.
        * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
        **/
        """
        # if node.name is None:  # node is a NavigableString
        if type(node) == element.NavigableString:
            return True
        if node.name.upper() in self.PHRASING_ELEMS:
            return True
        if node.name in ["a", "del", "ins"]:
            return all(self._isPhrasingContent(child) for child in node.contents)
        return False

    def _isWhitespace(self, node: element.Tag) -> bool:
        if type(node) == element.NavigableString:
            return node.strip() == ""
        if node.name == "br":
            return True
        return False

    def _getInnerText(self, e: element.Tag, normalizeSpaces: bool = True) -> str:
        """
        /**
        * Get the inner text of a node - cross browser compatibly.
        * This also strips out any excess whitespace to be found.
        *
        * @param Element
        * @param Boolean normalizeSpaces (default: true)
        * @return string
        **/
        """
        textContent = e.get_text().strip()

        if normalizeSpaces:
            return self.REGEXPS("normalize").sub(" ", textContent)
        return textContent

    def _getCharCount(self, e: element.Tag, s: str = ",") -> int:
        """
        /**
        * Get the number of times a string s appears in the node e.
        *
        * @param Element
        * @param string - what to split on. Default is ","
        * @return number (integer)
        **/
        """
        return len(self._getInnerText(e).split(s)) - 1
    
    def _cleanStyles(self, e: element.Tag) -> None:
        if not e or e.name == "svg":
            return
        
        # Remove `style` and deprecated presentational attributes
        for attr in self.PRESENTATIONAL_ATTRIBUTES:
            e.attrs.pop(attr, None)
        
        if e.name.upper() in self.DEPRECATED_SIZE_ATTRIBUTE_ELEMS:
            e.attr.pop("width")
            e.attr.pop("height")
        
        for child in e.children:
            if isinstance(child, element.Tag):
                self._cleanStyles(child)

    def _getLinkDensity(self, element: element.Tag) -> Union[int, float]:
        """
        /**
        * Get the density of links as a percentage of the content
        * This is the amount of text that is inside a link divided by the total text in the node.
        *
        * @param Element
        * @return number (float)
        **/
        """
        textLength = len(self._getInnerText(element))
        if textLength == 0:
            return 0
        
        linkLength = 0
        for linkNode in element.select("a"):
            href = linkNode.get("href")
            coefficient = 0.3 if href and self.REGEXPS["hashUrl"].match(href) else 1
            linkLength += len(self._getInnerText(linkNode)) * coefficient
        
        return linkLength / textLength
    
    def _getClassWeight(self, e: element.Tag) -> int:
        """
        /**
        * Get an elements class/id weight. Uses regular expressions to tell if this
        * element looks good or bad.
        *
        * @param Element
        * @return number (Integer)
        **/
        """
        if not self.flagIsActive(self.FLAG_WEIGHT_CLASSES):
            return 0
        
        weight = 0

        # Look for a special classname
        class_name = e.get("class")
        if class_name and isinstance(class_name, list):
            class_name = " ".join(class_name)
            if self.REGEXPS["negative"].search(class_name):
                weight -= 25
            if self.REGEXPS["positive"].search(class_name):
                weight += 25
        
        # Look for a special ID
        element_id = e.get("id")
        if element_id:
            if self.REGEXPS["negative"].search(element_id):
                weight -= 25
            if self.REGEXPS["positive"].search(element_id):
                weight += 25

        return weight

    def _getArticleTitle(self):
        curTitle = ""
        origTitle = ""

        try:
            curTitle = origTitle = self._doc.title.string.strip()

            # If they had an element with id "title" in their HTML
            if not isinstance(curTitle, str):
                curTitle = origTitle = self._getInnerText(self._doc.find("title"))
        except:
            # ignore exceptions setting the title.
            pass

        titleHadHierarchicalSeparators = False

        def wordCount(str):
            return len(str.split())

        # If there's a separator in the title, first remove the final part
        if re.search(r' [\|\-\\\/>»] ', curTitle):
            titleHadHierarchicalSeparators = re.search(r' [\\\/>»] ', curTitle)
            curTitle = re.sub(r'(.*)[\|\-\\\/>»] .*', r'\1', origTitle)

            # If the resulting title is too short (3 words or fewer), remove
            # the first part instead:
            if wordCount(curTitle) < 3:
                curTitle = re.sub(r'[^\|\-\\\/>»]*[\|\-\\\/>»](.*)', r'\1', origTitle)
            elif curTitle.find(": ") != -1:
                # Check if we have an heading containing this exact string, so we
                # could assume it's the full title.
                headings = self._concatNodeLists(
                    self._doc.find_all(["h1", "h2"])
                )
                trimmedTitle = curTitle.strip()
                match = any([heading.get_text().strip() == trimmedTitle for heading in headings])

                # If we don't, let's extract the title out of the original title string.
                if not match:
                    curTitle = origTitle[origTitle.rfind(":") + 1:]

                    # If the title is now too short, try the first colon instead:
                    if wordCount(curTitle) < 3:
                        curTitle = origTitle[origTitle.find(":") + 1:]
                        # But if we have too many words before the colon there's something weird
                        # with the titles and the H tags so let's just use the original title instead
                    elif wordCount(origTitle[0:origTitle.find(":")]) > 5:
                        curTitle = origTitle
            elif len(curTitle) > 150 or len(curTitle) < 15:
                hOnes = self._doc.find_all("h1")

                if len(hOnes) == 1:
                    curTitle = self._getInnerText(hOnes[0])

            curTitle = re.sub(self.REGEXPS["normalize"], " ", curTitle.strip())
            # If we now have 4 words or fewer as our title, and either no
            # 'hierarchical' separators (\, /, > or ») were found in the original
            # title or we decreased the number of words by more than 1 word, use
            # the original title.
            curTitleWordCount = wordCount(curTitle)
            if (curTitleWordCount <= 4 and
                (not titleHadHierarchicalSeparators or
                curTitleWordCount != wordCount(re.sub(r'[\|\-\\\/>»]+', '', origTitle)) - 1)):
                curTitle = origTitle
            return curTitle

    def _prepDocument(self):
        """
        /**
        * Prepare the HTML document for readability to scrape it.
        * This includes things like stripping javascript, CSS, and handling terrible markup.
        *
        * @return void
        **/
        """
        doc = self._doc

        # Remove all style tags in head
        self._removeNodes(self._getAllNodesWithTag(doc, ["style"]))

        if doc.body:
            self._replaceBrs(doc.body)

        self._replaceNodeTags(self._getAllNodesWithTag(doc, ["font"]), "span")

    def _nextNode(self, node):
        """
        /**
        * Finds the next node, starting from the given node, and ignoring
        * whitespace in between. If the given node is an element, the same node is
        * returned.
        */
        """
        next_node = node
        while next_node and (type(next_node) == element.Tag) and self.REGEXPS['whitespace'].search(next_node.textContent):
            next_node = next_node.nextSibling
        return next_node

    def _replaceBrs(self, elem):
        """
        /**
        * Replaces 2 or more successive <br> elements with a single <p>.
        * Whitespace between <br> elements are ignored. For example:
        *   <div>foo<br>bar<br> <br><br>abc</div>
        * will become:
        *   <div>foo<br>bar<p>abc</p></div>
        */
        """
        brs = elem.find_all('br')
        for br in brs:
            next = br.next_sibling

            # Whether 2 or more <br> elements have been found and replaced with a
            # <p> block.
            replaced = False

            # If we find a <br> chain, remove the <br>s until we hit another node
            # or non-whitespace. This leaves behind the first <br> in the chain
            # (which will be replaced with a <p> later).
            while next and next.name == 'br':
                replaced = True
                br_sibling = next.next_sibling
                next.extract()
                next = br_sibling

            # If we removed a <br> chain, replace the remaining <br> with a <p>. Add
            # all sibling nodes as children of the <p> until we hit another <br>
            # chain.
            if replaced:
                p = self._doc.new_tag("p")
                br.replace_with(p)

                next = p.next_sibling
                while next:
                    # If we've hit another <br><br>, we're done adding children to this <p>.
                    if next.name == 'br':
                        next_elem = self._nextNode(next.next_sibling)
                        if next_elem and next_elem.name == 'br':
                            break

                    if not self._isPhrasingContent(next):
                        break

                    # Otherwise, make this node a child of the new <p>.
                    sibling = next.next_sibling
                    p.append(next)
                    next = sibling

                while p.last_child and self._is_whitespace(p.last_child):
                    p.last_child.extract()

                if p.parent.name == "p":
                    self._setNodeTag(p.parent, "div")

    def _setNodeTag(self, node, tag):
        print("_setNodeTag", node, tag)

        replacement = self._doc.new_tag(tag)
        while node.contents:
            replacement.append(node.contents[0])
        node.replace_with(replacement)
        
        print("Will this line error?")
        if node.readability:
            replacement.readability = node.readability

        for attr, value in node.attrs.items():
            try:
                replacement[attr] = value
            except Exception as ex:
                """
                It's possible for setAttribute() to throw if the attribute name isn't a valid XML Name.
                Such attributes can however be parsed from source in HTML docs, see
                https://github.com/whatwg/html/issues/4275, so we can hit them here and then throw. We
                don't care about such attributes so we ignore them.
                """
                print("_setNodetag Error - ", ex)
        return replacement

    def _headerDuplicatesTitle(self, node):
        """
        /**
        * Check if this node is an H1 or H2 element whose content is mostly
        * the same as the article title.
        *
        * @param Element  the node to check.
        * @return boolean indicating whether this is a title-like header.
        */
        """
        if (node.name not in ["h1", "h2"]):
            return False
        heading = self._getInnerText(node, False)
        print("Evaluating similarity of header: ", heading, self._articleTitle)
        return self._textSimilarity(self._articleTitle, heading) > 0.75

    def _flagIsActive(self, flag):
        return (self._flags & flag) > 0
    
    def _removeFlag(self, flag):
        self._flags = self._flags & ~flag

    def _isProbablyVisible(self, node: element.Tag):
        # Have to null-check node.style and node.className.indexOf to deal with SVG and MathML nodes.
        return (
            (not node.attrs.get("style", None) or "display: none" not in node.attrs.get("style", ""))
            and not node.attrs.get("hidden", None)
            # check for "fallback-image" so that wikimedia math images are displayed
            and (("aria-hidden" not in node.attrs)
                 or node.attrs.get("aria-hidden", "False") != "true"
                 or ("fallback-image" not in node.name))
        )
    
    def parse(self):
        # Avoid parsing too large documents, as per configuration option
        if self._maxElemsToParse > 0:
            numTags = len(self._doc.find_all())
            if numTags > self._maxElemsToParse:
                raise ValueError(f"Aborting parsing document; {numTags} elements found")

        # Unwrap image from noscript
        self._unwrapNoscriptImages(self._doc)

        # Extract JSON-LD metadata before removing scripts
        jsonLd = {} # if self._disableJSONLD else self._getJSONLD(self._doc)

        # Remove script tags from the document.
        # self._removeScripts(this._doc)
        for script in self._doc(["script", "style"]):
            script.extract()

        self._prepDocument()

        metadata = self._getArticleMetadata(jsonLd)
        self._articleTitle = metadata["title"]
        print(self._articleTitle)

        articleContent = self._grabArticle()
        if not articleContent:
            return None

        # No fancy logging yet
        print(f"Grabbed: {articleContent.text}")

        self._postProcessContent(articleContent)

        # If we haven't found an excerpt in the article's metadata, use the article's
        # first paragraph as the excerpt. This is used for displaying a preview of
        # the article's content.
        if not metadata.excerpt:
            paragraphs = articleContent.find_all("p")
            if paragraphs:
                metadata.excerpt = paragraphs[0].text.strip()

        textContent = articleContent.text
        return {
            "title": self._articleTitle,
            "byline": metadata.byline or self._articleByline,
            "dir": self._articleDir,
            "lang": self._articleLang,
            "content": str(articleContent),
            "textContent": textContent,
            "length": len(textContent),
            "excerpt": metadata.excerpt,
            "siteName": metadata.siteName or self._articleSiteName,
        }
        


        

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 1297)

In [57]:
response = requests.get("https://www.builtinla.com/job/engineer/full-stack-software-engineer/132124")

In [81]:
reader = Readability(response.text)

In [82]:
reader.parse()

_setNodeTag <p><em><span><span><span><span><span><span><span><span><span>*VISA Sponsorship is NOT available for this position*</span></span></span></span></span></span></span></span></span></em><p><span><span>Salary will depend on various factors including applicant's prior relevant job experience, skill set, and geographic location. People Science offers a benefit package for full-time employees which includes: Medical, Dental, Vision, Flexible Spending Account, Life Insurance, DepCare FSA, Flexible Vacation Time Policy, Holidays, and Employee Stock Options. People Science reserves the right to amend, change, alter, and revise pay ranges and benefits offerings at any time. It is at the Company's discretion to determine what pay is provided to a candidate within the range associated with the role.</span></span></p></p> div
Will this line error?
Full-Stack Software Engineer (Greater LA Area, CA or Remote) - People Science


TypeError: _grabArticle() missing 1 required positional argument: 'page'