# fastlinkcheck API
> API for fast local and online link checking

In [1]:
#export
from fastcore.all import *
from html.parser import HTMLParser
from urllib.parse import urlparse,urlunparse

## Find links in an HTML file

In [2]:
#export
class _HTMLParseAttrs(HTMLParser):
    def reset(self):
        super().reset()
        self.found = set()
    def handle_starttag(self, tag, attrs):
        a = first(v for k,v in attrs if k in ("src","href"))
        if a: self.found.add(a)
    handle_startendtag = handle_starttag

In [3]:
#export
def get_links(fn):
    "List of all links in file `fn`"
    h = _HTMLParseAttrs()
    h.feed(Path(fn).read_text())
    return L(h.found)

In [4]:
path = Path('example')
host = 'fastlinkcheck.com'
get_links(path/'test.html')

(#4) ['test.js','http://www.bing.com','//somecdn.com/doesntexist.html','http://fastlinkcheck.com/test.html']

In [5]:
#export
def _local_url(u, root, host, fname):
    "Change url `u` to local path if it is a local link"
    fpath = Path(fname).parent
    islocal=False
    # remove `host` prefix
    for o in 'http://','https://','http://www.','https://www.':
        if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True
    # remove params, querystring, and fragment
    p = list(urlparse(u))[:5]+['']
    # local prefix, or no protocol or host
    if islocal or (not p[0] and not p[1]):
        u = p[2]
        if u and u[0]=='/': return (root/u[1:]).resolve()
        else: return (fpath/u).resolve()
    # URLs without a protocol are "protocol relative"
    if not p[0]: p[0]='http'
    # mailto etc are not checked
    if p[0] not in ('http','https'): return ''
    return urlunparse(p)

In [6]:
#export
def local_urls(path, host):
    "`dict` mapping all HTML files in `path` to a list of locally-resolved links in that file"
    fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm'))
    found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn))
             for fn in fns for link in get_links(fn)]
    return groupby(found, 1, 0)

Locally resolved links are returned as `Path` objects, and online URLs are strings.

In [7]:
links = local_urls(path, host=host)
links

{Path('/home/jhoward/git/fastlinkcheck/example/test.js'): [Path('/home/jhoward/git/fastlinkcheck/example/test.html')],
 'http://www.bing.com': [Path('/home/jhoward/git/fastlinkcheck/example/test.html')],
 'http://somecdn.com/doesntexist.html': [Path('/home/jhoward/git/fastlinkcheck/example/test.html')],
 Path('/home/jhoward/git/fastlinkcheck/example/test.html'): [Path('/home/jhoward/git/fastlinkcheck/example/test.html')]}

## Finding broken links

In [8]:
def broken_local(links):
    "List of items in keys of `links` that are `Path`s that do not exist"
    return L(o for o in links if isinstance(o,Path) and not o.exists())

In [9]:
for link in broken_local(links): print(f"link: {link}\n - files: {links[link]}\n")

link: /home/jhoward/git/fastlinkcheck/example/test.js
 - files: [Path('/home/jhoward/git/fastlinkcheck/example/test.html')]



In [10]:
def broken_urls(links):
    "List of items in keys of `links` that are URLs that return a failure status code"
    its = L(links).filter(risinstance(str))
    working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True)
    return L(o for o,p in zip(its,working_urls) if not p)

In [11]:
broken_urls(links)

(#1) ['http://somecdn.com/doesntexist.html']

In [17]:
@call_parse
def fastlinkcheck(path:Param("Root directory searched recursively for HTML files", str),
                  host:Param("Host and path (without protocol) of web server", str)=''):
    links = local_urls(path, host=host)
    print('## Failed URLs\n')
    for link in broken_urls(links): print(f"link: {link}\n - files: {links[link]}\n")
    print('## Local missing files\n')
    for link in broken_local(links): print(f"link: {link}\n - files: {links[link]}\n")

In [18]:
fastlinkcheck(path, host)

## Failed URLs



link: http://somecdn.com/doesntexist.html
 - files: [Path('/home/jhoward/git/fastlinkcheck/example/test.html')]

## Local missing files

link: /home/jhoward/git/fastlinkcheck/example/test.js
 - files: [Path('/home/jhoward/git/fastlinkcheck/example/test.html')]



In [7]:
# path = Path('../fast.ai/_site/')
# host = 'fast.ai'