# fastlinkcheck API
> API for fast local and online link checking

In [1]:
#export
from fastcore.all import *
from html.parser import HTMLParser
from urllib.parse import urlparse,urlunparse

## Find links in an HTML file

In [2]:
#export
class _HTMLParseAttrs(HTMLParser):
    def reset(self):
        super().reset()
        self.found = set()
    def handle_starttag(self, tag, attrs):
        a = first(v for k,v in attrs if k in ("src","href"))
        if a: self.found.add(a)
    handle_startendtag = handle_starttag

In [3]:
#export
def get_links(fn):
    "List of all links in file `fn`"
    h = _HTMLParseAttrs()
    h.feed(Path(fn).read_text())
    return L(h.found)

We can use `get_links` to parse an HTML file for different types of links.  For example, this is the contents of `./example/test.html`:

In [4]:
example = Path('example/test.html')
print(example.read_text())

<a href="//somecdn.com/doesntexist.html"></a>
<a href="http://www.bing.com"></a>
<script src="test.js"></script>
<img src="http://fastlinkcheck.com/test.html" />




Calling `get_links` with the above file path will return a list of links:

In [5]:
links = get_links(example)
test_eq(set(links), {'test.js',
                     '//somecdn.com/doesntexist.html',
                     'http://www.bing.com','http://fastlinkcheck.com/test.html'})

In [6]:
#export
def _local_url(u, root, host, fname):
    "Change url `u` to local path if it is a local link"
    fpath = Path(fname).parent
    islocal=False
    # remove `host` prefix
    for o in 'http://','https://','http://www.','https://www.':
        if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True
    # remove params, querystring, and fragment
    p = list(urlparse(u))[:5]+['']
    # local prefix, or no protocol or host
    if islocal or (not p[0] and not p[1]):
        u = p[2]
        if u and u[0]=='/': return (root/u[1:]).resolve()
        else: return (fpath/u).resolve()
    # URLs without a protocol are "protocol relative"
    if not p[0]: p[0]='http'
    # mailto etc are not checked
    if p[0] not in ('http','https'): return ''
    return urlunparse(p)

In [7]:
class _LinkMap(dict):
    """A dict that pretty prints Links and their associated locations."""
    def _repr_locs(self, k): return '\n'.join(f'  - {p}' for p in self[k])
    def __repr__(self):
        rstr = L(f'- {k!r} is found in pages:\n{self._repr_locs(k)}' for k in self).concat()
        return '\n\n'.join(rstr)
    _repr_markdown_ = __repr__

In [8]:
#export
def local_urls(path:Path, host:str):
    "returns a `dict` mapping all HTML files in `path` to a list of locally-resolved links in that file"
    path=Path(path)
    fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm'))
    found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn))
             for fn in fns for link in get_links(fn)]
    return _LinkMap(groupby(found, 1, 0))

The keys of the `dict` returned by `local_urls` are links found in HTML files, and the values of this `dict` are a list of paths that those links are found in.  

Furthermore, local links are returned as `Path` objects, whereas external URLs are strings.  For example, notice how the link:

```html
<img src="http://fastlinkcheck.com/test.html" />
```

is resolved to a local path, because the `host` parameter supplied to `local_urls`, `fastlinkcheck.com` matches the url in the link: 

In [9]:
path = Path('./example')
links = local_urls(path, host='fastlinkcheck.com')
links

- 'http://www.bing.com' is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html

- 'http://somecdn.com/doesntexist.html' is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html

- Path('/home/jhoward/git/fastlinkcheck/example/test.html') is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html

- Path('/home/jhoward/git/fastlinkcheck/example/test.js') is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html

## Finding broken links

In [12]:
def broken_local(links, ignore_paths=None):
    "List of items in keys of `links` that are `Path`s that do not exist"
    ignore_paths = setify(ignore_paths)
    return L(o for o in links if isinstance(o,Path) and o not in ignore_paths and not o.exists())

Since `test.js` does not exist in the `example/` directory, `broken_local` returns this path:

In [13]:
broken_local(links)

(#1) [Path('/home/jhoward/git/fastlinkcheck/example/test.js')]

In [14]:
assert not all([x.exists() for x in broken_local(links)])

In [15]:
def broken_urls(links, ignore_urls=None):
    "List of items in keys of `links` that are URLs that return a failure status code"
    ignore_urls = setify(ignore_urls)
    its = L(o for o in links if isinstance(o, str) and o not in ignore_urls)
    working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True)
    return L(o for o,p in zip(its,working_urls) if not p)

Similarly the url `http://somecdn.com/doesntexist.html` doesn't exist, which is why it is returned by `broken_urls`

In [16]:
assert broken_urls(links) == ['http://somecdn.com/doesntexist.html']

In [17]:
@call_parse
def fastlinkcheck(path:Param("Root directory searched recursively for HTML files", str),
                  host:Param("Host and path (without protocol) of web server", str)='',
                  config_file:Param("Location of file with urls to ignore", str)=None):
    path = Path(path)
    if config_file: assert Path(config_file).is_file(), f"{config_file} is either not a file or doesn't exist."
    ignore = L(x.strip() for x in (Path(config_file).readlines() if config_file else ''))
    links = local_urls(path, host=host)
    ignore_paths = set((path/o).resolve() for o in ignore if not urlvalid(o))
    ignore_urls = set(ignore.filter(urlvalid))
    return _LinkMap({k:links[k] for k in (broken_urls(links, ignore_urls) + broken_local(links, ignore_paths))})

In [18]:
fastlinkcheck(path='./example', host='fastlinkcheck.com')

- 'http://somecdn.com/doesntexist.html' is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html

- Path('/home/jhoward/git/fastlinkcheck/example/test.js') is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html

You can choose to ignore files with a a plain-text file containing a list of urls to ignore.  For example, the file `linkcheck.rc` contains a list of urls I want to ignore:

In [19]:
print((path/'linkcheck.rc').read_text())

test.js
https://www.google.com



In this case `example/test.js` will be filtered out from the list:

In [20]:
fastlinkcheck(path='./example', host='fastlinkcheck.com', config_file='example/linkcheck.rc')

- 'http://somecdn.com/doesntexist.html' is found in pages:
  - /home/jhoward/git/fastlinkcheck/example/test.html