# Test Internal Links

Scan HTML pages to confirm that all internal links are working.

In [1]:
import os
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
here = Path.cwd()
html_root = (here / '../docs/').resolve()

In [3]:
html_files = sorted(list(html_root.rglob('*.html')))

# Add the Mapbox CSVs to test their links as well
html_files += [Path(html_root / '../uploads/to-mapbox-label-points-2012-data.csv').resolve()]
html_files += [Path(html_root / '../uploads/to-mapbox-label-points-2022-data.csv').resolve()]

In [4]:
df = pd.DataFrame()

for h in tqdm(html_files):

    with h.open('r') as f:
        html_text = f.read()

    df_file = pd.DataFrame()
    soup = BeautifulSoup(html_text, features='html.parser')
    links = [link.get('href') for link in soup.find_all('a')]
    
    df_file['destination'] = links
    df_file['source'] = h
    
    df = pd.concat([df, df_file], ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=2072.0), HTML(value='')))




In [5]:
df['is_local'] = True
df.loc[df.destination.str.contains('http'), 'is_local'] = False
df_local = df[df.is_local].copy()

In [6]:
df_local['destination_resolved'] = df_local.apply(
    lambda x: (Path(x.source).parent / Path(x.destination)).resolve(), axis=1
)

In [7]:
# Total number of links
len(df_local)

25601

In [8]:
# Unique link destinations
len(df_local.destination.unique())
# todo: find orphan HTML pages, those not linked by anything else

4264

In [9]:
# Unique link destinations
len(df_local.destination_resolved.unique())
# todo: 

2070

In [10]:
# Unique link sources
len(df_local.source.unique())

2072

In [11]:
df_local['exists'] = df_local.apply(
    lambda x: (Path(x.source).parent / Path(x.destination)).resolve().exists(), axis=1
)

In [12]:
# links_to_check = df[df.is_local].groupby('destination').size()
# links_to_check

In [13]:
df_local['is_broken'] = (df_local.is_local) & ~(df_local.exists)

In [14]:
df_local['source_filename'] = df_local.source.apply(lambda x: Path(x).name)

In [15]:
df_local.is_broken.sum()

0

In [16]:
df_local[df_local.is_broken].source.unique().tolist()

[]

In [17]:
# Find orphan HTML pages, those not linked by anything else
[x for x in df_local.destination_resolved.unique() if x not in df_local.source.unique()]

[PosixPath('/Users/devin/Projects/openanc/uploads')]

In [18]:
[x for x in df_local.source.unique() if x not in df_local.destination_resolved.unique()]

[PosixPath('/Users/devin/Projects/openanc/docs/404.html'),
 PosixPath('/Users/devin/Projects/openanc/uploads/to-mapbox-label-points-2012-data.csv'),
 PosixPath('/Users/devin/Projects/openanc/uploads/to-mapbox-label-points-2022-data.csv')]

In [29]:
df_local[df_local.destination_resolved == Path('/Users/devin/Projects/openanc/uploads')].iloc[0].squeeze()

destination                                                              
source                  /Users/devin/Projects/openanc/uploads/to-mapbo...
is_local                                                             True
destination_resolved                /Users/devin/Projects/openanc/uploads
exists                                                               True
is_broken                                                           False
source_filename                      to-mapbox-label-points-2012-data.csv
Name: 30156, dtype: object

In [19]:
# sorted(list(df_local[df_local.is_broken].destination.unique()))

In [20]:
# df_local[df_local['source_filename'] == '1A.html']

## External Links

In [21]:
# df[~df.is_local].destination.unique()