# Extract links between pages  

Dividing them into three categories:
 - `external`: links to resources outside the wiki
 - `inlinks`: valid links between pages
 - `unknown`: links that need further research

In [1]:
from glob import glob

# Where are all those htmls?
html_route = r"C:\Corpora\zelda-wikia2-clean\\"

source_dir = html_route + "*.html" 
source_dir_template = html_route + "%s" 

files  = list(glob(source_dir))

In [3]:
from bs4 import BeautifulSoup
import os

from urllib.parse import urlparse
from urllib.parse import unquote

def clean_url(url):
    parsed = urlparse(url)
    path = unquote(parsed.path)
    if path.startswith("../"):
        path = path[3:]
    path = path.replace("/", "%2F")
    query = None if parsed.query == '' else parsed.query
    fragment = None if parsed.fragment == '' else parsed.fragment
    return (path, query, fragment)
    
pages = {}

for file in files:
    file_name = os.path.basename(file)
    #print(file_name)
    page:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        page = BeautifulSoup(r, "lxml")
    wikiaMainContent = page.find('article', {'id':'WikiaMainContent'})
    if file_name not in pages:
        pages[file_name] = {}
    if wikiaMainContent is None:
        print(file)
        continue
    anchors = wikiaMainContent.findAll('a')
    for anchor in anchors:
        items = clean_url(anchor.get('href', ''))
        anchor_text = anchor.text.strip()
        file_url = items[0]
        if "external" in file_url:
            if "external" not in pages[file_name]:
                pages[file_name]["external"] = []
            external = {
                'text':anchor_text,
                'url':None if items[1] is None else items[1][5:]
            }
            if items[1]:
                external['url'] = items[1][5:]
            pages[file_name]["external"].append(external)
        elif os.path.exists(source_dir_template % file_url):
            if file_url not in pages:
                pages[file_url] = {}
            if "inlinks" not in pages[file_url]:
                pages[file_url]["inlinks"] = []
            inlink = {
                'text': anchor_text,
                'from': file_name
            }
            if items[2]:
                inlink['fragment'] = items[2]
            pages[file_url]["inlinks"].append(inlink)
        else:
            if "unknown" not in pages[file_name]:
                pages[file_name]["unknown"] = []
            unknown = {
                'text':anchor_text, 
                'url':file_url
            }
            if items[2]:
                unknown['fragment'] = items[2]
            pages[file_name]["unknown"].append(unknown)
        
print(len(pages))

C:\Corpora\zelda-wikia2-clean\index.html
8715


In [None]:
import json

with open("network.json", "w") as w:
    json.dump(pages, w)

In [8]:
import zipfile

print ('Creating network.zip')
zf = zipfile.ZipFile('network.zip', mode='w')
try:
    print('Adding network.json')
    zf.write('network.json', compress_type = zipfile.ZIP_DEFLATED)
finally:
    zf.close()

Creating network.zip
Adding network.json
