# Title-Link relationship   

This notebook is used to generate the mapping between a page title and its unique html page. 

Produces a couple of csv files: `info/entities.wikia.csv` and `info/entities.gamepedia.csv` that have the following structure:  

| ID   | Name            | Page                 |
| ---- |:---------------:| --------------------:|
| 3    | Link            | Link.html            |
| ...  | ...             | ...                  |
| 9    | Hyrule Warriors | Hyrule_Warriors.html |

In [None]:
import os
import pandas as pd
from ie_conf import get_htmls_route
from glob import glob
from bs4 import BeautifulSoup

In [None]:
def get_wikia_page_title(file):
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
    wikiaMainContent = soup.find('article', {'id':'WikiaMainContent'})
    if wikiaMainContent is None:
        return None
    title = wikiaMainContent.get('title',None)
    if not wikiaMainContent or not title:
        return None
    return wikiaMainContent['title']

def get_gamepedia_page_title(file):
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
    bodyContent = soup.find('div', {'id':'bodyContent'})
    if bodyContent is None:
        return None
    title = bodyContent.get('title',None)
    if not bodyContent or not title:
        return None
    return bodyContent['title']

In [None]:
sites = ["wikia", "gamepedia"]
title_functions = [get_wikia_page_title, get_gamepedia_page_title]

names = set()
urls = set()

for site, title_function in zip(sites, title_functions):
    root = get_htmls_route(site)

    all_files = sorted(list(glob(root + "*.html")))

    i = 1
    entities_lst = []
    reverse = {}
    for file in all_files:
        node = os.path.basename(file)
        title = title_function(file)
        if title is None:
            print(file)
            continue

        names.add(title)
        urls.add(node)
        
        entities_lst.append([i, title, node])
        reverse[node] = i

        i += 1

    entities_df = pd.DataFrame(entities_lst, columns=['id','name','page']).set_index('id')
    entities_df.to_csv('info/entities.' + site +'.csv', encoding='utf8')
    print(site, "had", len(entities_df), "entities.")

## Get unique names and urls

In [None]:
unique_names = pd.DataFrame({'name': list(names)})
unique_urls = pd.DataFrame({'url':list(urls)})

unique_names.to_csv("info/entities.csv")
unique_urls.to_csv("info/urls.csv")

print("Unique names", len(unique_names))
print("Unique urls", len(unique_urls))
