# Infobox extraction from wikia

In [None]:
from bs4 import BeautifulSoup
from slugify import slugify
from glob import glob

from urllib.parse import urlparse
from urllib.parse import unquote

import pandas as pd

import re
import os

# Where are all those htmls?
html_route = r"C:\Corpora\zelda-wikia2-clean\\"

In [None]:
def clean_url(url):
    """
    Clean the url to met the structure adopted for the dataset
    """
    parsed = urlparse(url)
    path = unquote(parsed.path)
    if path.startswith("../"):
        path = path[3:]
    path = path.replace("/", "%2F")
    query = None if parsed.query == '' else parsed.query
    fragment = None if parsed.fragment == '' else parsed.fragment
    return (path, query, fragment)

parentheses = re.compile("\(.+\)")

def get_relation(label):
    """
    Canonicalize the relationship
    """
    lbl = re.sub(parentheses, '', label)
    l =  slugify(lbl.strip(), separator='_')
    return l.upper()

In [None]:
import json
from bs4.element import NavigableString as string
from bs4.element import Tag as tag

types = {
    "<class 'bs4.element.NavigableString'>":"string",
    "<class 'bs4.element.Tag'>":"tag"
}

infoboxes = {}
all_properties = set()
all_files = sorted(list(glob(html_route + "*.html")))

for file in all_files:
    filename = os.path.basename(file)
    
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
        
    wikiaMainContent = soup.find('article', {'id':'WikiaMainContent'})
    if not wikiaMainContent:
        continue
    
    infobox = wikiaMainContent.find('aside', {'class':'portable-infobox'})
    if not infobox:
        continue
    
    infoboxes[filename] = {}
    
    items = infobox.findAll('div', {'class': 'pi-item'})
    for item in items:
        h3 = item.find('h3')
        if not h3:
            continue
            
        relation = get_relation(h3.text.strip())
        all_properties.add(relation)
        
        values = item.find('div', {'class':'pi-data-value'}, recursive=False)
        infoboxes[filename][relation] = [[str(c).strip(),types.get(str(type(c)), str(type(c)))] for c in values.contents if str(c).strip()]

In [None]:
import json
with open("info/infoboxes.wikia.json", "w", encoding="utf8") as w:
    json.dump(infoboxes, w, indent=4)
with open("info/all_properties.wikia.json", "w", encoding="utf8") as w:
    json.dump(list(all_properties), w, indent=4)

In [None]:
print("Entities %d" % len(infoboxes))
print("Possible identified relationships %d" % len(all_properties))
keys = list(infoboxes.keys())

In [None]:
def get_page_title(file):
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
    wikiaMainContent = soup.find('article', {'id':'WikiaMainContent'})
    if wikiaMainContent is None:
        return None
    title = wikiaMainContent.get('title',None)
    if not wikiaMainContent or not title:
        return None
    return wikiaMainContent['title']
        
all_files = sorted(list(glob(html_route + "*.html")))
    
i = 1
entities_lst = []
reverse = {}
for file in all_files:
    node = os.path.basename(file)
    title = get_page_title(file)
    if title is None:
        print(file)
        continue
    
    entities_lst.append([i, title, node])
    reverse[node] = i
    
    i += 1

entities_df = pd.DataFrame(entities_lst, columns=['id','name','page']).set_index('id')
entities_df.to_csv("info/entities.wikia.csv", encoding="utf8")
entities_df.head()