# Infobox extraction from wikia

Extracts information from the [Zeldapedia](http://zelda.wikia.com/wiki/Wiki) based on `wikia` and generates two files:

### `info/infoboxes.wikia.json`:

```
"Link.html": {
    "FIRST_APPEARANCE": [
      [
        "<i>\n<a href=\"The_Legend_of_Zelda.html\" title=\"The Legend of Zelda\">\n           The Legend of Zelda\n          </a>\n</i>",
        "tag"
      ],
      [
        "(",
        "string"
      ],
      [
        "<a href=\"1987.html\" title=\"1987\">\n          1987\n         </a>",
        "tag"
      ]
    ],
    "APPEARS_...
```

### `info/all_properties.wikia.json`:


```
[
  "KINDRED",
  "COUNTRY",
  "RACE",
  "WEAPON",
  "HOMELAND",...
```


In [None]:
from bs4 import BeautifulSoup
from glob import glob

from ie_conf import get_htmls_route
from functions import infobox_clean_url, infobox_get_canonical_relation

import pandas as pd

import re
import os

# Where are all those htmls?
html_route = get_htmls_route("wikia")

dataframe = pd.read_csv("info/entities.wikia.csv", index_col=0)

In [None]:
import json
from bs4.element import NavigableString as string
from bs4.element import Tag as tag

types = {
    "<class 'bs4.element.NavigableString'>":"string",
    "<class 'bs4.element.Tag'>":"tag"
}

infoboxes = {}
all_properties = set()

for i, row in dataframe.iterrows():
    filename = row['page']
    file = html_route + filename
    
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
        
    wikiaMainContent = soup.find('article', {'id':'WikiaMainContent'})
    if not wikiaMainContent:
        continue
    
    infobox = wikiaMainContent.find('aside', {'class':'portable-infobox'})
    if not infobox:
        continue
    
    infoboxes[filename] = {}
    
    items = infobox.findAll('div', {'class': 'pi-item'})
    for item in items:
        h3 = item.find('h3')
        if not h3:
            continue
            
        relation = infobox_get_canonical_relation(h3.text.strip())
        all_properties.add(relation)
        
        values = item.find('div', {'class':'pi-data-value'}, recursive=False)
        infoboxes[filename][relation] = [[str(c).strip(),types.get(str(type(c)), str(type(c)))] 
                                         for c 
                                         in values.contents 
                                         if str(c).strip()]

In [None]:
import json
with open("info/infoboxes.wikia.json", "w", encoding="utf8") as w:
    json.dump(infoboxes, w, indent=4)
with open("info/all_properties.wikia.json", "w", encoding="utf8") as w:
    json.dump(list(all_properties), w, indent=4)

In [None]:
print("Entities %d" % len(infoboxes))
print("Possible identified relationships %d" % len(all_properties))
keys = list(infoboxes.keys())