# Infobox extraction from gamepedia

Extracts information from the [Zelda wiki](https://zelda.gamepedia.com/Main_Page) based on `gamepedia` and generates two files:

### `info/infoboxes.gamepedia.json`:

```
"Link.html": {
    "FIRST_APPEARANCE": [
      [
        "<i>\n<a href=\"The_Legend_of_Zelda.html\" title=\"The Legend of Zelda\">\n           The Legend of Zelda\n          </a>\n</i>",
        "tag"
      ],
      [
        "(",
        "string"
      ],
      [
        "<a href=\"1987.html\" title=\"1987\">\n          1987\n         </a>",
        "tag"
      ]
    ],
    "APPEARS_...
```

### `info/all_properties.gamepedia.json`:


```
[
  "KINDRED",
  "COUNTRY",
  "RACE",
  "WEAPON",
  "HOMELAND",...
```


In [None]:
from bs4 import BeautifulSoup
from glob import glob

from ie_conf import get_htmls_route
from functions import infobox_clean_url, infobox_get_canonical_relation

import pandas as pd

import re
import os

# Where are all those htmls?
html_route = get_htmls_route("gamepedia")

dataframe = pd.read_csv("info/entities.gamepedia.csv", index_col=0)

In [None]:
import json
from bs4.element import NavigableString as string
from bs4.element import Tag as tag

types = {
    "<class 'bs4.element.NavigableString'>":"string",
    "<class 'bs4.element.Tag'>":"tag"
}

infoboxes = {}
all_properties = set()

for i, row in dataframe.iterrows():
    filename = row['page']
    file = html_route + filename
    
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
        
    bodyContent = soup.find('div', {'id':'bodyContent'})
    if not bodyContent:
        continue
    
    wikitable = bodyContent.find('table', {'class':['infobox-curse-ad', 'wikitable', 'to']})
    if not wikitable:
        continue
    
    infoboxes[filename] = {}
    
    trs = wikitable.findAll('tr')
    for tr in trs:
        th = tr.find('th')
        values = tr.find('td', recursive=False)
        if not th or not values:
            continue
        relation = infobox_get_canonical_relation(th.text.strip())
        all_properties.add(relation)
        
        infoboxes[filename][relation] = [[str(c).strip(),types.get(str(type(c)), str(type(c)))] 
                                         for c 
                                         in values.contents 
                                         if str(c).strip()]

In [None]:
import json
with open("info/infoboxes.gamepedia.json", "w", encoding="utf8") as w:
    json.dump(infoboxes, w, indent=4)
with open("info/all_properties.gamepedia.json", "w", encoding="utf8") as w:
    json.dump(list(all_properties), w, indent=4)

In [None]:
print("Entities %d" % len(infoboxes))
print("Possible identified relationships %d" % len(all_properties))
keys = list(infoboxes.keys())