# Basic infobox extraction

In [73]:
from bs4 import BeautifulSoup
from slugify import slugify
from glob import glob

from urllib.parse import urlparse
from urllib.parse import unquote

import pandas as pd

import re
import os

# Where are all those htmls?
html_route = "/Users/antonioferegrino/corpora/zelda-wikia2-clean/"

In [74]:
def clean_url(url):
    """
    Clean the url to met the structure adopted for the dataset
    """
    parsed = urlparse(url)
    path = unquote(parsed.path)
    if path.startswith("../"):
        path = path[3:]
    path = path.replace("/", "%2F")
    query = None if parsed.query == '' else parsed.query
    fragment = None if parsed.fragment == '' else parsed.fragment
    return (path, query, fragment)

parentheses = re.compile("\(.+\)")

def get_relation(label):
    """
    Canonicalize the relationship
    """
    lbl = re.sub(parentheses, '', label)
    l =  slugify(lbl.strip(), separator='_')
    return l.upper()

In [75]:
infoboxes = {}
all_properties = set()

for file in glob(html_route + "*.html"):
    filename = os.path.basename(file)
    
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
        
    wikiaMainContent = soup.find('article', {'id':'WikiaMainContent'})
    if not wikiaMainContent:
        continue
    
    infobox = wikiaMainContent.find('aside', {'class':'portable-infobox'})
    if not infobox:
        continue
    
    infoboxes[filename] = {}
    
    items = infobox.findAll('div', {'class': 'pi-item'})
    for item in items:
        h3 = item.find('h3')
        if not h3:
            continue
            
        relation = get_relation(h3.text.strip())
        all_properties.add(relation)
        values = item.find('div', {'class':'pi-data-value'}, recursive=False)
        
        elements = [BeautifulSoup(s, "html5lib").body for s in ((''.join([str(element).strip() 
                              for element 
                              in values.contents 
                              if str(element).strip()])).split('<br/>'))]
        links = []
        for element in elements:
            anchors = element.findAll('a')
            for anchor in anchors:
                path, query, fragment = clean_url(anchor['href'])
                if not path.startswith("..%2"):
                    links.append(path)
            
        infoboxes[filename][relation] = links

In [76]:
import json
with open("basic/infoboxes.json", "w", encoding="utf8") as w:
    json.dump(infoboxes, w)
with open("basic/all_properties.json", "w", encoding="utf8") as w:
    json.dump(list(all_properties), w)

# Transform the data into something that can be loaded into neo4j 

via CSV files since I don't know how to import it from other sources **yet**

In [77]:
print("Entities %d" % len(infoboxes))
print("Possible identified relationships %d" % len(all_properties))
keys = list(infoboxes.keys())

Entities 7146
Possible identified relationships 113


In [78]:
def get_page_title(file):
    soup:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        soup = BeautifulSoup(r, "lxml")
    wikiaMainContent = soup.find('article', {'id':'WikiaMainContent'})
    title = wikiaMainContent.get('title',None)
    if not wikiaMainContent or not title:
        return None
    return wikiaMainContent['title']
        
i = 1
entities_lst = []
reverse = {}
for node in keys:
    file = html_route + node
    title = get_page_title(file)
    if title is None:
        print(infoboxes[node])
        continue
    
    entities_lst.append([i, title, node])
    reverse[node] = i
    
    i += 1

entities_df = pd.DataFrame(entities_lst, columns=['id','name','page']).set_index('id')
entities_df.to_csv("basic/entities.csv", encoding="utf8")
entities_df.head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Veil Springs,Veil_Springs.html
2,Scorching Naginata,Scorching_Naginata.html
3,Rito Harp,Rito_Harp.html
4,Strade,Strade.html
5,Rem's Shoe Shop,Rem's_Shoe_Shop.html


In [79]:
entities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7146 entries, 1 to 7146
Data columns (total 2 columns):
name    7146 non-null object
page    7146 non-null object
dtypes: object(2)
memory usage: 167.5+ KB


In [80]:
keys = list(infoboxes.keys())
relationships_lst = []
for node in keys:
    for relationship in infoboxes[node]:
        for entity in infoboxes[node][relationship]:
            id_ = reverse.get(entity, -1)
            if id_ < 0: continue
            else:
                relationships_lst.append([reverse[node], relationship, id_])
                
relationships_df = pd.DataFrame(relationships_lst, columns=['from','relationship','to'])
relationships_df.to_csv("basic/relationships.csv", encoding="utf8")
relationships_df.head()

Unnamed: 0,from,relationship,to
0,1,FIRST_APPEARANCE,4476
1,1,COUNTRY,3068
2,1,REGION,5784
3,1,POINT_OF_INTEREST,5095
4,1,POINT_OF_INTEREST,4441


In [81]:
relationships_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40664 entries, 0 to 40663
Data columns (total 3 columns):
from            40664 non-null int64
relationship    40664 non-null object
to              40664 non-null int64
dtypes: int64(2), object(1)
memory usage: 953.1+ KB


## Loading into neo4j...

```
LOAD CSV WITH HEADERS FROM "file:///entities.csv"
AS csvLine
CREATE(e:Entity {id:toInteger(csvLine.id), name:csvLine.name, page: csvLine.page})
```  

Results in:  

```
Added 7146 labels, created 7146 nodes, set 21438 properties, completed after 902 ms.
```

Now, let's create an index:

```
CREATE INDEX ON :Entity(id)
```

Now, let's add the relationships:

```
LOAD CSV WITH HEADERS FROM "file:///relationships.csv"
AS csvLine
MATCH (f:Entity {id:toInteger(csvLine.from)}),(to:Entity{id:toInteger(csvLine.to)})
CREATE (f)-[:RELATED {relation:csvLine.relationship}]->(to)
```

Results in:

```
Set 40664 properties, created 40664 relationships, completed after 2078 ms.
```

## Issuing simple queries

 > Who are the members of the Poe Sisters?
 
```
MATCH (e:Entity{name:'Poe Sisters'})-[r:RELATED{relation:'MEMBERS'}]->(e2:Entity)
RETURN e,r,e2
```

<img src="images/poe_sisters.png" />


 > Who are Dekus in TLOZ?

```
MATCH (e:Entity)-[r:RELATED{relation:'RACE'}]->(e2:Entity{name:'Deku'})
RETURN e,r,e2
```

<img src="images/dekus.png" />