In [2]:
from bs4 import BeautifulSoup
from slugify import slugify
from glob import glob

from urllib.parse import urlparse
from urllib.parse import unquote

import pandas as pd

import re
import os

import spacy

# Where are all those htmls?
html_route = "/Users/antonioferegrino/corpora/zelda-wikia2-clean/"

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
# Returns true if entity should be kept
def filter_entity(e):
    #1. Check entity is non-empty
    if e.ent_type_ is None or e.ent_type_ == '':
        return False
  
    #1. Filter out punctuation only tokens - tok.is_punct
    if e.is_punct:
        return False
  
    #1. Filter out URL entities that start with http
    if e.text.startswith("http"):
        return False
    return True

def extract_dependency_features(text):
    clean_text = text.replace("\n"," ")
    doc = nlp(clean_text)

    relations =[]
    for ent in doc.ents:
        ent.merge(tag=ent.root.tag_, lemma=ent.text, ent_type=ent.label_)
  
    for word in doc:
        if word.dep_ in ('attr', 'dobj'):
            if not (filter_entity(word)):
                continue
            subjects = [w for w in word.head.lefts if filter_entity(w)]
            for subject in subjects:
                relations.append((subject, word.head, word))
        elif word.dep_ == 'pobj':
            relations.append((word.head.head, word.head, word))
    return relations

extract_dependency_features("Veil Springs is a location from The Legend of Zelda: The Minish Cap. " + 
                            "The summit of Veil Falls, Veil Springs serves as the road to the Cloud Tops, " + 
                            "the new home of the Wind Tribe and the Palace of Winds, via a giant updraft.")

[(location, from, The Legend of Zelda),
 (summit, of, Veil Falls),
 (serves, as, road),
 (road, to, the Cloud Tops),
 (home, of, the Wind Tribe),
 (serves, via, updraft)]

In [5]:
from collections import Counter

file_list = list(glob(html_route + "*.html"))

spac_s = re.compile("\s+[\,\.\?\!]{1}")
spaces = re.compile("\s+")
pharag = re.compile("\(.+\)")

def clean_string(label):
    """
    Canonicalize the relationship
    """
    st = label
    st = re.sub(spac_s, '', st)
    st = re.sub(spaces, ' ', st)
    st = re.sub(pharag, '', st)
    return st.strip()

additional_info = {}

all_dependencies = []

for file in file_list:
    filename = os.path.basename(file)
    page:BeautifulSoup = None
    with open(file, "r", encoding="utf8") as r:
        page = BeautifulSoup(r, "lxml")
    wikiaMainContent = page.find('article', {'id':'WikiaMainContent'})
    if wikiaMainContent is None:
        print(file)
        continue
    textual_content = wikiaMainContent.find('div', {'id':'mw-content-text'})
    aside = textual_content.find('aside')
    if aside:
        aside.decompose()
    paragraphs = textual_content.findAll('p', recursive=False)
    for paragraph in paragraphs[:1]:
        links = paragraph.findAll('a')
        bolds = paragraph.find('b')

        desc = clean_string(paragraph.text)
        dependencies = extract_dependency_features(desc)
        for dep in dependencies:
            all_dependencies.append(dep[0].text.strip())
        additional_info[filename] = {
            'dependencies': dependencies,
            'links': links,
            'bolds': bolds
        }

/Users/antonioferegrino/corpora/zelda-wikia2-clean/index.html


In [15]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def get_canonical_dep(dep):
    return stemmer.stem(dep)

def convert(s):
    return slugify(s)

stems = [get_canonical_dep(dep.lower()) for dep in all_dependencies]
dependency_counter = Counter(stems)
common = dependency_counter.most_common(200)
print(common)

[('locat', 2454), ('breath', 2115), ('found', 1444), ('charact', 1387), ('item', 1342), ('link', 845), ('appear', 759), ('use', 732), ('in', 626), ('is', 613), ('region', 606), ('one', 563), ('enemi', 511), ('obtain', 411), ('boss', 348), ('part', 344), ('live', 323), ('known', 321), ('given', 316), ('legend', 312), ('dungeon', 310), ('ocarina', 283), ('cook', 272), ('quest', 254), ('releas', 242), ('resid', 228), ('serv', 216), ('are', 208), ('drop', 192), ('similar', 191), ('travel', 188), ('type', 177), ('piec', 172), ('complet', 170), ('restor', 170), ('refer', 166), ('area', 163), ('place', 157), ('base', 157), ('shrine', 154), ('return', 150), ('lead', 149), ('purchas', 148), ('version', 146), ('play', 138), ('power', 135), ('game', 135), ('race', 134), ('worn', 134), ('wield', 133), ('home', 130), ('out', 130), ('object', 130), ('shop', 128), ('access', 126), ('group', 125), ('along', 121), ('give', 120), ('run', 118), ('event', 117), ('set', 113), ('back', 112), ('take', 111), 

In [16]:
from functions import levenshtein

relationship_properties = {
    "obtain": "ITEM",
    "item": "ITEM",
    "given": "ITEM",
    "worn": "ITEM",
    "mask": "ITEM",
    "object":"ITEM",
    "weapon": "ITEM,WEAPON",
    "sword": "ITEM,WEAPON",
    "live": "CHARACTER",
    "charact": "CHARACTER",
    "enemi":"CHARACTER,ENEMY",
    "boss":"CHARACTER,ENEMY",
    "wife":"CHARACTER",
    "defeat": "CHARACTER",
    "locat" : "LOCATION",
    "region": "LOCATION",
    "inhabit": "LOCATION",
    "shrine": "LOCATION,SHRINE",
    "dungeon": "LOCATION,DUNGEON",
    "island": "LOCATION,ISLAND",
    "shop": "LOCATION,SHOP",
    "store": "LOCATION,STORE",
    "quest": "QUEST",
    "releas": "VIDEOGAME",
    "publish": "VIDEOGAME",
    "race":"RACE"
}

keys = sorted(list(additional_info.keys()))
new_info = {}
for k in keys:
    if len(additional_info[k]['dependencies']) > 0:
        new_info[k] = {
            'relations':[]
        }
        for dep in additional_info[k]['dependencies']:
            cannonical_dep = get_canonical_dep(dep[0].text)
            if cannonical_dep in relationship_properties:
                related = convert(dep[2].text)
                found_entity_id:str = None
                found_entity_name:str = dep[2].text

                # Search for any "hard" link to any of the mentioned entities
                for link in additional_info[k]['links']:
                    title_ = convert(link.get('title',""))
                    content_ = convert(link.text.strip())
                    title_distance = levenshtein(related, title_)
                    content_distance = levenshtein(related, content_)
                    if title_distance <= 2 or content_distance <= 2:
                        found_entity_id = link.get('href',"")

                new_info[k]['relations'].append({
                    'type': relationship_properties[cannonical_dep],
                    'relation': cannonical_dep + "_"+ dep[1].text,
                    'entity': dep[2].text,
                    'id': found_entity_id
                })

In [17]:
import json
from urllib.parse import urlparse
from urllib.parse import unquote

entities_df = pd.read_csv("basic/entities.csv", encoding="utf8", index_col='page')

def clean_url(url):
    if url is None:
        return (None, None, None)
    parsed = urlparse(url)
    path = unquote(parsed.path)
    if path.startswith("../"):
        path = path[3:]
    path = path.replace("/", "%2F")
    query = None if parsed.query == '' else parsed.query
    fragment = None if parsed.fragment == '' else parsed.fragment
    return (path, query, fragment)

types_lst =[]
relations_lst = []
for k in list(new_info.keys()):
    types = set()
    if 'relations' in new_info[k]:
        for relation in new_info[k]['relations']:
            rel = relation['relation']
            ent = relation['entity']
            id_ = relation.get('id',None)
            p, q, f = clean_url(id_)
            identified_id = None
            try:
                identified_id = None if id_ is None else entities_df.loc[p]['id']
            except:
                identified_id = None
                
            relations_lst.append([entities_df.loc[k]['id'], k, rel, ent, id_, identified_id])
            types.update(relation['type'].split(","))
            
    for t in types:
        types_lst.append([k, t, entities_df.loc[k]['id']])
        
            
        

types_df = pd.DataFrame(types_lst, columns=['page', 'type', 'id']).set_index('id')

In [18]:
# Assign year type

year_files = list(glob(html_route + "[0-9][0-9][0-9][0-9].html"))
years_lst = []
for year in year_files:
    filename = os.path.basename(year)
    years_lst.append([
        filename,
        "YEAR",
        entities_df.loc[filename]['id']
    ])
years_df = pd.DataFrame(years_lst, columns=['page', 'type', 'id']).set_index('id')
types_df = types_df.append(years_df)
types_df.to_csv("basic/types.csv", encoding="utf8")
types_df.head()

Unnamed: 0_level_0,page,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1-up_Doll.html,CHARACTER
1,1-up_Doll.html,ITEM
2,100th_Ring.html,ITEM
3,15-second_Game.html,LOCATION
36,2nd_Potion.html,ITEM


In [19]:
types_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8441 entries, 1 to 7
Data columns (total 2 columns):
page    8441 non-null object
type    8441 non-null object
dtypes: object(2)
memory usage: 197.8+ KB


In [20]:
relations_df = pd.DataFrame(relations_lst, columns=['source','source_str', 'relation', 'name','dst_str','dst'])
relations_df.to_csv("basic/adv_relationships.csv", encoding="utf8")
relations_df.head()

Unnamed: 0,source,source_str,relation,name,dst_str,dst
0,1,1-up_Doll.html,item_from,Zelda II,,
1,1,1-up_Doll.html,live_in,which,,
2,2,100th_Ring.html,obtain_from,Vasu,Vasu.html,8270.0
3,2,100th_Ring.html,obtain_by,end,,
4,3,15-second_Game.html,locat_from,The Legend of Zelda,,


In [21]:
relations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9731 entries, 0 to 9730
Data columns (total 6 columns):
source        9731 non-null int64
source_str    9731 non-null object
relation      9731 non-null object
name          9731 non-null object
dst_str       1874 non-null object
dst           1870 non-null float64
dtypes: float64(1), int64(1), object(4)
memory usage: 456.2+ KB
