In [1]:
import pandas as pd

entities = pd.read_csv("basic/entities.csv").set_index("id")
types = pd.read_csv("basic/types.csv").set_index("id")
relationships = pd.read_csv("basic/relationships.csv", index_col = 0)
adv_relationships = pd.read_csv("basic/adv_relationships.csv", index_col = 0)

In [2]:
entities.head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1-up Doll,1-up_Doll.html
2,100th Ring,100th_Ring.html
3,15-second Game,15-second_Game.html
4,1986,1986.html
5,1987,1987.html


In [3]:
types.head()

Unnamed: 0_level_0,page,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1-up_Doll.html,CHARACTER
1,1-up_Doll.html,ITEM
2,100th_Ring.html,ITEM
3,15-second_Game.html,LOCATION
36,2nd_Potion.html,ITEM


In [4]:
relationships.head()

Unnamed: 0,from,relationship,to
0,1,FIRST_APPEARANCE,8636
1,1,FIRST_APPEARANCE,6
2,1,FOUND,3766
3,1,USE,4550
4,2,FIRST_APPEARANCE,7747


In [5]:
adv_relationships.head()

Unnamed: 0,source,source_str,relation,name,dst_str,dst
0,1,1-up_Doll.html,item_from,Zelda II,,
1,1,1-up_Doll.html,live_in,which,,
2,2,100th_Ring.html,obtain_from,Vasu,Vasu.html,8270.0
3,2,100th_Ring.html,obtain_by,end,,
4,3,15-second_Game.html,locat_from,The Legend of Zelda,,


## Infering type of entity from the relationship extracted from the Infobox

In [64]:
list_of_relationships = list(relationships.relationship.unique())
print(sorted(list_of_relationships))

['AFFILIATION', 'ALLIES', 'ALTERNATE_FORM', 'ALTERNATE_FORM_OF', 'APPEARANCES', 'APPEARS_IN', 'ARTIST', 'ATTACK_METHOD', 'BOSS', 'CAPITAL', 'CHARACTERISTICS', 'CLIMATE', 'COUNTRY', 'CURRENCY', 'DEBUT', 'DEMONYM', 'DESIGNER', 'DEVELOPER', 'DIED', 'DISTINCTIONS', 'DISTRIBUTOR', 'DOMINANT_RACE', 'EFFECTIVE_AGAINST', 'EFFECTIVE_WEAPON', 'EMPLOYEES', 'EMPLOYER', 'ENEMIES', 'ENGINE', 'FIRST_APPEARANCE', 'FOUND', 'FOUNDED', 'FOUNDED_IN', 'GAME', 'GIVER', 'GOALS', 'HOMELAND', 'HOMETOWN', 'INHABITANTS', 'INPUTS', 'INPUT_METHODS', 'ITEM_USED', 'KEY_PEOPLE', 'KINDRED', 'KNOWN_FOR', 'LANGUAGE', 'LEADER', 'LEARNED', 'LIFE_RATING', 'LOCATION', 'MAIN_ITEM', 'MANUFACTURER', 'MEDIA', 'MEMBERS', 'MINI_BOSS', 'NOTABLE_MEMBERS', 'OBJECTIVE', 'OCCURRENCE', 'OWNER', 'PLATFORM', 'POINT_OF_INTEREST', 'POWER', 'PRECEDED_BY', 'PREREQUISITES', 'PRIZE', 'PRODUCTS', 'PROPRIETOR', 'PUBLISHED', 'PUBLISHER', 'PURPOSE', 'RACE', 'REGION', 'REGIONS', 'RELEASED', 'REQUIREMENTS', 'REWARDS', 'SERIES', 'SPOILS', 'SUCCEEDED_

In [60]:
def get_elements(relation, column):
    selected = relationships[relationships.relationship == relation]
    return selected[column].values

def list_elements(relations, column):
    r = []
    if isinstance(relations, str):
        r.append(relations)
    else:
        r.extend(relations)
    s_t = set()
    for rel in r:
        s_t.update(get_elements(rel, column))
    return list(s_t)

import random

def sample_elements(elements, num=5):
    return random.choices(population=elements, k=num)

### Sources of characters [SOURCE]

In [160]:
games = list_elements(["APPEARANCES", "APPEARS_IN", "FIRST_APPEARANCE"], "to")
sources = entities.loc[games]
true_sources = sources[sources.name.str.len() != 4].index.values
entities.loc[sample_elements(true_sources)].head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
7670,The Legend of Neil,The_Legend_of_Neil.html
308,BS The Legend of Zelda: Ancient Stone Tablets,BS_The_Legend_of_Zelda__Ancient_Stone_Tablets....
7748,The Legend of Zelda: Oracle of Ages (manga),The_Legend_of_Zelda__Oracle_of_Ages_(manga).html
7673,The Legend of Zelda,The_Legend_of_Zelda.html
8093,The Legend of Zelda: Tri Force Heroes,Tri_Force_Heroes.html


### Locations [LOCATION]

In [250]:
from_text = set(types[types.type == "LOCATION"].index.values)
locations = set(list_elements(["LOCATION", "COUNTRY", "HOMETOWN", "HOMELAND"], "to"))
regions =   set(list_elements(["REGION","REGIONS"], "to"))
true_locations = list( from_text | locations |regions)
entities.loc[sample_elements(true_locations)]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
259,Armos Knights,Armos_Knights.html
4481,Level 8 (First Quest),Level_8_(First_Quest).html
7187,Spirit Island,Spirit_Island.html
4757,Madorna Mountain,Madorna_Mountain.html
3782,Hyrule Cathedral,Hyrule_Cathedral.html


### Platforms [PLATFORM]

In [257]:
true_platforms = list_elements(["PLATFORM"], "to")
entities.loc[sample_elements(true_platforms)]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5461,Nintendo Switch,Nintendo_Switch.html
5451,Nintendo 3DS,Nintendo_3DS.html
5452,Nintendo 64,Nintendo_64.html
8451,Wii U,Wii_U.html
8305,Virtual Console,Virtual_Console.html


### Games developers/publishers [COMPANY]

In [258]:
companies_creators = set(list_elements(["PUBLISHER","DEVELOPER"], "to"))
print("Companies %d" % len(companies_creators))
true_companies_creators = companies_creators - set(platform)
print("Companies %d" % len(true_companies_creators))

entities.loc[true_companies_creators]

Companies 5
Companies 4


Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5450,Nintendo,Nintendo.html
8267,Vanpool,Vanpool.html
2653,Flagship,Flagship.html
5455,Nintendo Entertainment Analysis and Development,Nintendo_Entertainment_Analysis_and_Developmen...


In [259]:
key_people = list_elements("KEY_PEOPLE", "to")
sample = sample_elements(key_people)
entities.loc[sample]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6854,Shigeru Miyamoto,Shigeru_Miyamoto.html
6606,Satoru Iwata,Satoru_Iwata.html
6854,Shigeru Miyamoto,Shigeru_Miyamoto.html
6606,Satoru Iwata,Satoru_Iwata.html
6854,Shigeru Miyamoto,Shigeru_Miyamoto.html


### Weapons [WEAPON]

In [260]:
from_text = set(types[types.type == "WEAPON"].index.values)
weapons = set(list_elements(["EFFECTIVE_WEAPON", "WEAPON"], "to"))
effective_against_weapons = set(list_elements("EFFECTIVE_AGAINST", "from"))
infobox_weapons = weapons | effective_against_weapons
true_weapons = infobox_weapons
print(len(true_weapons))
entities.loc[sample_elements(list(true_weapons))]

387


Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4520,Like Like Ring,Like_Like_Ring.html
7305,Steel Lizal Bow,Steel_Lizal_Bow.html
4940,Master Sword (Hyrule Warriors),Master_Sword_(Hyrule_Warriors).html
5181,Moblin Ring,Moblin_Ring.html
374,Bari,Bari.html


### Characters [CHARACTERS]

In [261]:
enemies = list_elements(["ENEMIES","BOSS","MINI_BOSS"], "to")
spoils_things = list_elements(["SPOILS","EFFECTIVE_WEAPON","WEAPON"], "from")
entities.loc[sample_elements(enemies)]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
3454,Gyorg Pair,Gyorg_Pair.html
5960,Prince Sidon,Prince_Sidon.html
6226,Revali,Revali.html
2134,Dongorongo,Dongorongo.html
6760,Shadow Beast,Shadow_Beast.html


In [262]:
characters_to = list_elements(["INHABITANTS","OWNER"], "to")
characters_from = list_elements(["RACE","KINDRED","HOMELAND"],"from")
sample = sample_elements(characters_from)
entities.loc[sample]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
874,Captain Viscen,Captain_Viscen.html
6717,Seldon,Seldon.html
2426,Evan,Evan.html
6467,Rusta,Rusta.html
1996,Great Deku Tree,Deku_Tree.html


In [263]:
from_text = set(types[types.type == "CHARACTER"].index.values)
from_info_box = set(characters_to) | set(characters_from) | set(enemies) | set(spoils_things)
true_characters = list(from_text & from_info_box)
entities.loc[sample_elements(true_characters)]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5129,Minister Potho,Minister_Potho.html
7321,Stone Arrghus,Stone_Arrghus.html
1807,Cyclos,Cyclos.html
376,Baris,Baris.html
457,Bertri,Bertri.html


### Enemies [ENEMY]

In [264]:
from_text = set(types[types.type == "ENEMY"].index.values)
true_enemies = set(enemies) | from_text
entities.loc[true_enemies].head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
8192,Twilit Vermin,Twilit_Vermin.html
8195,Twinmold,Twinmold.html
8197,Twinrova (Ocarina of Time),Twinrova_(Ocarina_of_Time).html
8198,Twinrova (Oracle of Ages/Oracle of Seasons),Twinrova_(Oracle_of_Ages%2FOracle_of_Seasons)....
4109,Keaton (The Minish Cap),Keaton_(The_Minish_Cap).html


### Items

In [265]:
items = list_elements(["ITEM_USED","MAIN_ITEM"], "to")
entities.loc[sample_elements(items)]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6224,Revali's Gale,Revali's_Gale.html
5930,Power Glove,Power_Glove.html
4934,Master Ore,Master_Ore.html
6405,Royal Guard's Sword,Royal_Guard's_Sword.html
7179,Spirit's Spring,Spirit's_Spring.html


In [266]:
from_text = set(types[types.type == "ITEM"].index.values)
from_info = set(items)
true_items = list(from_text | from_info)
entities.loc[sample_elements(true_items)].head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4363,Lana's Cloak,Lana's_Cloak.html
4822,Majora's Mask: The Abridged Series,Majora's_Mask__The_Abridged_Series.html
2634,Fisherman's Shield,Fisherman's_Shield.html
6824,Sheikah Slate,Sheikah_Sensor+.html
5998,Pumpkin Harvest,Pumpkin_Harvest.html


### Races [RACE]

In [267]:
races = list_elements(["DOMINANT_RACE","RACE"], "to")
sample = sample_elements(races)
entities.loc[sample]

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2798,Frog,Frog.html
3693,Horse,Horse.html
7464,Sword,Sword.html
4639,Lokomo,Lokomo.html
7323,Stone Blin,Stone_Blin.html


In [268]:
# types are the types inferred from the text
from_text = set(types[types.type == "RACE"].index.values)
from_info = set(races)
true_races = list(from_text | from_info)
entities.loc[sample_elements(true_races)].head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2968,Gerudo,Gerudo_Pirates.html
1234,The Legend of Zelda: Breath of the Wild races,Category_The_Legend_of_Zelda__Breath_of_the_Wi...
8366,Cow,Water_Buffalo.html
6548,Salona,Salona.html
1065,Kokiri,Category_Kokiri.html


In [269]:
old_types = types.copy()
print(len(old_types))

# SOURCE
wo_ = types[types["type"]!='SOURCE']
print(len(wo_source))

# LOCATION
wo_ = wo_[wo_["type"]!='LOCATION']
print(len(wo_))

# PLATFORM
wo_ = wo_[wo_["type"]!='PLATFORM']
print(len(wo_))

# COMPANY
wo_ = wo_[wo_["type"]!='COMPANY']
print(len(wo_))

# CHARACTER
wo_ = wo_[wo_["type"]!='CHARACTER']
print(len(wo_))

# ITEM
wo_ = wo_[wo_["type"]!='ITEM']
print(len(wo_))

# RACE
wo_ = wo_[wo_["type"]!='RACE']
print(len(wo_))

# RACE
wo_ = wo_[wo_["type"]!='ENEMY']
print(len(wo_))


# WEAPON
wo_ = wo_[wo_["type"]!='WEAPON']
print(len(wo_))


# WEAPON
wo_ = wo_[wo_["type"]!='VIDEOGAME']
print(len(wo_))

print(wo_["type"].unique())
wo_.head()

8441
8441
6283
6283
6283
4038
2070
1962
1201
1061
931
['QUEST' 'SHRINE' 'SHOP' 'STORE' 'DUNGEON' 'ISLAND' 'YEAR']


Unnamed: 0_level_0,page,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
40,A_Brother's_Roast.html,QUEST
41,A_Curry_for_What_Ails_You.html,QUEST
42,A_Fragmented_Monument.html,QUEST
43,A_Freezing_Rod.html,QUEST
44,A_Gift_for_My_Beloved.html,QUEST


In [275]:
new_types = {
    "SOURCE":true_sources,
    "LOCATION":true_locations,
    "PLATFORM":true_platforms,
    "COMPANY":true_companies_creators,
    "CHARACTER": true_characters,
    "ITEM":true_items,
    "RACE": true_races,
    "ENEMY":true_enemies,
    "WEAPON":true_weapons
}

new_types_list = []
for new_type in new_types:
    for ent_id in new_types[new_type]:
        new_types_list.append([ent_id, entities.loc[ent_id]['page'], new_type])
types_df = pd.DataFrame(new_types_list, columns=['id', 'page', 'type']).set_index('id')

types_df = types.append(types_df)

In [280]:
types_df.to_csv("basic/types.csv", encoding="utf8")
types_df.head(20)

Unnamed: 0_level_0,page,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1-up_Doll.html,CHARACTER
1,1-up_Doll.html,ITEM
2,100th_Ring.html,ITEM
3,15-second_Game.html,LOCATION
36,2nd_Potion.html,ITEM
40,A_Brother's_Roast.html,QUEST
40,A_Brother's_Roast.html,ITEM
41,A_Curry_for_What_Ails_You.html,QUEST
41,A_Curry_for_What_Ails_You.html,ITEM
42,A_Fragmented_Monument.html,QUEST
