# Text extraction processing    

This notebook depends on the file `info/text_extraction.json` generated by [text_extraction](text_extraction.ipynb). It takes that file and extracts the relationships from one `Resource` and tries to find along with the *"hard"* links to other `Resource`s, outputting its results to a file called `info/hard_relationships.csv`:

| url                     | relation1   | attribute   | relation2   | related_url                              |
|:------------------------|:------------|:------------|:------------|:-----------------------------------------|
| Link's_Mother.html      | is          | Character   | in          | The_Legend_of_Zelda_(Series).html        |
| Link's_Uncle.html       | is          | Character   | from        | The_Legend_of_Zelda_series.html          |
| Link's_House.html       | is          | Location    | in          | The_Legend_of_Zelda_(Series).html        |
| Link's_House.html       | is          | Location    | from        | The_Legend_of_Zelda_series.html          |
| Link_Doll.html          | are         | Item        | in          | Zelda_II__The_Adventure_of_Link.html     |
| Link's_Grandmother.html | is          | Character   | in          | The_Legend_of_Zelda__The_Wind_Waker.html |
| Link's_Cabana.html      | is          | Location    | from        | The_Legend_of_Zelda__The_Wind_Waker.html |

In [1]:
import json
import pandas as pd

from nltk.stem import PorterStemmer
from extracted_entities import Relation, RelationDetails, ParsedParagraph

with open("info/text_extraction.json", "r") as r:
    extracted_info = json.load(r)

In [2]:
from slugify import slugify

def dumb_compare(str1_, str2_):
    str1 = slugify(str1_)
    str2 = slugify(str2_)
    if not str1 or not str2: return False
    if len(str2) < 3: return False
    return str1 in str2 or str2 in str1

In [3]:
stemmer = PorterStemmer()

rels = []
sources = ["wikia", "gamepedia"]
for entity in list(extracted_info.keys()):
    for s in sources:
        if s not in extracted_info[entity]: continue
        paragraphs = extracted_info[entity][s]["paragraphs"]
        if not paragraphs: continue
        for p in paragraphs:
            relations = p["relations"]
            if not relations: continue
            current_relations = []
            for rel in p["relations"]:
                attribute = stemmer.stem(rel["attribute"])
                related_entity = None
                related_relationship = None
                related_entity_link = None
                
                # Find hard links to entities:
                details = p["details"]
                links = p["links"]
                if links and details:
                    for det in p["details"]:
                        detail_attribute = stemmer.stem(det["attribute"])
                        if detail_attribute == attribute:
                            related_entity = det["subject"]
                            related_relationship = det["relation"]
                            for l in links:
                                if dumb_compare(related_entity, l["href"]) \
                                    or dumb_compare(related_entity, l["text"]):
                                    related_entity_link = l["href"]
                
                rels.append([entity, 
                             s,
                             extracted_info[entity][s]["name"],  
                             rel["subject"], 
                             rel["relation"], 
                             stemmer.stem(rel["attribute"]),
                             related_relationship,
                             related_entity,
                             related_entity_link
                            ])

relations = pd.DataFrame(rels, columns=["url", "source","name", "subject", 
                                        "relation", "attribute",
                                        "related_relationship","related_subject","related_subject_url"]).set_index("url")
relations.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27282 entries, 1-up_Doll.html to ___.html
Data columns (total 8 columns):
source                  27282 non-null object
name                    27282 non-null object
subject                 27282 non-null object
relation                27282 non-null object
attribute               27282 non-null object
related_relationship    18306 non-null object
related_subject         18306 non-null object
related_subject_url     14457 non-null object
dtypes: object(8)
memory usage: 1.9+ MB


In [4]:
def mentions_itself(r):
    return dumb_compare(r["name"], r["subject"])

relations["mention"] = relations.apply(mentions_itself, axis=1)

mentions_itself = relations[relations.mention]
print("Self referencing relationships found", len(mentions_itself)) 

Self referencing relationships found 14537


In [5]:
hard_relations = mentions_itself[(pd.notna(mentions_itself.related_subject_url))]
print("\"Hard relationships found\"", len(hard_relations))
hard_relations.sample(6)

"Hard relationships found" 10291


Unnamed: 0_level_0,source,name,subject,relation,attribute,related_relationship,related_subject,related_subject_url,mention
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Toh_Yahsa.html,wikia,Toh Yahsa,Toh Yahsa,is,charact,from,The Legend of Zelda:,The_Legend_of_Zelda__Breath_of_the_Wild.html,True
Yammo.html,wikia,Yammo,Yammo,is,charact,in,The Legend of Zelda:,The_Legend_of_Zelda__Breath_of_the_Wild.html,True
Moldorm_(A_Link_Between_Worlds).html,wikia,Moldorm (A Link Between Worlds),Moldorm,is,boss,of,Tower of Hera,Tower_of_Hera_(A_Link_Between_Worlds).html,True
The_Legend_of_Zelda_Phantom_Hourglass.html,gamepedia,The Legend of Zelda: Phantom Hourglass,The Legend of Zelda:,is,instal,of,The Legend of Zelda,The_Legend_of_Zelda__The_Wind_Waker.html,True
Ivan.html,wikia,Ivan,Ivan,is,charact,from,The Legend of Zelda,The_Legend_of_Zelda__The_Wind_Waker.html,True
Jitan_Sa'mi.html,wikia,Jitan Sa'mi,Jitan Sa'mi,is,charact,from,The Legend of Zelda:,The_Legend_of_Zelda__Breath_of_the_Wild.html,True


In [7]:
counts = hard_relations.attribute.value_counts()
to_consider = counts[counts > 6]
to_consider

charact          2184
item             2072
locat            1645
enemi             711
boss              342
object            340
type              258
dungeon           254
quest             192
shrine            144
instal             85
shop               81
race               72
group              72
song               69
level              63
one                58
option             54
area               51
stage              48
sword              46
abil               42
articl             42
modif              38
anim               33
mode               32
skill              31
weapon             29
shield             29
minigam            28
                 ... 
element            23
speci              21
place              20
event              20
antagonist         20
episod             20
featur             20
book               19
part               19
soundtrack         15
materi             14
off                14
quest item         13
version            13
drop      

### Categories for "hard relationships"

In [8]:
category_mapping = {    
    "charact":"Character",
    "enemi":"Enemy",    
    "boss":"Boss",
    "item":"Item",
    "locat":"Location",
    "object":"Object",
    "dungeon":"Dungeon",
    "quest":"Quest",
    "group":"Group",
    "song":"Song",
    "shrine":"Shrine",
    "mask":"Mask",
    "shop":"Shop",
    "sword": "Sword",
    "weapon":"Weapon",
    "stage":"Stage",
    "sequel":"Sequel",
    "shield":"Shield"
}

In [9]:
true_relations = []
for i,row in hard_relations.iterrows():
    if row["attribute"] in category_mapping:
        true_relations.append([row.name, 
                               row["relation"],
                               category_mapping[row["attribute"]],
                               row["related_relationship"],
                               row["related_subject_url"]
                              ])
true_relations = pd.DataFrame(true_relations, columns =["url","relation1","attribute", "relation2","related_url"])
print("Hard relationships processed", len(true_relations))
true_relations.to_csv("info/hard_relationships.csv")
true_relations.sample(18)

Hard relationships processed 8290


Unnamed: 0,url,relation1,attribute,relation2,related_url
1743,East_Akkala_Plains.html,are,Location,in,The_Legend_of_Zelda__Breath_of_the_Wild.html
3718,King_Rhoam's_Journal.html,is,Item,from,The_Legend_of_Zelda__Breath_of_the_Wild.html
742,Bound_Chest.html,is,Object,in,The_Legend_of_Zelda__The_Minish_Cap.html
2254,Freezor.html,are,Enemy,from,The_Legend_of_Zelda__A_Link_to_the_Past.html
189,Anouki_General_Store.html,is,Shop,from,The_Legend_of_Zelda__Spirit_Tracks.html
4519,Mayor_Plen.html,is,Character,in,The_Legend_of_Zelda__Oracle_of_Seasons.html
4009,Level-8_(BS_The_Legend_of_Zelda).html,is,Dungeon,in,BS_The_Legend_of_Zelda.html
177,Anju's_Grandmother.html,is,Character,from,The_Legend_of_Zelda__Majora%27s_Mask.html
594,Blue_ChuChu.html,are,Enemy,in,ChuChu.html#The_Legend_of_Zelda:_Breath_of_the...
5474,Pikit.html,are,Enemy,from,The_Legend_of_Zelda__A_Link_to_the_Past.html
