In [None]:
import json
import pandas as pd

from nltk.stem import PorterStemmer
from extracted_entities import Relation, RelationDetails, ParsedParagraph

with open("info/text_extraction.json", "r") as r:
    extracted_info = json.load(r)

In [None]:
from slugify import slugify

def dumb_compare(str1_, str2_):
    str1 = slugify(str1_)
    str2 = slugify(str2_)
    if not str1 or not str2: return False
    if len(str2) < 3: return False
    return str1 in str2 or str2 in str1

In [None]:
stemmer = PorterStemmer()

rels = []
sources = ["wikia", "gamepedia"]
for entity in list(extracted_info.keys()):
    for s in sources:
        if s not in extracted_info[entity]: continue
        paragraphs = extracted_info[entity][s]["paragraphs"]
        if not paragraphs: continue
        for p in paragraphs:
            relations = p["relations"]
            if not relations: continue
            current_relations = []
            for rel in p["relations"]:
                attribute = stemmer.stem(rel["attribute"])
                related_entity = None
                related_relationship = None
                related_entity_link = None
                
                # Find hard links to entities:
                details = p["details"]
                links = p["links"]
                if links and details:
                    for det in p["details"]:
                        detail_attribute = stemmer.stem(det["attribute"])
                        if detail_attribute == attribute:
                            related_entity = det["subject"]
                            related_relationship = det["relation"]
                            for l in links:
                                if dumb_compare(related_entity, l["href"]) \
                                    or dumb_compare(related_entity, l["text"]):
                                    related_entity_link = l["href"]
                
                rels.append([entity, 
                             s,
                             extracted_info[entity][s]["name"],  
                             rel["subject"], 
                             rel["relation"], 
                             stemmer.stem(rel["attribute"]),
                             related_relationship,
                             related_entity,
                             related_entity_link
                            ])

relations = pd.DataFrame(rels, columns=["url", "source","name", "subject", 
                                        "relation", "attribute",
                                        "related_relationship","related_subject","related_subject_url"]).set_index("url")
relations.info()

In [None]:
def mentions_itself(r):
    return dumb_compare(r["name"], r["subject"])

relations["mention"] = relations.apply(mentions_itself, axis=1)

mentions_itself = relations[relations.mention]
print("Self referencing relationships found", len(mentions_itself)) 

In [None]:
hard_relations = mentions_itself[(pd.notna(mentions_itself.related_subject_url))]
print("\"Hard relationships found\"", len(hard_relations))
hard_relations.sample(6)

In [None]:
counts = hard_relations.attribute.value_counts()
to_consider = counts[counts > 6]
to_consider

### Categories for "hard relationships"

In [None]:
category_mapping = {    
    "charact":"Character",
    "enemi":"Enemy",    
    "boss":"Boss",
    "item":"Item",
    "locat":"Location",
    "object":"Object",
    "dungeon":"Dungeon",
    "quest":"Quest",
    "group":"Group",
    "song":"Song",
    "shrine":"Shrine",
    "mask":"Mask",
    "shop":"Shop",
    "sword": "Sword",
    "weapon":"Weapon",
    "stage":"Stage",
    "shield":"Shield"
}

In [None]:
true_relations = []
for i,row in hard_relations.iterrows():
    if row["attribute"] in category_mapping:
        true_relations.append([row.name, 
                               row["relation"],
                               category_mapping[row["attribute"]],
                               row["related_relationship"],
                               row["related_subject_url"]
                              ])
true_relations = pd.DataFrame(true_relations, columns =["url","relation1","attribute", "relation2","related_url"])
print("Hard relationships processed", len(true_relations))
true_relations.to_csv("info/hard_relationships.csv")
true_relations.head()

```
LOAD CSV WITH HEADERS FROM 'file:///hard_relationships.csv' as line
MATCH (p1:Page{url:line.url})
MATCH (p2:Page{url:line.related_url})
MERGE (p1)-[:IsA{attribute:line.attribute, relation:line.relation2}]->(p2)
```

In [None]:
true_relations.relation2.value_counts()
true_relations[true_relations.relation2=="by"]

## Processing for soft relationships

#### I have no idea of what I'm doing
(generate nodes for "soft" relationships)

In [None]:
node_template = "(:%s)"
"CREATE " + (', '.join([node_template % label for label in category_mapping.values()]))

```
CREATE (:Character), (:Enemy), (:Boss), (:Item), (:Location), (:Object), (:Dungeon), (:Quest), (:Group), (:Song), (:Shrine), (:Mask), (:Shop)
```