In [None]:
import json
import os
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

## Load data  
Load data into a dataframe to identify all the possible relationships that exist in the dataset from infoboxes

In [None]:
def load_infoboxes():

    with open("info/infoboxes.wikia.json", "r") as r:
        infobox_wikia = json.load(r)

    with open("info/infoboxes.gamepedia.json", "r") as r:
        infobox_gamepedia = json.load(r)

    types = []
    i = 0
    for k in infobox_wikia:
        for relation in infobox_wikia[k]:
            values = infobox_wikia[k][relation]
            for value in values:
                types.append([k, relation, value[1], value[0], "wikia"])

    for k in infobox_gamepedia:
        for relation in infobox_gamepedia[k]:
            values = infobox_gamepedia[k][relation]
            for value in values:
                types.append([k, relation, value[1], value[0], "gamepedia"])
        
    infobox = pd.DataFrame(types, columns=["page", "relation", "type", "value", "source"])
    return infobox

infobox = load_infoboxes()
infobox.relation.value_counts().describe()

### Filter out rows with properties having less than `counts`

In [None]:
counts = 10
filtered = infobox.groupby('relation').filter(lambda x: len(x) >= counts)
filtered.relation.value_counts().iloc[:5]

## Genders  
Let's start with genders

In [None]:
genders = infobox[(infobox["relation"] == "GENDER")].copy() \
    .drop(["relation"], axis=1).set_index("page")
genders.columns = ["type","gender", "source"]

def get_tag(r):
    if r["type"] == "string":
        return None
    soup = BeautifulSoup(r["gender"], "lxml")
    return soup.body.next.name

genders["tag"] = genders.apply(get_tag,axis=1)
print(genders.info())
genders.sample(5)

Select properties where there are no `sup` or `br` tags, and inspect the remaining to see if they are valuable

In [None]:
genders = genders[(genders["tag"] != "sup") & (genders["tag"] != "br")]
genders.tag.value_counts()

Seems like there is nothing valuable, so let's use only the strings properties, we'll need to clean them a bit though.

In [None]:
genders = genders[(genders["type"] == "string")]
print(genders.info())

def get_gender(values):
    if len(values) == 1:
        value = list(values)[0]
        if value == 'Male' or value == 'Female':
            return value
    return 'Undefined'

merged_genders = genders.groupby(genders.index)['gender'].apply(set).apply(get_gender)
merged_genders.to_csv("info/genders.csv")

```
CREATE (:Gender {value:'Male'}), (:Gender {value:'Female'}), (:Gender {value:'Undefined'})
```  

```
LOAD CSV FROM 'file:///genders.csv' AS line WITH line
MATCH (p:Page{url:line[0]}) 
MATCH (g:Gender{value:line[1]})
MERGE (p)-[:IsA{property:'gender'}]->(g)
```

## First appereance

In [None]:
infobox = load_infoboxes()

In [None]:
first_appereance = infobox[(infobox["relation"] == "FIRST_APPEARANCE")].copy() \
    .drop(["relation"], axis=1).set_index("page")
first_appereance.columns = ["type","first", "source"]

Get links

In [None]:
def get_link(r):
    if r["type"] == "string":
        return np.nan
    soup = BeautifulSoup(r["first"], "lxml")
    anchor = soup.find('a')
    if not anchor:
        return np.nan
    href = anchor.get('href')
    if not href:
        return np.nan
    pound = href.find("#")
    return href if pound == -1 else href[:pound]

first_appereance["link"] = first_appereance.apply(get_link, axis=1)

In [None]:
year_re = re.compile(r"^(\.\./)?([0-9]{4}.+[\.html])")

def get_first_appereance(values):
    page:str = None
    year:str = None
    for s in values:
        s_results = year_re.search(s)
        if s_results:
            year = s_results.group(2)
        else:
            if s.startswith("../"):
                page = s[3:]
            else:
                page = s
    return [page, year]
first_appereance = first_appereance[pd.notna(first_appereance["link"])]
appereances = first_appereance.groupby(first_appereance.index)['link'].apply(set).apply(get_first_appereance)

In [None]:
appereances = pd.DataFrame(appereances.values.tolist(), 
                           index=appereances.index, 
                           columns=["entity","year"])

appereances[~pd.isna(appereances.entity)].to_csv("info/first_appereance.csv")

```
LOAD CSV WITH HEADERS FROM 'file:///first_appereance.csv' AS line WITH line
MATCH (p1:Page{url:line.page}) 
MATCH (p2:Page{url:line.entity}) 
MERGE (p1)-[:AppearsIn{first:true}]->(p2)
```

## Other appereances

In [None]:

#def get_release_years(years):
#    year_set = set([y for y in years if not pd.isna(y)])
#    return year_set