In [None]:
import json
import os
import pandas as pd
from bs4 import BeautifulSoup

## Load data  
Load data into a dataframe to identify all the possible relationships that exist in the dataset from infoboxes

In [None]:
with open("info/infoboxes.wikia.json", "r") as r:
    infobox_wikia = json.load(r)
    
with open("info/infoboxes.gamepedia.json", "r") as r:
    infobox_gamepedia = json.load(r)

types = []
i = 0
for k in infobox_wikia:
    for relation in infobox_wikia[k]:
        values = infobox_wikia[k][relation]
        for value in values:
            types.append([k, relation, value[1], value[0], "wikia"])
        
for k in infobox_gamepedia:
    for relation in infobox_gamepedia[k]:
        values = infobox_gamepedia[k][relation]
        for value in values:
            types.append([k, relation, value[1], value[0], "gamepedia"])
        
infobox = pd.DataFrame(types, columns=["page", "relation", "type", "value", "source"])

infobox.relation.value_counts().describe()

### Filter out rows with properties having less than `counts`

In [None]:
counts = 10
filtered = infobox.groupby('relation').filter(lambda x: len(x) >= counts)
filtered.relation.value_counts().iloc[:5]

## Genders  
Let's start with genders

In [None]:
genders = infobox[(infobox["relation"] == "GENDER")].copy() \
    .drop(["relation"], axis=1).set_index("page")
genders.columns = ["type","gender", "source"]

def get_tag(r):
    if r["type"] == "string":
        return None
    soup = BeautifulSoup(r["gender"], "lxml")
    return soup.body.next.name

genders["tag"] = genders.apply(get_tag,axis=1)
print(genders.info())
genders.sample(5)

Select properties where there are no `sup` or `br` tags, and inspect the remaining to see if they are valuable

In [None]:
genders = genders[(genders["tag"] != "sup") & (genders["tag"] != "br")]
genders.tag.value_counts()

Seems like there is nothing valuable, so let's use only the strings properties, we'll need to clean them a bit though.

In [None]:
genders = genders[(genders["type"] == "string")]
print(genders.info())

def get_gender(values):
    if len(values) == 1:
        value = list(values)[0]
        if value == 'Male' or value == 'Female':
            return value
    return 'Undefined'

merged_genders = genders.groupby(genders.index)['gender'].apply(set).apply(get_gender)
merged_genders.to_csv("info/genders.csv")

```
CREATE (:Gender {value:'Male'}), (:Gender {value:'Female'}), (:Gender {value:'Undefined'})
```  

```
LOAD CSV FROM 'file:///genders.csv' AS line WITH line
MATCH (p:Page{url:line[0]}) 
MATCH (g:Gender{value:line[1]})
MERGE (p)-[:IsA{property:'gender'}]->(g)
```