In [11]:
from neo4j.v1 import GraphDatabase

driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "one"))

In [18]:
import pandas as pd
import json

entities = pd.read_csv("../relation_extraction/basic/entities.csv").set_index("id")
types = pd.read_csv("../relation_extraction/basic/types.csv").set_index("id")
relationships = pd.read_csv("../relation_extraction/basic/relationships.csv", index_col = 0)
adv_relationships = pd.read_csv("../relation_extraction/basic/adv_relationships.csv", index_col = 0)

In [19]:
entities.head()

Unnamed: 0_level_0,name,page
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1-up Doll,1-up_Doll.html
2,100th Ring,100th_Ring.html
3,15-second Game,15-second_Game.html
4,1986,1986.html
5,1987,1987.html


In [20]:
relationships.head()

Unnamed: 0,from,relationship,to
0,1,FIRST_APPEARANCE,8636
1,1,FIRST_APPEARANCE,6
2,1,FOUND,3766
3,1,USE,4550
4,2,FIRST_APPEARANCE,7747


In [21]:
adv_relationships.head()

Unnamed: 0,source,source_str,relation,name,dst_str,dst
0,1,1-up_Doll.html,item_from,Zelda II,,
1,1,1-up_Doll.html,live_in,which,,
2,2,100th_Ring.html,obtain_from,Vasu,Vasu.html,8270.0
3,2,100th_Ring.html,obtain_by,end,,
4,3,15-second_Game.html,locat_from,The Legend of Zelda,,


In [22]:
types.head()

Unnamed: 0_level_0,page,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1-up_Doll.html,CHARACTER
1,1-up_Doll.html,ITEM
2,100th_Ring.html,ITEM
3,15-second_Game.html,LOCATION
36,2nd_Potion.html,ITEM


In [23]:
grouped_types = types.groupby(types.index)["type"].apply(set).to_frame()

In [24]:
import numpy as np
joint = pd.merge(entities, grouped_types, left_index=True, right_index=True, how='outer')
joint.tail()

Unnamed: 0_level_0,name,page,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8709,Zunari,Zunari.html,{CHARACTER}
8710,Zuta,Zuta.html,"{LOCATION, CHARACTER}"
8711,Zyle,Zyle.html,{CHARACTER}
8712,?,_.html,
8713,???,___.html,"{LOCATION, CHARACTER, ITEM}"


In [25]:
joint.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8713 entries, 1 to 8713
Data columns (total 3 columns):
name    8713 non-null object
page    8713 non-null object
type    6348 non-null object
dtypes: object(3)
memory usage: 272.3+ KB


In [28]:
def escape(string):
    return json.dumps(string)[1:-1]

In [29]:
print(joint.loc[8713]['name'])
print(escape(joint.loc[8713]['name']))
":".join(joint.loc[8713]["type"])

???
???


'LOCATION:CHARACTER:ITEM'

# Node creation

In [None]:
create_template = "CREATE (a:%s {id:%d, name:\"%s\", page:\"%s\"})"
with driver.session() as session:
    for i, row in joint.iterrows():
        type_ = 'OBJECT' if pd.isnull(row['type']) else "OBJECT:"+":".join(row['type'])
        id_ = row.name
        name = escape(row['name'])
        page = row['page']
        insert_stmt = create_template % (type_, id_, name, page) 
        
        result = session.run(insert_stmt)
    session.run("CREATE INDEX ON :OBJECT(id)")

# Relationship creation

In [None]:
relationship_template = """MATCH (from:OBJECT{id:%d}),(to:OBJECT{id:%d})
CREATE (from)-[:%s]->(to)"""

with driver.session() as session:
    for i, row in relationships.iterrows():
        if i % 500 == 0:
            print(i)
        try:
            relationship_stmt = relationship_template % (row["from"], row["to"], row["relationship"])
            session.run(relationship_stmt)
        except Exception as inst:
            print("Error", i)
            print(inst)
            break

# Known as relationship insertion   

In [8]:
import json

known_as:dict = None 
    
with open("../link_analysis/known_as.json", "r") as r:
    known_as = json.load(r)

In [13]:
aliases = set()
for page in known_as:
    aliases.update(known_as[page])

In [53]:
create_template = "CREATE (s:ALIAS {value:\"%s\"})"
with driver.session() as session:
    for alias in aliases:
        insert_stmt = create_template % (escape(alias)) 
        result = session.run(insert_stmt)
    session.run("CREATE INDEX ON :ALIAS(value)")

In [55]:
relationship_template = """MATCH (alias:ALIAS{value:\"%s\"}),(object:OBJECT{id:%d})
CREATE (object)-[:KNOWN_AS]->(alias)"""

def get_id_from_page(page):
    return entities[entities['page'] == page].index[0]



ii = 0
with driver.session() as session:
    for page in known_as:
        for alias in known_as[page]:
            if ii % 500 == 0:
                print(ii)
            try:
                relationship_stmt = relationship_template % (escape(alias), get_id_from_page(page))
                session.run(relationship_stmt)
            except Exception as inst:
                print("Error", ii)
                print(inst)
            ii += 1

0
500
Error 764
index 0 is out of bounds for axis 0 with size 0
1000
1500
Error 1908
index 0 is out of bounds for axis 0 with size 0
2000
2500
Error 2971
index 0 is out of bounds for axis 0 with size 0
3000
3500
Error 3766
index 0 is out of bounds for axis 0 with size 0
4000
4500
Error 4623
index 0 is out of bounds for axis 0 with size 0
5000
Error 5045
index 0 is out of bounds for axis 0 with size 0
5500
Error 5616
index 0 is out of bounds for axis 0 with size 0
Error 5830
index 0 is out of bounds for axis 0 with size 0
6000
6500
Error 6613
index 0 is out of bounds for axis 0 with size 0
7000
7500
Error 7710
index 0 is out of bounds for axis 0 with size 0
8000
8500
9000
Error 9403
index 0 is out of bounds for axis 0 with size 0
9500
10000
