In [44]:
import kuzu
from yfiles_jupyter_graphs_for_kuzu import KuzuGraphWidget
import lancedb
import pandas as pd
import hashlib


In [45]:
db = lancedb.connect("../lancedb")

claims_df = pd.DataFrame(db.open_table("claims").to_pandas())
entities_df = pd.DataFrame(db.open_table("entities").to_pandas())
sources_df = pd.DataFrame(db.open_table("sources").to_pandas())

In [65]:
print(sources_df.columns.values)
print("---------------------------------")
print(claims_df.columns.values)
print("---------------------------------")
print(entities_df.columns.values)

['filename' 'id' 'location' 'markdown']
---------------------------------
['filename' 'node_name' 'index' 'text' 'composite_id']
---------------------------------
['start' 'end' 'text' 'label' 'score' 'filename' 'nodename' 'index'
 'text_md5' 'composite_id']


Based on the above, I could make "from" and "to" columns in all of these.  Then also make a node_id and name for all of them as well.

In [46]:
sources_df

Unnamed: 0,filename,id,location,markdown
0,2202.05901v2.pdf,2c494b74cd387935028898cf4d998e1c,file://stores/inputs/2202.05901v2.pdf,## Page 1\n\nIdentiﬁcation of Flux Rope Orient...
1,2409.09230v1.pdf,824512f309bf431858125747565b05f6,file://stores/inputs/2409.09230v1.pdf,## Page 1\n\nSolar Physics\nDOI: 10.1007/ ••••...
2,source3,08260e29029420a6ba81dcc7cd7dc033,https://gc.copernicus.org/articles/2/55/2019/,![](https://contentmanager.copernicus.org/8009...


In [47]:
claims_df['composite_id'] = claims_df['filename'] + '_' + claims_df['node_name'] + '_' + claims_df['index'].astype(str)

In [48]:
claims_df

Unnamed: 0,filename,node_name,index,text,composite_id
0,2202.05901v2.pdf,Cardinal,0,This study demonstrates that convolutional neu...,2202.05901v2.pdf_Cardinal_0
1,2202.05901v2.pdf,Cardinal,1,The neural network trained on full duration sy...,2202.05901v2.pdf_Cardinal_1
2,2202.05901v2.pdf,Cardinal,2,Our work shows that neural networks struggle t...,2202.05901v2.pdf_Cardinal_2
3,2202.05901v2.pdf,Supporting,0,This research uses CNNs trained with magnetic ...,2202.05901v2.pdf_Supporting_0
4,2202.05901v2.pdf,Supporting,1,The study uses a physics-based flux rope model...,2202.05901v2.pdf_Supporting_1
...,...,...,...,...,...
96,source3,limitations,0,The inherent uncertainty due to the challenge ...,source3_limitations_0
97,source3,futureDirections,0,Development of open tools that leverage progra...,source3_futureDirections_0
98,source3,keyFindings,0,The rate at which new people are attending AGU...,source3_keyFindings_0
99,source3,references,0,"Narock, T., Hasnain, S., and Stephan, R.: Iden...",source3_references_0


In [49]:
entities_df['text_md5'] = entities_df['text'].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest())

In [50]:
entities_df['composite_id'] = entities_df['filename'] + '_' + entities_df['nodename'] + '_' + entities_df['index'].astype(str)

In [51]:
entities_df

Unnamed: 0,start,end,text,label,score,filename,nodename,index,text_md5,composite_id
0,29,58,convolutional neural networks,Technology,0.803726,2202.05901v2.pdf,Cardinal,0,d19a182265bb19e67de7745fa0777821,2202.05901v2.pdf_Cardinal_0
1,60,64,CNNs,Technology,0.593342,2202.05901v2.pdf,Cardinal,0,b2ee264f68cd2a4f9487fd2b0b985aac,2202.05901v2.pdf_Cardinal_0
2,118,154,interplanetary coronal mass ejection,Event,0.875200,2202.05901v2.pdf,Cardinal,0,2fd510e00fe9f53428cb353738a472e9,2202.05901v2.pdf_Cardinal_0
3,156,160,ICME,Event,0.855814,2202.05901v2.pdf,Cardinal,0,ef547ab69a1e98b2b518ff05ae1420ee,2202.05901v2.pdf_Cardinal_0
4,4,18,neural network,Model,0.773247,2202.05901v2.pdf,Cardinal,1,510bf9286c4642c7ac7fa6dc0e15f94f,2202.05901v2.pdf_Cardinal_1
...,...,...,...,...,...,...,...,...,...,...
252,139,151,Fall Meeting,Event,0.524235,source3,references,1,c2871c7df3434a05af55d285999fd189,source3_references_1
253,153,163,EarthArXiv,Journal,0.894782,source3,references,1,96882fe399475578c855366a5b5ebac0,source3_references_1
254,182,189,doi.org,Publisher,0.736378,source3,references,1,0a75e822c6f3334851117f8769a30e1c,source3_references_1
255,190,198,10.17605,DOI,0.772052,source3,references,1,83891a209f4432bbcf0e49ec8414d04b,source3_references_1


In [52]:
# Create the relations dataframe from claims_df
relations_df = pd.concat([
    pd.DataFrame({
        'from': claims_df['filename'],
        'to': claims_df['composite_id']
    }),
    pd.DataFrame({
        'from': entities_df['composite_id'],
        'to': entities_df['text']
    })
])


In [53]:
nodes = pd.DataFrame(pd.concat([
    sources_df['filename'].rename('id'),
    claims_df['composite_id'].rename('id'),
    entities_df['text'].rename('id')
]).unique(), columns=['id'])


In [54]:
# Open a new in-memory database
db = kuzu.Database()
conn = kuzu.Connection(db)

In [55]:
# Create a Person node table with name as the primary key
conn.execute("CREATE NODE TABLE Claim(id STRING PRIMARY KEY)")
# Enable the `ignore_errors` parameter below to ignore the erroneous rows
conn.execute("COPY Claim FROM nodes (ignore_errors=true)")

<kuzu.query_result.QueryResult at 0x662166b14110>

In [56]:
conn.execute("CREATE REL TABLE IF NOT EXISTS rels( FROM Claim TO Claim)")

<kuzu.query_result.QueryResult at 0x662166b141d0>

In [57]:
res = conn.execute(" COPY rels FROM relations_df")

In [58]:
# Create a widget instance using the existing connection
g = KuzuGraphWidget(conn)
# Display the entire graph
g.show_cypher("MATCH (a)-[b]->(c) RETURN *")

GraphWidget(layout=Layout(height='800px', width='100%'))