# Simple visual

Take the data from LanceDB and form up some dataframes to load up KuzuDB with.  Then do a few simple visualizations to see relations.

Next steps:
- https://docs.kuzudb.com/extensions/vector/
- https://docs.kuzudb.com/extensions/full-text-search/


In [1]:
import kuzu
from yfiles_jupyter_graphs_for_kuzu import KuzuGraphWidget
import lancedb
import pandas as pd
import hashlib
from sentence_transformers import SentenceTransformer


In [2]:
db = lancedb.connect("../lancedb")

claims_df = pd.DataFrame(db.open_table("claims").to_pandas())
entities_df = pd.DataFrame(db.open_table("entities").to_pandas())
sources_df = pd.DataFrame(db.open_table("sources").to_pandas())

In [3]:
print(sources_df.columns.values)
print("---------------------------------")
print(claims_df.columns.values)
print("---------------------------------")
print(entities_df.columns.values)

['filename' 'id' 'location' 'markdown']
---------------------------------
['filename' 'node_name' 'index' 'text']
---------------------------------
['start' 'end' 'text' 'label' 'score' 'filename' 'nodename' 'index']


Based on the above, I could make "from" and "to" columns in all of these.  Then also make a node_id and name for all of them as well.

In [4]:
sources_df

Unnamed: 0,filename,id,location,markdown
0,2202.05901v2.pdf,2c494b74cd387935028898cf4d998e1c,file://stores/inputs/2202.05901v2.pdf,## Page 1\n\nIdentiﬁcation of Flux Rope Orient...
1,2409.09230v1.pdf,824512f309bf431858125747565b05f6,file://stores/inputs/2409.09230v1.pdf,## Page 1\n\nSolar Physics\nDOI: 10.1007/ ••••...
2,source3,08260e29029420a6ba81dcc7cd7dc033,https://gc.copernicus.org/articles/2/55/2019/,![](https://contentmanager.copernicus.org/8009...


In [5]:
claims_df['composite_id'] = claims_df['filename'] + '_' + claims_df['node_name'] + '_' + claims_df['index'].astype(str)

In [6]:
claims_df

Unnamed: 0,filename,node_name,index,text,composite_id
0,2202.05901v2.pdf,Cardinal,0,This study demonstrates that convolutional neu...,2202.05901v2.pdf_Cardinal_0
1,2202.05901v2.pdf,Cardinal,1,The neural network trained on full duration sy...,2202.05901v2.pdf_Cardinal_1
2,2202.05901v2.pdf,Cardinal,2,Our work shows that neural networks struggle t...,2202.05901v2.pdf_Cardinal_2
3,2202.05901v2.pdf,Supporting,0,This research uses CNNs trained with magnetic ...,2202.05901v2.pdf_Supporting_0
4,2202.05901v2.pdf,Supporting,1,The study uses a physics-based flux rope model...,2202.05901v2.pdf_Supporting_1
...,...,...,...,...,...
96,source3,limitations,0,The inherent uncertainty due to the challenge ...,source3_limitations_0
97,source3,futureDirections,0,Development of open tools that leverage progra...,source3_futureDirections_0
98,source3,keyFindings,0,The rate at which new people are attending AGU...,source3_keyFindings_0
99,source3,references,0,"Narock, T., Hasnain, S., and Stephan, R.: Iden...",source3_references_0


In [7]:
entities_df['text_md5'] = entities_df['text'].apply(lambda x: hashlib.md5(str(x).encode()).hexdigest())

In [8]:
entities_df['composite_id'] = entities_df['filename'] + '_' + entities_df['nodename'] + '_' + entities_df['index'].astype(str)

In [9]:
entities_df

Unnamed: 0,start,end,text,label,score,filename,nodename,index,text_md5,composite_id
0,29,58,convolutional neural networks,Technology,0.803726,2202.05901v2.pdf,Cardinal,0,d19a182265bb19e67de7745fa0777821,2202.05901v2.pdf_Cardinal_0
1,60,64,CNNs,Technology,0.593342,2202.05901v2.pdf,Cardinal,0,b2ee264f68cd2a4f9487fd2b0b985aac,2202.05901v2.pdf_Cardinal_0
2,118,154,interplanetary coronal mass ejection,Event,0.875200,2202.05901v2.pdf,Cardinal,0,2fd510e00fe9f53428cb353738a472e9,2202.05901v2.pdf_Cardinal_0
3,156,160,ICME,Event,0.855814,2202.05901v2.pdf,Cardinal,0,ef547ab69a1e98b2b518ff05ae1420ee,2202.05901v2.pdf_Cardinal_0
4,4,18,neural network,Model,0.773247,2202.05901v2.pdf,Cardinal,1,510bf9286c4642c7ac7fa6dc0e15f94f,2202.05901v2.pdf_Cardinal_1
...,...,...,...,...,...,...,...,...,...,...
252,139,151,Fall Meeting,Event,0.524235,source3,references,1,c2871c7df3434a05af55d285999fd189,source3_references_1
253,153,163,EarthArXiv,Journal,0.894782,source3,references,1,96882fe399475578c855366a5b5ebac0,source3_references_1
254,182,189,doi.org,Publisher,0.736378,source3,references,1,0a75e822c6f3334851117f8769a30e1c,source3_references_1
255,190,198,10.17605,DOI,0.772052,source3,references,1,83891a209f4432bbcf0e49ec8414d04b,source3_references_1


In [10]:
# Create the relations dataframe from claims_df
relations_df = pd.concat([
    pd.DataFrame({
        'from': claims_df['filename'],
        'to': claims_df['composite_id']
    }),
    pd.DataFrame({
        'from': entities_df['composite_id'],
        'to': entities_df['text']
    })
])


In [22]:
# TODO  bring in the "text" from claims and .. add columns "description" and desc_embedding
nodes = pd.DataFrame(pd.concat([
    pd.concat([sources_df['filename'].rename('id'), pd.Series(['source'] * len(sources_df), name='type')], axis=1),
    pd.concat([claims_df['composite_id'].rename('id'), pd.Series(['claim'] * len(claims_df), name='type')], axis=1),
    pd.concat([entities_df['text'].rename('id'), pd.Series(['entities'] * len(entities_df), name='type')], axis=1)
])).drop_duplicates('id')


In [12]:
# load the model, this can take a while the first time
model = SentenceTransformer("all-MiniLM-L6-v2")

In [13]:
# Open a new in-memory database
db = kuzu.Database()
conn = kuzu.Connection(db)
conn.execute("INSTALL vector; LOAD vector;")


[<kuzu.query_result.QueryResult at 0x7ec070734d70>,
 <kuzu.query_result.QueryResult at 0x7ec067c1ce10>]

In [14]:
# Create a Person node table with name as the primary key
conn.execute("CREATE NODE TABLE Claim(id STRING PRIMARY KEY, type string)") # add in description and desc_embedding
# Enable the `ignore_errors` parameter below to ignore the erroneous rows
conn.execute("COPY Claim FROM nodes (ignore_errors=true)")

<kuzu.query_result.QueryResult at 0x7ec067d13950>

In [15]:
conn.execute("CREATE REL TABLE IF NOT EXISTS rels( FROM Claim TO Claim)")

<kuzu.query_result.QueryResult at 0x7ec067d13bb0>

In [16]:
res = conn.execute(" COPY rels FROM relations_df")

In [17]:
# Create a widget instance using the existing connection
g = KuzuGraphWidget(conn)

def get_node_color(node):
    node_type = node["properties"]["type"]
    if node_type == "source":
        return "blue"  # Or any color you prefer for source
    elif node_type == "claim":
        return "green" # Or any color you prefer for claim
    elif node_type == "entities":
        return "purple" # Or any color you prefer for entity
    else:
        return "gray"  # Default color for other types

g.add_node_configuration(
    "Claim",  # You might want to change this if the configuration is not just for "Person" nodes
    color=lambda node: get_node_color(node),  # type: ignore
    text=lambda node: {  # type: ignore
        "text": node["properties"]["type"],
        "position": "south",
    }
)
# set up configuration for the graph
# Custom configuration for nodes
# g.add_node_configuration(
#     "Claim",
#     color="red",   # type: ignore
#      text= lambda node : {   # type: ignore
#          "text": node["properties"]["type"],
#          "position": "south",
#     }
# )

# Display the entire graph

In [26]:
g.show_cypher("MATCH (a)-[b]->(c) RETURN *")


GraphWidget(layout=Layout(height='800px', width='100%'))

In [19]:
# g.show_cypher("MATCH (a)-[]->(intermediate_node {type: 'claim'})-[]->(c) RETURN *")
# g.show_cypher("MATCH conn_path = (a)-[]->(intermediate_node {type: 'claim'})-[]->(c) RETURN a, c, conn_path")
# g.show_cypher("MATCH conn_path = (a)-[]->(intermediate_node {type: 'claim'})-[]->(c) RETURN a, c, conn_path")
# MATCH p = (s {type: 'source'})-[*]-(e {type: 'entities'})
# RETURN p

In [32]:
# g.show_cypher("MATCH p = (start_node {type: 'source'})-[]->(intermediate_node)-[]->(end_node {type: 'entities'}) RETURN p")
g.show_cypher("MATCH p = (start_node {type: 'source'})-[]->(intermediate_node)-[]->(end_node {type: 'entities'}) RETURN start_node, end_node, p")

GraphWidget(layout=Layout(height='800px', width='100%'))