In [85]:
# import python library panda and py2neo
# for this demo, I have used py2neo, but official python driver can also be used, with some programming changes.
from __future__ import print_function, division
import pandas as pd
from py2neo.database import Graph

In [86]:
# initiate graph python connection context
graph = Graph('bolt://127.0.0.1:7687', auth=('community', 'neo4j'), name="neo4j")

In [89]:
# create constraint on col_id
create_constraint = graph.run("CREATE CONSTRAINT data_col_id IF NOT EXISTS ON (col:data) ASSERT col.col_id IS UNIQUE")

In [90]:
# read the csv file 
# load the csv file into our main dataframe -> df
main_df = pd.read_csv("demo.csv")

In [91]:
# verify the dataframe
# note some values are float. they are converted into int - for the purpose of this demo. 
main_df

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,2.0,3.0,4,7
1,,,,5,8
2,,,,6,9
3,10.0,11.0,12.0,13,14
4,,,,15,16
5,,,,17,18


In [92]:
# fill NaN with previous row values
# Documentation link -> https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html#pandas-dataframe-fillna
# and also coverting float to int
main_df =main_df.fillna(method='ffill').astype(int)

In [93]:
# verify dataframe datatypes.
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col1    6 non-null      int32
 1   col2    6 non-null      int32
 2   col3    6 non-null      int32
 3   col4    6 non-null      int32
 4   col5    6 non-null      int32
dtypes: int32(5)
memory usage: 248.0 bytes


In [94]:
# verify the main dataframe 
main_df

Unnamed: 0,col1,col2,col3,col4,col5
0,1,2,3,4,7
1,1,2,3,5,8
2,1,2,3,6,9
3,10,11,12,13,14
4,10,11,12,15,16
5,10,11,12,17,18


In [103]:
# the df could also be loaded into neo4j. but for small footprint
# creating the first 3 columns and getting the unique values.
# this step improves the data loading into neo4j, because it has less rows
# and, it doesn't need to compare and merge for every row.
# this dataframe serves as a root for the data
unique_root_node_df = main_df[['col1','col2','col3']].drop_duplicates(subset=['col1','col2','col3'])

In [104]:
unique_root_node_df

Unnamed: 0,col1,col2,col3
0,1,2,3
3,10,11,12


In [105]:
# load the root nodes dataframe into neo4j
for index,row in unique_root_node_df.iterrows():
    create_root_nodes="merge (c1:data{col_id:toInteger($col1)}) merge (c2:data{col_id:toInteger($col2)}) merge (c3:data{col_id:toInteger($col3)}) merge (c1)-[:LINKS]->(c2) merge (c2)-[:LINKS]->(c3)         "
    exec_create_root_nodes=graph.run(create_root_nodes,col1=int(row["col1"]),col2=int(row["col2"]),col3=int(row["col3"]))    

In [106]:
# since col4 and col5 are connected to each other,
# again another dataframe is created to minimize the dataframe footprint
# and faster processing
# since these nodes are leaf thus the name lead_df
leaf_df = main_df[['col4','col5']]

In [98]:
# verify leaf dataframe
leaf_df

Unnamed: 0,col4,col5
0,4,7
1,5,8
2,6,9
3,13,14
4,15,16
5,17,18


In [99]:
# load the leaf dataframe into neo4j dataframe
for index,row in leaf_df.iterrows():
    create_leaf_nodes="merge (c4:data{col_id:toInteger($col4)}) merge (c5:data{col_id:toInteger($col5)}) merge (c4)-[:LINKS]->(c5)  "
    exec_create_leaf_nodes=graph.run(create_leaf_nodes,col4=int(row["col4"]),col5=int(row["col5"]))  

In [100]:
# last part is to link col3, and col4
# again for smaller footprint, created a new dataframe with col3 and col4
link_df = main_df[['col3','col4']]

In [101]:
# verify link dataframe
link_df

Unnamed: 0,col3,col4
0,3,4
1,3,5
2,3,6
3,12,13
4,12,15
5,12,17


In [102]:
# finally create the link relationship.
for index,row in link_df.iterrows():
    create_link_nodes="match (c3:data{col_id:$col3}) match (c4:data{col_id:$col4}) WITH c3,c4 merge (c3)-[:LINKS]->(c4)"
    exec_create_link_nodes=graph.run(create_link_nodes,col3=int(row["col3"]),col4=int(row["col4"]))  

In [None]:
# note spliting of the dataframe is an optional.
# the loading of the data into neo4j can also be performed without the spliting.