# Development for NeonPandas

In [2]:
import numpy as np
import pandas as pd 
import neonpandas as npd

from utils import cypher
from utils import df_tools
from utils import node

## Load Pets Dataset

In [3]:
data = pd.read_csv('pets.csv')
data

Unnamed: 0,name,species,color,age,behavior
0,Ralph,Dog,black,10.0,
1,Pip,Cat,yellow,6.0,good
2,Babe,Pig,,3.0,
3,Bubbles,Fish,red,,acceptable
4,Freckles,Horse,brown,,


## NeonPandas NodeFrame
Look behind the scenes (e.g. `pets`) to see the _neo_node_ index that makes this all work.

In [4]:
pets = npd.NodeFrame(data, id_col='name', lbl_col='species', labels='Pet')
pets.show()

labels,name,color,age,behavior
"('Pet', 'Dog')",Ralph,black,10.0,
"('Pet', 'Cat')",Pip,yellow,6.0,good
"('Pet', 'Pig')",Babe,,3.0,
"('Pet', 'Fish')",Bubbles,red,,acceptable
"('Pet', 'Horse')",Freckles,brown,,


In [5]:
pets

Unnamed: 0_level_0,labels,name,color,age,behavior
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(n:Pet:Dog {name: ""Ralph""})","(Pet, Dog)",Ralph,black,10.0,
"(n:Pet:Cat {name: ""Pip""})","(Pet, Cat)",Pip,yellow,6.0,good
"(n:Pet:Pig {name: ""Babe""})","(Pet, Pig)",Babe,,3.0,
"(n:Pet:Fish {name: ""Bubbles""})","(Pet, Fish)",Bubbles,red,,acceptable
"(n:Pet:Horse {name: ""Freckles""})","(Pet, Horse)",Freckles,brown,,


## Set up Graph

In [None]:
graph = npd.Graph(uri='bolt://localhost:7687', auth=('neo4j', 'neonpandas'))

### Creates Nodes

In [None]:
graph.create_nodes(pets)

## NeonPandas EdgeFrame

In [56]:
edges = pd.DataFrame([
    {'src': 'Ralph', 'rel_type': 'FRIENDLY_WITH', 'dest': 'Bubbles', 'reporter': 'Jenny'},
    {'src': 'Pip', 'rel_type': 'MEAN_TO', 'dest': 'Babe', 'reporter': 'Frank'},
    {'src': 'Ralph', 'rel_type': 'MEAN_TO', 'dest': 'Freckles', 'reporter': 'Frank'},
    {'src': 'Freckles', 'rel_type': 'FRIENDLY_WITH', 'dest': 'Babe', 'reporter': 'Tommy'},
    {'src': 'Pip', 'rel_type': 'JUST_MET', 'dest': 'Barney', 'reporter': 'Jenny'}
])
edges

Unnamed: 0,src,rel_type,dest,reporter
0,Ralph,FRIENDLY_WITH,Bubbles,Jenny
1,Pip,MEAN_TO,Babe,Frank
2,Ralph,MEAN_TO,Freckles,Frank
3,Freckles,FRIENDLY_WITH,Babe,Tommy
4,Pip,JUST_MET,Barney,Jenny


#### Requirements for EdgeFrame
- Designate source, relationship-type, and dest columns (i.e. src, dest, rel_type)
- Ability to join **src** and **dest** cols with NodeFrame to get ID info (e.g. labels, id_props, etc.)
- `create_edges()` method with option to set **src** and **dest** (individually) to `MATCH` or `MERGE`
- Assumes all columns in EdgeFrame beyond **src**, **dest**, and **rel-type** are edge properties

In [57]:
edges = npd.EdgeFrame(edges, rel_col='rel_type')
#edges = npd.EdgeFrame(edges[['src', 'dest']], rel_type='SHARES_OWNER')
edges.show()

rel_type,src,dest,reporter
FRIENDLY_WITH,Ralph,Bubbles,Jenny
MEAN_TO,Pip,Babe,Frank
MEAN_TO,Ralph,Freckles,Frank
FRIENDLY_WITH,Freckles,Babe,Tommy
JUST_MET,Pip,Barney,Jenny


In [29]:
a = pets.index.to_series()

In [30]:
b = owners.index.to_series()

In [36]:
c = pd.concat([a, b])

In [38]:
c.to_list()

[(n:Pet:Dog {name: "Ralph"}),
 (n:Pet:Cat {name: "Pip"}),
 (n:Pet:Pig {name: "Babe"}),
 (n:Pet:Fish {name: "Bubbles"}),
 (n:Pet:Horse {name: "Freckles"}),
 (n:Owner {name: "Dan"}),
 (n:Owner {name: "Barbara"}),
 (n:Owner {name: "John"})]

In [55]:
def get_nodeframe_index(nf):
    if isinstance(nf, list):
        return pd.concat([n.index.to_series() for n in nf]).to_list()
    elif isinstance(nf, npd.NodeFrame):
        return nf.index.to_list()
    else:
        raise ValueError("Input not recognized.")

def enrich_edgeframe(edges, nodes, src_default:str=None, dest_default:str=None):
    src_col, dest_col = edges.src_col, edges.dest_col
    # handle nodes
    node_idx = get_nodeframe_index(nodes)
    # convert src and dest cols to Node type
    
    ## ** problem here: how to address .id_col when multiple nodeframes are provided (and even smashed into one)??
    ## maybe will just have to assign nodeframes to src and dest columns
    ## but then perhaps need to re-evaluate entire approach ????
    edges[src_col] = edges[src_col].apply(lambda x: node.Node(src_default, nodes.id_col, x))
    edges[dest_col] = edges[dest_col].apply(lambda x: node.Node(dest_default, nodes.id_col, x))
    
    # match to nodes in NodeFrame
    edges[src_col] = edges[src_col].apply(lambda x: node.find_match(x, node_idx))
    edges[dest_col] = edges[dest_col].apply(lambda x: node.find_match(x, node_idx))
    return edges

In [58]:
test = enrich_edgeframe(edges, pets, src_default='Pet', dest_default='Pet')
test

Unnamed: 0,rel_type,src,dest,reporter
0,FRIENDLY_WITH,"(n:Pet:Dog {name: ""Ralph""})","(n:Pet:Fish {name: ""Bubbles""})",Jenny
1,MEAN_TO,"(n:Pet:Cat {name: ""Pip""})","(n:Pet:Pig {name: ""Babe""})",Frank
2,MEAN_TO,"(n:Pet:Dog {name: ""Ralph""})","(n:Pet:Horse {name: ""Freckles""})",Frank
3,FRIENDLY_WITH,"(n:Pet:Horse {name: ""Freckles""})","(n:Pet:Pig {name: ""Babe""})",Tommy
4,JUST_MET,"(n:Pet:Cat {name: ""Pip""})","(n:Pet {name: ""Barney""})",Jenny


In [59]:
owner_data = [
    {'name': 'Dan', 'mood': 'good'},
    {'name': 'Barbara', 'mood': 'okay'},
    {'name': 'John', 'mood': 'bad'}
]

pet_owner_data = [
    {'src': 'Dan', 'rel_type': 'OWNS', 'dest': 'Freckles'},
    {'src': 'Barbara', 'rel_type': 'OWNS', 'dest': 'Ralph'},
    {'src': 'John', 'rel_type': 'TRAINED', 'dest': 'Babe'},
    {'src': 'Barbara', 'rel_type': 'FEEDS', 'dest': 'Pip'}
]

In [60]:
owners = npd.NodeFrame(owner_data, id_col='name', labels=('Owner'))
owners

Unnamed: 0_level_0,labels,name,mood
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(n:Owner {name: ""Dan""})","(Owner,)",Dan,good
"(n:Owner {name: ""Barbara""})","(Owner,)",Barbara,okay
"(n:Owner {name: ""John""})","(Owner,)",John,bad


In [61]:
pet_owner_edges = npd.EdgeFrame(pet_owner_data, rel_col='rel_type')
pet_owner_edges.show()

rel_type,src,dest
OWNS,Dan,Freckles
OWNS,Barbara,Ralph
TRAINED,John,Babe
FEEDS,Barbara,Pip


In [62]:
test = enrich_edgeframe(pet_owner_edges, nodes=[pets, owners], src_default='Owner', dest_default='Pet')
test

AttributeError: 'list' object has no attribute 'id_col'

## EdgeFrame Joins to NodeFrame
There are 2 proposed approaches:
1. Join via the Node-Index created already in the NodeFrame (via the Node class)
    - Allows for more dynamic/flexible joining (e.g. variable labels)
    - Incorporates node labels into join; not exclusively relying on column names
    - Provides a kind of hidden link between specific nodes across node and edge frames
    

2. Join via designated id columns provided within NodeFrame object(s)
    - More explicit to user; but also requires a little more attention

In [None]:
def enrich_edgeframe(edges, nodes, default=None):
    src_default, dest_default = _parse_default_param(default)
    src_lbls = '{}_labels'.format(edges.src_col)
    dest_lbls = '{}_labels'.format(edges.dest_col)
    # src join
    test = edges.merge(nodes[[nodes.id_col, 'labels']],
                      left_on=edges.src_col, right_on=nodes.id_col,
                      how='left').rename(columns={'labels': src_lbls})
    # dest join
    test = test.merge(nodes[[nodes.id_col, 'labels']],
                     left_on=edges.dest_col, right_on=nodes.id_col,
                     how='left', suffixes=('_src', '_dest')).rename(columns={'labels': dest_lbls})
    # drop extraneous columns
    test.drop(test.filter(regex='(_src|_dest)').columns, axis=1, inplace=True)
    # if some nodes from edgeframe were not matched to nodeframe
    # fill in default labels (src & dest)
    test = handle_nan_nodes(test, src_lbls, src_default)
    test = handle_nan_nodes(test, dest_lbls, dest_default)
    # convert src field and labels to Node object
    test[edges.src_col] = df_tools._generate_node_idx(test, key=nodes.id_col, 
                                                      value_col=edges.src_col, 
                                                      lbls_col=src_lbls, var='s')
    test[edges.dest_col] = df_tools._generate_node_idx(test, key=nodes.id_col, 
                                                       value_col=edges.dest_col, 
                                                       lbls_col=dest_lbls, var='d')
    test.drop(test.filter(regex='_labels').columns, axis=1, inplace=True)
    return test

def _parse_default_param(default):
    if isinstance(default, tuple):
        src_default, dest_default = default
    elif isinstance(default, str):
        src_default = default
        dest_default = default
    else:
        src_default = None
        dest_default = None
    return src_default, dest_default

def handle_nan_nodes(edges, col, col_default):
    _nans = edges[col].isnull().values
    if _nans.any():
        if col_default:
            # get indices of nan values
            nan_idx = np.where(_nans == True)[0]
            edges[col][nan_idx] = df_tools.conform_to_tuple(col_default,)
        else:
            error_msg = """{s} column contains {n} nan values after merge with NodeFrame.
            This may be because a node in the {s} column is not present in
            the NodeFrame. Use the 'src_default' parameter to provide a
            default Node Label for {s} nodes not found in NodeFrame.""".format(s=col, n=_nans.shape[0])
            raise ValueError(error_msg)
    return edges

In [None]:
test = enrich_edgeframe(edges, pets, default='Pet')
test

In [None]:
## left to do ----
## 1. Merge with multiple NodeFrames (ie an array)
##    this is essential for EdgeFrame that contain edges
##    going from one node class to another (e.g. Pets --> Owners)
## 2. [DONE] how to insert EdgeFrame into cypher query
## 3. Revisit joining/merging between EdgeFrame & NodeFrame
##.   consider how to use Node Match feature

### Node Joining
Perform _join_ operations with an input DataFrame against nodes in Neo4j.

In [None]:
new_pets = pd.DataFrame([
    {'name': 'Betsy', 'age': 2, 'species': 'Cow'},
    {'name': 'Carrie', 'species': 'Rabbit'}
])
all_pets = pd.concat([data, new_pets], sort=False).reset_index(drop=True)

In [None]:
all_pets = npd.NodeFrame(all_pets, id_col='name', lbl_col='species', labels={'Pet'})
all_pets

### Semi-Join
Check which nodes in DataFrame exist in Neo4j.

In [None]:
graph.semi_join(all_pets, on='name', labels='Pet')

### Anti-Join
Check which nodes in DataFrame do not exist in Neo4j.

In [None]:
graph.anti_join(all_pets, on='name', labels='Pet')

## Match Nodes
Search for nodes via `MATCH` statement

In [None]:
graph.match_nodes(labels={'Pet'}, limit=3)

In [None]:
## add properties to search
graph.match_nodes(labels={'Pet'}, properties={'name': 'Ralph'})

## Dynamic Relationship Merge Via APOC
Holy cow this works!

In [None]:
test

In [None]:
# try to recreate below loop via pandas dataframe
test['src_lbls'] = test[edges.src_col].apply(lambda x: x.labels.tolist())
test['src_id'] = test[edges.src_col].apply(lambda x: x._get_id())
test['dest_lbls'] = test[edges.dest_col].apply(lambda x: x.labels.tolist())
test['dest_id'] = test[edges.dest_col].apply(lambda x: x._get_id())
test.drop(columns=[edges.src_col, edges.dest_col], inplace=True)
test

In [None]:
properties= test[[col for col in test.columns 
                  if col not in ['rel_type', 'src_lbls', 'dest_lbls', 'src_id', 'dest_id']]]
properties = df_tools.convert_to_records(properties)
test['properties'] = properties
test

In [None]:
apoc_edges = test.to_dict('records')
apoc_edges[0]

In [None]:
query = """UNWIND $edges AS edge
            CALL apoc.merge.node(edge.src_lbls, edge.src_id) YIELD node AS src
            WITH src, edge
            CALL apoc.merge.node(edge.dest_lbls, edge.dest_id) YIELD node AS dest
            WITH src, dest, edge
            CALL apoc.merge.relationship(src, edge.rel_type, edge.properties, {}, dest) YIELD rel
            RETURN COUNT(rel)"""

In [None]:
graph.run(query, {'edges': apoc_edges})