# Development for NeonPandas

In [1]:
import numpy as np
import pandas as pd 
import neonpandas as npd

from utils import cypher
from utils import df_tools
from utils.node_tools import Node, find_match
from utils import edge_tools

## Load Pets Dataset

In [2]:
data = pd.read_csv('pets.csv')
data

Unnamed: 0,name,species,color,age,behavior
0,Ralph,Dog,black,10.0,
1,Pip,Cat,yellow,6.0,good
2,Babe,Pig,,3.0,
3,Bubbles,Fish,red,,acceptable
4,Freckles,Horse,brown,,


## NeonPandas NodeFrame
Look behind the scenes (e.g. `pets`) to see the _neo_node_ index that makes this all work.

In [3]:
pets = npd.NodeFrame(data, id_col='name', lbl_col='species', labels={'Pet'})
pets

Unnamed: 0,name,species,color,age,behavior
0,Ralph,Dog,black,10.0,
1,Pip,Cat,yellow,6.0,good
2,Babe,Pig,,3.0,
3,Bubbles,Fish,red,,acceptable
4,Freckles,Horse,brown,,


In [5]:
pets['labels'] = df_tools._merge_labels(pets, column='species', labels={'Pet'})
pets

Unnamed: 0,name,species,color,age,behavior,labels
0,Ralph,Dog,black,10.0,,"{Pet, Dog}"
1,Pip,Cat,yellow,6.0,good,"{Cat, Pet}"
2,Babe,Pig,,3.0,,"{Pet, Pig}"
3,Bubbles,Fish,red,,acceptable,"{Fish, Pet}"
4,Freckles,Horse,brown,,,"{Horse, Pet}"


In [6]:
pets['node'] = pets.apply(lambda x: Node(x.labels, pets.id_col, x[pets.id_col]), axis=1)
pets

Unnamed: 0,name,species,color,age,behavior,labels,node
0,Ralph,Dog,black,10.0,,"{Pet, Dog}","(n:Pet:Dog {name: ""Ralph""})"
1,Pip,Cat,yellow,6.0,good,"{Cat, Pet}","(n:Cat:Pet {name: ""Pip""})"
2,Babe,Pig,,3.0,,"{Pet, Pig}","(n:Pet:Pig {name: ""Babe""})"
3,Bubbles,Fish,red,,acceptable,"{Fish, Pet}","(n:Fish:Pet {name: ""Bubbles""})"
4,Freckles,Horse,brown,,,"{Horse, Pet}","(n:Horse:Pet {name: ""Freckles""})"


In [9]:
pets.drop(columns=['species'], inplace=True)
pets

Unnamed: 0,name,color,age,behavior,labels,node
0,Ralph,black,10.0,,"{Pet, Dog}","(n:Pet:Dog {name: ""Ralph""})"
1,Pip,yellow,6.0,good,"{Cat, Pet}","(n:Cat:Pet {name: ""Pip""})"
2,Babe,,3.0,,"{Pet, Pig}","(n:Pet:Pig {name: ""Babe""})"
3,Bubbles,red,,acceptable,"{Fish, Pet}","(n:Fish:Pet {name: ""Bubbles""})"
4,Freckles,brown,,,"{Horse, Pet}","(n:Horse:Pet {name: ""Freckles""})"


In [10]:
type(pets)

neonpandas.NodeFrame

## Set up Graph

In [None]:
graph = npd.Graph(uri='bolt://localhost:7687', auth=('neo4j', 'neonpandas'))

### Creates Nodes

In [None]:
graph.create_nodes(pets)

## NeonPandas EdgeFrame

In [None]:
edge_data = pd.DataFrame([
    {'src': 'Ralph', 'rel_type': 'FRIENDLY_WITH', 'dest': 'Bubbles', 'reporter': 'Jenny'},
    {'src': 'Pip', 'rel_type': 'MEAN_TO', 'dest': 'Babe', 'reporter': 'Frank'},
    {'src': 'Ralph', 'rel_type': 'MEAN_TO', 'dest': 'Freckles', 'reporter': 'Frank'},
    {'src': 'Freckles', 'rel_type': 'FRIENDLY_WITH', 'dest': 'Babe', 'reporter': 'Tommy'},
    {'src': 'Pip', 'rel_type': 'JUST_MET', 'dest': 'Barney', 'reporter': 'Jenny'}
])
edge_data

### Initiate a Standalone EdgeFrame from Edge Data

In [None]:
edges = npd.EdgeFrame(edge_data, rel_col='rel_type', lbls={'Pet'})
edges

In [None]:
edges.lbls

In [None]:
def create_node_column(col, name, lbls:set={}, var:str=None):
    if not edge_tools._already_contains_nodes(col):
        return col.apply(lambda x: Node(lbls, name, x, var=var))
    else:
        return col

In [None]:
edges['src'] = create_node_column(edges[edges.src_col], 'name', lbls=edges.lbls, var=edges.src_col)
edges['dest'] = create_node_column(edges[edges.dest_col], 'name', lbls=edges.lbls, var=edges.dest_col)
edges

#### Requirements for EdgeFrame
- Designate source, relationship-type, and dest columns (i.e. src, dest, rel_type)
- Ability to join **src** and **dest** cols with NodeFrame to get ID info (e.g. labels, id_props, etc.)
- `create_edges()` method with option to set **src** and **dest** (individually) to `MATCH` or `MERGE`
- Assumes all columns in EdgeFrame beyond **src**, **dest**, and **rel-type** are edge properties

In [None]:
edges = edges.merge(pets[['labels', pets.id_col]], 
                    left_on='src', right_on=pets.id_col, 
                    how='left', suffixes=('', '_src'))

edges = edges.merge(pets[['labels', pets.id_col]], 
                    left_on='dest', right_on=pets.id_col, 
                    how='left', suffixes=('_src', '_dest'))
edges.rename(columns={'labels_src': 'src_labels', 'labels_dest': 'dest_labels'}, inplace=True)
edges.drop(columns=['name_src', 'name_dest'], inplace=True)

## address any nan values

edges

In [None]:
edges['dest_labels'] = edges.dest_labels.fillna({'Pet'})
edges

In [None]:
#edges['src_node'] = edges.apply(lambda x: node_tools.Node(x.src_labels, pets.id_col, x.src), axis=1)
## ** error here is due to NaN in dest_labels column
edges['dest_node'] = edges.apply(lambda x: node_tools.Node(x.dest_labels, pets.id_col, x.dest), axis=1)
edges

In [None]:
list(set(['dog', 'pet']))

In [None]:
test = {'pet', 'dog'}
len(test)

In [None]:
test = node_tools.Node({'Pet', 'Dog'}, 'name', 'Ralph')

In [None]:
test

In [None]:
edges = npd.EdgeFrame(edge_data, rel_col='rel_type')
#edges = npd.EdgeFrame(edges[['src', 'dest']], rel_type='SHARES_OWNER')
edges.show()

In [None]:
edges.join_to_nodeframe(pets)

In [None]:
edges

In [None]:
owner_data = [
    {'name': 'Dan', 'mood': 'good'},
    {'name': 'Barbara', 'mood': 'okay'},
    {'name': 'John', 'mood': 'bad'}
]

pet_owner_data = [
    {'src': 'Dan', 'rel_type': 'OWNS', 'dest': 'Freckles'},
    {'src': 'Barbara', 'rel_type': 'OWNS', 'dest': 'Ralph'},
    {'src': 'John', 'rel_type': 'TRAINED', 'dest': 'Babe'},
    {'src': 'Barbara', 'rel_type': 'FEEDS', 'dest': 'Pip'}
]

In [None]:
owners = npd.NodeFrame(owner_data, id_col='name', labels=('Owner'))
owners

In [None]:
pet_owner_edges = npd.EdgeFrame(pet_owner_data, rel_col='rel_type')
pet_owner_edges.show()

In [None]:
what = set(what)
what

In [None]:
pet_owner_edges.join_to_nodeframe(src_nodes=owners, dest_nodes=pets)
pet_owner_edges

In [None]:
def _already_contains_nodes(col:pd.Series, num:int=3):
    for x in col[:num]:
        if not isinstance(x, node.Node):
            return False
    return True
    

def enrich_edgeframe(edges, nodes:npd.NodeFrame=None, src_nodes:npd.NodeFrame=None, dest_nodes:npd.NodeFrame=None,
                    src_labels=None, dest_labels=None):
    for _col,_nf, _default in [(edges.src_col, src_nodes, src_lbls), (edges.dest_col, dest_nodes, dest_lbls)]:
        if nodes is None and src_nodes is None and dest_nodes is None:
            edges[_col] = edges[_col].apply(lambda x: node.Node())
        nf = (_nf if nodes is None and _nf is not None else nodes)
        # convert (src|dest) node column to Node type
        if not _already_contains_nodes(edges[_col]):
            edges[_col] = edges[_col].apply(lambda x: node.Node(nf.default_lbls, nf.id_col, x))
            # match to respective NodeFrame
            edges[_col] = edges[_col].apply(lambda x: node.find_match(x, nf.index.to_series()))
    return edges

In [None]:
test = enrich_edgeframe(edges, pets)
test

### Node Joining
Perform _join_ operations with an input DataFrame against nodes in Neo4j.

In [None]:
new_pets = pd.DataFrame([
    {'name': 'Betsy', 'age': 2, 'species': 'Cow'},
    {'name': 'Carrie', 'species': 'Rabbit'}
])
all_pets = pd.concat([data, new_pets], sort=False).reset_index(drop=True)

In [None]:
all_pets = npd.NodeFrame(all_pets, id_col='name', lbl_col='species', labels={'Pet'})
all_pets

In [None]:
test = [type(n) for n in test.src]
test

In [None]:
if len(set([type(n) for n in test.src[:5]])) >= 1:
    print('yes')

### Semi-Join
Check which nodes in DataFrame exist in Neo4j.

In [None]:
test = enrich_edgeframe(pet_owner_edges, src_nodes=owners, dest_nodes=pets)
test

## EdgeFrame Joins to NodeFrame
There are 2 proposed approaches:
1. Join via the Node-Index created already in the NodeFrame (via the Node class)
    - Allows for more dynamic/flexible joining (e.g. variable labels)
    - Incorporates node labels into join; not exclusively relying on column names
    - Provides a kind of hidden link between specific nodes across node and edge frames
    

2. Join via designated id columns provided within NodeFrame object(s)
    - More explicit to user; but also requires a little more attention

In [None]:
## left to do ----
## 1. Merge with multiple NodeFrames (ie an array)
##    this is essential for EdgeFrame that contain edges
##    going from one node class to another (e.g. Pets --> Owners)
## 2. [DONE] how to insert EdgeFrame into cypher query
## 3. Revisit joining/merging between EdgeFrame & NodeFrame
##.   consider how to use Node Match feature

In [None]:
graph.semi_join(all_pets, on='name', labels='Pet')

### Anti-Join
Check which nodes in DataFrame do not exist in Neo4j.

In [None]:
graph.anti_join(all_pets, on='name', labels='Pet')

## Match Nodes
Search for nodes via `MATCH` statement

In [None]:
graph.match_nodes(labels={'Pet'}, limit=3)

In [None]:
## add properties to search
graph.match_nodes(labels={'Pet'}, properties={'name': 'Ralph'})

## Dynamic Relationship Merge Via APOC
Holy cow this works!

In [None]:
test

In [None]:
# try to recreate below loop via pandas dataframe
test['src_lbls'] = test[edges.src_col].apply(lambda x: x.labels.tolist())
test['src_id'] = test[edges.src_col].apply(lambda x: x._get_id())
test['dest_lbls'] = test[edges.dest_col].apply(lambda x: x.labels.tolist())
test['dest_id'] = test[edges.dest_col].apply(lambda x: x._get_id())
test.drop(columns=[edges.src_col, edges.dest_col], inplace=True)
test

In [None]:
properties= test[[col for col in test.columns 
                  if col not in ['rel_type', 'src_lbls', 'dest_lbls', 'src_id', 'dest_id']]]
properties = df_tools.convert_to_records(properties)
test['properties'] = properties
test

In [None]:
apoc_edges = test.to_dict('records')
apoc_edges[0]

In [None]:
query = """UNWIND $edges AS edge
            CALL apoc.merge.node(edge.src_lbls, edge.src_id) YIELD node AS src
            WITH src, edge
            CALL apoc.merge.node(edge.dest_lbls, edge.dest_id) YIELD node AS dest
            WITH src, dest, edge
            CALL apoc.merge.relationship(src, edge.rel_type, edge.properties, {}, dest) YIELD rel
            RETURN COUNT(rel)"""

In [None]:
graph.run(query, {'edges': apoc_edges})

## Re-Use this old code: How to handle NAN values with Sets

In [None]:
def enrich_edgeframe(edges, nodes, default=None):
    src_default, dest_default = _parse_default_param(default)
    src_lbls = '{}_labels'.format(edges.src_col)
    dest_lbls = '{}_labels'.format(edges.dest_col)
    # src join
    test = edges.merge(nodes[[nodes.id_col, 'labels']],
                      left_on=edges.src_col, right_on=nodes.id_col,
                      how='left').rename(columns={'labels': src_lbls})
    # dest join
    test = test.merge(nodes[[nodes.id_col, 'labels']],
                     left_on=edges.dest_col, right_on=nodes.id_col,
                     how='left', suffixes=('_src', '_dest')).rename(columns={'labels': dest_lbls})
    # drop extraneous columns
    test.drop(test.filter(regex='(_src|_dest)').columns, axis=1, inplace=True)
    # if some nodes from edgeframe were not matched to nodeframe
    # fill in default labels (src & dest)
    test = handle_nan_nodes(test, src_lbls, src_default)
    test = handle_nan_nodes(test, dest_lbls, dest_default)
    # convert src field and labels to Node object
    test[edges.src_col] = df_tools._generate_node_idx(test, key=nodes.id_col, 
                                                      value_col=edges.src_col, 
                                                      lbls_col=src_lbls, var='s')
    test[edges.dest_col] = df_tools._generate_node_idx(test, key=nodes.id_col, 
                                                       value_col=edges.dest_col, 
                                                       lbls_col=dest_lbls, var='d')
    test.drop(test.filter(regex='_labels').columns, axis=1, inplace=True)
    return test

def _parse_default_param(default):
    if isinstance(default, tuple):
        src_default, dest_default = default
    elif isinstance(default, str):
        src_default = default
        dest_default = default
    else:
        src_default = None
        dest_default = None
    return src_default, dest_default

def handle_nan_nodes(edges, col, col_default):
    _nans = edges[col].isnull().values
    if _nans.any():
        if col_default:
            # get indices of nan values
            nan_idx = np.where(_nans == True)[0]
            edges[col][nan_idx] = df_tools.conform_to_tuple(col_default,)
        else:
            error_msg = """{s} column contains {n} nan values after merge with NodeFrame.
            This may be because a node in the {s} column is not present in
            the NodeFrame. Use the 'src_default' parameter to provide a
            default Node Label for {s} nodes not found in NodeFrame.""".format(s=col, n=_nans.shape[0])
            raise ValueError(error_msg)
    return edges