# Development for NeonPandas

In [1]:
import numpy as np
import pandas as pd 
import neonpandas as npd

from utils import cypher
from utils import df_tools
from utils.node import Node

## Load Pets Dataset

In [2]:
data = pd.read_csv('pets.csv')
data

Unnamed: 0,name,species,color,age,behavior
0,Ralph,Dog,black,10.0,
1,Pip,Cat,yellow,6.0,good
2,Babe,Pig,,3.0,
3,Bubbles,Fish,red,,acceptable
4,Freckles,Horse,brown,,


## NeonPandas NodeFrame
Look behind the scenes (e.g. `pets`) to see the _neo_node_ index that makes this all work.

In [3]:
pets = npd.NodeFrame(data, id_col='name', lbl_col='species', labels='Pet')
pets.show()

labels,name,color,age,behavior
"('Pet', 'Dog')",Ralph,black,10.0,
"('Pet', 'Cat')",Pip,yellow,6.0,good
"('Pet', 'Pig')",Babe,,3.0,
"('Pet', 'Fish')",Bubbles,red,,acceptable
"('Pet', 'Horse')",Freckles,brown,,


In [4]:
pets

Unnamed: 0_level_0,labels,name,color,age,behavior
neo_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"(n:Pet:Dog {name: ""Ralph""})","(Pet, Dog)",Ralph,black,10.0,
"(n:Pet:Cat {name: ""Pip""})","(Pet, Cat)",Pip,yellow,6.0,good
"(n:Pet:Pig {name: ""Babe""})","(Pet, Pig)",Babe,,3.0,
"(n:Pet:Fish {name: ""Bubbles""})","(Pet, Fish)",Bubbles,red,,acceptable
"(n:Pet:Horse {name: ""Freckles""})","(Pet, Horse)",Freckles,brown,,


## Set up Graph

In [5]:
graph = npd.Graph(uri='bolt://localhost:7687', auth=('neo4j', 'neonpandas'))

### Creates Nodes

In [None]:
graph.create_nodes(pets)

## NeonPandas EdgeFrame

In [6]:
edges = pd.DataFrame([
    {'src': 'Ralph', 'rel_type': 'FRIENDLY_WITH', 'dest': 'Bubbles', 'reporter': 'Jenny'},
    {'src': 'Pip', 'rel_type': 'MEAN_TO', 'dest': 'Babe', 'reporter': 'Frank'},
    {'src': 'Ralph', 'rel_type': 'MEAN_TO', 'dest': 'Freckles', 'reporter': 'Frank'},
    {'src': 'Freckles', 'rel_type': 'FRIENDLY_WITH', 'dest': 'Babe', 'reporter': 'Tommy'},
    {'src': 'Pip', 'rel_type': 'JUST_MET', 'dest': 'Barney', 'reporter': 'Jenny'}
])
edges

Unnamed: 0,src,rel_type,dest,reporter
0,Ralph,FRIENDLY_WITH,Bubbles,Jenny
1,Pip,MEAN_TO,Babe,Frank
2,Ralph,MEAN_TO,Freckles,Frank
3,Freckles,FRIENDLY_WITH,Babe,Tommy
4,Pip,JUST_MET,Barney,Jenny


#### Requirements for EdgeFrame
- Designate source, relationship-type, and dest columns (i.e. src, dest, rel_type)
- Ability to join **src** and **dest** cols with NodeFrame to get ID info (e.g. labels, id_props, etc.)
- `create_edges()` method with option to set **src** and **dest** (individually) to `MATCH` or `MERGE`
- Assumes all columns in EdgeFrame beyond **src**, **dest**, and **rel-type** are edge properties

In [7]:
edges = npd.EdgeFrame(edges, rel_col='rel_type')
#edges = npd.EdgeFrame(edges[['src', 'dest']], rel_type='SHARES_OWNER')
edges

Unnamed: 0,rel_type,src,dest,reporter
0,FRIENDLY_WITH,Ralph,Bubbles,Jenny
1,MEAN_TO,Pip,Babe,Frank
2,MEAN_TO,Ralph,Freckles,Frank
3,FRIENDLY_WITH,Freckles,Babe,Tommy
4,JUST_MET,Pip,Barney,Jenny


In [15]:
def enrich_edgeframe(edges, nodes, default=None):
    src_default, dest_default = _parse_default_param(default)
    src_lbls = '{}_labels'.format(edges.src_col)
    dest_lbls = '{}_labels'.format(edges.dest_col)
    # src join
    test = edges.merge(nodes[[nodes.id_col, 'labels']],
                      left_on=edges.src_col, right_on=nodes.id_col,
                      how='left').rename(columns={'labels': src_lbls})
    # dest join
    test = test.merge(nodes[[nodes.id_col, 'labels']],
                     left_on=edges.dest_col, right_on=nodes.id_col,
                     how='left', suffixes=('_src', '_dest')).rename(columns={'labels': dest_lbls})
    # drop extraneous columns
    test.drop(test.filter(regex='(_src|_dest)').columns, axis=1, inplace=True)
    # if some nodes from edgeframe were not matched to nodeframe
    # fill in default labels (src & dest)
    test = handle_nan_nodes(test, src_lbls, src_default)
    test = handle_nan_nodes(test, dest_lbls, dest_default)
    # convert src field and labels to Node object
    test[edges.src_col] = df_tools._generate_node_idx(test, key=nodes.id_col, 
                                                      value_col=edges.src_col, 
                                                      lbls_col=src_lbls, var='s')
    test[edges.dest_col] = df_tools._generate_node_idx(test, key=nodes.id_col, 
                                                       value_col=edges.dest_col, 
                                                       lbls_col=dest_lbls, var='d')
    test.drop(test.filter(regex='_labels').columns, axis=1, inplace=True)
    return test

def _parse_default_param(default):
    if isinstance(default, tuple):
        src_default, dest_default = default
    elif isinstance(default, str):
        src_default = default
        dest_default = default
    else:
        src_default = None
        dest_default = None
    return src_default, dest_default

def handle_nan_nodes(edges, col, col_default):
    _nans = edges[col].isnull().values
    if _nans.any():
        if col_default:
            # get indices of nan values
            nan_idx = np.where(_nans == True)[0]
            edges[col][nan_idx] = df_tools.conform_to_tuple(col_default,)
        else:
            error_msg = """{s} column contains {n} nan values after merge with NodeFrame.
            This may be because a node in the {s} column is not present in
            the NodeFrame. Use the 'src_default' parameter to provide a
            default Node Label for {s} nodes not found in NodeFrame.""".format(s=col, n=_nans.shape[0])
            raise ValueError(error_msg)
    return edges

In [16]:
test = enrich_edgeframe(edges, pets, default='Pet')
test

Unnamed: 0,rel_type,src,dest,reporter
0,FRIENDLY_WITH,"(s:Pet:Dog {name: ""Ralph""})","(d:Pet:Fish {name: ""Bubbles""})",Jenny
1,MEAN_TO,"(s:Pet:Cat {name: ""Pip""})","(d:Pet:Pig {name: ""Babe""})",Frank
2,MEAN_TO,"(s:Pet:Dog {name: ""Ralph""})","(d:Pet:Horse {name: ""Freckles""})",Frank
3,FRIENDLY_WITH,"(s:Pet:Horse {name: ""Freckles""})","(d:Pet:Pig {name: ""Babe""})",Tommy
4,JUST_MET,"(s:Pet:Cat {name: ""Pip""})","(d:Pet {name: ""Barney""})",Jenny


In [None]:
## left to do ----
## now how to merge with multiple NodeFrames (ie an array)
## this is essential for EdgeFrame that contain edges
## going from one node class to another (e.g. Pets --> Owners)

In [None]:
results = test.dest_labels.isnull().values
results

In [None]:
nan_idx = np.where(results == True)[0]
nan_idx

In [None]:
test.dest_labels[nan_idx] = df_tools.conform_to_tuple(('ieurue',))

In [10]:
a, b = ('dog', 'cat')
a, b

('dog', 'cat')

In [11]:
a, b = 'dog'
a, b

ValueError: too many values to unpack (expected 2)

In [None]:
test.match(n_lbls=1)

In [None]:
apoc_records = df_tools.prepare_df_for_apoc(test, lbls_col='rel_type')
apoc_records[-1]

In [None]:
apoc_records[0]

In [None]:
## current issues
## how to maintain identify in nodes list
## with hidden link between nodes across NodeFrame & EdgeFrame
## how to submit EdgeFrame to CREATE RELATIONSHIP query w/ variable Node-Labels

In [None]:
query = """UNWIND $edges AS edge
        MATCH (s:Pet {name: edge.src})
        MATCH (d:Pet {name: edge.dest})
        WITH s,d,edge
        CALL apoc.merge.relationship(s, edge.rel_type, {}, {}, d) YIELD rel
        RETURN COUNT(rel)"""

In [None]:
graph.create_relationships(edges, query)

### Node Joining
Perform _join_ operations with an input DataFrame against nodes in Neo4j.

In [None]:
new_pets = pd.DataFrame([
    {'name': 'Betsy', 'age': 2, 'species': 'Cow'},
    {'name': 'Carrie', 'species': 'Rabbit'}
])
all_pets = pd.concat([data, new_pets], sort=False).reset_index(drop=True)

In [None]:
all_pets = npd.NodeFrame(all_pets, id_col='name', lbl_col='species', labels={'Pet'})
all_pets

### Semi-Join
Check which nodes in DataFrame exist in Neo4j.

In [None]:
graph.semi_join(all_pets, on='name', labels='Pet')

### Anti-Join
Check which nodes in DataFrame do not exist in Neo4j.

In [None]:
graph.anti_join(all_pets, on='name', labels='Pet')

## Match Nodes
Search for nodes via `MATCH` statement

In [None]:
graph.match_nodes(labels={'Pet'}, limit=3)

In [None]:
## add properties to search
graph.match_nodes(labels={'Pet'}, properties={'name': 'Ralph'})

## DataFrame Object

In [None]:
class Test(pd.DataFrame):
    def __init__(self, data, column:str=None, labels=None):
        super().__init__(data)
        self.whatami = "I am a NeonPandas DataFrame"
        self._set_labels(column=column, labels=labels)
        
    def _set_labels(self, column:str=None, labels:set=None) -> list:
        if column is not None and labels is None:
            assert column in self.columns
            _lbls = self[column].apply(lambda x: df_tools.conform_to_set(x))
        elif column is not None and labels is not None:
            _lbls = self[column].apply(lambda x: {x}.union(df_tools.conform_to_set(labels)))
        elif column is None and labels is not None:
            labels = df_tools.conform_to_set(labels)
            _lbls = [labels for i in range(len(self))]
        else:
            raise ValueError("Must provide either 'labels' or 'use_column' as input for attribute type.")
        # finish processing dataframe and labels column
        self.drop(columns=[column], inplace=True)
        # set labels as column
        self.insert(0, 'labels', _lbls)
        return

In [None]:
pets_data = df_tools.convert_to_records(all_pets)
pets_data[0]

In [None]:
pets_test = Test(pets_data, column='labels')
pets_test

In [None]:
pets = pd.read_csv('pets.csv')
pets

In [None]:
pets_test = Test(pets, column='species', labels={'Animal', 'Pet'})
pets_test