<a href="https://colab.research.google.com/github/danb-neo4j/NODES2022_GraphEDA/blob/main/NODES2022_Load_Graph_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NODES 2022 GraphEDA Workshop: 
## Notebook Two -- Generate and Load Graph Data


# Import Libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette("colorblind")
sns.set(rc={'figure.figsize':(11,6)})

import os 
import configparser

In [2]:
# install or import Neo4j GraphDataScience library
try: 
  from graphdatascience import GraphDataScience
  print('Successfully imported GraphDataScience')
except ModuleNotFoundError:
  !pip install graphdatascience
  from graphdatascience import GraphDataScience
  print('installed and imported GraphDataScience')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting graphdatascience
  Downloading graphdatascience-1.4-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 5.2 MB/s 
Collecting multimethod<2.0,>=1.0
  Downloading multimethod-1.9-py3-none-any.whl (10 kB)
Collecting neo4j<6.0,>=4.4.2
  Downloading neo4j-5.2.0.tar.gz (173 kB)
[K     |████████████████████████████████| 173 kB 42.0 MB/s 
Building wheels for collected packages: neo4j
  Building wheel for neo4j (setup.py) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.2.0-py3-none-any.whl size=248023 sha256=d2268bf64d342bd44d3fbb51ba18ab148e6566815897d9ce97fb1dbed73091ae
  Stored in directory: /root/.cache/pip/wheels/5a/07/16/4d845d69ef310660c14b7148848c95da3ef3950c7b58daec42
Successfully built neo4j
Installing collected packages: neo4j, multimethod, graphdatascience
Successfully installed graphdatascience-1.4 multimethod-1.9 neo4j-5.2.0
instal

# Custom Functions

In [3]:
# function adapted from Neo4j GDS Fraud Demo Notebook (h/t Zach B.)
def read_neo4j_properties(NEO4J_PROPERTIES_FILE: str=None) -> str:
  '''Parses Neo4j database or Aura connection details from provided .ini filepath.
  Requirements:
    configparser

  Args:
    NEO4J_PROPERTIES_FILE: path to a .ini file
  
  Returns:
    HOST: link to Neo4j or Aura host 
    USERNAME: login username
    PASSWORD: login password 

  Note: The .ini file should use the following syntax
    [NEO4J]
    PASSWORD=<password>
    USERNAME=<database name>
    HOST=<host uri>

  If no path is passed, the function will return the defaults:
    HOST = 'neo4j://localhost'
    USERNAME = 'neo4j'
    PASSWORD = 'password'
  '''

  if NEO4J_PROPERTIES_FILE is not None and os.path.exists(NEO4J_PROPERTIES_FILE):
      config = configparser.RawConfigParser()
      config.read(NEO4J_PROPERTIES_FILE)
      HOST = config['NEO4J']['HOST']
      USERNAME = config['NEO4J']['USERNAME']
      PASSWORD = config['NEO4J']['PASSWORD']
      print('Using HOST, USERNAME, PASSWORD from .ini file')
      return HOST, USERNAME, PASSWORD
  else:
      print('Could not find database properties file, using defaults:')
      HOST = 'neo4j://localhost'
      USERNAME = 'neo4j'
      PASSWORD = 'password'
      print(f'HOST: {HOST} \nUSERHAME: {USERNAME} \nPASSWORD: {PASSWORD}')
      return HOST, USERNAME, PASSWORD 

In [4]:
def gen_passenger_amenity_edges(df: pd.DataFrame, col: str) -> pd.DataFrame:
  '''Generate an edgelist from passenger to amenity with spend amount

  Args:
    df: DataFrame containing data
    col: column containing the spend amount 
  '''
  edge_list = df[df[col] > 0][['PassengerId', col]]
  edge_list['Amenity'] = col
  edge_list = edge_list[['PassengerId', 'Amenity', col]]
  edge_list.columns = ['source', 'target', 'amount']
  
  return edge_list

In [5]:
def load_passenger_amenity_edges(df: pd.DataFrame) -> None:
  '''upload (p:Passenger)-[:SPENT_ON]->(a:Amenity) to database
  args:
    df: dataframe with passenger, amenity, and amount values
  '''
  gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (p:Passenger{id:rel.source})
               MATCH (a:Amenity{id:rel.target})
               MERGE (p)-[r:SPEND]->(a)
               ON CREATE SET 
                  r.amount = toFloat(rel.amount)
              ''', {'rels': df.to_dict('records')})


In [6]:
def gen_agg_spend_df(df: pd.DataFrame, agg_col: str) -> pd.DataFrame():
  '''generate aggregate amenity spend for a specific column
  inputs:
    df: dataframe containing the agg column and spend values
    agg_col: specific column to aggregate on 
  
  outputs:
    agg_spend_df: dataframe containing aggregated spending values
  '''
  agg_spend_df = (
    df[[agg_col, 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]
    .groupby(agg_col)
    .agg('sum')
    .reset_index()
  )

  return agg_spend_df


In [7]:
def gen_agg_amenity_edges(df: pd.DataFrame, 
                          aggcol: str, 
                          amenity: str) -> pd.DataFrame():
  '''gen edgelist for aggregate cols to specified amenity 

  inputs:
    df: dataframe grouping amenity spend by cabin
    aggcol: column to aggregate on
    amenity: amenity to generate edges for 
  
  outputs:
    edgelist: dataframe with edges to upload
  '''
  edgelist = df[[aggcol, amenity]].copy()
  edgelist.drop_duplicates(inplace=True)
  edgelist['target'] = amenity
  edgelist.columns = ['source', 'amount', 'target']
  edgelist = edgelist[['source', 'target', 'amount']]
  
  return edgelist 

# Connect to Colab

In [8]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
os.chdir('drive/MyDrive/Colab Notebooks/NODES_2022_Workshop/')

# Import Data

In [10]:
train_df = pd.read_csv("source_data/spaceshipTitanic_train_processed.csv")

print('train dataframe dimensions:', train_df.shape)

train dataframe dimensions: (8693, 21)


In [11]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Passenger_Group,Group_Size,Surname,Cabin_Deck,Cabin_Num,Cabin_Side,Total_Spend
0,0001_01,Europa,False,B0P,TRAPPIST1e,39,False,0,0,0,0,0,Maham Ofracculy,False,1,1,Ofracculy,B,0,P,0
1,0002_01,Earth,False,F0S,TRAPPIST1e,24,False,109,9,25,549,44,Juanna Vines,True,2,1,Vines,F,0,S,736
2,0003_01,Europa,False,A0S,TRAPPIST1e,58,True,43,3576,0,6715,49,Altark Susent,False,3,2,Susent,A,0,S,10383
3,0003_02,Europa,False,A0S,TRAPPIST1e,33,False,0,1283,371,3329,193,Solam Susent,False,3,2,Susent,A,0,S,5176
4,0004_01,Earth,False,F1S,TRAPPIST1e,16,False,303,70,151,565,2,Willy Santantines,True,4,1,Santantines,F,1,S,1091


# Connect to AuraDS
If using a persistent AuraDS instance, it is recommended to store authentication credentials in a separate file and read them in to the notebook as variables. This code assumes the notebook is running Colab and the files are stored in a local auth directory.

In [12]:
NEO4J_PROPERTIES_FILE = 'auth/nodes_titanic_auth.ini'
HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

Using HOST, USERNAME, PASSWORD from .ini file


In [13]:
# connect to AuraDS instance 
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

In [14]:
# confirm connection with GDS version 
gds.version()

'2.2.2'

## Clear Database Contents

In [15]:
# if necessary, clear out the database
gds.run_cypher('''
                MATCH (n)
                DETACH DELETE n
                ''')

# Create Database Constraints
Create  constraints to enforce uniqueness.

In [16]:
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (p:Passenger) ASSERT p.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (s:Surname) ASSERT s.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (g:Group) ASSERT g.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (c:Cabin) ASSERT c.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (cd:CabinDeck) ASSERT cd.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (cs:CabinSide) ASSERT cs.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (pl:Planet) ASSERT pl.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT IF NOT EXISTS ON (a:Amenity) ASSERT a.id IS UNIQUE''')


In [17]:
gds.run_cypher('''SHOW CONSTRAINTS''')

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndexId
0,16,constraint_2173d8e2,UNIQUENESS,NODE,[CabinDeck],[id],15
1,18,constraint_2e968b1d,UNIQUENESS,NODE,[CabinSide],[id],17
2,20,constraint_69922a96,UNIQUENESS,NODE,[Planet],[id],19
3,4,constraint_6f46daf2,UNIQUENESS,NODE,[Passenger],[id],3
4,12,constraint_a154dbc5,UNIQUENESS,NODE,[Group],[id],11
5,22,constraint_a1637b42,UNIQUENESS,NODE,[Amenity],[id],21
6,10,constraint_e622b4ad,UNIQUENESS,NODE,[Surname],[id],9
7,14,constraint_f70bc517,UNIQUENESS,NODE,[Cabin],[id],13


# Load Nodes
Reference: [Create a graph database in Neo4j using Python]('https://towardsdatascience.com/create-a-graph-database-in-neo4j-using-python-4172d40f89c4')

## Import Passenger Nodes 

In [18]:
# subset dataframe to just the columns we want 
passenger_node_features = ['PassengerId', 'Age', 'CryoSleep', 'Total_Spend', 'Transported']
train_df[passenger_node_features].head()

Unnamed: 0,PassengerId,Age,CryoSleep,Total_Spend,Transported
0,0001_01,39,False,0,False
1,0002_01,24,False,736,True
2,0003_01,58,False,10383,False
3,0003_02,33,False,5176,False
4,0004_01,16,False,1091,True


In [19]:
# reformat to a list of dictionaries as 'records' when we submit to the query 
train_df[passenger_node_features].head().to_dict(orient='records')

[{'PassengerId': '0001_01',
  'Age': 39,
  'CryoSleep': False,
  'Total_Spend': 0,
  'Transported': False},
 {'PassengerId': '0002_01',
  'Age': 24,
  'CryoSleep': False,
  'Total_Spend': 736,
  'Transported': True},
 {'PassengerId': '0003_01',
  'Age': 58,
  'CryoSleep': False,
  'Total_Spend': 10383,
  'Transported': False},
 {'PassengerId': '0003_02',
  'Age': 33,
  'CryoSleep': False,
  'Total_Spend': 5176,
  'Transported': False},
 {'PassengerId': '0004_01',
  'Age': 16,
  'CryoSleep': False,
  'Total_Spend': 1091,
  'Transported': True}]

In [20]:
# import dataframe as list of dictionaries 
gds.run_cypher('''
               UNWIND $passenger_list AS item
               MERGE (p:Passenger{id: item.PassengerId})
               ON CREATE SET p.age = toFloat(item.Age),
                             p.cryosleep = toBoolean(item.CryoSleep),
                             p.totalspend = toFloat(item.Total_Spend),
                             p.transported = toBoolean(item.Transported)
              ''', {'passenger_list': train_df[passenger_node_features].to_dict('records')})

## Import Planet Nodes

In [21]:
planet_list = train_df['HomePlanet'].unique().tolist() + train_df['Destination'].unique().tolist()
planet_list

['Europa', 'Earth', 'Mars', 'TRAPPIST1e', 'PSOJ318522', '55Cancrie']

In [22]:
# pass our list of planets generated above
gds.run_cypher('''
              UNWIND $planet_list AS item 
              MERGE (pl:Planet{id:item})
              ''', {'planet_list': planet_list}
               )

## Import Surname Nodes

In [23]:
print('count of unique surnames:', train_df['Surname'].nunique())

count of unique surnames: 2218


In [24]:
gds.run_cypher('''
              UNWIND $surname_list AS item 
              MERGE (s:Surname{id:item})
              ''', {'surname_list': train_df['Surname'].unique().tolist()}
               )

## Import Group Nodes
Passengers in groups of two or more will be considered to be traveling in a group.

In [25]:
group_nodes = train_df[train_df['Group_Size'] >= 2][['Passenger_Group', 'Group_Size']]
group_nodes.drop_duplicates(inplace=True)
group_nodes.shape

(1412, 2)

In [26]:
gds.run_cypher('''
              UNWIND $group_list AS item 
              MERGE (g:Group{id:item.Passenger_Group})
              ON CREATE 
                SET g.Group_Size = toFloat(item.Group_Size)
              ''', {'group_list': group_nodes.to_dict('records')}
               )

## Import Cabin Nodes

In [29]:
gds.run_cypher('''
              UNWIND $cabin_list AS item 
              MERGE (c:Cabin{id:item})
              ''', {'cabin_list': train_df['Cabin'].unique().tolist()}
               )

## Import Cabin Deck Nodes

In [30]:
gds.run_cypher('''
              UNWIND $deck_list AS item 
              MERGE (cd:CabinDeck{id:item})
              ''', {'deck_list': train_df['Cabin_Deck'].unique().tolist()}
               )

## Import Cabin Side Nodes

In [31]:
gds.run_cypher('''
              UNWIND $side_list AS item 
              MERGE (cs:CabinSide{id:item})
              ''', {'side_list': train_df['Cabin_Side'].unique().tolist()}
               )

## Import Amenities Nodes

In [32]:
amenities_nodes = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [33]:
gds.run_cypher('''
              UNWIND $amenities_list AS item 
              MERGE (a:Amenity{id:item})
              ''', {'amenities_list': amenities_nodes}
               )

# Load Relationships

In [30]:
# # if necessary, clear out relationships
# gds.run_cypher('''
#                 MATCH ()-[r]->()
#                 DETACH DELETE r
#                 ''')

## Passenger

### Passenger ORIGINATED from Planet 
`(pl:Planet)<-[:ORIGINATED_FROM]-(p:Passenger)-[:TRAVELING_TO]->(pl:Planet)`

In [34]:
passenger_originates_planet = train_df[['PassengerId', 'HomePlanet']].copy()
passenger_originates_planet.drop_duplicates(inplace=True)
passenger_originates_planet.columns = ['source', 'target']
passenger_originates_planet.shape

(8693, 2)

In [35]:
passenger_originates_planet.head()

Unnamed: 0,source,target
0,0001_01,Europa
1,0002_01,Earth
2,0003_01,Europa
3,0003_02,Europa
4,0004_01,Earth


In [36]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (p:Passenger{id:rel.source})
               MATCH (pl:Planet{id:rel.target})
               MERGE (p)-[:ORIGINATED]->(pl)
              ''', {'rels': passenger_originates_planet.to_dict('records')})

### Passenger TRAVELING to Destination

In [37]:
passenger_travelingTo_planet = train_df[['PassengerId', 'Destination']].copy()
passenger_travelingTo_planet.drop_duplicates(inplace=True)
passenger_travelingTo_planet.columns = ['source', 'target']
passenger_travelingTo_planet.shape

(8693, 2)

In [38]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (p:Passenger{id:rel.source})
               MATCH (pl:Planet{id:rel.target})
               MERGE (p)-[:TRAVELING]->(pl)
              ''', {'rels': passenger_travelingTo_planet.to_dict('records')})

### Passenger HAS Surname
`(p:Passenger)-[:HAS_SURNAME]->(s:Surname)`

In [40]:
# generage edgelist as a dataframe
passenger_has_surname = train_df[['PassengerId', 'Surname']].copy()
passenger_has_surname.columns = ['source', 'target']
passenger_has_surname.drop_duplicates(inplace=True)
passenger_has_surname.shape

(8693, 2)

In [41]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (p:Passenger{id:rel.source})
               MATCH (s:Surname{id:rel.target})
               MERGE (p)-[:HAS_SURNAME]->(s)
              ''', {'rels': passenger_has_surname.to_dict('records')})

### Passenger is MEMBER of Group 
*Note: Will only create the reationship for passengers traveling as part of groups with two or more members.*

`(p:Passenger)-[:MEMBER]->(g:Group)`

In [42]:
passenger_isMember_group = train_df[train_df['Group_Size'] >= 2][['PassengerId', 'Passenger_Group']]
passenger_isMember_group.columns = ['source', 'target']
passenger_isMember_group.drop_duplicates(inplace=True)
passenger_isMember_group.shape

(3888, 2)

In [43]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (p:Passenger{id:rel.source})
               MATCH (g:Group{id:rel.target})
               MERGE (p)-[:MEMBER]->(g)
              ''', {'rels': passenger_isMember_group.to_dict('records')})

### Passenger ASSIGNED to Cabin

`(p:Passenger)-[:ASSIGNED]->(c:Cabin)`

In [44]:
passenger_booked_cabin = train_df[['PassengerId', 'Cabin']].copy()
passenger_booked_cabin.columns = ['source', 'target']
passenger_booked_cabin.drop_duplicates(inplace=True)
passenger_booked_cabin.shape

(8693, 2)

In [45]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (p:Passenger{id:rel.source})
               MATCH (c:Cabin{id:rel.target})
               MERGE (p)-[:ASSIGNED]->(c)
              ''', {'rels': passenger_booked_cabin.to_dict('records')})

### Passenger SPEND on Amenity 
`(p:Passenger)-[:SPEND]->(a:Amenity)`

In [46]:
# generate passenger spend edges
passenger_spentOn_RoomService = gen_passenger_amenity_edges(df=train_df, col='RoomService')
passenger_spentOn_FoodCourt = gen_passenger_amenity_edges(df=train_df, col='FoodCourt')
passenger_spentOn_ShoppingMall = gen_passenger_amenity_edges(df=train_df, col='ShoppingMall')
passenger_spentOn_Spa = gen_passenger_amenity_edges(df=train_df, col='Spa')
passenger_spentOn_VRDeck = gen_passenger_amenity_edges(df=train_df, col='VRDeck')


In [47]:
passenger_spentOn_RoomService.head()

Unnamed: 0,source,target,amount
1,0002_01,RoomService,109
2,0003_01,RoomService,43
4,0004_01,RoomService,303
6,0006_01,RoomService,42
11,0008_03,RoomService,39


In [48]:
# combine edge lists 
all_passenger_spend = pd.concat([passenger_spentOn_RoomService,
                                 passenger_spentOn_FoodCourt,
                                 passenger_spentOn_ShoppingMall,
                                 passenger_spentOn_Spa,
                                 passenger_spentOn_VRDeck], axis=0)
all_passenger_spend.shape

(15083, 3)

In [49]:
# load passenger spend edges
load_passenger_amenity_edges(all_passenger_spend)

In [50]:
# confirm uploaded relcount 
passenger_spend_relCount = gds.run_cypher('''MATCH (p:Passenger)-[r:SPEND]->(a:Amenity)
                                             RETURN COUNT(r) AS relCount''')

assert all_passenger_spend.shape[0] == passenger_spend_relCount['relCount'][0]

## Cabin

### Cabin ON Deck 
`(c:Cabin)-[:ON_DECK]->(cd:CabinDeck)`

In [51]:
cabin_on_deck = train_df[['Cabin', 'Cabin_Deck']].copy()
cabin_on_deck.columns = ['source', 'target']
cabin_on_deck.drop_duplicates(inplace=True)
cabin_on_deck.shape

(6560, 2)

In [52]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (c:Cabin{id:rel.source})
               MATCH (cd:CabinDeck{id:rel.target})
               MERGE (c)-[:ON_DECK]->(cd)
              ''', {'rels': cabin_on_deck.to_dict('records')})

### Cabin ON Side
`(c:Cabin)-[:ON_SIDE]->(cs:CabinSide)`

In [53]:
cabin_on_side = train_df[['Cabin', 'Cabin_Side']].copy()
cabin_on_side.columns = ['source', 'target']
cabin_on_side.drop_duplicates(inplace=True)
cabin_on_side.shape

(6560, 2)

In [54]:
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (c:Cabin{id:rel.source})
               MATCH (cs:CabinSide{id:rel.target})
               MERGE (c)-[:ON_SIDE]->(cs)
              ''', {'rels': cabin_on_side.to_dict('records')})

## Surname

### Surname IN Group
`(s:Surname)-[:IN_GROUP]->(g:Group)`

In [56]:
# create edge list
surname_in_group = train_df[train_df['Group_Size'] >= 2][['Surname', 'Passenger_Group']].copy()
surname_in_group.drop_duplicates(inplace=True)
surname_in_group.columns = ['source', 'target']
surname_in_group.shape

(1782, 2)

In [57]:
# upload to cypher 
gds.run_cypher('''
               UNWIND $rels AS rel 
               MATCH (s:Surname{id:rel.source})
               MATCH (g:Group{id:rel.target})
               MERGE (s)-[:IN_GROUP]->(g)
              ''', {'rels': surname_in_group.to_dict('records')})

# Run Tests on Graph Upload
*Could be considered the start the Graph EDA Section*

In [59]:
graph_summary_df = gds.run_cypher('''CALL apoc.meta.stats()
                                     YIELD *
                                  ''')

# set a named index, rather than the default 0
graph_summary_df.index = ['values']


## Confirm Node Counts  

In [60]:
assert graph_summary_df['labels']['values']['Passenger'] == train_df['PassengerId'].nunique()
assert graph_summary_df['labels']['values']['Surname'] == train_df['Surname'].nunique()
assert graph_summary_df['labels']['values']['Group'] == train_df[train_df['Group_Size'] >= 2]['Passenger_Group'].nunique()
assert graph_summary_df['labels']['values']['Cabin'] == train_df['Cabin'].nunique()
assert graph_summary_df['labels']['values']['CabinDeck'] == train_df['Cabin_Deck'].nunique()
assert graph_summary_df['labels']['values']['CabinSide'] == train_df['Cabin_Side'].nunique()
assert graph_summary_df['labels']['values']['Planet'] == len(planet_list)
assert graph_summary_df['labels']['values']['Amenity'] == len(amenities_nodes)


## Test Selected Relationship Counts

In [61]:
assert graph_summary_df['relTypesCount']['values']['ORIGINATED'] == train_df['PassengerId'].nunique()
assert graph_summary_df['relTypesCount']['values']['TRAVELING'] == train_df['PassengerId'].nunique()
assert graph_summary_df['relTypesCount']['values']['HAS_SURNAME'] == train_df['PassengerId'].nunique()
assert graph_summary_df['relTypesCount']['values']['MEMBER'] == train_df[train_df['Group_Size'] >= 2]['PassengerId'].nunique()
assert graph_summary_df['relTypesCount']['values']['ASSIGNED'] == train_df['PassengerId'].nunique()
assert graph_summary_df['relTypesCount']['values']['ON_DECK'] == train_df['Cabin'].nunique()
assert graph_summary_df['relTypesCount']['values']['ON_SIDE'] == train_df['Cabin'].nunique()


In [62]:
# confirm spend relcounts
assert graph_summary_df['relTypesCount']['values']['SPEND'] == all_passenger_spend.shape[0]