<a href="https://colab.research.google.com/github/danb-neo4j/patient_journey/blob/main/patientJourney_dataLoad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neo4j GDS Patient Journey Demo: Data Loading
This notebook walks through the process of loading [Synthea](https://synthea.mitre.org/) data into Neo4j. This notebook loads data used for the full demo:
* Patients
* Encounters
* Procedures
* Medications (Drugs)

The code below is adapted from the code provided in [Graph Data Processing with Cypher](https://github.com/PacktPublishing/Cypher-Querying). 

*Last updated: 1 April 2023*

# Import Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette("colorblind")
sns.set(rc={'figure.figsize':(12,7)})

import os
import configparser
from IPython.display import Image

In [2]:
# install or import Neo4j GraphDataScience library
try: 
  from graphdatascience import GraphDataScience
  print('Successfully imported GraphDataScience')
except ModuleNotFoundError:
  !pip install graphdatascience
  from graphdatascience import GraphDataScience
  print('installed and imported GraphDataScience')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting graphdatascience
  Downloading graphdatascience-1.6-py3-none-any.whl (918 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m919.0/919.0 KB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting neo4j<6.0,>=4.4.2
  Downloading neo4j-5.7.0.tar.gz (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.3/176.3 KB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: neo4j
  Building wheel for neo4j (pyproject.toml) ... [?25l[?25hdone
  Created wheel for neo4j: filename=neo4j-5.7.0-py3-none-any.whl size=244275 sha256=5faaacc69f0f76f4cc50c798913d087c86e0019c6c6bb13c8eb564e6edac7509
  Store

# Mount Google Drive

In [3]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# Connect to Neo4j Database

In [4]:
# change directory
os.chdir('drive/MyDrive/Colab Notebooks/gds_demos/patient_journey/')

In [5]:
# set data directory
DATA_DIRECTORY = 'data/'

In [6]:
# import utility function to connect to Neo4j DB
from neoUtils import read_neo4j_properties

In [7]:
# read in Neo4j host and authentication 
NEO4J_PROPERTIES_FILE = 'auth/patientJourney_auth.ini'
HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

Using HOST, USERNAME, PASSWORD from .ini file


In [8]:
# connect to and instantiate GDS
gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)

# confirm connection with gds version 
print('Neo4j GDS Version:', gds.version())

Neo4j GDS Version: 2.3.2+14


In [9]:
# # code to delete database, if necessary
# gds.run_cypher('''
# MATCH (n) 
# CALL { WITH n 
# DETACH DELETE n 
# } IN TRANSACTIONS OF 50000 ROWS
# ''')

# Load Patient Data

In [10]:
# NOTE: The 'updated' version of the Patient Data includes an 'AGE' feature
patient_df = pd.read_csv(DATA_DIRECTORY + 'patients_updated.csv')
patient_df.shape

(5885, 29)

## Patient Constraints

In [11]:
# source file patient_indexes.cql
gds.run_cypher('''CREATE CONSTRAINT patient_id IF NOT EXISTS FOR (n:Patient) REQUIRE n.id IS UNIQUE''')
gds.run_cypher('''CREATE CONSTRAINT race_id IF NOT EXISTS FOR (n:Race) REQUIRE n.type IS UNIQUE''')

## Patient Data and Relationships

In [12]:
gds.run_cypher('''
CYPHER runtime=slotted
UNWIND $patient_data AS row

CALL {
WITH row
MERGE (p:Patient {id: row.Id})
SET
    p.firstName = row.FIRST,
    p.lastName = row.LAST,
    p.suffix = row.SUFFIX,
    p.ssn = row.SSN,
    p.gender = row.GENDER,
    p.birthDate = DATE(row.BIRTHDATE),
    p.age = toFloat(row.AGE),
    p.marital = row.MARITAL,
    p.expenses = row.HEALTHCARE_EXPENSES,
    p.income = row.INCOME

WITH row, p
MERGE (r:Race {type: row.RACE})
MERGE (p)-[:HAS_RACE]->(r)

//WITH row, p
//WHERE row.ZIP IS NOT NULL
//MERGE (z:ZipCode {zip: row.ZIP})
//MERGE (p)-[:HAS_ZIPCODE]->(z)
} IN TRANSACTIONS OF 1000 ROWS
''', {'patient_data': patient_df.to_dict('records')})

# Load Encounters Data

In [13]:
encounters_df = pd.read_csv(DATA_DIRECTORY + 'encounters.csv')
encounters_df.shape

(497415, 15)

In [14]:
encounters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497415 entries, 0 to 497414
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Id                   497415 non-null  object 
 1   START                497415 non-null  object 
 2   STOP                 497415 non-null  object 
 3   PATIENT              497415 non-null  object 
 4   ORGANIZATION         497415 non-null  object 
 5   PROVIDER             497415 non-null  object 
 6   PAYER                497415 non-null  object 
 7   ENCOUNTERCLASS       497415 non-null  object 
 8   CODE                 497415 non-null  int64  
 9   DESCRIPTION          497415 non-null  object 
 10  BASE_ENCOUNTER_COST  497415 non-null  float64
 11  TOTAL_CLAIM_COST     497415 non-null  float64
 12  PAYER_COVERAGE       497415 non-null  float64
 13  REASONCODE           218995 non-null  float64
 14  REASONDESCRIPTION    218995 non-null  object 
dtypes: float64(4), in

## Create Encounter Index

In [15]:
gds.run_cypher('''CREATE INDEX encounter_id IF NOT EXISTS FOR (n:Encounter) ON n.id''')

## Create Encounter Constraints

In [16]:
gds.run_cypher('''CREATE CONSTRAINT snomed_id IF NOT EXISTS FOR (n:SNOMED_CT) REQUIRE n.code IS UNIQUE''')

## Encounter Data and Relationships

In [17]:
gds.run_cypher('''
CYPHER runtime=slotted

UNWIND $encounters_data AS row
CALL {
WITH row
MERGE(e:Encounter {id: row.Id})
SET
    e.date=datetime(row.START),
    e.description=row.DESCRIPTION,
    e.isEnd = false,
    e.totalCost = row.TOTAL_CLAIM_COST

FOREACH (ignore in CASE WHEN row.STOP IS NOT NULL AND row.STOP <> '' THEN [1] ELSE [] END |
         SET e.end=datetime(row.STOP)
    )
FOREACH (ignore in CASE WHEN row.CODE IS NOT NULL AND row.CODE <> '' THEN [1] ELSE [] END |
         MERGE(s:SNOMED_CT {code:row.CODE})
         MERGE(e)-[:OF_TYPE]->(s)
    )
WITH row,e
// CALL apoc.create.setLabels( e, [ 'Encounter', row.ENCOUNTERCLASS ] ) YIELD node
CALL apoc.create.setLabels( e, [ 'Encounter', toUpper(left(row.ENCOUNTERCLASS, 1)) + right(row.ENCOUNTERCLASS, size(row.ENCOUNTERCLASS) - 1) ] ) 
YIELD node
    
WITH row,e
MERGE(p:Patient {id: row.PATIENT})
MERGE (p)-[:HAS_ENCOUNTER]->(e)

//WITH row,e
//MERGE (provider:Provider {id:row.PROVIDER})
//MERGE(e)-[:HAS_PROVIDER]->(provider)
//FOREACH (ignore in CASE WHEN row.ORGANIZATION IS NOT
//    NULL AND row.ORGANIZATION <> '' THEN [1] ELSE [] END |
//      MERGE (o:Organization {id: row.ORGANIZATION})
//      MERGE (e)-[:HAS_ORGANIZATION]->(o))
} IN TRANSACTIONS OF 1000 ROWS
''', {'encounters_data': encounters_df.to_dict('records')})

# Load Procedures Data

In [18]:
procedures_df = pd.read_csv(DATA_DIRECTORY + 'procedures.csv')
procedures_df = procedures_df.fillna('')
procedures_df.shape

(785604, 9)

In [19]:
gds.run_cypher('''
CYPHER runtime=slotted

UNWIND $procedures_data AS row
CALL {
WITH row
MATCH (p:Patient {id:row.PATIENT})
MERGE (c:SNOMED_CT {code:row.CODE})
    SET c.description=row.DESCRIPTION, c:Procedure

MERGE (cs:Encounter {id:row.ENCOUNTER, isEnd: false})
  ON CREATE
  SET cs.date=datetime(row.START)

MERGE (p)-[:HAS_ENCOUNTER]->(cs)
MERGE (cs)-[:HAS_PROCEDURE]->(c)
} IN TRANSACTIONS OF 1000 ROWS
''', {'procedures_data': procedures_df.to_dict('records')})

# Load Medications (Drugs) Data

In [20]:
medications_df = pd.read_csv(DATA_DIRECTORY + 'medications.csv')
medications_df = medications_df.fillna('')
medications_df.shape

(604103, 13)

In [21]:
gds.run_cypher('''
CYPHER runtime=slotted

UNWIND $medications_data AS row
CALL {
WITH row
MERGE (p:Patient {id: row.PATIENT})
MERGE (d:Drug {code: row.CODE})
    SET d.description = row.DESCRIPTION
MERGE (ps:Encounter {id: row.ENCOUNTER, isEnd: false})
MERGE (ps)-[:HAS_DRUG]->(d)
MERGE (p)-[:HAS_ENCOUNTER]->(ps)

FOREACH (ignore in CASE WHEN 
                        row.REASONCODE IS NOT NULL AND
                        row.REASONCODE <> '' THEN [1] ELSE [] END | 
        MERGE (s:SNOMED_CT {code: row.CODE})
        SET s:Diagnosis, s.description = row.REASONDESCRIPTION
        MERGE (ps)-[:HAS_DIAGNOSIS]->(s)
)

WITH row, ps, p
  WHERE row.STOP IS NOT NULL and row.STOP <> ''
CREATE (pe:Encounter {id:row.ENCOUNTER, date:datetime(row.STOP)})
SET pe.isEnd=true
CREATE (p)-[:HAS_ENCOUNTER]->(pe)
CREATE (pe)-[:HAS_DRUG]->(d)
CREATE (ps)-[:HAS_END]->(pe)
} IN TRANSACTIONS OF 1000 ROWS
''', {'medications_data': medications_df.to_dict('records')})

In [22]:
print('data load complete!')

data load complete!
