<a href="https://colab.research.google.com/github/danb-neo4j/patient_journey/blob/main/patientJourney_tabularEDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Patient Journey Demo: Tabular EDA

# Import Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_style('darkgrid')
sns.set_palette("colorblind")
sns.set(rc={'figure.figsize':(12,7)})

import os
import configparser
from IPython.display import Image

In [2]:
# # install or import Neo4j GraphDataScience library
# try: 
#   from graphdatascience import GraphDataScience
#   print('Successfully imported GraphDataScience')
# except ModuleNotFoundError:
#   !pip install graphdatascience
#   from graphdatascience import GraphDataScience
#   print('installed and imported GraphDataScience')

# Mount Google Drive

In [3]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


# Connect to Neo4j Database

In [4]:
os.chdir('drive/MyDrive/Colab Notebooks/gds_demos/patient_journey/')

In [5]:
# # import utility function to connect to Neo4j DB
# from neoUtils import read_neo4j_properties

In [6]:
# # read in Neo4j host and authentication 
# NEO4J_PROPERTIES_FILE = 'auth/patientJourney_auth.ini'
# # NEO4J_PROPERTIES_FILE = 'patientJourney_auth.ini'
# HOST, USERNAME, PASSWORD = read_neo4j_properties(NEO4J_PROPERTIES_FILE=NEO4J_PROPERTIES_FILE)

In [7]:
# # connect to and instantiate GDS
# gds = GraphDataScience(HOST, auth=(USERNAME, PASSWORD), aura_ds=True)
# # gds.set_database('neo4j')
# # confirm connection with gds version 
# print('Neo4j GDS Version:', gds.version())

# Patient Data

In [8]:
DATA_DIRECTORY = 'data/'

## Explore Full Patient Data

In [9]:
patient_df = pd.read_csv(DATA_DIRECTORY + 'patients.csv')
patient_df.shape

(5885, 27)

In [10]:
patient_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5885 entries, 0 to 5884
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   5885 non-null   object 
 1   BIRTHDATE            5885 non-null   object 
 2   DEATHDATE            885 non-null    object 
 3   SSN                  5885 non-null   object 
 4   DRIVERS              4937 non-null   object 
 5   PASSPORT             4641 non-null   object 
 6   PREFIX               4792 non-null   object 
 7   FIRST                5885 non-null   object 
 8   LAST                 5885 non-null   object 
 9   SUFFIX               76 non-null     object 
 10  MAIDEN               1572 non-null   object 
 11  MARITAL              4066 non-null   object 
 12  RACE                 5885 non-null   object 
 13  ETHNICITY            5885 non-null   object 
 14  GENDER               5885 non-null   object 
 15  BIRTHPLACE           5885 non-null   o

In [11]:
patient_df.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,75bef938-819a-ce94-8dc9-17dc50077a86,1991-11-22,,999-68-3502,S99951806,X84544248X,Mrs.,Joi660,Barrows492,,Schinner682,M,white,nonhispanic,F,Methuen Massachusetts US,811 Little Orchard Suite 57,Fall River,Massachusetts,Bristol County,25005.0,2720,41.724408,-71.203884,36108.19,219006.29,144427
1,faf277d8-c5c7-406c-f139-9143c35ff4e8,1987-09-28,,999-51-6719,S99958687,X29099987X,Mr.,Billy698,McLaughlin530,,,M,white,nonhispanic,M,Boston Massachusetts US,871 Klein Stravenue Suite 70,North Lakeville,Massachusetts,Plymouth County,,0,41.86911,-70.904529,6685.55,68732.89,10147
2,4f01341c-4946-31cd-9f2e-7844867f1606,1979-07-30,,999-82-1028,S99967942,X56759551X,Mr.,Dominic463,Ward668,,,M,white,nonhispanic,M,Agawam Massachusetts US,209 Cassin Brook Apt 47,Tyngsborough,Massachusetts,Middlesex County,,0,42.639541,-71.417188,37041.63,6793.2,109151
3,63cd703f-3077-613f-adf0-85fc32f3ae2a,1971-01-22,,999-72-1131,S99976682,X71554510X,Mr.,Ethan766,Nolan344,,,M,white,nonhispanic,M,New Bedford Massachusetts US,231 Champlin Mill,Westfield,Massachusetts,Hampden County,25013.0,1086,42.10606,-72.722163,16736.57,216336.28,78360
4,61b942b2-8324-0208-7e2f-5309af2be4f4,1968-04-06,,999-12-8574,S99954876,X22547263X,Mr.,Arron144,Bashirian201,,,D,asian,nonhispanic,M,Easthampton Massachusetts US,422 Russel Branch Suite 52,Boston,Massachusetts,Suffolk County,25025.0,2210,42.38592,-71.056706,57242.58,195281.25,416323


# Encounters Data

## Explore Full Encounters Data

In [12]:
encounters_df = pd.read_csv(DATA_DIRECTORY + 'encounters.csv')
encounters_df.shape

(497415, 15)

In [13]:
encounters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497415 entries, 0 to 497414
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Id                   497415 non-null  object 
 1   START                497415 non-null  object 
 2   STOP                 497415 non-null  object 
 3   PATIENT              497415 non-null  object 
 4   ORGANIZATION         497415 non-null  object 
 5   PROVIDER             497415 non-null  object 
 6   PAYER                497415 non-null  object 
 7   ENCOUNTERCLASS       497415 non-null  object 
 8   CODE                 497415 non-null  int64  
 9   DESCRIPTION          497415 non-null  object 
 10  BASE_ENCOUNTER_COST  497415 non-null  float64
 11  TOTAL_CLAIM_COST     497415 non-null  float64
 12  PAYER_COVERAGE       497415 non-null  float64
 13  REASONCODE           218995 non-null  float64
 14  REASONDESCRIPTION    218995 non-null  object 
dtypes: float64(4), in

In [15]:
encounters_df.head()

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,b7d4cb84-84b8-55a6-186c-fc2a9baa9e12,2010-01-15T11:35:37Z,2010-01-15T12:28:52Z,75bef938-819a-ce94-8dc9-17dc50077a86,fb4a08b8-57ee-382b-b3cb-43d94f1bb4c0,ca383a2c-61e9-319e-8afa-97bc71679361,0133f751-9229-3cfd-815f-b6d4979bdd6a,wellness,162673000,General examination of patient (procedure),136.8,1233.93,1233.93,,
1,896d8330-07e5-9c95-3f71-a9033c13f837,2005-11-21T11:16:28Z,2005-11-21T12:03:47Z,faf277d8-c5c7-406c-f139-9143c35ff4e8,071a898d-3535-39fd-bf61-d33301aacd04,c7efc940-0b81-3a2b-8d26-c3fcad47abcb,df166300-5a78-3502-a46a-832842197811,wellness,162673000,General examination of patient (procedure),136.8,1302.7,1152.7,,
2,4c55d50d-930c-2db6-27e7-2ec206adc81a,2006-11-27T11:16:28Z,2006-11-27T12:13:43Z,faf277d8-c5c7-406c-f139-9143c35ff4e8,071a898d-3535-39fd-bf61-d33301aacd04,c7efc940-0b81-3a2b-8d26-c3fcad47abcb,df166300-5a78-3502-a46a-832842197811,wellness,162673000,General examination of patient (procedure),136.8,704.2,704.2,,
3,fc37d497-489e-fafd-4641-bcab2211611b,2009-11-30T11:16:28Z,2009-11-30T12:04:00Z,faf277d8-c5c7-406c-f139-9143c35ff4e8,071a898d-3535-39fd-bf61-d33301aacd04,c7efc940-0b81-3a2b-8d26-c3fcad47abcb,df166300-5a78-3502-a46a-832842197811,wellness,162673000,General examination of patient (procedure),136.8,1186.78,1086.78,,
4,21f8b4bc-8f2d-18f2-60bc-6e6e2e1edbaf,2012-12-03T11:16:28Z,2012-12-03T11:52:22Z,faf277d8-c5c7-406c-f139-9143c35ff4e8,071a898d-3535-39fd-bf61-d33301aacd04,c7efc940-0b81-3a2b-8d26-c3fcad47abcb,df166300-5a78-3502-a46a-832842197811,wellness,162673000,General examination of patient (procedure),136.8,704.2,604.2,,


In [20]:
(
    encounters_df[['ENCOUNTERCLASS', 'DESCRIPTION']]
    .drop_duplicates()
    .sort_values(by=['ENCOUNTERCLASS', 'DESCRIPTION'], ascending=True)
)

Unnamed: 0,ENCOUNTERCLASS,DESCRIPTION
5473,ambulatory,Allergic disorder follow-up assessment
349,ambulatory,Allergic disorder initial assessment
3754,ambulatory,Asthma follow-up
522,ambulatory,Discussion about treatment (procedure)
5489,ambulatory,Domiciliary or rest home patient evaluation and management
10,ambulatory,Encounter for 'check-up'
20440,ambulatory,Encounter for Problem
241223,ambulatory,Encounter for check up
60,ambulatory,Encounter for problem
128,ambulatory,Encounter for problem (procedure)


# Provider Data

## Explore Full Provider Data

In [None]:
provider_df = pd.read_csv(DATA_DIRECTORY + 'providers.csv')
provider_df.shape

(1088, 13)

In [None]:
provider_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1088 entries, 0 to 1087
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            1088 non-null   object 
 1   ORGANIZATION  1088 non-null   object 
 2   NAME          1088 non-null   object 
 3   GENDER        1088 non-null   object 
 4   SPECIALITY    1088 non-null   object 
 5   ADDRESS       1088 non-null   object 
 6   CITY          1088 non-null   object 
 7   STATE         1088 non-null   object 
 8   ZIP           1088 non-null   int64  
 9   LAT           1088 non-null   float64
 10  LON           1088 non-null   float64
 11  ENCOUNTERS    1088 non-null   int64  
 12  PROCEDURES    1088 non-null   int64  
dtypes: float64(2), int64(3), object(8)
memory usage: 110.6+ KB


In [None]:
provider_df.head()

Unnamed: 0,Id,ORGANIZATION,NAME,GENDER,SPECIALITY,ADDRESS,CITY,STATE,ZIP,LAT,LON,ENCOUNTERS,PROCEDURES
0,404167a6-b48b-3399-bc52-7172d276d812,74ab949d-17ac-3309-83a0-13b4405c66aa,Gabriel934 Reilly981,F,GENERAL PRACTICE,881 Main Street,Fitchburg,MA,1420,42.586487,-71.80521,14532,0
1,371fd834-696c-30b4-b01a-00a764128a93,da92d3fc-5445-3825-a937-043ef0d6ecd0,Neal874 Cruickshank494,M,GENERAL PRACTICE,336 GRATTAN ST,CHICOPEE,MA,10201314,42.166298,-72.590941,31,0
2,21442dfe-cb15-341e-9279-916b85a28f9d,588f6ce6-b8db-3588-8189-29db2680a313,Terence292 Brakus656,M,GENERAL PRACTICE,461 WALNUT AVE,JAMAICA PLAIN,MA,21302331,42.311588,-71.098001,527,0
3,9d158f38-019a-33c6-9243-ec52ce8272e5,324b4137-57a0-3ae0-89db-1c33f57ae0c1,Herma433 Satterfield305,F,GENERAL PRACTICE,134 NORTH ST,NORTH READING,MA,18641315,42.589234,-71.105465,9,0
4,c07adb29-c7b6-318c-94fc-861993ce5406,b6398e07-4967-31a5-807f-380039a1f303,Shane235 Lueilwitz711,M,GENERAL PRACTICE,19 TACOMA ST,WORCESTER,MA,16053516,42.302305,-71.766147,117,0


# Organization Data

## Explore Organization Data

In [None]:
organization_df = pd.read_csv(DATA_DIRECTORY + 'organizations.csv')

# Medications Data

In [None]:
medications_df = pd.read_csv(DATA_DIRECTORY + 'medications.csv')
medications_df.shape

(604103, 13)

In [None]:
medications_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 604103 entries, 0 to 604102
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   START              604103 non-null  object 
 1   STOP               586915 non-null  object 
 2   PATIENT            604103 non-null  object 
 3   PAYER              604103 non-null  object 
 4   ENCOUNTER          604103 non-null  object 
 5   CODE               604103 non-null  int64  
 6   DESCRIPTION        604103 non-null  object 
 7   BASE_COST          604103 non-null  float64
 8   PAYER_COVERAGE     604103 non-null  float64
 9   DISPENSES          604103 non-null  int64  
 10  TOTALCOST          604103 non-null  float64
 11  REASONCODE         550098 non-null  float64
 12  REASONDESCRIPTION  550098 non-null  object 
dtypes: float64(4), int64(2), object(7)
memory usage: 59.9+ MB


In [None]:
# @@ need REASONCODE AND REASONDESCRIPTION to be blank strings w/ nan - continue on with this below this cell
medications_df=medications_df.fillna('')

In [None]:
medications_df.head()

Unnamed: 0,START,STOP,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION
0,2013-07-24T12:53:37Z,2013-08-20T11:16:28Z,faf277d8-c5c7-406c-f139-9143c35ff4e8,df166300-5a78-3502-a46a-832842197811,eebc17d4-d133-24f2-3502-a3640d3feaba,861467,Meperidine Hydrochloride 50 MG Oral Tablet,53.88,0.0,1,53.88,,
1,2013-07-24T12:53:37Z,2013-10-01T12:53:37Z,faf277d8-c5c7-406c-f139-9143c35ff4e8,df166300-5a78-3502-a46a-832842197811,eebc17d4-d133-24f2-3502-a3640d3feaba,313782,Acetaminophen 325 MG Oral Tablet,124.87,0.0,2,249.74,,
2,2014-10-05T11:54:48Z,2015-09-30T11:54:48Z,75bef938-819a-ce94-8dc9-17dc50077a86,0133f751-9229-3cfd-815f-b6d4979bdd6a,90b93931-eec7-ff2b-f759-0381562bc7fa,831533,Errin 28 Day Pack,441.21,411.21,12,5294.52,,
3,2014-10-31T11:35:37Z,2014-11-21T11:35:37Z,75bef938-819a-ce94-8dc9-17dc50077a86,0133f751-9229-3cfd-815f-b6d4979bdd6a,c6fcb2f9-a4b5-fb21-2db3-36ae3114cbee,310965,Ibuprofen 200 MG Oral Tablet,90.68,60.68,1,90.68,,
4,2015-09-30T11:54:48Z,2016-04-01T11:35:37Z,75bef938-819a-ce94-8dc9-17dc50077a86,0133f751-9229-3cfd-815f-b6d4979bdd6a,f90f36bc-72a9-8458-839a-68c3ef53695a,831533,Errin 28 Day Pack,680.38,650.38,6,4082.28,,


# Conditions Data

In [None]:
conditions_df = pd.read_csv(DATA_DIRECTORY + 'conditions.csv')
conditions_df.shape

(254432, 6)

In [None]:
conditions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254432 entries, 0 to 254431
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   START        254432 non-null  object
 1   STOP         201506 non-null  object
 2   PATIENT      254432 non-null  object
 3   ENCOUNTER    254432 non-null  object
 4   CODE         254432 non-null  int64 
 5   DESCRIPTION  254432 non-null  object
dtypes: int64(1), object(5)
memory usage: 11.6+ MB


# Procedures Data

In [None]:
procedures_df = pd.read_csv(DATA_DIRECTORY + 'procedures.csv')
procedures_df.shape

(785604, 9)

In [None]:
procedures_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785604 entries, 0 to 785603
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   START              785604 non-null  object 
 1   STOP               785604 non-null  object 
 2   PATIENT            785604 non-null  object 
 3   ENCOUNTER          785604 non-null  object 
 4   CODE               785604 non-null  int64  
 5   DESCRIPTION        785604 non-null  object 
 6   BASE_COST          785604 non-null  float64
 7   REASONCODE         249782 non-null  float64
 8   REASONDESCRIPTION  249782 non-null  object 
dtypes: float64(2), int64(1), object(6)
memory usage: 53.9+ MB


# Allergies Data

In [None]:
allergies_df = pd.read_csv(DATA_DIRECTORY + 'allergies.csv')
allergies_df.shape

(4062, 15)

In [None]:
allergies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4062 entries, 0 to 4061
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   START         4062 non-null   object 
 1   STOP          0 non-null      float64
 2   PATIENT       4062 non-null   object 
 3   ENCOUNTER     4062 non-null   object 
 4   CODE          4062 non-null   int64  
 5   SYSTEM        4062 non-null   object 
 6   DESCRIPTION   4062 non-null   object 
 7   TYPE          4062 non-null   object 
 8   CATEGORY      4062 non-null   object 
 9   REACTION1     2035 non-null   float64
 10  DESCRIPTION1  2035 non-null   object 
 11  SEVERITY1     2035 non-null   object 
 12  REACTION2     1246 non-null   float64
 13  DESCRIPTION2  1246 non-null   object 
 14  SEVERITY2     1246 non-null   object 
dtypes: float64(3), int64(1), object(11)
memory usage: 476.1+ KB


In [None]:
allergies_df=allergies_df.fillna('')
allergies_df.head()

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,SYSTEM,DESCRIPTION,TYPE,CATEGORY,REACTION1,DESCRIPTION1,SEVERITY1,REACTION2,DESCRIPTION2,SEVERITY2
0,1982-11-26,,4f01341c-4946-31cd-9f2e-7844867f1606,6cf7b46d-f474-77ed-3090-bb3426145d69,84489001,Unknown,Mold (organism),allergy,environment,,,,,,
1,1982-11-26,,4f01341c-4946-31cd-9f2e-7844867f1606,6cf7b46d-f474-77ed-3090-bb3426145d69,29046,Unknown,Lisinopril,intolerance,medication,,,,,,
2,1951-04-21,,2c962d54-191c-2810-b42d-a6b68e5cb540,38977bb0-20ea-0f72-3034-a6252dd8bf47,1191,Unknown,Aspirin,allergy,medication,247472004.0,Wheal (finding),MODERATE,,,
3,1970-12-14,,bc87d4d2-66e0-a893-4bc6-32bbd9ca334e,db7f8530-6d11-ae13-b15f-45772755e922,111088007,Unknown,Latex (substance),allergy,environment,247472004.0,Wheal (finding),MILD,,,
4,1970-12-14,,bc87d4d2-66e0-a893-4bc6-32bbd9ca334e,db7f8530-6d11-ae13-b15f-45772755e922,288328004,Unknown,Bee venom (substance),allergy,environment,271807003.0,Eruption of skin (disorder),MODERATE,,,


# Care Plan Data

In [None]:
careplans_df = pd.read_csv(DATA_DIRECTORY + 'careplans.csv')
careplans_df.shape

(22293, 9)

In [None]:
careplans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22293 entries, 0 to 22292
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Id                 22293 non-null  object 
 1   START              22293 non-null  object 
 2   STOP               11410 non-null  object 
 3   PATIENT            22293 non-null  object 
 4   ENCOUNTER          22293 non-null  object 
 5   CODE               22293 non-null  int64  
 6   DESCRIPTION        22293 non-null  object 
 7   REASONCODE         11531 non-null  float64
 8   REASONDESCRIPTION  11531 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 1.5+ MB
