In [1]:
#! pip install openpyxl  # might need this to load xlsx file

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter

import graphistry
from graphistry.feature_utils import group_columns_by_dtypes, get_dataframe_by_column_dtype

import warnings
warnings.filterwarnings('ignore')

In [3]:
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username="...", password="...") 

In [4]:
g = graphistry.bind()

In [5]:
res = os.path.expanduser('~/')

In [6]:
df = pd.read_excel(res+'/Downloads/mass-vax-dataset.xlsx')

In [7]:
df['n'] = range(len(df)) # add a node identifier that lines up with emergent edges dataframe

In [8]:
df.columns

Index(['Date', 'Time of event', 'Day of Week', 'SYMPTOMS',
       'Patient observation site', 'Dispottion to', 'Gender',
       'Observation length of stay min', 'Age', 'Intervention', 'Notes',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'n'],
      dtype='object')

In [9]:
# remove columns that are NaN or uninteresting (expect to be uniformly distributed and not correlative)
df = df.drop(columns = ['Unnamed: 12', 'Unnamed: 13', 'Unnamed: 11', 'Time of event', 'Day of Week'])
gtypes = group_columns_by_dtypes(df)

[feature_utils.py:323 - group_columns_by_dtypes() ] datetime64[ns] has 1 members
[feature_utils.py:323 - group_columns_by_dtypes() ] int64 has 1 members
[feature_utils.py:323 - group_columns_by_dtypes() ] float64 has 2 members
[feature_utils.py:323 - group_columns_by_dtypes() ] object has 6 members


In [10]:
# some data cleaning (coerce to correct type)
df[gtypes['float64']] = df[gtypes['float64']].fillna(-1.0)

In [11]:
df[gtypes['object']] = df[gtypes['object']].fillna('')

In [12]:
df['Date'] = df['Date'].fillna(pd.Timestamp('20210206')) # date is not very interesting, but we fillna here

# Let's see the abundance of difference aspects

In [13]:
Counter(df.SYMPTOMS).most_common()

[('dizzy', 815),
 ('Syncope', 243),
 ("throat sx's", 159),
 ('Allergy - Skin', 121),
 ('cardiac', 100),
 ('anxiety', 80),
 ('N / V', 73),
 ('Other', 67),
 ('Local', 37),
 ('SOB', 29),
 ('Falls', 11),
 ('Allergy', 2)]

In [14]:
Counter(df.Intervention).most_common()

[('Rn Obs Only', 799),
 ('MD Exam & Reassurance', 526),
 ('PO Challenge', 260),
 ('OTC meds', 82),
 ('Prescription meds', 54),
 ('', 9),
 ('Wound Care', 7)]

In [15]:
Counter(df['Dispottion to']).most_common()

[('home with monitoring', 903),
 ('DC home alone', 531),
 ('Urgent PMD follow up', 259),
 ('Ambulance (AMR)', 31),
 ('Private car to ER', 8),
 ('AMA', 3),
 ('', 2)]

In [16]:
Counter(df['Notes']).most_common()

[('', 1064),
 ('juice', 143),
 ('zyrtec', 55),
 ('water', 53),
 ('Juice', 27),
 ('diaphoretic', 20),
 ('zofran', 13),
 ('Zyrtec', 12),
 ('headache', 9),
 ('hx of syncope', 6),
 ('hx of fainting', 5),
 ('palpitations', 5),
 ('employee', 4),
 ('ambulance', 4),
 ('JUICE', 4),
 ('Zofran', 3),
 ('po challenge', 3),
 ('juice, diaphoretic', 3),
 ('JANSSEN', 3),
 ('Water', 3),
 ('afib with first vaccination dose', 2),
 ('vaccine declined', 2),
 ('blurred vision', 2),
 ('zytrec', 2),
 ('15 yo accidently vaccinated', 2),
 ('syringe malfunction, Re-vaccinated, needle retacted immediately, vaccine leaked on skin',
  2),
 ('juice and crackers  ', 2),
 ('juice and crackers', 2),
 ('tingling', 2),
 ('lip tingling', 2),
 ('venishing point, Medical device failure, second full dose administered', 2),
 ('WATER', 2),
 ('JANSSEN?', 2),
 ('ZYRTEC', 2),
 ('crackers', 2),
 ('diaphoretic, juice', 2),
 ('juice/crackers', 2),
 ('anticoag, anaphylaxis', 1),
 ('Went home, then ED next day', 1),
 ('hand laceration,

In [18]:
df.columns

Index(['Date', 'SYMPTOMS', 'Patient observation site', 'Dispottion to',
       'Gender', 'Observation length of stay min', 'Age', 'Intervention',
       'Notes', 'n'],
      dtype='object')

In [29]:
# whats nice about .featurize is we don't need to do any data munging (JUICE -> juice etc), 
# the textual AI will understand that
# We featurize the following columns (do not add Date, unless coerced to string)
# the following produces interesting visual journey of the data:

g2 = g.nodes(df, 'n').featurize(use_columns=['SYMPTOMS',
 'Dispottion to',
 'Gender',
 'Intervention',
 #'Patient observation site', # not interesting
 #'Observation length of stay min', # not that interesting
 #'Age',                            # interesting but for outside analysis               
 'Notes'])

[feature_utils.py:164 - remove_node_column_from_ndf_and_return_ndf_from_res() ] removing node column `n` so we do not featurize it
[feature_utils.py:802 - get_dataframe_columns() ] returning DataFrame with columns `['SYMPTOMS', 'Dispottion to', 'Gender', 'Intervention', 'Notes']`
[feature_utils.py:434 - check_if_textual_column() ] 
	Column `Dispottion to` looks textual with mean number of words 3.13
[feature_utils.py:434 - check_if_textual_column() ] 
	Column `Intervention` looks textual with mean number of words 3.06
[feature_utils.py:484 -       encode_textual() ] -Calculating Embeddings for column `Dispottion to`
[feature_utils.py:484 -       encode_textual() ] -Calculating Embeddings for column `Intervention`
[feature_utils.py:487 -       encode_textual() ] Encoded Textual data at 3989.94 rows per column minute
[feature_utils.py:611 - process_dirty_dataframes() ] Encoding DataFrame might take a few minutes --------
[feature_utils.py:616 - process_dirty_dataframes() ] -Shape of data

In [30]:
g3 = g2.umap(scale=0.3) # needs low scale as edge weights are highly non-gaussian (box-cox would help)

[feature_utils.py:1045 -                 umap() ] There are repeat entities in node table, we will not relabel nodes
[feature_utils.py:920 - _featurize_or_get_nodes_data_if_X_is_None() ] Found Node features in `res`
[feature_utils.py:932 - _featurize_or_get_nodes_data_if_X_is_None() ] Fetching `node_target` in `res`. Target is type <class 'NoneType'>
[umap_utils.py:92 -                  fit() ] Starting UMAP-ing data of shape (1737, 825)
[umap_utils.py:105 -      _edge_influence() ] Calculating weighted adjacency (edge) DataFrame
[umap_utils.py:96 -                  fit() ] -UMAP-ing took 0.18 minutes total
[umap_utils.py:97 -                  fit() ]  - or 9752.97 rows per minute
[feature_utils.py:778 - prune_weighted_edges_df() ] Pruning weighted edge DataFrame from 31356 to 24978 edges


In [22]:
g3.weighted_edges_df_from_nodes  # implicit edges from featurization and umap

Unnamed: 0,_src,_dst,weight
6,0,998,1.000000
13,1,25,1.000000
29,2,173,1.000000
35,2,1527,0.932380
36,3,11,0.637480
...,...,...,...
25801,1735,1298,0.686397
25805,1735,1619,0.754393
25817,1736,568,1.000000
25818,1736,608,1.000000


In [31]:
g3.bind(point_title='Notes').plot()

In [32]:
g3.bind(point_title='SYMPTOMS').plot()