In [1]:
def get_var_category(series):
    unique_count = series.nunique(dropna=False)
    total_count = len(series)
    if pd.api.types.is_numeric_dtype(series):
        return 'Numerical'
    elif pd.api.types.is_datetime64_dtype(series):
        return 'Date'
    elif unique_count==total_count:
        return 'Text (Unique)'
    else:
        return 'Categorical'

def print_categories(df):
    for column_name in df.columns:
        print(column_name, ": ", get_var_category(df[column_name]))

In [2]:
import numpy as np
import pandas as pd

# Setup HTML display
from IPython.core.display import display, HTML
# Notebook cell width adjustment
display(HTML('<style>.container { width:80% !important; }</style>'))

In [8]:
all_dfs = pd.read_excel(r'./data/20210225-ems-raw-v04.xlsx', 
                        sheet_name=None, 
                        na_values=['NA'])
print("All dataset names: ", all_dfs.keys)
df_pat = all_dfs['Patients']
print("Patients dataset shape: ", df_pat.shape)
df_proc = all_dfs['Procedures']
print("Procedures  dataset shape: ", df_proc.shape)
df_med = all_dfs['Medications']
print("Medications dataset shape: ", df_med.shape)

# get accurate row count for Venn Diagram later
varPatCount = len(df_pat.index)
varProcCount = len(df_proc.index)
varMadCount = len(df_med.index)

All dataset names:  <built-in method keys of dict object at 0x000001E73063DD18>
Patients dataset shape:  (543774, 12)
Procedures  dataset shape:  (170808, 6)
Medications dataset shape:  (63168, 6)


In [9]:
print_categories(df_pat)

PatientId :  Numerical
FRDPersonnelID :  Categorical
Shift :  Categorical
UnitId :  Categorical
FireStation :  Numerical
Battalion :  Numerical
PatientOutcome :  Categorical
PatientGender :  Categorical
CrewMemberRoles :  Categorical
DispatchTime :  Date
FRDPersonnelGender :  Categorical
FRDPersonnelStartDate :  Date


In [10]:
print_categories(df_proc)

Dim_Procedure_PK :  Numerical
PatientId :  Numerical
Procedure_Performed_Code :  Numerical
Procedure_Performed_Description :  Categorical
FRDPersonnelID :  Categorical
Procedure_Performed_Date_Time :  Date


In [11]:
print_categories(df_med)

Dim_Medication_PK :  Numerical
PatientId :  Numerical
Medication_Given_RXCUI_Code :  Numerical
Medication_Given_Description :  Categorical
FRDPersonnelID :  Categorical
Medication_Administered_Date_Time :  Date


In [12]:
# Focus Question 1.1

# Inner join on Patients and Procedures example and display first 5 rows
df_pat_proc = df_pat.merge(df_proc, 
                        on=('PatientId','FRDPersonnelID'))
df_pat_proc.shape

(147868, 16)

In [14]:
# get accurate row count for Venn Diagram later
varPatProcCount = len(df_pat_proc.index)

varPatProcDiff = varPatCount - varPatProcCount
varPatProcPct = varPatProcCount / varPatCount 
varProcDiff = varProcCount - varPatProcCount
varProcPct = varPatProcCount / varProcCount
print(varPatProcPct)
print(varProcPct)

0.27192914703534926
0.8656971570418247


In [15]:
df_proc['PK'] = df_proc['PatientId'].map(str) + '_' + df_proc['FRDPersonnelID']
df_pat_proc['_PK'] = df_pat_proc['PatientId'].map(str) + '_' + df_pat_proc['FRDPersonnelID']

In [16]:
df_proc.shape

(170808, 7)

In [17]:
df_pat_proc.shape

(147868, 17)

In [18]:
df_pat_proc.columns

Index(['PatientId', 'FRDPersonnelID', 'Shift', 'UnitId', 'FireStation',
       'Battalion', 'PatientOutcome', 'PatientGender', 'CrewMemberRoles',
       'DispatchTime', 'FRDPersonnelGender', 'FRDPersonnelStartDate',
       'Dim_Procedure_PK', 'Procedure_Performed_Code',
       'Procedure_Performed_Description', 'Procedure_Performed_Date_Time',
       '_PK'],
      dtype='object')

In [28]:
#add back in later
#df_proc['PK'] = df_proc['PatientId'].map(str) + '|' + df_proc['FRDPersonnelID']
#df_pat_proc['_PK'] = df_pat_proc['PatientId'].map(str) + '|' + df_pat_proc['FRDPersonnelID']

procPKi = df_proc.set_index('PK')
patProcPKi = df_pat_proc.set_index('_PK')

df_proc_left_outer = df_proc.merge(patProcPKi, 
                             how = 'left', 
                             left_on = ('PK'), 
                             right_on = ('_PK'), 
                             indicator = 'i').query('i == "left_only"')

In [29]:
df_proc_left_outer.shape

(22948, 24)

In [30]:
df_proc_left_outer.head(5)

Unnamed: 0,Dim_Procedure_PK_x,PatientId_x,Procedure_Performed_Code_x,Procedure_Performed_Description_x,FRDPersonnelID_x,Procedure_Performed_Date_Time_x,PK,PatientId_y,FRDPersonnelID_y,Shift,...,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate,Dim_Procedure_PK_y,Procedure_Performed_Code_y,Procedure_Performed_Description_y,Procedure_Performed_Date_Time_y,i
27,180516,479885,392230005,IV Start - Extremity Vein (arm or leg),17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-01 01:36:56,479885_17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,,,,...,,,NaT,,NaT,,,,NaT,left_only
30,180519,479920,392230005,IV Start - Extremity Vein (arm or leg),C387923A-F613-E411-A585-F305C4522FCB,2018-01-01 01:53:00,479920_C387923A-F613-E411-A585-F305C4522FCB,,,,...,,,NaT,,NaT,,,,NaT,left_only
31,180532,480011,392230005,IV Start - Extremity Vein (arm or leg),9BD2C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-01 03:20:15,480011_9BD2C99E-9E01-E211-B5F5-78E7D18CFD3C,,,,...,,,NaT,,NaT,,,,NaT,left_only
35,180549,480042,392230005,IV Start - Extremity Vein (arm or leg),F1D2C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-01 04:37:35,480042_F1D2C99E-9E01-E211-B5F5-78E7D18CFD3C,,,,...,,,NaT,,NaT,,,,NaT,left_only
36,180557,480049,392230005,IV Start - Extremity Vein (arm or leg),72D2C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-01 05:30:30,480049_72D2C99E-9E01-E211-B5F5-78E7D18CFD3C,,,,...,,,NaT,,NaT,,,,NaT,left_only


In [31]:
df_proc_left_outer.columns

Index(['Dim_Procedure_PK_x', 'PatientId_x', 'Procedure_Performed_Code_x',
       'Procedure_Performed_Description_x', 'FRDPersonnelID_x',
       'Procedure_Performed_Date_Time_x', 'PK', 'PatientId_y',
       'FRDPersonnelID_y', 'Shift', 'UnitId', 'FireStation', 'Battalion',
       'PatientOutcome', 'PatientGender', 'CrewMemberRoles', 'DispatchTime',
       'FRDPersonnelGender', 'FRDPersonnelStartDate', 'Dim_Procedure_PK_y',
       'Procedure_Performed_Code_y', 'Procedure_Performed_Description_y',
       'Procedure_Performed_Date_Time_y', 'i'],
      dtype='object')

In [32]:
# create new dataframe without _x
# Rename columns with _x
df_proc_left_outer.rename(columns = {'Dim_Procedure_PK_x':'Dim_Procedure_PK',
                                     'PatientId_x':'PatientId',
                                     'Procedure_Performed_Code_x':'Procedure_Performed_Code',
                                     'Procedure_Performed_Description_x':'Procedure_Performed_Description',
                                     'Procedure_Performed_Date_Time_x':'Procedure_Performed_Date_Time',
                                     'FRDPersonnelID_x':'FRDPersonnelID',
                                     'Procedure_Performed_Date_Time_x':'Procedure_Performed_Date_Time'}, inplace = True) 
print("\nAfter modifying first column:\n", df_proc_left_outer.columns)


After modifying first column:
 Index(['Dim_Procedure_PK', 'PatientId', 'Procedure_Performed_Code',
       'Procedure_Performed_Description', 'FRDPersonnelID',
       'Procedure_Performed_Date_Time', 'PK', 'PatientId_y',
       'FRDPersonnelID_y', 'Shift', 'UnitId', 'FireStation', 'Battalion',
       'PatientOutcome', 'PatientGender', 'CrewMemberRoles', 'DispatchTime',
       'FRDPersonnelGender', 'FRDPersonnelStartDate', 'Dim_Procedure_PK_y',
       'Procedure_Performed_Code_y', 'Procedure_Performed_Description_y',
       'Procedure_Performed_Date_Time_y', 'i'],
      dtype='object')


In [33]:
# Drop all NaN, _y columns, don't need them anymore
df_proc_left_outer = df_proc_left_outer.drop(['PatientId_y', 'FRDPersonnelID_y', 'Shift', 'UnitId', 'FireStation',
                                             'Battalion', 'PatientOutcome', 'PatientGender', 'CrewMemberRoles',
                                             'DispatchTime', 'FRDPersonnelGender', 'FRDPersonnelStartDate',
                                             'Dim_Procedure_PK_y', 'Procedure_Performed_Code_y', 'Procedure_Performed_Description_y',
                                             'Procedure_Performed_Date_Time_y', 'i'],axis=1)
print("\nAfter modifying first column:\n", df_proc_left_outer.columns)


After modifying first column:
 Index(['Dim_Procedure_PK', 'PatientId', 'Procedure_Performed_Code',
       'Procedure_Performed_Description', 'FRDPersonnelID',
       'Procedure_Performed_Date_Time', 'PK'],
      dtype='object')


In [34]:
# Running example
df_pat.loc[df_pat['PatientId'] == 481501]

Unnamed: 0,PatientId,FRDPersonnelID,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate
870,481501,6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10,Female,2006-01-09
871,481501,5AD3C99E-9E01-E211-B5F5-78E7D18CFD3C,B - Shift,M440,40,403,Treated & Transported,Female,"Driver-Response,Driver-Transport",2018-01-02 17:39:10,Male,2007-06-11


In [35]:
df_proc.loc[df_proc['PatientId'] == 481501]

Unnamed: 0,Dim_Procedure_PK,PatientId,Procedure_Performed_Code,Procedure_Performed_Description,FRDPersonnelID,Procedure_Performed_Date_Time,PK
251,181448,481501,230040009,Airway - Clear/Suction,78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:35:27,481501_78C0656B-4FD3-4AFD-8907-05258241F20A
252,181449,481501,425447009,Resp - Assist Ventilation - BVM Via Mask,6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-02 18:35:56,481501_6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C
253,181450,481501,392230005,IV Start - Extremity Vein (arm or leg),78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:36:36,481501_78C0656B-4FD3-4AFD-8907-05258241F20A


In [36]:
df_pat_proc.loc[df_pat_proc['PatientId'] == 481501]
# of the 3 rows in df_proc, this matched correctly with the inner join for provider 6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C

Unnamed: 0,PatientId,FRDPersonnelID,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate,Dim_Procedure_PK,Procedure_Performed_Code,Procedure_Performed_Description,Procedure_Performed_Date_Time,_PK
218,481501,6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10,Female,2006-01-09,181449,425447009,Resp - Assist Ventilation - BVM Via Mask,2018-01-02 18:35:56,481501_6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C


In [38]:
df_proc_left_outer.loc[df_proc_left_outer['PatientId'] == 481501]
# This correctly separates out the 2 rows for provider 78C0656B-4FD3-4AFD-8907-05258241F20A

Unnamed: 0,Dim_Procedure_PK,PatientId,Procedure_Performed_Code,Procedure_Performed_Description,FRDPersonnelID,Procedure_Performed_Date_Time,PK
491,181448,481501,230040009,Airway - Clear/Suction,78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:35:27,481501_78C0656B-4FD3-4AFD-8907-05258241F20A
493,181450,481501,392230005,IV Start - Extremity Vein (arm or leg),78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:36:36,481501_78C0656B-4FD3-4AFD-8907-05258241F20A


In [39]:
# Test to find most pared-down unique list
df_pat.loc[df_pat['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A'] \
    .groupby(['FRDPersonnelID','FRDPersonnelGender','FRDPersonnelStartDate',
              'Shift','UnitId','FireStation','Battalion','CrewMemberRoles']).size().reset_index(name='count')

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,FRDPersonnelStartDate,Shift,UnitId,FireStation,Battalion,CrewMemberRoles,count
0,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,A - Shift,E427,27,405,Primary Patient Caregiver-At Scene,1
1,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,A - Shift,E440,40,403,Primary Patient Caregiver-At Scene,1
2,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,A - Shift,M401,1,402,"Primary Patient Caregiver-At Scene,Primary Pat...",7
3,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,A - Shift,M401,1,402,"Primary Patient Caregiver-At Scene,Primary Pat...",2
4,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,A - Shift,M402,2,402,"Primary Patient Caregiver-At Scene,Primary Pat...",2
...,...,...,...,...,...,...,...,...,...
74,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,C - Shift,M427,27,405,"Primary Patient Caregiver-At Scene,Primary Pat...",3
75,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,C - Shift,M431,31,401,"Primary Patient Caregiver-At Scene,Primary Pat...",11
76,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,C - Shift,M438,38,403,"Primary Patient Caregiver-At Scene,Primary Pat...",6
77,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,C - Shift,M440,40,403,"Primary Patient Caregiver-At Scene,Primary Pat...",32


In [40]:
# Test to find most pared-down unique list
df_pat.loc[df_pat['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A'] \
    .groupby(['FRDPersonnelID','FRDPersonnelGender','FRDPersonnelStartDate',
              'Shift']).size().reset_index(name='count')

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,FRDPersonnelStartDate,Shift,count
0,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,A - Shift,171
1,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,B - Shift,689
2,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,C - Shift,113


In [42]:
# Test to find most pared-down unique list
df_pat.loc[df_pat['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A'] \
    .groupby(['FRDPersonnelID','FRDPersonnelGender','FRDPersonnelStartDate',
              'Battalion']).size().reset_index(name='count')

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,FRDPersonnelStartDate,Battalion,count
0,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,401,27
1,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,402,29
2,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,403,705
3,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,404,23
4,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,405,31
5,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,406,138
6,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,407,20


In [43]:
# Test to find most pared-down unique list
df_pat.loc[df_pat['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A'] \
             .groupby(['FRDPersonnelID','FRDPersonnelGender','PatientId']) \
             .size() \
             .reset_index(name='count')

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,PatientId,count
0,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,480280,1
1,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,480444,1
2,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,480505,1
3,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,481488,1
4,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,481881,1
...,...,...,...,...
968,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,1459802,1
969,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,1460001,1
970,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,1460086,1
971,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,1460118,1


In [44]:
# Test to find most pared-down unique list
# Winner, winner, chicken dinner!
df_pat.loc[df_pat['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A'] \
             .groupby(['FRDPersonnelID','FRDPersonnelGender','FRDPersonnelStartDate']) \
             .size() \
             .reset_index(name='count')

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,FRDPersonnelStartDate,count
0,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,973


In [80]:
# Based on info below, need to group by FRDPersonnelID, FRDPersonnelGender, and FRDPersonnelStartDate
# See how large the dataset is, mayby just do for the orphan rows only
# Then join lookup dataframe df_prov_gend_lu with df_proc_left_outer on PatientId only 
# and need to pull in 
# Shift
# UnitId
# FireStation
# Battalion
# PatientOutcome
# PatientGender
# CrewMemberRoles
# DispatchTime

# Create Lookup table for Providers
df_prov_gend_lu = df_pat.groupby(['FRDPersonnelID','FRDPersonnelGender', 'FRDPersonnelStartDate'])\
             .size() \
             .reset_index(name='count')
df_prov_gend_lu.head(5)

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,FRDPersonnelStartDate,count
0,00D5C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2008-04-28,1200
1,00D6C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2007-09-17,388
2,00D7C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,1993-09-20,21
3,00D8C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,1991-06-03,5
4,00D9C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2003-04-19,249


In [81]:
df_prov_gend_lu.columns

Index(['FRDPersonnelID', 'FRDPersonnelGender', 'FRDPersonnelStartDate',
       'count'],
      dtype='object')

In [82]:
df_prov_gend_lu = df_prov_gend_lu.drop(['count'],axis=1)
df_prov_gend_lu.columns

Index(['FRDPersonnelID', 'FRDPersonnelGender', 'FRDPersonnelStartDate'], dtype='object')

In [83]:
# Join lookup dataframe df_prov_gend_lu with df_proc_left_outer on PatientId only
df_pat_proc_orphans = df_proc_left_outer.merge(df_prov_gend_lu, 
                        on=('FRDPersonnelID'))
df_pat_proc_orphans.shape

(22937, 9)

In [84]:
df_pat_proc_orphans.head()

Unnamed: 0,Dim_Procedure_PK,PatientId,Procedure_Performed_Code,Procedure_Performed_Description,FRDPersonnelID,Procedure_Performed_Date_Time,PK,FRDPersonnelGender,FRDPersonnelStartDate
0,180516,479885,392230005,IV Start - Extremity Vein (arm or leg),17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-01 01:36:56,479885_17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2005-07-25
1,185426,486284,392230005,IV Start - Extremity Vein (arm or leg),17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-07 15:00:00,486284_17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2005-07-25
2,186800,488083,392230005,IV Start - Extremity Vein (arm or leg),17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-09 07:08:00,488083_17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2005-07-25
3,209301,514333,392230005,IV Start - Extremity Vein (arm or leg),17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-02-07 19:42:00,514333_17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2005-07-25
4,214483,522281,392230005,IV Start - Extremity Vein (arm or leg),17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-02-16 16:02:00,522281_17D3C99E-9E01-E211-B5F5-78E7D18CFD3C,Male,2005-07-25


In [85]:
df_prov_gend_lu.loc[df_prov_gend_lu['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A']

Unnamed: 0,FRDPersonnelID,FRDPersonnelGender,FRDPersonnelStartDate
857,78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21


In [86]:
df_pat_proc_orphans.loc[df_pat_proc_orphans['FRDPersonnelID'] == '78C0656B-4FD3-4AFD-8907-05258241F20A'].count()

Dim_Procedure_PK                   46
PatientId                          46
Procedure_Performed_Code           46
Procedure_Performed_Description    46
FRDPersonnelID                     46
Procedure_Performed_Date_Time      46
PK                                 46
FRDPersonnelGender                 46
FRDPersonnelStartDate              46
dtype: int64

In [87]:
df_pat_proc_orphans.loc[df_pat_proc_orphans['PatientId'] == 481501]

Unnamed: 0,Dim_Procedure_PK,PatientId,Procedure_Performed_Code,Procedure_Performed_Description,FRDPersonnelID,Procedure_Performed_Date_Time,PK,FRDPersonnelGender,FRDPersonnelStartDate
919,181448,481501,230040009,Airway - Clear/Suction,78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:35:27,481501_78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21
920,181450,481501,392230005,IV Start - Extremity Vein (arm or leg),78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:36:36,481501_78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21


In [88]:
df_pat.columns

Index(['PatientId', 'FRDPersonnelID', 'Shift', 'UnitId', 'FireStation',
       'Battalion', 'PatientOutcome', 'PatientGender', 'CrewMemberRoles',
       'DispatchTime', 'FRDPersonnelGender', 'FRDPersonnelStartDate'],
      dtype='object')

In [118]:
# Join df_pat_proc_orphans dataframe back with df_pat to bring back in
# Shift
# UnitId
# FireStation
# Battalion
# PatientOutcome
# PatientGender
# CrewMemberRoles
# DispatchTime

#df_pat_proc_new = df_pat_proc_orphans.merge(df_pat.iloc[:, [2:9]],
df_pat_proc_new = df_pat_proc_orphans.merge(df_pat.loc[df_pat['CrewMemberRoles'] \
                                     .str.contains('Primary')][['PatientId','Shift','UnitId','FireStation','Battalion', 
                                                                'PatientOutcome','PatientGender','CrewMemberRoles','DispatchTime']],
                                     on=('PatientId'))
df_pat_proc_new.shape

(22870, 17)

In [119]:
df_pat_proc_new.loc[df_pat_proc_new['PatientId'] == 481501]

Unnamed: 0,Dim_Procedure_PK,PatientId,Procedure_Performed_Code,Procedure_Performed_Description,FRDPersonnelID,Procedure_Performed_Date_Time,PK,FRDPersonnelGender,FRDPersonnelStartDate,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime
995,181448,481501,230040009,Airway - Clear/Suction,78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:35:27,481501_78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10
996,181450,481501,392230005,IV Start - Extremity Vein (arm or leg),78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:36:36,481501_78C0656B-4FD3-4AFD-8907-05258241F20A,Male,2015-09-21,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10


In [120]:
# Append df_pat_proc_new back to df_pat_proc and do not reindex
df_pat_proc_append = df_pat_proc.append(df_pat_proc_new, ignore_index=True)

In [121]:
df_pat_proc_append.shape

(170738, 18)

In [122]:
df_pat_proc_append.columns

Index(['PatientId', 'FRDPersonnelID', 'Shift', 'UnitId', 'FireStation',
       'Battalion', 'PatientOutcome', 'PatientGender', 'CrewMemberRoles',
       'DispatchTime', 'FRDPersonnelGender', 'FRDPersonnelStartDate',
       'Dim_Procedure_PK', 'Procedure_Performed_Code',
       'Procedure_Performed_Description', 'Procedure_Performed_Date_Time',
       '_PK', 'PK'],
      dtype='object')

In [123]:
df_pat_proc_append.loc[df_pat_proc_append['PatientId'] == 481501]

Unnamed: 0,PatientId,FRDPersonnelID,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate,Dim_Procedure_PK,Procedure_Performed_Code,Procedure_Performed_Description,Procedure_Performed_Date_Time,_PK,PK
218,481501,6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10,Female,2006-01-09,181449,425447009,Resp - Assist Ventilation - BVM Via Mask,2018-01-02 18:35:56,481501_6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C,
148863,481501,78C0656B-4FD3-4AFD-8907-05258241F20A,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10,Male,2015-09-21,181448,230040009,Airway - Clear/Suction,2018-01-02 18:35:27,,481501_78C0656B-4FD3-4AFD-8907-05258241F20A
148864,481501,78C0656B-4FD3-4AFD-8907-05258241F20A,B - Shift,M440,40,403,Treated & Transported,Female,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-02 17:39:10,Male,2015-09-21,181450,392230005,IV Start - Extremity Vein (arm or leg),2018-01-02 18:36:36,,481501_78C0656B-4FD3-4AFD-8907-05258241F20A


In [124]:
# This shows the merged and appended datset contains the same three procedures with all the info from Patients and Procedures 
df_proc.loc[df_proc['PatientId'] == 481501]

Unnamed: 0,Dim_Procedure_PK,PatientId,Procedure_Performed_Code,Procedure_Performed_Description,FRDPersonnelID,Procedure_Performed_Date_Time,PK
251,181448,481501,230040009,Airway - Clear/Suction,78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:35:27,481501_78C0656B-4FD3-4AFD-8907-05258241F20A
252,181449,481501,425447009,Resp - Assist Ventilation - BVM Via Mask,6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C,2018-01-02 18:35:56,481501_6CD5C99E-9E01-E211-B5F5-78E7D18CFD3C
253,181450,481501,392230005,IV Start - Extremity Vein (arm or leg),78C0656B-4FD3-4AFD-8907-05258241F20A,2018-01-02 18:36:36,481501_78C0656B-4FD3-4AFD-8907-05258241F20A


In [125]:
df_pat_proc_append.to_csv(r'./data/dfPatProcAppend.csv', index = True)

In [None]:
# Focus Question 1.2