### Importing Key Libraries

In [1]:
import numpy as np

import pandas as pd
from pandas import Series, DataFrame
pd.set_option("display.max_rows", 60)
pd.set_option("precision", 5)

import pyarrow as pa
import pyarrow.parquet as pq

### Loading Patients Data from Raw File

In [2]:
df = pd.read_excel(io = '../data/01_raw/20210214-ems-raw-v03.xlsx', sheet_name='Patients')
df.head()

Unnamed: 0,PatientId,FRDPersonnelID,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate
0,479838,6ED7C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M404,4,401,Standby (No Services Performed),,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-01 00:29:02,Male,1997-08-18
1,479838,F39A55F0-C619-E511-80CA-001DD8B71D38,A - Shift,M404,4,401,Standby (No Services Performed),,"Driver-Transport,Other Patient Caregiver-At Scene",2018-01-01 00:29:02,Male,2007-02-19
2,479838,D6D3C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M404,4,401,Standby (No Services Performed),,"Intern/Student,Other Patient Caregiver-At Scen...",2018-01-01 00:29:02,Male,2008-08-18
3,479839,71D4C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M401,1,402,No Treatment/Transport Required,,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-01 00:40:54,Male,2011-10-24
4,479839,F9D4C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M401,1,402,No Treatment/Transport Required,,"Driver-Response,Driver-Transport",2018-01-01 00:40:54,Male,2012-02-27


### Setting Data Types

In [3]:
df=df.astype(dtype = {'Shift': 'category',
                      'FireStation': 'category',
                      'Battalion': 'category',
                      'PatientOutcome': 'category',
                      'PatientGender': 'category',
                      'FRDPersonnelGender': 'category',
                      'UnitId': 'category',              # Could be changed to string
                      'CrewMemberRoles': 'category',     # Could be changed to string
                      'PatientId': 'string',
                      'FRDPersonnelID': 'string',
                      'DispatchTime': 'datetime64',
                      'FRDPersonnelStartDate': 'datetime64'})

### Creating Index Column
At this stage the index will be stored as a new column, untill all duplicates are resolved.

In [4]:
comp_index = DataFrame({'comp_index': df['PatientId']+"_"+df['FRDPersonnelID']})
df_con = pd.concat([df,comp_index], axis=1)
df_con.head()

Unnamed: 0,PatientId,FRDPersonnelID,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate,comp_index
0,479838,6ED7C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M404,4,401,Standby (No Services Performed),,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-01 00:29:02,Male,1997-08-18,479838_6ED7C99E-9E01-E211-B5F5-78E7D18CFD3C
1,479838,F39A55F0-C619-E511-80CA-001DD8B71D38,A - Shift,M404,4,401,Standby (No Services Performed),,"Driver-Transport,Other Patient Caregiver-At Scene",2018-01-01 00:29:02,Male,2007-02-19,479838_F39A55F0-C619-E511-80CA-001DD8B71D38
2,479838,D6D3C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M404,4,401,Standby (No Services Performed),,"Intern/Student,Other Patient Caregiver-At Scen...",2018-01-01 00:29:02,Male,2008-08-18,479838_D6D3C99E-9E01-E211-B5F5-78E7D18CFD3C
3,479839,71D4C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M401,1,402,No Treatment/Transport Required,,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-01 00:40:54,Male,2011-10-24,479839_71D4C99E-9E01-E211-B5F5-78E7D18CFD3C
4,479839,F9D4C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M401,1,402,No Treatment/Transport Required,,"Driver-Response,Driver-Transport",2018-01-01 00:40:54,Male,2012-02-27,479839_F9D4C99E-9E01-E211-B5F5-78E7D18CFD3C


### DataFrame to Parquet
The patient dataset will be stored as parquet file to optimize I/O activities and storage of the DataFrame.

In [5]:
# Step 1: Convert DataFrame into PyArrow Table
table = pa.Table.from_pandas(df_con)

# Step 2: Write the Table into parquet format and save it in the raw data folder.
pq.write_table(table, '../data/01_raw/20210214_v3_patients.parquet')

# Read Validation
# Step 3: Read the parquet file into a new PyArrow Table
table2 = pq.read_table('../data/01_raw/20210214_v3_patients.parquet')

# Step 4: change the table to a Pandas DataFrame
# Note: PyArrow <-> Pandas have problems converting categories that
#       have numeric strings only. For that reason, we need to add
#       FireStation and Battalion as categories, so we can transform it
#       to a pandas DataFrame.
pa_df = table2.to_pandas(categories=['FireStation','Battalion'])

# Step 5: Display a sample of the DataFrame
pa_df.head()

Unnamed: 0,PatientId,FRDPersonnelID,Shift,UnitId,FireStation,Battalion,PatientOutcome,PatientGender,CrewMemberRoles,DispatchTime,FRDPersonnelGender,FRDPersonnelStartDate,comp_index
0,479838,6ED7C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M404,4,401,Standby (No Services Performed),,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-01 00:29:02,Male,1997-08-18,479838_6ED7C99E-9E01-E211-B5F5-78E7D18CFD3C
1,479838,F39A55F0-C619-E511-80CA-001DD8B71D38,A - Shift,M404,4,401,Standby (No Services Performed),,"Driver-Transport,Other Patient Caregiver-At Scene",2018-01-01 00:29:02,Male,2007-02-19,479838_F39A55F0-C619-E511-80CA-001DD8B71D38
2,479838,D6D3C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M404,4,401,Standby (No Services Performed),,"Intern/Student,Other Patient Caregiver-At Scen...",2018-01-01 00:29:02,Male,2008-08-18,479838_D6D3C99E-9E01-E211-B5F5-78E7D18CFD3C
3,479839,71D4C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M401,1,402,No Treatment/Transport Required,,"Primary Patient Caregiver-At Scene,Primary Pat...",2018-01-01 00:40:54,Male,2011-10-24,479839_71D4C99E-9E01-E211-B5F5-78E7D18CFD3C
4,479839,F9D4C99E-9E01-E211-B5F5-78E7D18CFD3C,A - Shift,M401,1,402,No Treatment/Transport Required,,"Driver-Response,Driver-Transport",2018-01-01 00:40:54,Male,2012-02-27,479839_F9D4C99E-9E01-E211-B5F5-78E7D18CFD3C


In [6]:
pa_df.dtypes

PatientId                        string
FRDPersonnelID                   string
Shift                          category
UnitId                         category
FireStation                    category
Battalion                      category
PatientOutcome                 category
PatientGender                  category
CrewMemberRoles                category
DispatchTime             datetime64[ns]
FRDPersonnelGender             category
FRDPersonnelStartDate    datetime64[ns]
comp_index                       string
dtype: object

### Conclusion

Now we can have faster loading times between notebooks if we load our data from parquet to PyArrow Table to Pandas DataFrame.

#### Loading Patient Data Set in Two Steps
~~~
table2 = pq.read_table('../data/01_raw/20210214_v3_patients.parquet')
pa_df = table2.to_pandas(categories=['FireStation','Battalion'])
~~~

#### Loading Patient Data Set in One Step
~~~
df = pq.read_table('../data/01_raw/20210214_v3_patients.parquet').to_pandas(categories=['FireStation','Battalion'])
~~~

### Benefits
1. Loading the dataset in any new/old notebook is significantly faster
2. No need to re-define datatypes for the DataFrame as they are loaded with the right schema