In [230]:
import pandas as pd

In [250]:
df = pd.read_csv('SampleData.csv')
df.head()

Unnamed: 0,ID Único,Data,Unidade,Grupo EFR,Grupo Rúbrica,Tipo Rúbrica,Sexo,Data de Nascimento
0,1,20-02-2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,1961
1,1,26-07-2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,1961
2,1,04-08-2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,1961
3,1,15-09-2017,HCIS,ADSE,RECOBRO,,F,1961
4,2,12-01-2017,HCS,ADSE,NEURO-CIRURGIA,URGÊNCIAS,F,1971


## Data Cleaning

In [251]:
# rename columns
columns = ['Unique ID','Date','Hospital','Payer','Specific_Service','Category_of_Service','Sex','Birth_Year']
df.columns = columns
df.head(3)

# drop virtual client
#df = df[df.UniqueID != '6bb61e3b7bce0931da574d19d1d82c88'] 

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Birth_Year
0,1,20-02-2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,1961
1,1,26-07-2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,1961
2,1,04-08-2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,1961


### Basic Processing

In [252]:
import datetime

# convert birth year to age based on year of access
df["Date"] = pd.to_datetime(df.Date, errors="coerce") # convert to date time 
df["Age"] = df["Date"].dt.year - df["Birth_Year"] # calculate approximate age
df["Date"]= df["Date"].dt.year # set date column only to the year of the date
del df["Birth_Year"] # delete Birth_Year column

df.head(3)

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Age
0,1,2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56
1,1,2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56
2,1,2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56


#### Missing Values and Duplicates

In [253]:
# replace missing values with "missing"
df.Sex.fillna('missing',inplace = True)
df.Hospital.fillna('missing',inplace = True)
df.Specific_Service.fillna('missing',inplace = True)
df.Payer.fillna('missing',inplace = True)
df.Category_of_Service.fillna('missing',inplace = True)
null_data = df[df.isnull().any(axis=1)]

df.head(3)

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Age
0,1,2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56
1,1,2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56
2,1,2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56


In [254]:
# drop duplicates
df = df.drop_duplicates()
df.dtypes

Unique ID               int64
Date                    int64
Hospital               object
Payer                  object
Specific_Service       object
Category_of_Service    object
Sex                    object
Age                     int64
dtype: object

#### Reduce File Size

In [255]:
# convert objects to categories to conserve space
df[["Hospital","Payer","Specific_Service",
    "Category_of_Service","Sex"]] = df[["Hospital","Payer","Specific_Service",
                                        "Category_of_Service","Sex"]].astype('category')

# convert 
df.dtypes

Unique ID                 int64
Date                      int64
Hospital               category
Payer                  category
Specific_Service       category
Category_of_Service    category
Sex                    category
Age                       int64
dtype: object

### Categorical Variable Grouping

#### Hospitals

In [256]:
df.Hospital.unique()

[HCIS, CCA, HCS, CCB]
Categories (4, object): [HCIS, CCA, HCS, CCB]

In [257]:
# categorical grouping for hospitals based on size

# define lists that group hospitals by size
large_hospital = ['HCD','HCIS','HCP']
medium_hospital = ['CCC','CCTV','HCS','HCV','CUFC']
clinic = ['ICDT','CCA','CCB','CCAL','CCSDR','CCMF','CCS','CCM','CCSJM','CLA']

# create a new column (HType) that describes the size of the hospital 
df.loc[df['Hospital'].isin(large_hospital),'HType'] = 'Large'
df.loc[df['Hospital'].isin(medium_hospital),'HType'] = 'Medium'
df.loc[df['Hospital'].isin(clinic),'HType'] = 'Clinic'

df.head(3)

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Age,HType
0,1,2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56,Large
1,1,2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56,Clinic
2,1,2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56,Medium


In [258]:
# categorical grouping for hospitals based on region

# define lists that group hospital by region
lisbon = ['HCD', 'HCIS','CCC','CCA','CCB','CCTV''CCAL','CCSDR','CCMF','CCS','CCM']
porto = ['HCP','ICDT','CCSJM']
santarem = ['HCS']
setubal = ['CLA']
coimbra = ['CUFC']
viseu = ['HCV']

# create a new column (HRegion) that describes the region of the hospital 
df.loc[df['Hospital'].isin(lisbon),'HRegion'] = 'Lisbon'
df.loc[df['Hospital'].isin(porto),'HRegion'] = 'Porto'
df.loc[df['Hospital'].isin(santarem),'HRegion'] = 'Santarem'
df.loc[df['Hospital'].isin(setubal),'HRegion'] = 'Sentubal'
df.loc[df['Hospital'].isin(coimbra),'HRegion'] = 'Coimbra'
df.loc[df['Hospital'].isin(viseu),'HRegion'] = 'Viseu'

df.head(3)

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Age,HType,HRegion
0,1,2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56,Large,Lisbon
1,1,2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56,Clinic,Lisbon
2,1,2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56,Medium,Santarem


#### Specific Service

In [259]:
# read specific service as a new dataframe
specificservice = pd.read_excel('iXperience Metadata.xlsx')

In [260]:
# create a dictionary that maps each service to a situational category
situational_dict = dict(zip(specificservice['Specific Service'],specificservice['Situational Category']))

# create a dictionary that maps each service to a severity
severity_dict = dict(zip(specificservice['Specific Service'],specificservice['Severity']))

# add 2 new columns to the data frame (Severity, Situational_Category)
df['Severity'] = df['Specific_Service'].map(severity_dict)
df['Situational_Category'] = df['Specific_Service'].map(situational_dict)
df.head(3)

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Age,HType,HRegion,Severity,Situational_Category
0,1,2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56,Large,Lisbon,Medium,Testing
1,1,2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56,Clinic,Lisbon,Emergency,Operations
2,1,2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56,Medium,Santarem,Medium,Specialization


#### Payer

In [261]:
# read payer as a new dataframe
payer = pd.read_excel('payer mapping.xlsx')

In [262]:
# create a dictionary that maps each service to a severity
payer_dict = dict(zip(payer['Grupo EFR (payer)'],payer['Type']))

# add a new columns to the data frame (Insurance_Type)
df['Insurance_Type'] = df['Payer'].map(payer_dict)
df.head()

Unnamed: 0,Unique ID,Date,Hospital,Payer,Specific_Service,Category_of_Service,Sex,Age,HType,HRegion,Severity,Situational_Category,Insurance_Type
0,1,2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56,Large,Lisbon,Medium,Testing,Out-of-Pocket
1,1,2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56,Clinic,Lisbon,Emergency,Operations,Out-of-Pocket
2,1,2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56,Medium,Santarem,Medium,Specialization,State
3,1,2017,HCIS,ADSE,RECOBRO,missing,F,56,Large,Lisbon,Medium,Treatment,State
4,2,2017,HCS,ADSE,NEURO-CIRURGIA,URGÊNCIAS,F,46,Medium,Santarem,Severe,Operations,State


### Group By: sortdf

In [263]:
# aggregate function takes the mode of columns
aggregation_functions = {'Date': 'count', 'Hospital': lambda x: x.value_counts().index[0],'Sex':'first',
                        'Age': 'mean','Insurance_Type':lambda x: x.value_counts().index[0],
                        'HType':lambda x: x.value_counts().index[0],
                         #'Severity':lambda x: x.value_counts().index[0],
                         'Situational_Category':lambda x: x.value_counts().index[0],
                       'HRegion':lambda x: x.value_counts().index[0]}
sortdf = df.groupby(df['Unique ID']).aggregate(aggregation_functions)


# rename Date as Freq, signifying the number of visits per client (each client = 1 row)
sortdf.rename(columns= {'Date':'Freq'},inplace=True)
sortdf.head(3)

Unnamed: 0_level_0,Freq,Hospital,Sex,Age,Insurance_Type,HType,Situational_Category,HRegion
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4,HCIS,F,56,Out-of-Pocket,Large,Specialization,Lisbon
2,2,HCS,F,46,State,Medium,Specialization,Lisbon
3,1,HCIS,F,39,Private,Large,Other,Lisbon


### Categorical Grouping for Age

In [264]:
# create a new column ('Age_Group')
sortdf['Age_Group']= ''

In [265]:
# group clients into age groups categories and fill 'Age_Group' column
# age group classifications taken from: https://www.cia.gov/library/publications/the-world-factbook/geos/po.html

pd.options.mode.chained_assignment = None # remove warning

sortdf['Age_Group'][(sortdf["Age"]<15) & (sortdf["Age"]>=0)] = "Child"
sortdf['Age_Group'][(sortdf["Age"]>=15) & (sortdf["Age"]<25)] = "Early Working"
sortdf['Age_Group'][(sortdf["Age"]>=25) & (sortdf["Age"]<55)] = "Prime Working"
sortdf['Age_Group'][(sortdf["Age"]>=55) & (sortdf["Age"]<65)] = "Mature Working"
sortdf['Age_Group'][sortdf["Age"]>=65] = "Elderly"

sortdf.head(3)

Unnamed: 0_level_0,Freq,Hospital,Sex,Age,Insurance_Type,HType,Situational_Category,HRegion,Age_Group
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,4,HCIS,F,56,Out-of-Pocket,Large,Specialization,Lisbon,Mature Working
2,2,HCS,F,46,State,Medium,Specialization,Lisbon,Prime Working
3,1,HCIS,F,39,Private,Large,Other,Lisbon,Prime Working


In [266]:
# reorganize columns
sortdf = sortdf[['Freq','Sex','Age','Age_Group','Hospital','HType','HRegion',
                 'Insurance_Type','Situational_Category']]

# reset index values and show dataframe
sortdf.reset_index().head(3)

Unnamed: 0,Unique ID,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Insurance_Type,Situational_Category
0,1,4,F,56,Mature Working,HCIS,Large,Lisbon,Out-of-Pocket,Specialization
1,2,2,F,46,Prime Working,HCS,Medium,Lisbon,State,Specialization
2,3,1,F,39,Prime Working,HCIS,Large,Lisbon,Private,Other


#### Replace Null Values and Reduce File Size

In [267]:
# replace missing values with "missing"
sortdf.Sex.fillna('missing',inplace = True)
sortdf.Freq.fillna('missing',inplace = True)
sortdf.Age.fillna('missing',inplace = True)
sortdf.Age_Group.fillna('missing',inplace = True)
sortdf.Hospital.fillna('missing',inplace = True)
sortdf.Hospital.fillna('missing',inplace = True)
sortdf.HType.fillna('missing',inplace = True)
sortdf.HRegion.fillna('missing',inplace = True)
sortdf.Insurance_Type.fillna('missing',inplace = True)
sortdf.Situational_Category.fillna('missing',inplace = True)
null_data = df[df.isnull().any(axis=1)]

In [272]:
# make 'Age_Group' a category
sortdf['Sex'] = sortdf['Sex'].astype('category')
sortdf['Age_Group'] = sortdf['Age_Group'].astype('category')
sortdf['Hospital'] = sortdf['Hospital'].astype('category')
sortdf['HType'] = sortdf['HType'].astype('category')
sortdf['HRegion'] = sortdf['HRegion'].astype('category')
sortdf['Insurance_Type'] = sortdf['Insurance_Type'].astype('category')
sortdf['Situational_Category'] = sortdf['Age_Group'].astype('category')

Freq                       int64
Sex                     category
Age                        int64
Age_Group               category
Hospital                category
HType                   category
HRegion                 category
Insurance_Type          category
Situational_Category    category
dtype: object