# Cleaning the Data

## Data Prep 

### Load Packages

In [23]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import Image
from ipywidgets import interact, fixed

In [24]:
df = pd.read_csv("SampleData.csv")
df.head()

Unnamed: 0,ID Único,Data,Unidade,Grupo EFR,Grupo Rúbrica,Tipo Rúbrica,Sexo,Data de Nascimento,Unnamed: 8,Unnamed: 9
0,1,20-02-2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,1961,,
1,1,26-07-2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,1961,,This is just some randomly generated data. It ...
2,1,04-08-2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,1961,,
3,1,15-09-2017,HCIS,ADSE,RECOBRO,,F,1961,,
4,2,12-01-2017,HCS,ADSE,NEURO-CIRURGIA,URGÊNCIAS,F,1971,,


### Manipulate Columns

In [25]:
#Get rid of blank columns
df_col= df.columns[0:8]
df_col
df = df[df_col]
df.head()

Unnamed: 0,ID Único,Data,Unidade,Grupo EFR,Grupo Rúbrica,Tipo Rúbrica,Sexo,Data de Nascimento
0,1,20-02-2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,1961
1,1,26-07-2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,1961
2,1,04-08-2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,1961
3,1,15-09-2017,HCIS,ADSE,RECOBRO,,F,1961
4,2,12-01-2017,HCS,ADSE,NEURO-CIRURGIA,URGÊNCIAS,F,1971


In [26]:
#Translate column names into english
data = df.rename(columns={
    "ID Único":"UniqueID",
    "Data":"Date", 
    "Unidade":"Facility",
    "Grupo EFR":"Payer",
    "Grupo Rúbrica":"SpecificService", 
    "Tipo Rúbrica":"CategoryofService",
    "Sexo":"Sex",
    "Data de Nascimento":"BirthYear",
})

data.head(4)

Unnamed: 0,UniqueID,Date,Facility,Payer,SpecificService,CategoryofService,Sex,BirthYear
0,1,20-02-2017,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,1961
1,1,26-07-2017,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,1961
2,1,04-08-2017,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,1961
3,1,15-09-2017,HCIS,ADSE,RECOBRO,,F,1961


In [27]:
import datetime
# Calculate Approximate Age
data["Date"] = pd.to_datetime(data.Date, errors="coerce") #converting to date time 
data["Age"] = data["Date"].dt.year - data["BirthYear"]

In [28]:
# Delete Uneeded Columns
del data["BirthYear"]
del data["Date"]

### Variable Grouping - Age

In [29]:
infant = list(range(0,5))
child = list(range(5,19))
youngAdult = list(range(19,45))
Adult = list(range(45,65))
senior = list(range(65,82))
elderly = list(range(82,data.Age.max()))

In [30]:
def check(list1,val):
    flag= False
    for x in list1: 
        if val== x: 
            flag = True 
    return flag

In [31]:
if data['Age']
data.loc[check(data['Age'],infant), 'age_type'] = 'Infant'

data.loc[check(data['Age'],child), 'age_type'] = 'Child'

data.loc[check(data['Age'],youngAdult), 'age_type'] = 'Young Adult'

data.loc[check(data['Age'],Adult), 'age_type'] = 'Adult'

data.loc[check(data['Age'],senior), 'age_type'] = 'Senior'

data.loc[check(data['Age'],elderly), 'age_type'] = 'Elderly'

SyntaxError: invalid syntax (<ipython-input-31-867dcee36bc4>, line 1)

In [32]:
age_types = ['infant', 'child', 'youngAdult', 'adult', 'senior','elderly']

data['age_category'] = pd.cut(data['Age'], 6, labels=age_types)

data.age_category.value_counts()

youngAdult    13
senior        12
adult          9
infant         6
elderly        4
child          0
Name: age_category, dtype: int64

In [33]:
ages = pd.DataFrame(list(data.Age), columns=['age'])
bins = [0, 4, 18, 44, 64, 81, 120]
labels = ['infant', 'child', 'young adult', 'adult', 'senior', 'elderly']
ages['age_range'] = pd.cut(ages.age, bins, labels = labels,include_lowest = True)

In [34]:
#0-4 infant 
#5-18 child 
#19-44 youngAdult
#45-64 Adult
#65-81 Senior
#82+
ages = pd.DataFrame(list(data.Age), columns=['age'])
bins = [0, 4, 18, 44, 64, 81, 120]
labels = ['infant', 'child', 'young adult', 'adult', 'senior', 'elderly']
ages['age_range'] = pd.cut(ages, bins, labels = labels,include_lowest = True)

print(ages)

ValueError: Input array must be 1 dimensional

### Variable Grouping - Facility 

In [35]:

data["hospital_type"] = "Other"

data["hospital_type"][data.Facility.isin([
    "HCIS", "HCD", "HCP"
])] = "Large_Hospitals"

data["hospital_type"][data.Facility.isin([
    "CCC", "CCTV", "HCS", "HCV", "CUFC"
])] = "Medium_Hospitals"

data["hospital_type"][data.Facility.isin([
    "CCA", "CCB", "CCAL"
])] = "Clinic"

data["hospital_type"][data.Facility.isin([
    "CCSDR", "CCMF", "CCS", "CCM", "CCSJM", "CLA"
])] = "Small_Clinic"

data["hospital_type"][data.Facility.isin([
    "ICDT"
])] = "Large_Clinic"


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

In [36]:
data.head(3)

Unnamed: 0,UniqueID,Facility,Payer,SpecificService,CategoryofService,Sex,Age,age_category,hospital_type
0,1,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56,senior,Large_Hospitals
1,1,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56,senior,Clinic
2,1,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56,senior,Medium_Hospitals


## Data Cleaning

In [37]:
# Add when get real data

## Export Dataset

In [38]:
#Use pickle format so that df reads into notebook faster 
data.to_pickle("cleaned_data.pkl")