# Cleaning the Data

## Data Prep 

### Load Packages

In [19]:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")
from IPython.display import Image
from ipywidgets import interact, fixed

In [20]:
df = pd.read_csv("Data/2017.csv")
df.head()

Unnamed: 0,Data,Unidade,Grupo EFR,Grupo Rúbrica,Tipo Rúbrica,Data Nascimento,Sexo,hashed_ID_Único
0,14/09/2017,HCIS,ADSE,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
1,21/08/2017,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
2,21/08/2017,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,SERVIÇOS E TÉCNICAS GERAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
3,18/12/2017,CCTV,ADSE,SERVIÇOS ESPECIAIS UROLOGIA,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
4,24/04/2017,CCSJM,PARTICULARES,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3


In [21]:
df.shape

(6227655, 8)

### Manipulate Columns

In [22]:
# Delete Uneeded Columns
#del df["hashed_ID_Único"]

In [23]:
#Translate column names into english
data = df.rename(columns={
    "hashed_ID_Único":"UniqueID",
    "Data":"Date", 
    "Unidade":"Facility",
    "Grupo EFR":"Payer",
    "Grupo Rúbrica":"SpecificService", 
    "Tipo Rúbrica":"CategoryofService",
    "Sexo":"Sex",
    "Data Nascimento":"BirthYear",
})

In [24]:
data.head()

Unnamed: 0,Date,Facility,Payer,SpecificService,CategoryofService,BirthYear,Sex,UniqueID
0,14/09/2017,HCIS,ADSE,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
1,21/08/2017,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
2,21/08/2017,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,SERVIÇOS E TÉCNICAS GERAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
3,18/12/2017,CCTV,ADSE,SERVIÇOS ESPECIAIS UROLOGIA,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3
4,24/04/2017,CCSJM,PARTICULARES,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,1980.0,Feminino,91e9d2627cd4cc24958988333c4220a3


In [None]:
import datetime
# Calculate Approximate Age
data["Date"] = pd.to_datetime(data.Date, errors="coerce") #converting to date time 
data["Age"] = data["Date"].dt.year - data["BirthYear"]

In [14]:
# Delete Uneeded Columns
del data["BirthYear"]
del data["Date"]

data.head(2)

Unnamed: 0,Facility,Payer,SpecificService,CategoryofService,Sex,UniqueID,Age
0,HCIS,ADSE,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0
1,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,EXAMES ESPECIAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0


In [16]:
data.head()

Unnamed: 0,Facility,Payer,SpecificService,CategoryofService,Sex,UniqueID,Age
0,HCIS,ADSE,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0
1,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,EXAMES ESPECIAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0
2,CCSJM,PARTICULARES,SERVIÇOS E TÉCNICAS GERAIS,SERVIÇOS E TÉCNICAS GERAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0
3,CCTV,ADSE,SERVIÇOS ESPECIAIS UROLOGIA,EXAMES ESPECIAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0
4,CCSJM,PARTICULARES,SERVIÇOS ESPECIAIS CARDIOLOGIA,EXAMES ESPECIAIS,Feminino,91e9d2627cd4cc24958988333c4220a3,37.0


### Create a customer dataframe 

In [15]:
patient_visits = data.UniqueID.value_counts(sort=False)

In [8]:
customers = patient_visits.rename_axis('UniqueID').reset_index(name='FrequencyofVisits')

In [9]:
customers.head()

Unnamed: 0,UniqueID,FrequencyofVisits
0,1,4
1,2,2
2,3,1
3,4,1
4,5,2


In [10]:
customers[["Age","Sex"]]= data[["Age","Sex"]]

In [11]:
customers.head()

Unnamed: 0,UniqueID,FrequencyofVisits,Age,Sex
0,1,4,56,F
1,2,2,56,F
2,3,1,56,F
3,4,1,56,F
4,5,2,46,F


### Variable Grouping by Age

In [12]:
data['Age_Group']= 0

In [13]:
data['Age_Group'][(data["Age"]<18) & (data["Age"]>=5)] = "Child"
data['Age_Group'][(data["Age"]>=18) & (data["Age"]<44)] = "YoungAdult"
data['Age_Group'][(data["Age"]>=44) & (data["Age"]<64)] = "Adult"
data['Age_Group'][(data["Age"]>=64) & (data["Age"]<81)] = "Senior"
data['Age_Group'][data["Age"]>=81] = "Elderly"

In [14]:
data.head()

Unnamed: 0,UniqueID,Facility,Payer,SpecificService,CategoryofService,Sex,Age,Age_Group
0,1,HCIS,PARTICULARES,RX CONVENCIONAL,CONSULTA EXTERNA,F,56,Adult
1,1,CCA,PARTICULARES,URGÊNCIA GERAL,URGÊNCIAS,F,56,Adult
2,1,HCS,ADSE,GASTROENTEROLOGIA,IMAGIOLOGIA,F,56,Adult
3,1,HCIS,ADSE,RECOBRO,,F,56,Adult
4,2,HCS,ADSE,NEURO-CIRURGIA,URGÊNCIAS,F,46,Adult


### Data Cleaning

In [15]:
# Add when get real data

### Export Dataset
Use pickle format so that df reads into notebook faster 

In [16]:
# This is the long format, the transaction table 
data.to_pickle("cleaned_data.pkl")

In [1]:
# This is the wide format, the customer table 
customers.to_pickle("customer_data.pkl")

NameError: name 'customers' is not defined