In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [14]:
transaction = pd.read_pickle('last outputs/CleanedTransactionData.pkl')

In [3]:
transaction.isnull().any()

Date                   False
Hospital               False
Payer                  False
Specific_Service       False
Category_of_Service    False
Sex                    False
UniqueID               False
Age                    False
HType                  False
HRegion                False
Insurance_Type         False
General_Service        False
dtype: bool

In [4]:
transaction.shape

(21921171, 12)

## Group by customers: customerdf

In [5]:
# aggregate function takes the mode of categorical columns, average of age, and count of unique dates
aggregation_functions = {'Date': 'nunique', 'Hospital': lambda x: x.value_counts().index[0],
                         'Sex': lambda x: x.value_counts().index[0],
                         'Age': 'mean','Insurance_Type':lambda x: x.value_counts().index[0],
                         'HType':lambda x: x.value_counts().index[0],
                         'Specific_Service':lambda x: x.value_counts().index[0],
                         'Category_of_Service':lambda x: x.value_counts().index[0],
                         'General_Service':lambda x: x.value_counts().index[0],
                         'Payer':lambda x: x.value_counts().index[0],
                         'HRegion':lambda x: x.value_counts().index[0]}
customerdf = transaction.groupby(transaction['UniqueID']).aggregate(aggregation_functions)


# rename Date as Freq, signifying the number of visits per client (each client = 1 row)
customerdf.rename(columns= {'Date':'Freq'},inplace=True)
customerdf.head(3)

Unnamed: 0_level_0,Freq,Hospital,Sex,Age,Insurance_Type,HType,Specific_Service,Category_of_Service,General_Service,Payer,HRegion
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,HCP,Masculino,47.4,Private,Large,PATOLOGIA CLINICA,PATOLOGIA CLINICA,Examination,MÉDIS,Porto
00001e984eba85527fd3122056451279,8,HCIS,Feminino,63.8,Private,Large,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS,Examination,ALLIANZ,Lisbon
0000219e4b37d2504fb6b8c28e24a2d4,1,ICDT,Feminino,10.0,Private,Clinic,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA,Examination,MÉDIS,Porto


In [6]:
# make a copy
customer1 = customerdf.copy()

### Categorical Grouping for Age

In [7]:
# create a new column ('Age_Group')
customerdf['Age_Group']= ''

In [8]:
# group clients into age groups categories and fill 'Age_Group' column
# age group classifications taken from: https://www.cia.gov/library/publications/the-world-factbook/geos/po.html

pd.options.mode.chained_assignment = None # remove warning

customerdf['Age_Group'][(customerdf["Age"]<15) & (customerdf["Age"]>=0)] = "Child"
customerdf['Age_Group'][(customerdf["Age"]>=15) & (customerdf["Age"]<25)] = "Early Working"
customerdf['Age_Group'][(customerdf["Age"]>=25) & (customerdf["Age"]<55)] = "Prime Working"
customerdf['Age_Group'][(customerdf["Age"]>=55) & (customerdf["Age"]<65)] = "Mature Working"
customerdf['Age_Group'][customerdf["Age"]>=65] = "Elderly"

customerdf.Age_Group.fillna('missing',inplace = True)

customerdf.head(3)

Unnamed: 0_level_0,Freq,Hospital,Sex,Age,Insurance_Type,HType,Specific_Service,Category_of_Service,General_Service,Payer,HRegion,Age_Group
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,HCP,Masculino,47.4,Private,Large,PATOLOGIA CLINICA,PATOLOGIA CLINICA,Examination,MÉDIS,Porto,Prime Working
00001e984eba85527fd3122056451279,8,HCIS,Feminino,63.8,Private,Large,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS,Examination,ALLIANZ,Lisbon,Mature Working
0000219e4b37d2504fb6b8c28e24a2d4,1,ICDT,Feminino,10.0,Private,Clinic,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA,Examination,MÉDIS,Porto,Child


In [9]:
# reorganize columns
customerdf = customerdf[['Freq','Sex','Age','Age_Group','Hospital','HType','HRegion',
                 'Payer','Insurance_Type','Specific_Service','Category_of_Service','General_Service']]

customerdf.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,Category_of_Service,General_Service
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,Prime Working,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,PATOLOGIA CLINICA,Examination
00001e984eba85527fd3122056451279,8,Feminino,63.8,Mature Working,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS,Examination
0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,Child,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA,Examination


In [10]:
# drop missing values
#customerdf.dropna(inplace=True)
customerdf.isnull().any()

Freq                   False
Sex                    False
Age                    False
Age_Group              False
Hospital               False
HType                  False
HRegion                False
Payer                  False
Insurance_Type         False
Specific_Service       False
Category_of_Service    False
General_Service        False
dtype: bool

## Convert categorical variables to numerical

### Get percentage for each hospital type

In [21]:
# get visit counts of each hospital type for each customer
htype_count = transaction.groupby(['UniqueID','HType'])['HType'].count()
htype_count.head(5)

UniqueID                          HType 
00000f7264c27ba6fea0c837ed6aa0aa  Clinic     2
                                  Large      3
00001e984eba85527fd3122056451279  Large     10
0000219e4b37d2504fb6b8c28e24a2d4  Clinic     2
000026c67a83fa72aec14512887bb173  Medium     8
Name: HType, dtype: int64

In [22]:
# compute percentage of visits to different types of hospital
htype_perc = htype_count / htype_count.groupby(level=0).sum()

In [23]:
# create new columns
htype_df = htype_perc.unstack(level=-1).fillna(0)
htype_df.head(3)

HType,Clinic,Large,Medium
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0.4,0.6,0.0
00001e984eba85527fd3122056451279,0.0,1.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,1.0,0.0,0.0


In [25]:
# change column names
htype_df.columns = ['Clinic','Large_Hospital','Medium_Hospital']
htype_df.head(3)

Unnamed: 0_level_0,Clinic,Large_Hospital,Medium_Hospital
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0.4,0.6,0.0
00001e984eba85527fd3122056451279,0.0,1.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,1.0,0.0,0.0


In [26]:
# append new columns to the customer dataframe
customerdf = customerdf.join(htype_df)
customerdf.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,Category_of_Service,General_Service,Clinic,Large_Hospital,Medium_Hospital
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,Prime Working,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,PATOLOGIA CLINICA,Examination,0.4,0.6,0.0
00001e984eba85527fd3122056451279,8,Feminino,63.8,Mature Working,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS,Examination,0.0,1.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,Child,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA,Examination,1.0,0.0,0.0


### Get percentage for each insurance type

In [27]:
# implement the same procedure on insurance type
# get the percentage of times that a customer used each type of insurance
IType_count = transaction.groupby(['UniqueID','Insurance_Type'])['Insurance_Type'].count()
IType_perc = IType_count / IType_count.groupby(level=0).sum()
IType_df = IType_perc.unstack(level=-1).fillna(0)
IType_df.head(3)

Insurance_Type,Other,Out-of-Pocket,Private,Special Program,State
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0.0,0.0,1.0,0.0,0.0
00001e984eba85527fd3122056451279,0.0,0.0,0.8,0.0,0.2
0000219e4b37d2504fb6b8c28e24a2d4,0.0,0.0,1.0,0.0,0.0


In [28]:
IType_df.columns = ['Payer_Other','Payer_Out_of_Pocket','Payer_Private',
                    'Payer_Special_Program','Payer_State']
IType_df.head(3)

Unnamed: 0_level_0,Payer_Other,Payer_Out_of_Pocket,Payer_Private,Payer_Special_Program,Payer_State
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0.0,0.0,1.0,0.0,0.0
00001e984eba85527fd3122056451279,0.0,0.0,0.8,0.0,0.2
0000219e4b37d2504fb6b8c28e24a2d4,0.0,0.0,1.0,0.0,0.0


In [29]:
customerdf = customerdf.join(IType_df)
customerdf.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,Category_of_Service,General_Service,Clinic,Large_Hospital,Medium_Hospital,Payer_Other,Payer_Out_of_Pocket,Payer_Private,Payer_Special_Program,Payer_State
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,Prime Working,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,PATOLOGIA CLINICA,Examination,0.4,0.6,0.0,0.0,0.0,1.0,0.0,0.0
00001e984eba85527fd3122056451279,8,Feminino,63.8,Mature Working,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS,Examination,0.0,1.0,0.0,0.0,0.0,0.8,0.0,0.2
0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,Child,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA,Examination,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Get percentage for each region

In [30]:
# implement the same procedure on hospital region
# get the percentage of times that a customer went to each region
region_count = transaction.groupby(['UniqueID','HRegion'])['HRegion'].count()
region_perc = region_count / region_count.groupby(level=0).sum()
region_df = region_perc.unstack(level=-1).fillna(0)

In [32]:
region_df.columns = ['region_Lisbon','region_Porto','region_Santarem','region_Viseu']
region_df.head(5)

Unnamed: 0_level_0,region_Lisbon,region_Porto,region_Santarem,region_Viseu
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0.0,1.0,0.0,0.0
00001e984eba85527fd3122056451279,1.0,0.0,0.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,0.0,1.0,0.0,0.0
000026c67a83fa72aec14512887bb173,1.0,0.0,0.0,0.0
000028899fe7782862d40bb1b87807ee,1.0,0.0,0.0,0.0


In [33]:
customerdf = customerdf.join(region_df)
customerdf.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,...,Medium_Hospital,Payer_Other,Payer_Out_of_Pocket,Payer_Private,Payer_Special_Program,Payer_State,region_Lisbon,region_Porto,region_Santarem,region_Viseu
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,Prime Working,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
00001e984eba85527fd3122056451279,8,Feminino,63.8,Mature Working,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,...,0.0,0.0,0.0,0.8,0.0,0.2,1.0,0.0,0.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,Child,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


### Get percentage for each type of general service

In [34]:
# implement the same procedure on general service
# get the percentage of times that a customer got each type of general service
service_count = transaction.groupby(['UniqueID','General_Service'])['General_Service'].count()
service_perc = service_count / service_count.groupby(level=0).sum()
service_df = service_perc.unstack(level=-1).fillna(0)

service_df.head(5)

General_Service,Appointment,Emergency,Examination,Others,Surgery,Treatment
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0.2,0.2,0.6,0.0,0.0,0.0
00001e984eba85527fd3122056451279,0.3,0.0,0.7,0.0,0.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,0.5,0.0,0.5,0.0,0.0,0.0
000026c67a83fa72aec14512887bb173,0.0,0.0,0.75,0.0,0.25,0.0
000028899fe7782862d40bb1b87807ee,0.0,1.0,0.0,0.0,0.0,0.0


In [35]:
service_df.columns = ['General_Service_Appointment','General_Service_Emergency','General_Service_Examination','General_Service_Others',
                     'General_Service_Surgery','General_Service_Treatment']

In [36]:
customerdf = customerdf.join(service_df)
customerdf.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,...,region_Lisbon,region_Porto,region_Santarem,region_Viseu,General_Service_Appointment,General_Service_Emergency,General_Service_Examination,General_Service_Others,General_Service_Surgery,General_Service_Treatment
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,Prime Working,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,...,0.0,1.0,0.0,0.0,0.2,0.2,0.6,0.0,0.0,0.0
00001e984eba85527fd3122056451279,8,Feminino,63.8,Mature Working,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,...,1.0,0.0,0.0,0.0,0.3,0.0,0.7,0.0,0.0,0.0
0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,Child,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,...,0.0,1.0,0.0,0.0,0.5,0.0,0.5,0.0,0.0,0.0


### Get dummy variables for sex

In [37]:
# change sex into dummy variables
dummies = pd.get_dummies(customerdf['Sex'])
dummies.head(3)

Unnamed: 0_level_0,Feminino,Masculino,missing
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0,1,0
00001e984eba85527fd3122056451279,1,0,0
0000219e4b37d2504fb6b8c28e24a2d4,1,0,0


In [38]:
dummies.columns = ['Sex_Feminino','Sex_Masculino','Sex_Missing']
dummies.head(3)

Unnamed: 0_level_0,Sex_Feminino,Sex_Masculino,Sex_Missing
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00000f7264c27ba6fea0c837ed6aa0aa,0,1,0
00001e984eba85527fd3122056451279,1,0,0
0000219e4b37d2504fb6b8c28e24a2d4,1,0,0


In [39]:
customerdf = customerdf.join(dummies)
customerdf.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Age_Group,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,...,region_Viseu,General_Service_Appointment,General_Service_Emergency,General_Service_Examination,General_Service_Others,General_Service_Surgery,General_Service_Treatment,Sex_Feminino,Sex_Masculino,Sex_Missing
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,Prime Working,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,...,0.0,0.2,0.2,0.6,0.0,0.0,0.0,0,1,0
00001e984eba85527fd3122056451279,8,Feminino,63.8,Mature Working,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,...,0.0,0.3,0.0,0.7,0.0,0.0,0.0,1,0,0
0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,Child,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,...,0.0,0.5,0.0,0.5,0.0,0.0,0.0,1,0,0


In [40]:
customerdf.columns

Index(['Freq', 'Sex', 'Age', 'Age_Group', 'Hospital', 'HType', 'HRegion',
       'Payer', 'Insurance_Type', 'Specific_Service', 'Category_of_Service',
       'General_Service', 'Clinic', 'Large_Hospital', 'Medium_Hospital',
       'Payer_Other', 'Payer_Out_of_Pocket', 'Payer_Private',
       'Payer_Special_Program', 'Payer_State', 'region_Lisbon', 'region_Porto',
       'region_Santarem', 'region_Viseu', 'General_Service_Appointment',
       'General_Service_Emergency', 'General_Service_Examination',
       'General_Service_Others', 'General_Service_Surgery',
       'General_Service_Treatment', 'Sex_Feminino', 'Sex_Masculino',
       'Sex_Missing'],
      dtype='object')

In [41]:
# check for null value
customerdf.isnull().any()

Freq                           False
Sex                            False
Age                            False
Age_Group                      False
Hospital                       False
HType                          False
HRegion                        False
Payer                          False
Insurance_Type                 False
Specific_Service               False
Category_of_Service            False
General_Service                False
Clinic                         False
Large_Hospital                 False
Medium_Hospital                False
Payer_Other                    False
Payer_Out_of_Pocket            False
Payer_Private                  False
Payer_Special_Program          False
Payer_State                    False
region_Lisbon                  False
region_Porto                   False
region_Santarem                False
region_Viseu                   False
General_Service_Appointment    False
General_Service_Emergency      False
General_Service_Examination    False
G

In [43]:
customerdf.dtypes

Freq                             int64
Sex                             object
Age                            float64
Age_Group                       object
Hospital                        object
HType                           object
HRegion                         object
Payer                           object
Insurance_Type                  object
Specific_Service                object
Category_of_Service             object
General_Service                 object
Clinic                         float64
Large_Hospital                 float64
Medium_Hospital                float64
Payer_Other                    float64
Payer_Out_of_Pocket            float64
Payer_Private                  float64
Payer_Special_Program          float64
Payer_State                    float64
region_Lisbon                  float64
region_Porto                   float64
region_Santarem                float64
region_Viseu                   float64
General_Service_Appointment    float64
General_Service_Emergency

## Export the dataframe for FAMD

In [82]:
# select the variables needed for FAMD
famd_df = customerdf[['Freq','Sex','Age','Hospital','HType','HRegion','Payer','Insurance_Type',
                      'Specific_Service','Category_of_Service']]

In [71]:
# get numerical columns
famd_num = famd_df._get_numeric_data()
famd_num.head(3)

Unnamed: 0_level_0,Freq,Age
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,47.4
00001e984eba85527fd3122056451279,8,63.8
0000219e4b37d2504fb6b8c28e24a2d4,1,10.0


In [72]:
from sklearn.preprocessing import StandardScaler

# save column names
num_cols = famd_num.columns

# standardize using standardscaler (mean = 0, std = 1)
famd_scaler = StandardScaler(with_std = True)
standardized_famd = famd_scaler.fit_transform(famd_num)

In [73]:
# convert array into dataframe
standardized_famd = pd.DataFrame(standardized_famd)
standardized_famd.head(5)

Unnamed: 0,0,1
0,-0.279657,0.297376
1,0.02368,1.016946
2,-0.50716,-1.343594
3,-0.431325,-0.158936
4,-0.431325,-1.277779


In [74]:
# change column names
standardized_famd.columns = num_cols
standardized_famd.head(3)

Unnamed: 0,Freq,Age
0,-0.279657,0.297376
1,0.02368,1.016946
2,-0.50716,-1.343594


In [83]:
famd_df.reset_index(inplace=True)
famd_df.head(5)

Unnamed: 0,UniqueID,Freq,Sex,Age,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,Category_of_Service
0,00000f7264c27ba6fea0c837ed6aa0aa,4,Masculino,47.4,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,PATOLOGIA CLINICA
1,00001e984eba85527fd3122056451279,8,Feminino,63.8,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS
2,0000219e4b37d2504fb6b8c28e24a2d4,1,Feminino,10.0,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA
3,000026c67a83fa72aec14512887bb173,2,Masculino,37.0,CCC,Medium,Lisbon,MÉDIS,Private,SERVIÇOS ESPECIAIS MEDICINA DENTÁRIA,EXAMES ESPECIAIS
4,000028899fe7782862d40bb1b87807ee,2,Masculino,11.5,HCD,Large,Lisbon,ADSE,State,URGÊNCIA PEDIATRIA,URGÊNCIAS


In [84]:
# replace the numerical columns with the standardized values
famd_df['Freq'] = standardized_famd['Freq']
famd_df['Age'] = standardized_famd['Age']

In [85]:
famd_df.head(5)

Unnamed: 0,UniqueID,Freq,Sex,Age,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,Category_of_Service
0,00000f7264c27ba6fea0c837ed6aa0aa,-0.279657,Masculino,0.297376,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,PATOLOGIA CLINICA
1,00001e984eba85527fd3122056451279,0.02368,Feminino,1.016946,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS
2,0000219e4b37d2504fb6b8c28e24a2d4,-0.50716,Feminino,-1.343594,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA
3,000026c67a83fa72aec14512887bb173,-0.431325,Masculino,-0.158936,CCC,Medium,Lisbon,MÉDIS,Private,SERVIÇOS ESPECIAIS MEDICINA DENTÁRIA,EXAMES ESPECIAIS
4,000028899fe7782862d40bb1b87807ee,-0.431325,Masculino,-1.277779,HCD,Large,Lisbon,ADSE,State,URGÊNCIA PEDIATRIA,URGÊNCIAS


In [86]:
famd_df.set_index('UniqueID',inplace=True)

In [87]:
famd_df.isnull().any()

Freq                   False
Sex                    False
Age                    False
Hospital               False
HType                  False
HRegion                False
Payer                  False
Insurance_Type         False
Specific_Service       False
Category_of_Service    False
dtype: bool

In [88]:
famd_df.head(3)

Unnamed: 0_level_0,Freq,Sex,Age,Hospital,HType,HRegion,Payer,Insurance_Type,Specific_Service,Category_of_Service
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
00000f7264c27ba6fea0c837ed6aa0aa,-0.279657,Masculino,0.297376,HCP,Large,Porto,MÉDIS,Private,PATOLOGIA CLINICA,PATOLOGIA CLINICA
00001e984eba85527fd3122056451279,0.02368,Feminino,1.016946,HCIS,Large,Lisbon,ALLIANZ,Private,SERVIÇOS ESPECIAIS DERMATOLOGIA,EXAMES ESPECIAIS
0000219e4b37d2504fb6b8c28e24a2d4,-0.50716,Feminino,-1.343594,ICDT,Clinic,Porto,MÉDIS,Private,SERVIÇOS ESPECIAIS OFTALMOLOGIA,CONSULTA EXTERNA


In [89]:
famd_df.to_pickle('pre_famd_df.pkl')

## Standardize numerical columns

In [55]:
# get numerical columns
num_df = customerdf._get_numeric_data()
num_df.head()

Unnamed: 0_level_0,Freq,Age,Clinic,Large_Hospital,Medium_Hospital,Payer_Other,Payer_Out_of_Pocket,Payer_Private,Payer_Special_Program,Payer_State,...,region_Viseu,General_Service_Appointment,General_Service_Emergency,General_Service_Examination,General_Service_Others,General_Service_Surgery,General_Service_Treatment,Sex_Feminino,Sex_Masculino,Sex_Missing
UniqueID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000f7264c27ba6fea0c837ed6aa0aa,4,47.4,0.4,0.6,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.2,0.2,0.6,0.0,0.0,0.0,0,1,0
00001e984eba85527fd3122056451279,8,63.8,0.0,1.0,0.0,0.0,0.0,0.8,0.0,0.2,...,0.0,0.3,0.0,0.7,0.0,0.0,0.0,1,0,0
0000219e4b37d2504fb6b8c28e24a2d4,1,10.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.5,0.0,0.5,0.0,0.0,0.0,1,0,0
000026c67a83fa72aec14512887bb173,2,37.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.75,0.0,0.25,0.0,0,1,0
000028899fe7782862d40bb1b87807ee,2,11.5,0.0,1.0,0.0,0.0,0.0,0.5,0.0,0.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,1,0


In [60]:
# save column names
cols = num_df.columns

In [56]:
# standardize using standardscaler (mean = 0, std = 1)
scaler = StandardScaler(with_std = True)
standardized = scaler.fit_transform(num_df)

In [57]:
# convert array into dataframe
standardized = pd.DataFrame(standardized)
standardized.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,-0.279657,0.297376,0.353768,0.171623,-0.555808,-0.243716,-0.493714,0.977708,-0.189121,-0.538143,...,-0.164676,-0.592757,0.563824,0.583638,-0.134845,-0.336768,-0.384018,-1.109188,1.116113,-0.055554
1,0.02368,1.016946,-0.6552,1.036258,-0.555808,-0.243716,-0.493714,0.55902,-0.189121,-0.028397,...,-0.164676,-0.309423,-0.43676,0.898848,-0.134845,-0.336768,-0.384018,0.90156,-0.895966,-0.055554
2,-0.50716,-1.343594,1.867219,-1.125331,-0.555808,-0.243716,-0.493714,0.977708,-0.189121,-0.538143,...,-0.164676,0.257245,-0.43676,0.268427,-0.134845,-0.336768,-0.384018,0.90156,-0.895966,-0.055554
3,-0.431325,-0.158936,-0.6552,-1.125331,1.974664,-0.243716,-0.493714,0.977708,-0.189121,-0.538143,...,-0.164676,-1.159425,-0.43676,1.056454,-0.134845,1.815089,-0.384018,-1.109188,1.116113,-0.055554
4,-0.431325,-1.277779,-0.6552,1.036258,-0.555808,-0.243716,-0.493714,-0.069014,-0.189121,0.736222,...,-0.164676,-1.159425,4.566161,-1.307628,-0.134845,-0.336768,-0.384018,-1.109188,1.116113,-0.055554


In [63]:
# change column names
standardized.columns = cols
standardized.head(3)

Unnamed: 0,Freq,Age,Clinic,Large_Hospital,Medium_Hospital,Payer_Other,Payer_Out_of_Pocket,Payer_Private,Payer_Special_Program,Payer_State,...,region_Viseu,General_Service_Appointment,General_Service_Emergency,General_Service_Examination,General_Service_Others,General_Service_Surgery,General_Service_Treatment,Sex_Feminino,Sex_Masculino,Sex_Missing
0,-0.279657,0.297376,0.353768,0.171623,-0.555808,-0.243716,-0.493714,0.977708,-0.189121,-0.538143,...,-0.164676,-0.592757,0.563824,0.583638,-0.134845,-0.336768,-0.384018,-1.109188,1.116113,-0.055554
1,0.02368,1.016946,-0.6552,1.036258,-0.555808,-0.243716,-0.493714,0.55902,-0.189121,-0.028397,...,-0.164676,-0.309423,-0.43676,0.898848,-0.134845,-0.336768,-0.384018,0.90156,-0.895966,-0.055554
2,-0.50716,-1.343594,1.867219,-1.125331,-0.555808,-0.243716,-0.493714,0.977708,-0.189121,-0.538143,...,-0.164676,0.257245,-0.43676,0.268427,-0.134845,-0.336768,-0.384018,0.90156,-0.895966,-0.055554


## Check for Multicollinearity of numerical variables

In [64]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [65]:
def calculate_vif_(X, thresh=5.0):
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]

        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables])
    return X.iloc[:, variables]

In [66]:
num_final = calculate_vif_(standardized)

dropping 'Clinic' at index: 2
dropping 'Payer_Other' at index: 4
dropping 'region_Lisbon' at index: 8
dropping 'General_Service_Appointment' at index: 11
dropping 'Sex_Feminino' at index: 16
dropping 'Payer_Private' at index: 5
Remaining variables:
Index(['Freq', 'Age', 'Large_Hospital', 'Medium_Hospital',
       'Payer_Out_of_Pocket', 'Payer_Special_Program', 'Payer_State',
       'region_Porto', 'region_Santarem', 'region_Viseu',
       'General_Service_Emergency', 'General_Service_Examination',
       'General_Service_Others', 'General_Service_Surgery',
       'General_Service_Treatment', 'Sex_Masculino', 'Sex_Missing'],
      dtype='object')


In [67]:
num_final.isnull().any()

Freq                           False
Age                            False
Large_Hospital                 False
Medium_Hospital                False
Payer_Out_of_Pocket            False
Payer_Special_Program          False
Payer_State                    False
region_Porto                   False
region_Santarem                False
region_Viseu                   False
General_Service_Emergency      False
General_Service_Examination    False
General_Service_Others         False
General_Service_Surgery        False
General_Service_Treatment      False
Sex_Masculino                  False
Sex_Missing                    False
dtype: bool

In [68]:
num_final.shape

(1468101, 17)

In [69]:
# export numerical variables
num_final.to_pickle('customer_numerical_standardized.pkl')

In [70]:
# export customer data with all variables
customerdf.to_pickle('customer_all_variables.pkl')

## Variables description for customerdf
- Freq: the number of unique days that the customer visited hospitals, numerical
- Sex: gender of the customer, categorical
- Age: average age of the customer over all past transactions, numerical
- Age_Group: age group of the customer according to the average age, categorical
- Hospital: the most frequently visited hospital, categorical
- HType: the most frequently visited type of hospital, categorical
- HRegion: the most frequently visited region, categorical
- Payer: the most frequently used payment method, categorical
- Insurance_Type: the most frequently used type of payment, categorical
- Specific_Service: the most frequently used specific service, categorical
- Category_of_Service: the most frequently used category of service, categorical
- Clinic, Large_Hospital, Medium_Hospital: percentage of visits to each type of hospital, numerical
- Payer_Missing, Payer_Other, Payer_Out_of_Pocket, Payer_Private, Payer_Special_Program, Payer_State: percentage of transactions using each payment type, numerical
- sex_Feminino, sex_Masculino, sex_missing: dummy variables to indicate the gender of each customer, numerical