# 4. Feature Engineering

In [8]:
%pip install ace_tools

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [4]:
original_data = pd.read_csv("C:/Research/Msc/CMM709/CAUSALITY-EXPLORE/data/raw/medical_appointment_no_show.csv")

In [5]:
df_fe = pd.read_csv("C:/Research/Msc/CMM709/CAUSALITY-EXPLORE/data/processed/medical_appointment_no_show_processed.csv")
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,lead_time,appointment_day_of_week
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,False,0,False,False,2,Friday
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,False,0,False,False,2,Friday
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,False,0,False,False,2,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday


In [170]:
df_fe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71959 entries, 0 to 71958
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   patient_id               71959 non-null  int64 
 1   appointment_id           71959 non-null  int64 
 2   gender                   71959 non-null  object
 3   scheduled_day            71959 non-null  object
 4   appointment_day          71959 non-null  object
 5   age                      71959 non-null  int64 
 6   neighbourhood            71959 non-null  object
 7   scholarship              71959 non-null  bool  
 8   hypertension             71959 non-null  bool  
 9   diabetes                 71959 non-null  bool  
 10  alcoholism               71959 non-null  bool  
 11  handicap                 71959 non-null  int64 
 12  sms_received             71959 non-null  bool  
 13  no_show                  71959 non-null  bool  
 14  lead_time                71959 non-nul

In [171]:
# Convert `gender`, `neighbourhood`, `handicap`, to categorical type
for col in ['gender', 'neighbourhood', 'handicap', 'appointment_day_of_week']:
    df_fe[col] = df_fe[col].astype('category')

# Convert `scheduled_day` and `appointment_day` to datetime
for col in ['scheduled_day', 'appointment_day']:
    df_fe[col] = pd.to_datetime(df_fe[col]).dt.date.astype('datetime64[ns]')
    
df_fe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71959 entries, 0 to 71958
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   patient_id               71959 non-null  int64         
 1   appointment_id           71959 non-null  int64         
 2   gender                   71959 non-null  category      
 3   scheduled_day            71959 non-null  datetime64[ns]
 4   appointment_day          71959 non-null  datetime64[ns]
 5   age                      71959 non-null  int64         
 6   neighbourhood            71959 non-null  category      
 7   scholarship              71959 non-null  bool          
 8   hypertension             71959 non-null  bool          
 9   diabetes                 71959 non-null  bool          
 10  alcoholism               71959 non-null  bool          
 11  handicap                 71959 non-null  category      
 12  sms_received             71959 n

In [172]:
df_fe.columns

Index(['patient_id', 'appointment_id', 'gender', 'scheduled_day',
       'appointment_day', 'age', 'neighbourhood', 'scholarship',
       'hypertension', 'diabetes', 'alcoholism', 'handicap', 'sms_received',
       'no_show', 'lead_time', 'appointment_day_of_week'],
      dtype='object')

### Create new features

1. Extract the hour of the day when the appointment was scheduled

In [173]:
df_fe['scheduled_hours'] = pd.to_datetime(original_data['ScheduledDay']).dt.hour.astype('int64')
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,lead_time,appointment_day_of_week,scheduled_hours
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,False,0,False,False,2,Friday,18
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,False,0,False,False,2,Friday,17
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,False,0,False,False,2,Friday,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,13
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,16
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7


2. Create Age Groups

In [174]:
df_fe['age_group'] = pd.cut(df_fe['age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-60', '51-65', '66+'])
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,lead_time,appointment_day_of_week,scheduled_hours,age_group
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,False,0,False,False,2,Friday,18,66+
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16,19-35
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16,36-60
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,False,0,False,False,2,Friday,17,19-35
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,False,0,False,False,2,Friday,16,19-35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,13,51-65
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,16,51-65
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7,19-35
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7,36-60


3. Create a new column for the total number of chronic conditions

In [175]:
# df_fe['hypertension'] = df_fe['hypertension'].replace({0: 'False', 1: 'True'}).infer_objects(copy=False)
# df_fe['hypertension'].astype('int64')

# df_fe['diabetes'] = df_fe['diabetes'].replace({0: 'False', 1: 'True'}).infer_objects(copy=False)
# df_fe['diabetes'].astype('int64')

# df_fe['alcoholism'] = df_fe['alcoholism'].replace({0: 'False', 1: 'True'}).infer_objects(copy=False)
# df_fe['diabetes'].astype('int64')

# df_fe['handicap'] = df_fe['handicap'].replace({0: 'False', 1: 'True'}).infer_objects(copy=False)
# df_fe['diabetes'].astype('int64')

# df_fe['total_chronic_conditions'] = df_fe[['hypertension', 'diabetes', 'alcoholism', 'handicap']].sum(axis=1)
# df_fe

4. Create a new column to indicate if the appointment is on a weekend

In [176]:
df_fe['is_weekend'] = pd.to_datetime(df_fe['appointment_day']).dt.dayofweek.isin([5, 6]).astype('int64')
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,lead_time,appointment_day_of_week,scheduled_hours,age_group,is_weekend
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,False,0,False,False,2,Friday,18,66+,0
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16,19-35,0
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16,36-60,0
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,False,0,False,False,2,Friday,17,19-35,0
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,False,0,False,False,2,Friday,16,19-35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,13,51-65,0
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,16,51-65,0
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7,19-35,0
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7,36-60,0


5. Create a new feature for SMS Reminders

In [177]:
df_fe['received_sms'] = df_fe['sms_received'].map({False: 'No', True: 'Yes'})
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,lead_time,appointment_day_of_week,scheduled_hours,age_group,is_weekend,received_sms
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,False,0,False,False,2,Friday,18,66+,0,No
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16,19-35,0,No
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,False,0,False,True,2,Friday,16,36-60,0,No
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,False,0,False,False,2,Friday,17,19-35,0,No
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,False,0,False,False,2,Friday,16,19-35,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,13,51-65,0,Yes
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,0,True,False,35,Tuesday,16,51-65,0,Yes
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7,19-35,0,Yes
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,0,True,False,41,Tuesday,7,36-60,0,Yes


6. Create a feature for the number of day until the next appointment.

In [178]:
df_fe['days_until_next_appointment'] = (df_fe.groupby('patient_id')['appointment_day'].shift(-1) - df_fe['appointment_day']).dt.days.fillna(0).astype('int64')
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,...,handicap,sms_received,no_show,lead_time,appointment_day_of_week,scheduled_hours,age_group,is_weekend,received_sms,days_until_next_appointment
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,...,0,False,False,2,Friday,18,66+,0,No,33
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,...,0,False,True,2,Friday,16,19-35,0,No,0
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,...,0,False,True,2,Friday,16,36-60,0,No,20
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,...,0,False,False,2,Friday,17,19-35,0,No,0
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,...,0,False,False,2,Friday,16,19-35,0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,...,0,True,False,35,Tuesday,13,51-65,0,Yes,0
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,...,0,True,False,35,Tuesday,16,51-65,0,Yes,0
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,...,0,True,False,41,Tuesday,7,19-35,0,Yes,0
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,...,0,True,False,41,Tuesday,7,36-60,0,Yes,0


7. Create a feature for the number of previous no-show by the patient

In [179]:
df_fe['previous_no_show'] = df_fe.groupby('patient_id')['no_show'].cumsum() - df_fe['no_show']
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,...,sms_received,no_show,lead_time,appointment_day_of_week,scheduled_hours,age_group,is_weekend,received_sms,days_until_next_appointment,previous_no_show
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,...,False,False,2,Friday,18,66+,0,No,33,0
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,...,False,True,2,Friday,16,19-35,0,No,0,0
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,...,False,True,2,Friday,16,36-60,0,No,20,0
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,...,False,False,2,Friday,17,19-35,0,No,0,0
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,...,False,False,2,Friday,16,19-35,0,No,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,...,True,False,35,Tuesday,13,51-65,0,Yes,0,1
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,...,True,False,35,Tuesday,16,51-65,0,Yes,0,1
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,...,True,False,41,Tuesday,7,19-35,0,Yes,0,0
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,...,True,False,41,Tuesday,7,36-60,0,Yes,0,0


8. Create a feature for the total number of appointments per patient

In [180]:
df_fe['total_appointments'] = df_fe.groupby('patient_id')['appointment_id'].transform('count')
df_fe

Unnamed: 0,patient_id,appointment_id,gender,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,...,no_show,lead_time,appointment_day_of_week,scheduled_hours,age_group,is_weekend,received_sms,days_until_next_appointment,previous_no_show,total_appointments
0,95985133231274,5626772,F,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,...,False,2,Friday,18,66+,0,No,33,0,2
1,733688164476661,5630279,F,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,...,True,2,Friday,16,19-35,0,No,0,0,1
2,3449833394123,5630575,F,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,...,True,2,Friday,16,36-60,0,No,20,0,2
3,78124564369297,5629123,F,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,...,False,2,Friday,17,19-35,0,No,0,0,1
4,734536231958495,5630213,F,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,...,False,2,Friday,16,19-35,0,No,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,F,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,...,False,35,Tuesday,13,51-65,0,Yes,0,1,2
71955,3596266328735,5650093,F,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,...,False,35,Tuesday,16,51-65,0,Yes,0,1,3
71956,15576631729893,5630692,F,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,...,False,41,Tuesday,7,19-35,0,Yes,0,0,1
71957,92134931435557,5630323,F,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,...,False,41,Tuesday,7,36-60,0,Yes,0,0,2


9. Encode categorical variables as needed for modeling

In [181]:
df_fe = pd.get_dummies(df_fe, columns=['gender','age_group','received_sms','appointment_day_of_week'], drop_first=True)
df_fe

Unnamed: 0,patient_id,appointment_id,scheduled_day,appointment_day,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,...,age_group_19-35,age_group_36-60,age_group_51-65,age_group_66+,received_sms_Yes,appointment_day_of_week_Monday,appointment_day_of_week_Saturday,appointment_day_of_week_Thursday,appointment_day_of_week_Tuesday,appointment_day_of_week_Wednesday
0,95985133231274,5626772,2016-04-27,2016-04-29,76,REPÚBLICA,False,True,False,False,...,False,False,False,True,False,False,False,False,False,False
1,733688164476661,5630279,2016-04-27,2016-04-29,23,GOIABEIRAS,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,3449833394123,5630575,2016-04-27,2016-04-29,39,GOIABEIRAS,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,78124564369297,5629123,2016-04-27,2016-04-29,19,CONQUISTA,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,734536231958495,5630213,2016-04-27,2016-04-29,30,NOVA PALESTINA,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,2572134369293,5651768,2016-05-03,2016-06-07,56,MARIA ORTIZ,False,False,False,False,...,False,False,True,False,True,False,False,False,True,False
71955,3596266328735,5650093,2016-05-03,2016-06-07,51,MARIA ORTIZ,False,False,False,False,...,False,False,True,False,True,False,False,False,True,False
71956,15576631729893,5630692,2016-04-27,2016-06-07,21,MARIA ORTIZ,False,False,False,False,...,True,False,False,False,True,False,False,False,True,False
71957,92134931435557,5630323,2016-04-27,2016-06-07,38,MARIA ORTIZ,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False


10. Drop columns irrelevant columns

In [182]:
df_fe = df_fe.drop(columns=['appointment_id', 'patient_id', 'scheduled_day', 'appointment_day'], axis=1, inplace=False)
df_fe

Unnamed: 0,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,no_show,lead_time,...,age_group_19-35,age_group_36-60,age_group_51-65,age_group_66+,received_sms_Yes,appointment_day_of_week_Monday,appointment_day_of_week_Saturday,appointment_day_of_week_Thursday,appointment_day_of_week_Tuesday,appointment_day_of_week_Wednesday
0,76,REPÚBLICA,False,True,False,False,0,False,False,2,...,False,False,False,True,False,False,False,False,False,False
1,23,GOIABEIRAS,False,False,False,False,0,False,True,2,...,True,False,False,False,False,False,False,False,False,False
2,39,GOIABEIRAS,False,False,False,False,0,False,True,2,...,False,True,False,False,False,False,False,False,False,False
3,19,CONQUISTA,False,False,False,False,0,False,False,2,...,True,False,False,False,False,False,False,False,False,False
4,30,NOVA PALESTINA,False,False,False,False,0,False,False,2,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71954,56,MARIA ORTIZ,False,False,False,False,0,True,False,35,...,False,False,True,False,True,False,False,False,True,False
71955,51,MARIA ORTIZ,False,False,False,False,0,True,False,35,...,False,False,True,False,True,False,False,False,True,False
71956,21,MARIA ORTIZ,False,False,False,False,0,True,False,41,...,True,False,False,False,True,False,False,False,True,False
71957,38,MARIA ORTIZ,False,False,False,False,0,True,False,41,...,False,True,False,False,True,False,False,False,True,False


In [183]:
df_fe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71959 entries, 0 to 71958
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   age                                71959 non-null  int64   
 1   neighbourhood                      71959 non-null  category
 2   scholarship                        71959 non-null  bool    
 3   hypertension                       71959 non-null  bool    
 4   diabetes                           71959 non-null  bool    
 5   alcoholism                         71959 non-null  bool    
 6   handicap                           71959 non-null  category
 7   sms_received                       71959 non-null  bool    
 8   no_show                            71959 non-null  bool    
 9   lead_time                          71959 non-null  int64   
 10  scheduled_hours                    71959 non-null  int64   
 11  is_weekend                         71959 

In [184]:
df_fe.to_csv("C:/Research/Msc/CMM709/CAUSALITY-EXPLORE/data/processed/medical_appointment_no_show_final.csv", index=False)