In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
# Data Source: https://data.mendeley.com/datasets/wm6w2fvkfj/1
path = 'medical-appointments-no-show-en.csv'
dt = pd.read_csv(path)
print(dt.shape)
dt.head()

(49593, 26)


Unnamed: 0,specialty,appointment_time,gender,appointment_date,no_show,no_show_reason,disability,date_of_birth,entry_service_date,city,...,over_60_years_old,patient_needs_companion,average_temp_day,average_rain_day,max_temp_day,max_rain_day,rainy_day_before,storm_day_before,rain_intensity,heat_intensity
0,physiotherapy,13:20,M,09/09/2021,yes,surto,,,,,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
1,psychotherapy,13:20,M,09/09/2021,no,,,,,,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
2,speech therapy,13:20,F,09/09/2021,no,,,,,,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
3,physiotherapy,13:20,F,09/09/2021,no,,,,,,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
4,physiotherapy,14:00,M,09/09/2021,no,,motor,10/10/1954,5/2/2020,B. CAMBORIU,...,1,1,20.75,0.01,23.7,0.2,1,1,no_rain,mild


In [None]:
# Removing duplicate rows
print(dt.duplicated().value_counts())
dt = dt.drop_duplicates()

False    46672
Name: count, dtype: int64


In [12]:
# I will be deleting the 'icd' and 'no_show_reason' columns from this dataframe because there is a significant amount of missing values.
# ICD is missing in 78.1% of rows. Even with the values I have, the information it gives is too vague for this analysis. 
# F84 stands for Pervasive developmental disorders (autism, aspergers, rett, etc.) and it includes 52.1% of the non missing values.
# R68 stands for Other general symptoms and signs and it includes 23.2% of the non missing values.
# Any other value is less than 3%.

# Only 35% of people who no_show gave a reason and since it seems to be a free response, there are 329 unique responses, all in Portuguese.
# The most common reasons to no_show seem to be sickness (flu, covid, fever), lack of transportation, and cancelling.
print(dt.isna().mean())
dt = dt.drop(['icd', 'no_show_reason'], axis=1)
dt.head()

specialty                  0.127443
appointment_time           0.000000
gender                     0.000000
appointment_date           0.000000
no_show                    0.000000
no_show_reason             0.963640
disability                 0.081805
date_of_birth              0.185936
entry_service_date         0.082191
city                       0.082748
icd                        0.781089
appointment_month          0.000000
appointment_year           0.000000
appointment_shift          0.000000
age                        0.186493
under_12_years_old         0.000000
over_60_years_old          0.000000
patient_needs_companion    0.000000
average_temp_day           0.015148
average_rain_day           0.015148
max_temp_day               0.015148
max_rain_day               0.015148
rainy_day_before           0.000000
storm_day_before           0.000000
rain_intensity             0.000000
heat_intensity             0.000000
dtype: float64


Unnamed: 0,specialty,appointment_time,gender,appointment_date,no_show,disability,date_of_birth,entry_service_date,city,appointment_month,...,over_60_years_old,patient_needs_companion,average_temp_day,average_rain_day,max_temp_day,max_rain_day,rainy_day_before,storm_day_before,rain_intensity,heat_intensity
0,physiotherapy,13:20,M,09/09/2021,yes,,,,,sept,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
1,psychotherapy,13:20,M,09/09/2021,no,,,,,sept,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
2,speech therapy,13:20,F,09/09/2021,no,,,,,sept,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
3,physiotherapy,13:20,F,09/09/2021,no,,,,,sept,...,0,0,20.75,0.01,23.7,0.2,1,1,no_rain,mild
4,physiotherapy,14:00,M,09/09/2021,no,motor,10/10/1954,5/2/2020,B. CAMBORIU,sept,...,1,1,20.75,0.01,23.7,0.2,1,1,no_rain,mild
