In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns

CLINICAL_DATA = './../DATA/Clean_data/IMPALA_Clinical_data_raw.csv'
CLINICAL_DTYPES = './../DATA/IMPALA_Clinical_dictionary.csv'

#### Load data

In [None]:

def read_clinical_df(path):
    """ Load clinical data into a Pandas DataFrame. """
    return pd.read_csv(path, low_memory=False)


def read_clinical_dtype_df(path):
    """ Load clinical dict containing all variable names and they properties. """

    df = pd.read_csv(path)
    df = df[df['Variable'].str.startswith(tuple(['recru', 'dly', 'record_id']))]

    dtype_dict = defaultdict(list)

    for _, row in df.iterrows():
        dtype_dict[row['Field Type']].append(row['Variable'])
    
    # {k: v.append('dly_time') for k, v in dtype_dict.items()}

    return df, dtype_dict


clinical_df = read_clinical_df(CLINICAL_DATA)
dtype_df, dtype_dict = read_clinical_dtype_df(CLINICAL_DTYPES)

# Checkbox column names in dtype_df do not correspond 1-on-1 with clinical data
checkbox_dict = {
    'recru_hdu_admission_reason' : ['recru_resp', 'recru_circu', 'recru_neuro',
                                    'recru_nurse', 'recru_unclear', 'recru_hdu_other'],
    'recru_medication_specfy' : clinical_df.columns[clinical_df.columns.str.startswith('recru_medication_specfy___')].to_list(),
    'dly_new_drug' : clinical_df.columns[clinical_df.columns.str.startswith('dly_new_drug___')].to_list(),
}

dtype_dict['checkbox'] = [l for s in checkbox_dict.values() for l in s]
dtype_dict['checkbox'].append('dly_time')


#### Data stats

In [None]:

# Age, sex, LOS
stats = [[], [], []]
for k, df in clinical_df.groupby('record_id'):
    stats[0].append(df['recru_age_months'].iloc[0])
    stats[1].append(df['recru_sex'].iloc[0])

    if df.shape[0] > 1:
        start = pd.to_datetime(df['recru_hospital_admission_date'].iloc[0], format='mixed')
        end = pd.to_datetime(df['dly_time'].iloc[-2], format='mixed')
        stats[2].append(end - start)


In [None]:
import datetime

# Age
print('Max age:', np.max(stats[0]), 'months')
print('Min age:', np.min(stats[0]), 'months')
print('Mean age:', round(np.mean(stats[0]), 2), 'months')
print('Std age:', round(np.std(stats[0]), 2))
print('Median age:', round(np.median(stats[0]), 2), 'months\n')

# Sex
a, b = np.unique(stats[1], return_counts=True)
print(f'Male patients(%): {round(b[0] / sum(b) * 100, 2)}')
print(f'Female patients(%): {round(b[1] / sum(b) * 100, 2)}\n')

# LOS
print('Max LOS:', np.max(stats[2]))
print('Min LOS:', np.min(stats[2]))
print('Mean LOS:', np.mean(stats[2]))
std = np.std([i.total_seconds() for i in stats[2]])
print('Std LOS:', datetime.timedelta(seconds=std))
print('Median LOS:', np.median(stats[2]))


#### Plot recru, dly, dis ratio

In [None]:

print(sum(clinical_df.columns.str.startswith('recru')))
print(sum(clinical_df.columns.str.startswith('dly')))
print(sum(clinical_df.columns.str.startswith('dis')))

y = [sum(clinical_df.columns.str.startswith(s)) for s in ['recru', 'dly', 'dis']]

plt.bar(range(3), y)
plt.text(-0.1, y[0]+3, y[0])
plt.text(0.93, y[1]+3, y[1])
plt.text(1.93, y[2]+3, y[2])

plt.xticks(range(3), ['recru', 'dly', 'dis'])
plt.ylabel('Number of columns')
plt.title(f'Number of columns per category')
plt.tight_layout()
plt.show()


#### Plot patient survival rate

In [None]:

survival_rates = [0, 0, 0, 0]
for k, df in clinical_df.groupby('record_id'):
    if not pd.isna(df['dis_outcome'].iloc[0]):
        survival_rates[int(df['dis_outcome'].iloc[0]) - 1] += 1

print(survival_rates)

plt.bar(range(4), survival_rates)
plt.text(-0.1, survival_rates[0]+5, survival_rates[0])
plt.text(0.95, survival_rates[1]+5, survival_rates[1])
plt.text(1.94, survival_rates[2]+5, survival_rates[2])
plt.text(2.94, survival_rates[3]+5, survival_rates[3])

plt.xticks(range(4), ['Alive', 'Died', 'Absconded', 'Withdraw consent'])
plt.xlabel('Discharge outcome')
plt.ylabel('Number of patients')
plt.title(f'Hospital outcome for {sum(survival_rates)} patients')
plt.tight_layout()
plt.show()


#### Plot missing values

In [None]:

def plot_missing_values(df):
    """
    Plot histogram of percentage of missing values of columns. Return a sorted
    list of the percentages along with the columns.
    """

    percentage_missing = []
    percentage_binary = [] # Entries that are either completely 0 or 1

    for col in df:

        count = df[col].value_counts(normalize=True, dropna=False).to_dict()

        if -1 in count.keys():
            percentage_missing.append(count[-1])

            if count[-1] == 1:
                percentage_binary.append(1)

        else:
            percentage_missing.append(0)
            percentage_binary.append(0)

    percentage_missing = np.array(percentage_missing)
    sorted_idx = np.argsort(-percentage_missing)
    percentage_missing = percentage_missing[sorted_idx]
    columns_missing = df.columns.to_numpy()[sorted_idx]

    plt.hist(percentage_missing, bins=100)
    plt.hist(percentage_binary, bins=100, color='red')
    plt.xlim(-0.05, 1.05)
    plt.xlabel('Ratio of missing data')
    plt.ylabel('Number of columns')
    plt.title('Ratio of missing data in the admission and daily variables')
    plt.legend(['Contain both data and missing data', 'Contain either no missing data or no data at all'], loc=9)
    plt.tight_layout()
    plt.show()

    return columns_missing, percentage_missing


clinical_df = read_clinical_df(CLINICAL_DATA)

df = clinical_df.copy()
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)


df = df[df.columns[df.columns.str.startswith(tuple(['record_id', 'recru', 'dly']))]]

columns_missing, percentage_missing = plot_missing_values(df)
del df


In [None]:

clinical_df = read_clinical_df(CLINICAL_DATA)

df = clinical_df.copy()
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)

df = df[df.columns[df.columns.str.startswith(tuple(['record_id', 'recru', 'dly']))]]
columns_missing, percentage_missing = plot_missing_values(df)
del df

data = [percentage_missing[percentage_missing >= i].size for i in np.arange(0, 1.05, 0.05)]
plt.bar(range(0, 105, 5), data, width=4)
plt.xlabel('Percentage of missing values')
plt.ylabel('Number of columns in dataset')
plt.title('Number of columns remaining after thresholding')
plt.tight_layout()
plt.show()


In [None]:

clinical_df = read_clinical_df(CLINICAL_DATA)

df = clinical_df.copy()
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)


df = df[df.columns[df.columns.str.startswith(tuple(['record_id', 'recru', 'dly']))]]

columns_missing, percentage_missing = plot_missing_values(df)
del df

t = 0.8
# print(columns_missing[percentage_missing >= t])
print(len(columns_missing))

num_cols = np.array([any(c.isdigit() for c in string) for string in columns_missing[percentage_missing >= t]])
other_cols = np.array([True if ('other') in string else False for string in columns_missing[percentage_missing >= t]])
other_cols += np.array([True if ('spec') in string else False for string in columns_missing[percentage_missing >= t]])
combined = num_cols + other_cols

x = range(4)
y = [len(combined), num_cols.sum(), other_cols.sum(), (num_cols + other_cols).sum()]
plt.bar(x, y)

for i, v in enumerate(y):
    plt.text(x[i]-0.1, v + 3, str(v))

plt.xticks(range(4), ['All columns', 'Column name\ncontains number', 'Column name\ncontains\n["other", "spec"]', 'Intersection'])
plt.ylabel('Number of columns')
plt.title(f'Columns where NaN ratio >= {t}')
plt.tight_layout()
plt.show()
plt.close()


In [None]:
# Find out what columns are in the 80 %

# print(columns_missing[percentage_missing >= 1])

plt.figure(figsize=(8, 5))

for t in np.arange(0, 1.1, 0.1):
    num_cols = np.array([any(c.isdigit() for c in string) for string in columns_missing[percentage_missing >= t]])
    other_cols = np.array([True if ('other') in string else False for string in columns_missing[percentage_missing >= t]])
    other_cols += np.array([True if ('spec') in string else False for string in columns_missing[percentage_missing >= t]])

    combined = num_cols + other_cols

    offset = (t*0.8) - 0.35
    x = [0 + offset, 1 + offset, 2 + offset, 3 + offset]
    y = [len(combined), num_cols.sum(), other_cols.sum(), (num_cols + other_cols).sum()]

    plt.bar(x, y, width=0.08, align='center')


plt.xticks(range(4), ['All columns', 'Column name\ncontains number', 'Column name\ncontains\n["other", "spec"]', 'Intersection'])
plt.ylabel('Number of columns')
plt.legend(np.arange(0, 1.1, 0.1).round(1), )
plt.title(f'Columns where NaN ratio >= threshold')
plt.tight_layout()
plt.show()
plt.close()


#### Plot Blantyre Coma Score

In [None]:

fig = plt.figure()

# bcs_e
plt.subplot(2, 2, 1)
data = clinical_df['dly_bcs_e'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(2)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_e')

# bcs_m
plt.subplot(2, 2, 2)
data = clinical_df['dly_bcs_m'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(3)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_m')

# bcs_v
plt.subplot(2, 2, 3)
data = clinical_df['dly_bcs_v'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(3)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_v')

# bcs_total
plt.subplot(2, 2, 4)
data = clinical_df['dly_bcs_total'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(6)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_total')

fig.suptitle('Blantyre Coma Scores')
plt.tight_layout()
plt.show()


#### Plot AVPU

In [None]:

# Recru_avpu_score
avpu_recru = {0:0, 1:0, 2:0, 3:0, 4:0, 99:0}
data_recru = clinical_df[['record_id', 'recru_avpu_score']].fillna(value=0)
for _, v in data_recru.groupby('record_id'):
    avpu_recru[v.iloc[0, 1]] += 1

avpu_recru[0] += avpu_recru[99]
del avpu_recru[99]

print(avpu_recru)

# data_recru = clinical_df['recru_avpu_score'].value_counts(dropna=True).to_dict()
# data_recru[0] = data_recru.pop(99)

data_recru_x = np.array(list(avpu_recru.keys()))
data_recru_y = np.array(list(avpu_recru.values()))

# dly_avpu
data_dly = clinical_df['dly_avpu'].fillna(value=0)
data_dly = data_dly.replace(99, 0)
data_dly = data_dly.value_counts(dropna=False).to_dict()

data_dly_x = np.array(list(data_dly.keys()))
data_dly_y = np.array(list(data_dly.values()))


idx = np.argsort(data_recru_x)
data_recru_x = data_recru_x[idx]
data_recru_y = data_recru_y[idx]

idx = np.argsort(data_dly_x)
data_dly_x = data_dly_x[idx]
data_dly_y = data_dly_y[idx]

data_y = data_recru_y + data_dly_y

plt.bar(['Missing/\nUnknown', 'Alert', 'Voice', 'Pain', 'Unresponsive'], data_y)
# plt.text

# plt.bar(data_recru_x-0.2, data_recru_y, width=0.4)
# plt.bar(data_dly_x+0.2, data_dly_y, width=0.4)

# plt.xticks(range(5), labels=['Missing/\nUnknown', 'Alert', 'Voice', 'Pain', 'Unresponsive'])
# plt.legend(['recru_avpu_score', 'dly_avpu'])
plt.ylabel('N occurences')

plt.title('AVPU scores in the admission and daily data')
plt.savefig('./avpu_stats')
plt.show()


#### Plot Capillary Refill Time

In [None]:

crt_recru = defaultdict(int)
data_recru = clinical_df[['record_id', 'recru_cr_time_result']].fillna(value=0)
for _, v in data_recru.groupby('record_id'):
    crt_recru[v.iloc[0, 1]] += 1


print(crt_recru)

crt_data = clinical_df['dly_crt']

# NaN values are seen as 0
crt_data.fillna(value=0, inplace=True)

# Remove biologically impossibility
crt_data = crt_data[crt_data < 12]

x = crt_data.value_counts(dropna=True).keys().to_numpy()
y = crt_data.value_counts(dropna=True).values

data_recru_x = np.array(list(crt_recru.keys()))
data_recru_y = np.array(list(crt_recru.values()))

idx = np.argsort(data_recru_x)
data_recru_x = data_recru_x[idx]
data_recru_y = data_recru_y[idx]

idx = np.argsort(x)
data_dly_x = x[idx]
data_dly_y = y[idx]

data_recru_x = np.append(data_recru_x, 7)
data_recru_x = np.append(data_recru_x, 8)
data_recru_x = np.append(data_recru_x, 9)
data_recru_y = np.append(data_recru_y, 0)
data_recru_y = np.append(data_recru_y, 0)
data_recru_y = np.append(data_recru_y, 0)

data_dly_x = np.insert(data_dly_x, -1, 7)
data_dly_x = np.insert(data_dly_x, -1, 8)
data_dly_y = np.insert(data_dly_y, -1, 0)
data_dly_y = np.insert(data_dly_y, -1, 0)

print(data_recru_x, data_dly_x)
print(data_recru_y, data_dly_y)

data_y = data_recru_y + data_dly_y


print(x)
print(y)


plt.xlim(-1, 10)
plt.xticks(range(10), ['Missing/\nUnknown', 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.bar(x, y)

plt.xlabel('Capillary refill time (s)')
plt.ylabel('N occurences')
plt.title('CRT')
# plt.legend(['dly_crt'])
plt.tight_layout()
plt.savefig('./crt_stats')
plt.show()


#### Plot NaN columns without time

In [None]:


# clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df.copy()
    
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
df = df[df.columns[df.columns.str.startswith(tuple(['recru', 'dis', 'record_id']))]]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)

drop_idx = []
df.reset_index(inplace=True)

for _, v in df.groupby('record_id'):
    drop_idx += v.index[1:].to_list()

df = df.drop(df.index[drop_idx])

_, _ = plot_missing_values(df)
del df


#### Plot missing values of yesno columns

In [None]:

def intersection(l1, l2):
    """ Create intersection of two lists. """
    return [v1 for v1 in l1 if v1 in l2]

clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df.copy()

# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]

# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)
df.reset_index(inplace=True)

for key, value in dtype_dict.items():
    print(key, len(value))

valid_columns = intersection(dtype_dict['text'], clinical_df.columns.to_list())
plot_missing_values(df[valid_columns])
del df


#### Plot text variables

In [None]:

clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df[-pd.isna(clinical_df['dly_time'])]
df = df[df.columns[df.columns.str.startswith(tuple(['recru', 'dly', 'record_id']))]]

print(df.shape)
df = df[['record_id', 'recru_impala_id', 'recru_staff_initials', 'recru_interview_date_', 'recru_hospital_admission_date', 'recru_hdu_admission_date', 'recru_admitted_from_specify', 'recru_dob', 'recru_age_months', 'recru_weight_kg', 'recru_length_cm', 'recru_muac_cm', 'recru_chronic_disability_yes', 'recru_mother_age_years', 'recru_father_age_in_years', 'recru_siblings_alive_num', 'recru_siblings_died_num', 'recru_child_residence', 'recru_hospital_travel_distance', 'recru_cough_num', 'recru_difficult_in_b_v_3', 'recru_fastbreathing_days', 'recru_fever_days', 'recru_vomitting_days', 'recru_diarrhoea_days', 'recru_weight_loss_days', 'recru_not_eating_dys', 'recru_pallor_days', 'recru_jaundice_days', 'recru_trauma_days', 'recru_poisoning_into_v_4', 'recru_other_specify', 'recru_respiratory_rate_min', 'recru_oxygen_saturation', 'recru_heart_rate_result', 'recru_temperature_result', 'recru_liver_palpable_cm', 'recru_spleen_palatable_cm', 'recru_abnormal_specify', 'recru_miscellaneous_specify', 'recru_haemato_cardial_specify', 'recru_neurology_specify', 'recru_skin_ent_specify', 'recru_miscellaneous_specify_1', 'recru_neurology_specify_1', 'recru_skin_ent_specify_1', 'recru_miscellaneous_specify_2', 'recru_neurology_specify_2', 'recru_miscellaneous_specify_3', 'recru_miscellaneous_specify_4', 'recru_neurology_specify_4', 'recru_miscellaneous_specify_5', 'recru_miscellaneous_specify_6', 'recru_pcv', 'recru_pcv_reason', 'recru_malaria_micr_reason', 'recru_malariardt_3', 'recru_hb', 'recru_wbc', 'recru_platelets', 'recru_fbc_reason', 'recru_blood_glucose', 'recru_blood_culture_reasn', 'recru_medications_specify', 'recru_cardiovascular_other', 'recru_git_other', 'recru_haematological_other', 'recru_musculoskeletal_other', 'recru_neurological_other', 'recru_pathogen_syndromes_other', 'recru_sepsis_syndromes_other', 'recru_surgical_other', 'recru_urinar_tract_other', 'recru_urti_ent_other', 'recru_other_infections_other', 'recru_other_conditions_other', 'recru_cardiovascular_other_2', 'recru_git_other_2', 'recru_inflammatory_other_2', 'recru_neurological_other_2', 'recru_sepsis_syndromes_other_2', 'recru_surgical_other_2', 'recru_urinar_tract_other_2', 'recru_urti_ent_other_2', 'recru_other_infections_other_2', 'recru_other_conditions_other_2', 'recru_cardiovascular_other_3', 'recru_git_other_3', 'recru_neurological_other_3', 'recru_sepsis_syndromes_other_3', 'recru_surgical_other_3', 'recru_urti_ent_other_3', 'recru_other_infections_other_3', 'recru_cardiovascular_other_4', 'recru_soft_tissue_other_4', 'recru_other_infections_other_4', 'recru_git_other_5', 'recru_bloodculture_time', 'recru_bloodculture_reason', 'recru_lactate_sample_time', 'recru_study_lactate', 'recru_lactate_reason', 'recru_storage_spec_time', 'recru_storage_spec_reas', 'recru_nasal_swab_time', 'recru_nasal_swab_reas', 'recru_urine_time', 'recru_urine_reas', 'dly_impala_id', 'dly_time', 'dly_staff_id', 'dly_temp', 'dly_daily_weight', 'dly_monitor_num', 'dly_reason_nt_attached', 'dly_time_new_cie1a', 'dly_time_new_cie2', 'dly_time_new_cie3', 'dly_time_new_cie4', 'dly_time_new_cie5', 'dly_time_new_cie6', 'dly_blood_culture_results', 'dly_lactate_results', 'dly_blood_glucose', 'dly_glucose_test', 'dly_main_result', 'dly_main_result2', 'dly_cardiovascular_other', 'dly_git_other', 'dly_musculoskeletal_other', 'dly_neurological_other', 'dly_sepsis_syndromes_other', 'dly_urti_ent_other', 'dly_other_infections_other', 'dly_other_conditions_other', 'dly_time_drug', 'dly_other_conditions_other_1', 'dly_other_conditions_other_2']]

print(df.shape)

df = df.drop(['recru_age_months', 'recru_weight_kg', 'recru_length_cm', 'recru_muac_cm', 'recru_mother_age_years', 'recru_father_age_in_years', 'recru_siblings_alive_num', 'recru_siblings_died_num', 'recru_hospital_travel_distance', 'recru_cough_num', 'recru_difficult_in_b_v_3', 'recru_fastbreathing_days', 'recru_fever_days', 'recru_vomitting_days', 'recru_diarrhoea_days', 'recru_weight_loss_days', 'recru_not_eating_dys', 'recru_pallor_days', 'recru_jaundice_days', 'recru_trauma_days', 'recru_poisoning_into_v_4', 'recru_respiratory_rate_min', 'recru_oxygen_saturation', 'recru_heart_rate_result', 'recru_temperature_result', 'recru_liver_palpable_cm', 'recru_spleen_palatable_cm', 'recru_neurology_specify_4', 'recru_pcv', 'recru_hb', 'recru_wbc', 'recru_platelets', 'recru_blood_glucose', 'recru_soft_tissue_other_4', 'recru_study_lactate', 'dly_temp', 'dly_daily_weight', 'dly_monitor_num', 'dly_lactate_results', 'dly_blood_glucose']
, axis=1)

# df = df.drop(['record_id', 'dly_time'], axis=1)

print(df.shape)

df = df[df.columns.drop(list(df.filter(regex='date')))]
df = df[df.columns.drop(list(df.filter(regex='time')))]
df = df[df.columns.drop(list(df.filter(regex='dob')))]

print(df.shape)

df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)
df.reset_index(inplace=True)

percentage_missing = []
percentage_binary = [] # Entries that are either completely 0 or 1

for col in df:

    count = df[col].value_counts(normalize=True, dropna=False).to_dict()

    if -1 in count.keys():
        percentage_missing.append(count[-1])

        if count[-1] == 1:
            percentage_binary.append(1)

    else:
        percentage_missing.append(0)
        percentage_binary.append(0)

percentage_missing = np.array(percentage_missing)
sorted_idx = np.argsort(-percentage_missing)
percentage_missing = percentage_missing[sorted_idx]
columns_missing = df.columns.to_numpy()[sorted_idx]

plt.hist(percentage_missing, bins=100)
plt.hist(percentage_binary, bins=100, color='red')
plt.xlim(-0.05, 1.05)
plt.xlabel('Ratio of missing data')
plt.ylabel('Number of columns')
plt.title('Ratio of missing data in the text variables of admission and daily data')
plt.legend(['Contain both data and missing data', 'Contain either no missing data or no data at all'], loc=9)
plt.tight_layout()
plt.savefig('./missing_data.png')
plt.show()



#### Plot missing values for recru, dly and dis

In [None]:
# clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df.copy()
    
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]

# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)
df.reset_index(inplace=True)

df_recru = df[df.columns[df.columns.str.startswith(tuple(['recru', 'record_id', 'dly']))]]
# df_dly = df[df.columns[df.columns.str.startswith('dly')]]
# df_dis = df[df.columns[df.columns.str.startswith('dis')]]

print(df_recru.shape)
# print(df_dly.shape)
# print(df_dis.shape)

_, _ = plot_missing_values(df_recru)
# _, _ = plot_missing_values(df_dly)
# _, _ = plot_missing_values(df_dis)
