In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

CLINICAL_DATA = './DATA/Clean Data/IMPALA_Clinical_Data_202308211019_Raw.csv'
CLINICAL_DTYPES = './DATA/IMPALAclinicalstudy_Dictionary_2023-09-25.csv'

#### Load data

In [None]:

def read_clinical_df(path):
    """ Load clinical data into a Pandas DataFrame. """
    return pd.read_csv(path, low_memory=False)


def read_clinical_dtype_df(path):
    """ Load clinical dict containing all variable names and they properties. """

    df = pd.read_csv(path)
    df = df[df['Variable'].str.startswith(tuple(['recru', 'dly', 'record_id']))]

    dtype_dict = defaultdict(list)

    for _, row in df.iterrows():
        dtype_dict[row['Field Type']].append(row['Variable'])
    
    # {k: v.append('dly_time') for k, v in dtype_dict.items()}

    return df, dtype_dict


clinical_df = read_clinical_df(CLINICAL_DATA)
dtype_df, dtype_dict = read_clinical_dtype_df(CLINICAL_DTYPES)

# Checkbox column names in dtype_df do not correspond 1-on-1 with clinical data
checkbox_dict = {
    'recru_hdu_admission_reason' : ['recru_resp', 'recru_circu', 'recru_neuro',
                                    'recru_nurse', 'recru_unclear', 'recru_hdu_other'],
    'recru_medication_specfy' : clinical_df.columns[clinical_df.columns.str.startswith('recru_medication_specfy___')].to_list(),
    'dly_new_drug' : clinical_df.columns[clinical_df.columns.str.startswith('dly_new_drug___')].to_list(),
}

dtype_dict['checkbox'] = [l for s in checkbox_dict.values() for l in s]
# dtype_dict['checkbox'].append('dly_time')


#### Plot missing values

In [None]:

def plot_missing_values(df):
    """
    Plot histogram of percentage of missing values of columns. Return a sorted
    list of the percentages along with the columns.
    """

    percentage_missing = []
    percentage_binary = [] # Entries that are either completely 0 or 1

    for col in df:

        count = df[col].value_counts(normalize=True, dropna=False).to_dict()

        if -1 in count.keys():
            percentage_missing.append(count[-1])

            if count[-1] == 1:
                percentage_binary.append(1)

        else:
            percentage_missing.append(0)
            percentage_binary.append(0)

    percentage_missing = np.array(percentage_missing)
    sorted_idx = np.argsort(-percentage_missing)
    percentage_missing = percentage_missing[sorted_idx]
    columns_missing = df.columns.to_numpy()[sorted_idx]

    plt.hist(percentage_missing, bins=100)
    plt.hist(percentage_binary, bins=100, color='red')
    plt.xlim(-0.05, 1.05)
    plt.xlabel('Percentage of missing values')
    plt.ylabel('Number of columns')
    plt.title('Percentages of missing values of the data')
    plt.legend(['0 < x < 1', 'x = (0 or 1)'], loc=9)
    plt.tight_layout()
    plt.show()

    return columns_missing, percentage_missing


clinical_df = read_clinical_df(CLINICAL_DATA)

df = clinical_df.copy()
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)


df = df[df.columns[df.columns.str.startswith(tuple(['record_id', 'recru', 'dly']))]]

columns_missing, percentage_missing = plot_missing_values(df)
del df


In [None]:

clinical_df = read_clinical_df(CLINICAL_DATA)

df = clinical_df.copy()
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)

df = df[df.columns[df.columns.str.startswith(tuple(['record_id', 'recru', 'dly']))]]
columns_missing, percentage_missing = plot_missing_values(df)
del df

data = [percentage_missing[percentage_missing >= i].size for i in np.arange(0, 1.05, 0.05)]
plt.bar(range(0, 105, 5), data, width=4)
plt.xlabel('Percentage of missing values')
plt.ylabel('Number of columns in dataset')
plt.title('Number of columns remaining after thresholding')
plt.tight_layout()
plt.show()


In [None]:

t = 0.8
# print(columns_missing[percentage_missing >= t])

num_cols = np.array([any(c.isdigit() for c in string) for string in columns_missing[percentage_missing >= t]])
other_cols = np.array([True if ('other') in string else False for string in columns_missing[percentage_missing >= t]])
other_cols += np.array([True if ('spec') in string else False for string in columns_missing[percentage_missing >= t]])
combined = num_cols + other_cols

x = range(4)
y = [len(combined), num_cols.sum(), other_cols.sum(), (num_cols + other_cols).sum()]
plt.bar(x, y)

for i, v in enumerate(y):
    plt.text(x[i]-0.1, v + 3, str(v))

plt.xticks(range(4), ['All columns', 'Column name\ncontains number', 'Column name\ncontains\n["other", "spec"]', 'Intersection'])
plt.ylabel('Number of columns')
plt.title(f'Columns where NaN ratio >= {t}')
plt.tight_layout()
plt.show()
plt.close()


In [None]:
# Find out what columns are in the 80 %

# print(columns_missing[percentage_missing >= 1])

plt.figure(figsize=(8, 5))

for t in np.arange(0, 1.1, 0.1):
    num_cols = np.array([any(c.isdigit() for c in string) for string in columns_missing[percentage_missing >= t]])
    other_cols = np.array([True if ('other') in string else False for string in columns_missing[percentage_missing >= t]])
    other_cols += np.array([True if ('spec') in string else False for string in columns_missing[percentage_missing >= t]])

    combined = num_cols + other_cols

    offset = (t*0.8) - 0.35
    x = [0 + offset, 1 + offset, 2 + offset, 3 + offset]
    y = [len(combined), num_cols.sum(), other_cols.sum(), (num_cols + other_cols).sum()]

    plt.bar(x, y, width=0.08, align='center')


plt.xticks(range(4), ['All columns', 'Column name\ncontains number', 'Column name\ncontains\n["other", "spec"]', 'Intersection'])
plt.ylabel('Number of columns')
plt.legend(np.arange(0, 1.1, 0.1).round(1), )
plt.title(f'Columns where NaN ratio >= threshold')
plt.tight_layout()
plt.show()
plt.close()


#### Plot Blantyre Coma Score

In [None]:

fig = plt.figure()

# bcs_e
plt.subplot(2, 2, 1)
data = clinical_df['dly_bcs_e'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(2)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_e')

# bcs_m
plt.subplot(2, 2, 2)
data = clinical_df['dly_bcs_m'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(3)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_m')

# bcs_v
plt.subplot(2, 2, 3)
data = clinical_df['dly_bcs_v'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(3)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_v')

# bcs_total
plt.subplot(2, 2, 4)
data = clinical_df['dly_bcs_total'].value_counts(dropna=True).to_dict()
data = {k : v for k, v in data.items() if k in range(6)}
plt.bar(data.keys(), data.values())
plt.title('dly_bcs_total')

fig.suptitle('Blantyre Coma Scores')
plt.tight_layout()
plt.show()


#### Plot AVPU

In [None]:

LOG_AVPU = True

# Recru_avpu_score
data_recru = clinical_df['recru_avpu_score'].fillna(value=0)
data_recru = clinical_df['recru_avpu_score'].value_counts(dropna=True).to_dict()
data_recru[0] = data_recru.pop(99)

data_recru_x = np.array(list(data_recru.keys()))
data_recru_y = np.array(list(data_recru.values()))

# dly_avpu
data_dly = clinical_df['dly_avpu'].fillna(value=0)
data_dly = data_dly.replace(99, 0)
data_dly = data_dly.value_counts(dropna=False).to_dict()

data_dly_x = np.array(list(data_dly.keys()))
data_dly_y = np.array(list(data_dly.values()))


if LOG_AVPU:
    data_recru_y = np.log10(data_recru_y)
    data_dly_y = np.log10(data_dly_y)
    plt.ylabel('Log( N occurences )')

else:
    plt.ylabel('N occurences')


plt.bar(data_recru_x-0.2, data_recru_y, width=0.4)
plt.bar(data_dly_x+0.2, data_dly_y, width=0.4)

plt.xticks(range(5), labels=['Missing/\nUnknown', 'Alert', 'Voice', 'Pain', 'Unresponsive'])
plt.legend(['recru_avpu_score', 'dly_avpu'])
plt.title('AVPU')
plt.show()


#### Plot Capillary Refill Time

In [None]:

crt_data = clinical_df['dly_crt']

# NaN values are seen as 0
crt_data.fillna(value=0, inplace=True)

# Remove biologically impossibility
crt_data = crt_data[crt_data < 12]

x = crt_data.value_counts(dropna=True).keys().to_numpy()
y = crt_data.value_counts(dropna=True).values

plt.xlim(-1, 13)
plt.xticks(range(13))
plt.bar(x, y)

plt.xlabel('Capillary refill time (s)')
plt.ylabel('N occurences')
plt.title('CRT')
plt.legend(['dly_crt'])
plt.tight_layout()
plt.show()


#### Plot NaN columns without time

In [None]:


# clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df.copy()
    
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]
df = df[df.columns[df.columns.str.startswith(tuple(['recru', 'dis', 'record_id']))]]
# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)

drop_idx = []
df.reset_index(inplace=True)

for _, v in df.groupby('record_id'):
    drop_idx += v.index[1:].to_list()

df = df.drop(df.index[drop_idx])

_, _ = plot_missing_values(df)
del df


#### Plot missing values of yesno columns

In [None]:

def intersection(l1, l2):
    """ Create intersection of two lists. """
    return [v1 for v1 in l1 if v1 in l2]

clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df.copy()

# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]

# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)
df.reset_index(inplace=True)

for key, value in dtype_dict.items():
    print(key, len(value))

valid_columns = intersection(dtype_dict['yesno'], clinical_df.columns.to_list())
plot_missing_values(df[valid_columns])
del df


#### Plot missing values for recru, dly and dis

In [None]:
# clinical_df = read_clinical_df(CLINICAL_DATA)
df = clinical_df.copy()
    
# Remove rows that have no daily time (as those are duplicates of the row above)
df = df[-pd.isna(df['dly_time'])]

# Replace missing/NaN entries
df.replace(to_replace=99, value=-1, inplace=True)
df.fillna(-1, inplace=True)
df.reset_index(inplace=True)

df_recru = df[df.columns[df.columns.str.startswith('recru')]]
df_dly = df[df.columns[df.columns.str.startswith('dly')]]
df_dis = df[df.columns[df.columns.str.startswith('dis')]]

print(df_recru.shape)
print(df_dly.shape)
print(df_dis.shape)

_, _ = plot_missing_values(df_recru)
_, _ = plot_missing_values(df_dly)
_, _ = plot_missing_values(df_dis)
