In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import pickle
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

CLINICAL_DATA = './IMPALA_Clinical_Data/Clean Data/IMPALA_Clinical_Data_202308211019_Raw.csv'
CLINICAL_DTYPES = './IMPALA_Clinical_Data/IMPALAclinicalstudy_Dictionary_2023-09-25.csv'

#### Load data

In [None]:

def read_clinical_df(path):
    """ Load clinical data into a Pandas DataFrame. """
    return pd.read_csv(path, low_memory=False)


def read_clinical_dtype_df(path):
    """ Load clinical dict containing all variable names and they properties. """

    df = pd.read_csv(path)
    df = df[df['Variable'].str.startswith(tuple(['recru', 'dly', 'record_id']))]

    dtype_dict = defaultdict(list)

    for _, row in df.iterrows():
        dtype_dict[row['Field Type']].append(row['Variable'])
    
    # {k: v.append('dly_time') for k, v in dtype_dict.items()}

    return df, dtype_dict


In [None]:

clinical_df = read_clinical_df(CLINICAL_DATA)
dtype_df, dtype_dict = read_clinical_dtype_df(CLINICAL_DTYPES)

# Checkbox column names in dtype_df do not correspond 1-on-1 with clinical data
checkbox_dict = {
    'recru_hdu_admission_reason' : ['recru_resp', 'recru_circu', 'recru_neuro',
                                    'recru_nurse', 'recru_unclear', 'recru_hdu_other'],
    'recru_medication_specfy' : clinical_df.columns[clinical_df.columns.str.startswith('recru_medication_specfy___')].to_list(),
    'dly_new_drug' : clinical_df.columns[clinical_df.columns.str.startswith('dly_new_drug___')].to_list(),
}

dtype_dict['checkbox'] = [l for s in checkbox_dict.values() for l in s]
# dtype_dict['checkbox'].append('dly_time')


#### Count CIE (CPR, PICU and death)

In [None]:

def get_cie_columns(df):
    """
    Extract the colums that contain information about CIEs.
    """

    cie_columns = {
        'static'   : df.columns[df.columns.str.startswith(
                        tuple(['recru_cie_other', 'recru_circulatory',
                               'dis_outcome', 'dis_hospital_discharge',
                               'dis_date_of_intervw',
                               'recru_hospital_admission_date']))].to_list(),
    
        'temporal' : df.columns[df.columns.str.startswith(
                        tuple(['dly_cie_other',
                               'dly_circulatory']))].to_list()
    }

    cie_columns['temporal'].append('dly_time')

    return cie_columns


def split_per_patient(df, cie_columns):
    """
    Extract the cie_columns from the original DataFrame and split it per patient.
    """

    patient_dict = {}

    for record_id, patient_df in df.groupby(['record_id'], observed=True):
        patient_dict[record_id[0]] = {k : patient_df[v] for k, v in cie_columns.items()}
    
    return patient_dict


def clean_cie_dataframes(patient_dict, freq='T'):
    """
    Clean the CIE information captured in the patient_dict. Since some variables
    are temporal and others static the need to be processed differently.

    :param patient_dict: a dictionary containing DataFrames with static
                         and temporal information regarding CIEs, per patient.
    :param freq: time unit to which dtes are rounded (T = minutes, H = hours).
    :return patient_dict: the cleaned input dictionary.
    """

    for record_id, patient_df in patient_dict.items():
        static = patient_df['static']
        temporal = patient_df['temporal']
        
        ### Clean static DataFrame ###
        static = static.iloc[0, :].to_dict()

        # If no exact discharge time is given, use discharge date
        if pd.isna(static['dis_hospital_discharge']):
            static['dis_hospital_discharge'] = static['dis_date_of_intervw']
            del static['dis_date_of_intervw']
        
        # Turn string date into Pandas TimeStamp
        static['dis_hospital_discharge'] = pd.to_datetime(
            static['dis_hospital_discharge'],
            format='mixed').round(freq=freq)
        
        static['recru_hospital_admission_date'] = pd.to_datetime(
            static['recru_hospital_admission_date'],
            format='mixed').round(freq=freq)
        
        # Replace NaN values with 0
        static = {k : (0 if pd.isna(v) else v) for k, v in static.items()}

        patient_dict[record_id]['static'] = static

        ### Clean temporal DataFrame ###
        # Set daily time as index 
        temporal.set_index('dly_time', inplace=True)
        temporal = temporal[temporal.index.notnull()]
        temporal.index = pd.to_datetime(temporal.index, format='mixed').round(freq=freq)

        # Replace NaN values with 0
        temporal = temporal.fillna(value=0)

        patient_dict[record_id]['temporal'] = temporal


    return patient_dict


def process_cpr(dfs):
    """
    Count CPR occurences of a patient.
    """

    # Recruitment
    time = dfs['static']['recru_hospital_admission_date']
    n_recru_cpr = sum([v for k, v in dfs['static'].items() if 'circulatory' in k and v == 4])
    recru_cpr = [time] if n_recru_cpr > 0 else []
    
    # Daily
    dly_cpr_df = dfs['temporal'][dfs['temporal'].columns[dfs['temporal'].columns.str.startswith('dly_circulatory')]]
    dly_cpr = [time for time, row in dly_cpr_df.iterrows() if (row == 4).any()]


    dly_cpr += recru_cpr

    return dly_cpr


def process_picu(dfs):
    """
    Count PICU occurences of a patient.
    """

    # Recruitment
    time = dfs['static']['recru_hospital_admission_date']
    n_recru_picu = sum([v for k, v in dfs['static'].items() if 'cie_other' in k and v == 2])
    recru_picu = [time] if n_recru_picu > 0 else []
    
    # Daily
    dly_picu_df = dfs['temporal'][dfs['temporal'].columns[dfs['temporal'].columns.str.startswith('dly_cie_other')]]
    dly_picu = [time for time, row in dly_picu_df.iterrows() if (row == 2).any()]

    dly_picu += recru_picu

    return dly_picu


def process_death(dfs):
    """
    Count death occurences of a patient.
    """

    # Recruitment
    time = dfs['static']['recru_hospital_admission_date']
    n_recru_death = sum([v for k, v in dfs['static'].items() if 'cie_other' in k and v == 4])
    recru_death = [time] if n_recru_death > 0 else []
    
    # Discharge
    time = dfs['static']['dis_hospital_discharge']
    dis_death = [time] if dfs['static']['dis_outcome'] == 2 else []

    # Daily
    dly_death_df = dfs['temporal'][dfs['temporal'].columns[dfs['temporal'].columns.str.startswith('dly_cie_other')]]
    dly_death = [time for time, row in dly_death_df.iterrows() if (row == 4).any()]

    dly_death += recru_death
    dly_death += dis_death

    # Since someone can only die once, so choose the oldest date
    if dly_death != []:
        return [min(dly_death)]
    else:
        return dly_death


def get_cie_timepoints(patient_dict, save_file=False):
    """
    For each patient, create a dictionary that contains timepoints of the CIEs
    that (possibly) occured.
    :param patient_dict: a dictionary containing DataFrames with static and
                         temporal information regarding CIEs, per patient.
    :param save_file: flag whether the resulting CIE dictionary should be saved
                      to a file.csv
    :return cie_dict: a dictionary containing timepoints of CIEs per patient.
    """

    cie_dict = defaultdict(dict)

    for record_id, patient_df in patient_dict.items():

        cie_dict[record_id]['cpr'] = process_cpr(patient_df)

        cie_dict[record_id]['picu'] = process_picu(patient_df)

        cie_dict[record_id]['death'] = process_death(patient_df)
    
    if save_file:
        with open('CIE_dict', 'wb') as file:
            pickle.dump(cie_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

    return cie_dict


def get_all_cie(df, freq='T', save_file=False, show_count=True):
    """
    Create a dictionary containing the timepoints all CIEs per patient
    """

    cie_columns = get_cie_columns(df)

    patient_dict = split_per_patient(df, cie_columns)

    patient_dict = clean_cie_dataframes(patient_dict, freq=freq)

    cie_dict = get_cie_timepoints(patient_dict, save_file=save_file)

    if show_count:
        
        cpr, picu, death = 0, 0, 0
        for d in cie_dict.values():
            cpr += len(d['cpr'])
            picu += len(d['picu'])
            death += len(d['death'])

        print("=== Critical Illness Events ===")
        print(f"- CPR  : {cpr}")
        print(f"- PICU : {picu}")
        print(f"- Death: {death}")
        print(f"- Total: {cpr + picu + death}")

    del patient_dict

    return cie_dict


In [None]:

cie_dict = get_all_cie(clinical_df, freq='T', save_file=False, show_count=True)


#### Prepare DataType data

In [None]:

def intersection(l1, l2):
    """ Create intersection of two lists. """
    return [v1 for v1 in l1 if v1 in l2]


def replace_missing_values(df, threshold=None, freq='T'):
    """
    Clean the missing/NaN values by replacing them with -1. If a threshold is
    set, remove columns where missing/NaN value ratio exceeds this threshold.
    Also set the daily time as the index for each row.
    """

    df = df.copy()

    # Remove rows that have no daily time (as those are duplicates of the row above)
    # df = df[-pd.isna(df['dly_time'])]

    # Set daily time as DataFrame index
    # df.set_index('dly_time', inplace=True)
    # df.index = pd.to_datetime(df.index, format='mixed').round(freq=freq)

    # Convert numbers to float
    df = df.astype(float)

    # Replace missing/NaN entries
    df.replace(to_replace=99, value=-1, inplace=True)
    df.fillna(-1, inplace=True)

    # Remove columns where missing/NaN values exceed threshold
    if threshold:
        for col in df:
            if -1 in df[col].value_counts(normalize=True).to_dict().keys() and \
                    df[col].value_counts(normalize=True).to_dict()[-1] > threshold:

                df.drop(col, axis=1, inplace=True)

    return df


def one_hot_encoding(df):
    """
    Convert categorical columns to multiple binary columns and add them to the
    end of the DataFrame. Remove binary columns that are created for
    missing/NaN values.
    """

    df = df.copy()

    categorical_columns = df.columns.to_list()

    for col in tqdm(categorical_columns):

        if -1 in df[col].value_counts().to_dict().keys():
            DROP_FIRST = True

        else:
            DROP_FIRST = False

        df = df.join(pd.get_dummies(df[col],
                                    dummy_na=False, 
                                    prefix=col,
                                    sparse=True,
                                    drop_first=DROP_FIRST,
                                    dtype=float))

        # Drop original categorical column
        df = df.drop(col, axis=1)

    return df


In [None]:

clinical_df = read_clinical_df(CLINICAL_DATA)
clinical_df.fillna(-1, inplace=True)

valid_radio_columns = intersection(dtype_dict['radio'], clinical_df.columns.to_list())
valid_yesno_columns = intersection(dtype_dict['yesno'], clinical_df.columns.to_list())
valid_checkbox_columns = intersection(dtype_dict['checkbox'], clinical_df.columns.to_list())
valid_text_columns = intersection(dtype_dict['text'], clinical_df.columns.to_list())

radio_df = clinical_df[valid_radio_columns]
yesno_df = clinical_df[valid_yesno_columns]
checkbox_df = clinical_df[valid_checkbox_columns]

text_df = clinical_df[valid_text_columns]
num_df = text_df.select_dtypes(include=['float64', 'int64'])
valid_num_columns = num_df.columns.to_list()
text_df = text_df.drop(valid_num_columns, axis=1)
valid_text_columns = [col for col in valid_text_columns if col not in valid_num_columns]

# print(radio_df.memory_usage().sum())

radio_df = radio_df.astype(np.int16)
yesno_df = yesno_df.astype(np.int16)
checkbox_df = checkbox_df.astype(np.int16)

radio_df = replace_missing_values(radio_df)
radio_df = one_hot_encoding(radio_df)



#### Perform Prinicipal Component Analysis

In [None]:

def perform_PCA(df, visualize=True):
    """
    Standardize the data and perform PCA to reduce dimensionality.
    If visualize is True, plot the explained variance per PC.
    """

    # Standardize the data
    scaler = StandardScaler()
    scaler.fit(df.values)
    data = scaler.transform(df.values)

    # Perform Principal Component Analysis
    pca = PCA(n_components='mle')
    pca.fit(data)
    new_data = pca.transform(data)

    if visualize:
        plt.plot(range(pca.n_components_), pca.explained_variance_ratio_)
        plt.title('Explained variance ratio per principal component')
        plt.xlabel('Number of components')
        plt.ylabel('Ratio of explained variance')
        plt.legend(['Explained Variance', "Number of PCs found using Minka's MLE"])
        plt.tight_layout()
        plt.show()

    return new_data, pca


# PCA visualization
def plot_contribution(components, idx, columns, top_n=50):
    
    sorted_idx = np.argsort(-components[idx, :])

    sorted_components = components[idx, :][sorted_idx]
    sorted_columns = columns[sorted_idx]
    
    if top_n:
        plt.bar(sorted_columns[:top_n], sorted_components[:top_n])
    else:
        plt.bar(sorted_columns[:top_n], sorted_components[:top_n])

    plt.xticks(rotation=-90, ha='center')
    plt.xlabel('Features')
    plt.ylabel('Contribution in component')
    plt.title(f'Contributions for principal component {idx+1}')
    plt.tight_layout()
    plt.show()

    return sorted_components, sorted_columns


def calculate_most_contribution_component(components, columns, top_n=10, top_m=10):
    """
    Count the top 10 features for each component and rank the top 10.
    """

    counter = defaultdict(int)

    for component in components:

        sorted_idx = np.argsort(-component)
        sorted_component = component[sorted_idx]
        sorted_columns = columns[sorted_idx]

        for i in range(top_n):
            counter[sorted_columns[i]] += sorted_component[i]


    counter = dict(Counter(counter).most_common(top_m))

    plt.bar(counter.keys(), counter.values())

    plt.xticks(rotation=-90, ha='center')
    plt.xlabel('Features')
    plt.ylabel('Contribution in component')
    plt.title('Most contributing features over all components')
    plt.tight_layout()
    plt.show()


In [None]:

new_data, pca = perform_PCA(radio_df)

calculate_most_contribution_component(pca.components_, radio_df.columns.to_numpy(), top_n=100, top_m=25)


#### Sliding window

In [None]:

def sliding_window_old(patient_dict, cie_dict, sample_window_hours, predictive_window_hours):
    """
    Apply a sliding window to the patient's data. Iterate over the timepoints:
    apply a sliding window (sample window) starting with the current timepoint,
    see what other timepoints fit into that window and finally append another
    window (predictive window) after the sample window to see if a CIE occurs
    in the future.
    :param patient_dict: a dictionary containing DataFrames with static and
                         temporal information regarding CIEs, per patient.
    :param cie_dict: a dictionary containing timepoints of CIEs per patient.
    :param sample_window_hours: an integer indicating the size/hours of the
                                sliding window.
    :param predictive_window_hours: an integer indicating the size/hours of the
                                predictive window.
    """

    X = []
    y = []

    for i, (record_id, patient_df) in enumerate(patient_dict.items()):

        if i > 10:
            break

        # Static data

        # Temporal data
        for start in patient_df['temporal'].index:

            # Create sample window (data sample)
            sample_end = start + pd.to_timedelta(sample_window_hours, unit='h')
            s_timepoints = patient_df['temporal'].index.to_series().between(start, sample_end)
            sample_window = patient_df['temporal'][s_timepoints]

            # Check if CIE occurs during predictive window (output)
            predictive_end = sample_end + pd.to_timedelta(predictive_window_hours, unit='h')
            all_cie = [sum([1 for t in d if t >= sample_end and t <= predictive_end]) \
                       for d in cie_dict[record_id].values()]

            
            X.append(sample_window.values)
            y.append(all_cie)

    # X = np.array(X) # shape: data samples, timepoints, dimensions
    # y = np.array(y) # shape: data samples, number of CIE (outputs)


    return X, y



In [None]:

def sliding_window(df, cie_dict, sample_window_hours, predictive_window_hours):
    pass


print(dtype_dict['checkbox'])

# display(radio_df['dly_time'])
