# Apply Standard Score On all Series with Same Mean and Standard Deviation

This pre-processing scales all train series and all prediction series with same mean and standard deviation using a manually written standard score-transformation. The [StandardScaler by scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) can perform the same calculations, but fails to process series of different lengths. The standard score-transformed series allow for the execution of the prediction scripts ending with "s1". These runs should be compared with the individual scaling of each series performed with the default Scaler of Darts (aka [MinMaxScaler by scikit-learn](https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range)) that can be found in the prediction scripts ending with "s2". Data for all windows (train and prediction data for each needed resampling method) of all parameters are exported.

In [None]:
def apply_standard_score(series, mean, std):
    scaled_series = list()

    for value in series:
        scaled_series.append((value - mean) / std)

    return scaled_series

In [None]:
from darts import TimeSeries
from darts.dataprocessing.transformers import MissingValuesFiller
import itertools
import numpy as np
import pandas as pd
import pickle5 as pickle


n_chunks = 2000
filler = MissingValuesFiller()
means, stds = dict(), dict()

for parameter in ['bp', 'hr', 'o2']:

    resampled = pd.read_parquet(f'../../data/resampling/resample_output_{parameter}_first{n_chunks}.parquet',
                                engine='pyarrow')

    # Collect values of all series with minimal length
    relevant_series_median, relevant_series_min, relevant_series_max = dict(), dict(), dict()

    for chunk_id in pd.unique(resampled.CHUNK_ID_FILLED_TH):
        current_series = resampled[resampled['CHUNK_ID_FILLED_TH'] == chunk_id]

        if len(current_series) > 12:
            relevant_series_median[chunk_id] = current_series['VITAL_PARAMTER_VALUE_MEDIAN_RESAMPLING'].tolist()
            relevant_series_min[chunk_id] = current_series['VITAL_PARAMTER_VALUE_MIN_RESAMPLING'].tolist()
            relevant_series_max[chunk_id] = current_series['VITAL_PARAMTER_VALUE_MAX_RESAMPLING'].tolist()

    # Calculate number of chunks corresponding to 20% of chunks
    relevant_chunk_ids = list(relevant_series_median.keys())
    twenty_percent = int((20 * len(relevant_chunk_ids)) / 100)

    # Iterate five times different 20% of the chunks (= 5 windows)
    for window_idx in range(5):

        # Extract 20% of series for prediction (and catch last window to avoid ignoring chunks)
        if window_idx == 4:
            pred_median = {chunk_id: relevant_series_median[chunk_id]
                           for chunk_id in relevant_chunk_ids[twenty_percent * window_idx:]}
            pred_min = {chunk_id: relevant_series_min[chunk_id]
                        for chunk_id in relevant_chunk_ids[twenty_percent * window_idx:]}
            pred_max = {chunk_id: relevant_series_max[chunk_id]
                        for chunk_id in relevant_chunk_ids[twenty_percent * window_idx:]}
        else:
            pred_median = {chunk_id: relevant_series_median[chunk_id]
                           for chunk_id in relevant_chunk_ids[
                                           twenty_percent * window_idx:twenty_percent * (window_idx + 1)]}
            pred_min = {chunk_id: relevant_series_min[chunk_id]
                        for chunk_id in relevant_chunk_ids[
                                        twenty_percent * window_idx:twenty_percent * (window_idx + 1)]}
            pred_max = {chunk_id: relevant_series_max[chunk_id]
                        for chunk_id in relevant_chunk_ids[
                                        twenty_percent * window_idx:twenty_percent * (window_idx + 1)]}

        # Extract 80% of series for training
        train_median = {chunk_id: relevant_series_median[chunk_id] for chunk_id in relevant_chunk_ids
                        if chunk_id not in list(pred_median.keys())}
        train_min = {chunk_id: relevant_series_min[chunk_id] for chunk_id in relevant_chunk_ids
                     if chunk_id not in list(pred_min.keys())}
        train_max = {chunk_id: relevant_series_max[chunk_id] for chunk_id in relevant_chunk_ids
                     if chunk_id not in list(pred_max.keys())}

        # Collect all values to calculate overall mean and standard deviation
        train_values = pd.DataFrame(list(itertools.chain.from_iterable(list(train_median.values()) +
                                                                       list(train_min.values()) +
                                                                       list(train_max.values()))))
        pred_values = pd.DataFrame(list(itertools.chain.from_iterable(list(pred_median.values()) +
                                                                      list(pred_min.values()) +
                                                                      list(pred_max.values()))))

        # Scale values and merge with timestamps
        train_median_scaled, train_min_scaled, train_max_scaled = dict(), dict(), dict()
        pred_median_scaled, pred_min_scaled, pred_max_scaled = dict(), dict(), dict()

        for chunk_id in train_median.keys():
            train_mean, train_std = train_values.mean(), train_values.std()
            means[f'{parameter}_{window_idx}_train'] = train_mean
            stds[f'{parameter}_{window_idx}_train'] = train_std

            original_series = resampled[resampled['CHUNK_ID_FILLED_TH'] == chunk_id]
            original_series['SCALED_MEDIAN'] = np.array(
                apply_standard_score(train_median[chunk_id], train_mean, train_std))
            original_series['SCALED_MIN'] = np.array(apply_standard_score(train_min[chunk_id], train_mean, train_std))
            original_series['SCALED_MAX'] = np.array(apply_standard_score(train_max[chunk_id], train_mean, train_std))

            train_median_scaled[chunk_id] = filler.transform(TimeSeries.from_dataframe(
                df=original_series,
                time_col='CHARTTIME',
                value_cols=['SCALED_MEDIAN'],
                freq='H'))

            train_min_scaled[chunk_id] = filler.transform(TimeSeries.from_dataframe(
                df=original_series,
                time_col='CHARTTIME',
                value_cols=['SCALED_MIN'],
                freq='H'))

            train_max_scaled[chunk_id] = filler.transform(TimeSeries.from_dataframe(
                df=original_series,
                time_col='CHARTTIME',
                value_cols=['SCALED_MAX'],
                freq='H'))

        for chunk_id in pred_median.keys():
            pred_mean, pred_std = pred_values.mean(), pred_values.std()
            means[f'{parameter}_{window_idx}_pred'] = pred_mean
            stds[f'{parameter}_{window_idx}_pred'] = pred_std

            original_series = resampled[resampled['CHUNK_ID_FILLED_TH'] == chunk_id]
            original_series['SCALED_MEDIAN'] = np.array(z_transform(pred_median[chunk_id], pred_mean, pred_std))
            original_series['SCALED_MIN'] = np.array(z_transform(pred_min[chunk_id], pred_mean, pred_std))
            original_series['SCALED_MAX'] = np.array(z_transform(pred_max[chunk_id], pred_mean, pred_std))

            pred_median_scaled[chunk_id] = filler.transform(TimeSeries.from_dataframe(
                df=original_series,
                time_col='CHARTTIME',
                value_cols=['SCALED_MEDIAN'],
                freq='H'))

            pred_min_scaled[chunk_id] = filler.transform(TimeSeries.from_dataframe(
                df=original_series,
                time_col='CHARTTIME',
                value_cols=['SCALED_MIN'],
                freq='H'))

            pred_max_scaled[chunk_id] = filler.transform(TimeSeries.from_dataframe(
                df=original_series,
                time_col='CHARTTIME',
                value_cols=['SCALED_MAX'],
                freq='H'))

        # Export dicts containing chunk ID and its scaled TimeSeries
        train_median_scaled_f = open(f'../../data/z_scaled/{parameter}_{window_idx}_train_median.pickle', 'wb')
        pickle.dump(train_median_scaled, train_median_scaled_f, protocol=pickle.HIGHEST_PROTOCOL)
        train_median_scaled_f.close()

        train_min_scaled_f = open(f'../../data/z_scaled/{parameter}_{window_idx}_train_min.pickle', 'wb')
        pickle.dump(train_min_scaled, train_min_scaled_f, protocol=pickle.HIGHEST_PROTOCOL)
        train_min_scaled_f.close()

        train_max_scaled_f = open(f'../../data/z_scaled/{parameter}_{window_idx}_train_max.pickle', 'wb')
        pickle.dump(train_max_scaled, train_max_scaled_f, protocol=pickle.HIGHEST_PROTOCOL)
        train_max_scaled_f.close()

        pred_median_scaled_f = open(f'../../data/z_scaled/{parameter}_{window_idx}_pred_median.pickle', 'wb')
        pickle.dump(pred_median_scaled, pred_median_scaled_f, protocol=pickle.HIGHEST_PROTOCOL)
        pred_median_scaled_f.close()

        pred_min_scaled_f = open(f'../../data/z_scaled/{parameter}_{window_idx}_pred_min.pickle', 'wb')
        pickle.dump(pred_min_scaled, pred_min_scaled_f, protocol=pickle.HIGHEST_PROTOCOL)
        pred_min_scaled_f.close()

        pred_max_scaled_f = open(f'../../data/z_scaled/{parameter}_{window_idx}_pred_max.pickle', 'wb')
        pickle.dump(pred_max_scaled, pred_max_scaled_f, protocol=pickle.HIGHEST_PROTOCOL)
        pred_max_scaled_f.close()

# Export dicts containing means and standard deviations
means_f = open(f'../../data/z_scaled/means_z_scaling.pickle', 'wb')
pickle.dump(means, means_f, protocol=pickle.HIGHEST_PROTOCOL)
means_f.close()

stds_f = open(f'../../data/z_scaled/stds_z_scaling.pickle', 'wb')
pickle.dump(stds, stds_f, protocol=pickle.HIGHEST_PROTOCOL)
stds_f.close()