# Check Baseline Variance

Prework to determine whether baseline correction should be done within trials or not.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import neuropsy as npsy
import neuropsy.analysis as npsya
import pickle
import json
import pandas
import time
from scipy import signal
from scipy.fftpack import next_fast_len

In [3]:
#********** PARAMETERS **********#
path_data                   = '/mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/data/preprocessed'
path_results                = f'/mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline'
subject_ids                 = npsy.utils.get_subject_ids_from_path(path_data)

postfix_load                = 'preprocessed'

# analysis parameters
baseline                    = (-.2, 0.)              # time period from before stimulus to use for baseline correction
baseline_method             = 'mean'                # basline correction method (see neuropsy.preprocessing)

# wavelet parameters
fs                          = 512
t_wl                        = np.arange(-4, 4+1/fs, 1/fs) # long enough time to capture the wavelet for given f and n
# 2-12 Hz, 3-8 cycles
frequencies                 = np.arange(2, 13, 1)
cycles                      = [8, 7, 6, 6, 5, 5, 5, 4, 4, 3, 3]
n_half_wavelet              = len(t_wl) // 2

# set up directory for saving results
if not os.path.exists(path_results):
    os.makedirs(path_results)
    os.makedirs(f"{path_results}/data")

# 1. Compute Power of Baseline Periods Individually

In [None]:
# save general analysis info (will be saved as json file in the end)
dict_analysis_info = {}
dict_analysis_info['general'] = {}
dict_analysis_info['general']['baseline'] = baseline
dict_analysis_info['general']['baseline_method'] = baseline_method
dict_analysis_info['general']['wavelet'] = {'frequencies': frequencies.tolist(), 'cycles': cycles.tolist() if isinstance(cycles, np.ndarray) else cycles}

# time-frequency result dictionary {condition: {subject_id: {channel: {trial: np.2darray}}}}
dict_tfr_power_results = {s: {} for s in subject_ids}

#********** START ANALYSIS **********#
for subject_id in subject_ids:
    
    # initialise variables
    trial_names_outliers                = None
    labels_outliers                     = None
    dict_outliers                       = {}
    trial_names_too_short               = None
    trial_names_too_close               = None
    labels_too_short                    = None
    labels_too_close                    = None
    dict_too_short                      = {}
    dict_too_close                      = {}
    
    
    start_time_sub = time.time()
    print(f"Starting subject {subject_id}...")

    #********** LOAD SUBJECT DATA **********# 
    data = npsy.DataHandler(path=path_data, subject_id=subject_id, exp_phase=2, fs=512, verbose=False)
    data.load(load_saved=True, postfix=postfix_load)
    
    # subtract total mean from each channel to remove DC bias
    print("Removing DC bias from iEEG data...")
    total_mean = np.mean(data.ieeg)
    for i in range(data.ieeg.shape[0]):
        data.ieeg[i, :] = data.ieeg[i, :] - total_mean
        data.ieeg[i, :] = signal.detrend(data.ieeg[i, :])
        
    #********** SELECT CHANNELS TO INCLUDE IN ANALYSIS **********# 
    print(f"Selecting channels in hippocampus...")
    ch_names = data.df_chan.loc[data.df_chan['DK_Subfields'].str.contains('HP_head|HP_body|HP_tail', case=True)].name.to_list()
    # check that there are channels in the hippocampal subfield for this subject, otherwise skip
    if len(ch_names) > 0:
        data.select_channels(ch_names=np.ravel(ch_names), inplace=True)
        print(f"Selected {len(ch_names)} channels in hippocampus.")
    else:
        print(f"No channels in hippocampus.")
    
    #********** REMOVE OUTLIERS (TRIALS WITH TOO HIGH REACTION TIME)  **********# 
    print("Removing outliers from experiment dataframe...")
    idx_outliers = data.df_exp[data.df_exp['outlier'] == True]['outlier'].index.to_list()
    if len(idx_outliers) > 0:
        trial_names_outliers    = data.df_exp[data.df_exp['outlier'] == True]['Trial Identifier'].to_list()
        labels_outliers         = data.df_exp.loc[idx_outliers, cond_column].to_list()
        for i, label in zip(idx_outliers, labels_outliers):
            if label not in dict_outliers.keys():
                dict_outliers[label] = 1
            else:
                dict_outliers[label] += 1
        for key, value in dict_outliers.items():
            print(f"{value} outliers in condition {repr(key)}.")
        data.df_exp = data.df_exp.drop(idx_outliers).reset_index(drop=True)
    else:
        print("No outliers in experiment dataframe.")

    #********** CREATE MNE RAW OBJECT WITH iEEG DATA **********#
    data.create_raw()
    

    #********** GET TRIAL TIME POINTS (INDICES) AND TRIAL IDENTIFIERS (NAMES) **********#
    # - need trial indices from experiment dataframe to extract the baseline period for each trial
    # - also need to extract the trial identifiers (names) to keep track of trials in the resulting time-frequency power data
    print("Getting trial indices...")
    dict_trial_indices = {**{f'baseline': None}, **{f'names': None}}
    # get the trial indices
    idx_baseline                    = data.df_exp['Mark for Picture Shown'].to_numpy().astype(int)
    # save indices
    dict_trial_indices["baseline"]  = idx_baseline
    # get the trial identifiers (names) for the condition
    trial_names                     = data.df_exp['Trial Identifier'].to_numpy().astype(str)
    # save trial identifiers
    dict_trial_indices["names"]     = trial_names


    #********** TIME-FREQUENCY ANALYSIS **********#
    # info
    dict_n_trials_kept = None
    dict_n_trials_ieds = None
    
    if len(ch_names) > 0:
        
        # info
        dict_n_trials_kept = {**{f'{ch}': {'trial_identifiers': [], 'count': None} for ch in ch_names}, **{'total': 0}}
        dict_n_trials_ieds = {**{f'{ch}': {'trial_identifiers': [], 'count': None} for ch in ch_names}, **{'total': 0}}
        
        # CHANNEL:
        #  - FFT of entire channel signal
        #  - CWT for each frequency and cycle
        #  - store power in dB for each frequency
        #  - this is done once per channel, then afterwards each trial is considered independently during CONDITION loop
        #    - this is done to save time as the CWT is the most time-consuming part
        for ch in ch_names:
            start_time_ch = time.time()
            print(f"\tStarting channel {ch}...")
            
            # use the whole channel signal for computing time-frequency representation
            ch_signal = data.raw._data[ch_names.index(ch), ...]
            
            # ********* FFT of CHANNEL ********* #
            n_conv      = len(t_wl) + len(ch_signal) - 1
            n_conv_fast = next_fast_len(n_conv)
            signal_fft  = np.fft.fft(ch_signal, n_conv_fast)
            
            # initialise output data for continuous wavelet transform
            tf_data = np.zeros((len(frequencies), len(ch_signal)))
            
            # ********* CWT ********* #
            for i, (f, n) in enumerate(zip(frequencies, cycles)):
                
                # [INFO] - print current wavelet parameters
                # print(f"creating wavelet with parameters: f = {f}, n = {n}...")
                
                # create wavelet
                wavelet = npsya.morlet(f, n, t_wl)
                
                # fft of wavelet
                # note:
                #   output must match the length of the signal_fft in order to multiply in the frequency domain
                wavelet_fft = np.fft.fft(wavelet, n=n_conv_fast)
                
                # convolution
                coefficients = np.fft.ifft(signal_fft * wavelet_fft, n=n_conv_fast)
                coefficients = coefficients[:n_conv] # remove padding from next_fast_len
                coefficients = coefficients[n_half_wavelet:-n_half_wavelet]
                
                # convert to power in dB
                tf_power = 10 * np.log10(np.abs(coefficients)**2)
                
                # store result for frequency
                tf_data[i, :] = tf_power
                
            # clean up
            del ch_signal, signal_fft, wavelet, wavelet_fft, coefficients, tf_power
                
            # CONDITION:
            #  - continuous wavelet transform result is stored in tf_data (in dB)
            #  - now we need to consider each trial independently
            #  - loop over conditions
            #  - loop over trials in condition
            #  - baseline correction is done for each trial separately
            #  - the window between onset and offset is divided into bins
            
            dict_tfr_power_results[subject_id][ch] = {}
            
            for (i_baseline, trial_id) in zip(dict_trial_indices[f'baseline'], 
                                                dict_trial_indices[f'names']):
                start_time_trial = time.time()
                
                # get start and stop timepoints for baseline period
                b_tmin = int(i_baseline + int(baseline[0] * fs))
                b_tmax = int(i_baseline + int(baseline[1] * fs))
                
                #********** CHECK TRIAL FOR IEDs **********# 
                # - if there are IEDs in the baseline, skip it and exclude it from the analysis
                if npsya.check_period_for_ieds(ch, data.df_ied, b_tmin, b_tmax):
                    print(f"\t\t\tSkipping trial {trial_id} due to IEDs in baseline period.")
                    dict_n_trials_ieds[ch]['trial_identifiers'].append(trial_id)
                    dict_n_trials_ieds[ch]['count'] += 1
                    dict_n_trials_ieds['total']     += 1
                    continue
                
                # get power for baseline period
                power_baseline = tf_data[:, b_tmin:b_tmax]
                
                # calculate mean of baseline across time, vector output = [freq_dim,]
                power_baseline_mean = np.mean(power_baseline, axis=1)
                
                # store time-frequency power result for trial
                dict_tfr_power_results[subject_id][ch][trial_id] = power_baseline_mean
                
                # info
                dict_n_trials_kept[ch]['trial_identifiers'].append(trial_id)
                dict_n_trials_kept[ch]['count'] += 1
                dict_n_trials_kept['total']     += 1
                
            # clean up
            del power_baseline
            print(f"\tChannel {ch} done - {time.time() - start_time_ch:.2f} seconds")
    # no channels
    else:
        # remove subject from results if there are no channels in the hippocampus
        dict_tfr_power_results.pop(subject_id)
        print(f"No channels in hippocampus for subject {subject_id}.")
        

    # save analysis info for subject
    tmp_dict_outliers           = {**dict_outliers, **{'count': len(idx_outliers)}, **{'indices': idx_outliers}, **{'trial_identifiers': trial_names_outliers}}
    
    dict_analysis_info[f'subject {subject_id}'] = {
        'channels': ch_names if len(ch_names) > 0 else None,
        'trials_kept': dict_n_trials_kept,
        'trials_dropped': {
            'outliers': tmp_dict_outliers,
            'ieds': dict_n_trials_ieds
        } if len(ch_names) > 0 else None
    }
    
    # clean up
    del dict_n_trials_kept, dict_n_trials_ieds, tmp_dict_outliers
    
    print(f"Subject {subject_id} done - {time.time() - start_time_sub:.2f} seconds")
    
    # break
        

#********** SAVE RESULTS **********#
filename = f"{path_results}/data/baseline_mean_power.pkl"
with open(filename, 'wb') as f:
    print(f"Saving results as {repr(filename)}...")
    pickle.dump(dict_tfr_power_results, f)
    print("Done")
#********** SAVE ANALYSIS INFO **********#
filename = f"{path_results}/data/baseline_analysis_info.json"
with open(filename, 'w') as f:
    print(f"Saving analysis info as {repr(filename)}...")
    json.dump(dict_analysis_info, f)
    print("Done")
    
# clean up
del dict_tfr_power_results, dict_analysis_info

NameError: name 'cond_column' is not defined

# 2. Plot Baseline Variance Across Trials

- Each subject is processed independently
- Each frequency is processed indpendently

In [4]:
# [INFO] load data in this cell once to avoid re-running the heavy loading process. 
filename = f"{path_results}/data/baseline_mean_power.pkl"

# load previously computed time-frequency power for all subjects
with open(filename, 'rb') as f:
    dict_baseline_mean_power = pickle.load(f)
    
# get conditions and subject ids from the loaded results dictionary
subject_ids = list(dict_baseline_mean_power.keys())

In [None]:
# [INFO] NOT USED!!!


# get conditions and subject ids from the loaded results dictionary
subject_ids = list(dict_baseline_mean_power.keys())

list_baseline_power = []
for i, sub_id in enumerate(subject_ids):
    list_baseline_power.append([])
    
    for j, channel in enumerate(dict_baseline_mean_power[sub_id].keys()):
        list_baseline_power[i].append([])
        
        print(f"Subject {sub_id}, channel {channel}: {len(dict_baseline_mean_power[sub_id][channel].keys())} trials.")
        
        for f in range(n_freqs):
            list_baseline_power[i][j].append([])
        
            for trial in dict_baseline_mean_power[sub_id][channel].keys():
                
                list_baseline_power[i][j][f].append(dict_baseline_mean_power[sub_id][channel][trial][f])
    break

Subject 03, channel B' 02: 193 trials.
Subject 03, channel B' 03: 192 trials.
Subject 03, channel B' 04: 190 trials.
Subject 03, channel B' 05: 192 trials.
Subject 03, channel C' 02: 194 trials.


### 2.1 Boxplot

In [None]:
for sub_id in subject_ids:
    fig, ax = plt.subplots(nrows=3, ncols=4, figsize=(25, 15))
    ax = ax.flatten()
    
    for f, freq in enumerate(frequencies):
        channel_baseline_power = []
        channel_names = []
        
        for ch, channel in enumerate(dict_baseline_mean_power[sub_id].keys()):
            trials_baseline_power = []
            channel_names.append(channel)
            
            for trial in dict_baseline_mean_power[sub_id][channel].keys():
                trials_baseline_power.append(dict_baseline_mean_power[sub_id][channel][trial][f])
                
            channel_baseline_power.append(trials_baseline_power)
        
        # plot
        channel_baseline_power = np.array(channel_baseline_power, dtype=object)
        ax[f].boxplot([ch for ch in channel_baseline_power], labels=channel_names)
        ax[f].set_title(f"Frequency: {freq} Hz")
        # ax[f].grid(axis='y')
    
    ax[11].axis('off')
    
    fig.suptitle(f"Subject {sub_id} - Baseline Variance", fontsize=14, y=.92)
    
    # save figure
    full_save_path = f"{path_results}/2.1 boxplot_sub{sub_id}"
    fig.savefig(full_save_path, dpi=300, bbox_inches='tight')
    print(f"Saved figure as {full_save_path}.png")
    plt.close(fig)
                
    # break
    

Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub03.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub04.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub05.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub07.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub09.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub10.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/boxplot_sub11.png
Saved figure 

### 2.2 Lineplot

In [7]:
def normalize(data, new_min=0, new_max=1):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized = (data - min_val) / (max_val - min_val) * (new_max - new_min) + new_min
    return normalized


for sub_id in subject_ids:        
    for channel in dict_baseline_mean_power[sub_id].keys():
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(25, 10))
        
        trials_baseline_power = []
        for trial in dict_baseline_mean_power[sub_id][channel].keys():
            trials_baseline_power.append(dict_baseline_mean_power[sub_id][channel][trial])
            
        trials_baseline_power = np.array(trials_baseline_power)    
        
        for f, freq in enumerate(frequencies):
            trials_baseline_power[:, f] = normalize(trials_baseline_power[:, f], new_min=freq-1, new_max=freq+1)
        
        ax.plot(trials_baseline_power)
        ax.set_title(f"Subject {sub_id} - channel {channel} - Baseline Variance", fontsize=14)
        ax.set_ylabel("Frequency (Hz)", fontsize=12)
        ax.set_xlabel("Trial", fontsize=12)
        
        ax.set_yticks(frequencies, frequencies)
        ax.set_xlim((0, len(trials_baseline_power)-1))
        ax.grid(axis='y')
        
        # save figure
        full_save_path = f"{path_results}/2.2 lineplot_sub{sub_id}_channel_{channel}"
        fig.savefig(full_save_path, dpi=300, bbox_inches='tight')
        print(f"Saved figure as {full_save_path}.png")
        plt.close(fig)
        
    #     break
    # break    

Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/lineplot_sub15_channel_B' 02.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/lineplot_sub15_channel_B' 03.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/lineplot_sub15_channel_B' 04.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/lineplot_sub15_channel_B' 05.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/lineplot_sub15_channel_B' 06.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/MasterThesis/master-project/results/preprocessing/check baseline/lineplot_sub15_channel_C' 02.png
Saved figure as /mnt/c/Users/matti/OneDrive/Education/SDC/Master