# EEGs: correlations

Considering only unanimous consensus.

- Multiple plots for visual inspection.
- Determine correlations between electrodes.

In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.signal import sosfiltfilt, butter

# base_dir = "../../kaggle_data/hms"
base_dir = "../../data/hms"
# base_dir = "/kaggle/input/hms-harmful-brain-activity-classification"

fs = 200  # Sample rate.

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')
df_traincsv.head()

# Indices of sub eegs where:
# 2 or less NaN rows in 50 s sample (to interpolate).
# No change of votes in the different sub eegs for the same eeg_id.
#
idxs = np.load("../data/02_idx_constant_votes.npy")

df = df_traincsv.iloc[idxs]
print(len(df))

83893


In [13]:
def banana(eeg_absolute):
    '''Returns pandas dataframe with a banana montage.
    '''
    # Filtering between 8 Hz and 30 Hz.
    sos = butter(5, [8,30], btype='bandpass', fs=fs, output='sos')
    filtered_data = sosfiltfilt(sos, eeg_absolute)

    eeg = pd.DataFrame(data={
        'Fp1-F7' : filtered_data.Fp1 - filtered_data.F7,
        'Fp7-T3' : filtered_data.F7 - filtered_data.T3,
        'T3-T5' : filtered_data.T3 - filtered_data.T5,
        'T5-O1' : filtered_data.T5 - filtered_data.O1,

        'Fp2-F8' : filtered_data.Fp2 - filtered_data.F8,
        'F8-T4' : filtered_data.F8 - filtered_data.T4,
        'T4-T6' : filtered_data.T4 - filtered_data.T6,
        'T6-O2' : filtered_data.T6 - filtered_data.O2,

        'Fp1-F3' : filtered_data.Fp1 - filtered_data.F3,
        'F3-C3' : filtered_data.F3 - filtered_data.C3,
        'C3-P3' : filtered_data.C3 - filtered_data.P3,
        'P3-O1' : filtered_data.P3 - filtered_data.O1,

        'Fp2-F4' : filtered_data.Fp2 - filtered_data.F4,
        'F4-C4' : filtered_data.F4 - filtered_data.C4,
        'C4-P4' : filtered_data.C4 - filtered_data.P4,
        'P4-O2' : filtered_data.P4 - filtered_data.O2,

        'Fz-Cz' : filtered_data.Fz - filtered_data.Cz,
        'Cz-Pz' : filtered_data.Cz - filtered_data.Pz,

        'EKG' : filtered_data.EKG
        })
    return eeg

## Creation of various graphs


In [3]:
eeg_ids = np.unique(df['eeg_id'].values)

vote = np.array([])
qty_sub_ids = np.array([], dtype=int)
for eeg_id in eeg_ids:
    items = df.loc[df.eeg_id==eeg_id]
    vote = np.append(vote, items.iloc[0].expert_consensus)
    qty_sub_ids = np.append(qty_sub_ids, len(items))
d = {
    'eeg_id' : eeg_ids,
    'vote' : vote,
    'qty_sub_ids' : qty_sub_ids
}
df_vote = pd.DataFrame(data=d)
df_vote

Unnamed: 0,eeg_id,vote,qty_sub_ids
0,568657,Other,4
1,582999,LPD,11
2,642382,Other,2
3,751790,GPD,1
4,778705,Other,1
...,...,...,...
15456,4293354003,GRDA,1
15457,4293843368,GRDA,1
15458,4294455489,Other,1
15459,4294858825,Other,5


Search for the longest running eegs: 

In [4]:
df_vote.sort_values(by='qty_sub_ids')[-100:]

Unnamed: 0,eeg_id,vote,qty_sub_ids
6012,1654580421,GRDA,61
12240,3394076749,GRDA,61
12913,3575372862,GRDA,61
13772,3825216091,GPD,61
12244,3395187963,GPD,62
...,...,...,...
6229,1712056492,LRDA,433
1867,525664301,LRDA,531
5951,1641054670,GPD,562
8810,2428433259,GRDA,664


In [5]:
df_vote.sort_values(by='qty_sub_ids')[-20:].groupby(by='vote').size()


vote
GPD      7
GRDA     4
LRDA     8
Other    1
dtype: int64

In [6]:
df_vote.sort_values(by='qty_sub_ids')[-150:].groupby(by='vote').size()

vote
GPD        33
GRDA       57
LPD         1
LRDA       55
Other       1
Seizure     3
dtype: int64

<div class="alert alert-block alert-info">&rdsh; The longest running eegs (the ones with lots of sub eegs) are GPD, GRDA and LRDA.</div>


In [7]:
df_vote.sort_values(by='qty_sub_ids')[-150:].groupby(by='vote').max()

Unnamed: 0_level_0,eeg_id,qty_sub_ids
vote,Unnamed: 1_level_1,Unnamed: 2_level_1
GPD,4007115462,743
GRDA,4203450228,664
LPD,699093875,88
LRDA,4044323427,531
Other,3123865097,206
Seizure,4098737417,73


In [8]:
def plot_eeg(ax, eeg, title, sep):
    srate = 200 # Sample rate.
    nx = eeg.shape[0]
    totaltime = nx/srate
    X, Y = np.linspace(0, totaltime, nx), np.zeros(nx)
    yticklabels = eeg.columns[::-1]  # Reversed.

    for i, label in enumerate(yticklabels):
        Y = eeg[label]
        ax.plot(X, Y + (i * sep), linewidth=0.5, color='black')

    ax.set_title(title)
    ax.set(ylim=(-0.5*sep, (len(yticklabels)-0.5)*sep),
           yticks=np.arange(len(yticklabels))*sep,
           yticklabels=yticklabels)
    ax.set_xlabel('time [s]')

Generating plots of eegs with over 20 sub eegs. Only 5 plots in each eeg, distributed uniformly.

In [11]:
#
# ref_all
#

n = 3  # eegs
N = 5  # sub eegs

# Not including 'Other' because theere too few longer than 15.
votes = ['LPD', 'GPD', 'Seizure', 'GRDA', 'LRDA']
for vote in votes:
    _eeg_ids = df_vote.loc[(df_vote.vote == vote) & (df_vote.qty_sub_ids > 20), 'eeg_id'].sample(n).values
    for eeg_id in _eeg_ids:
        eeg_absolute = pd.read_parquet(f'{base_dir}/train_eegs/{eeg_id}.parquet')
        eeg_absolute = eeg_absolute.interpolate(limit_direction='both') # <<<<< Interpolation
        eeg = banana(eeg_absolute)
        items = df.loc[df.eeg_id == eeg_id]
        chunks = int(len(items)/N)

        for i in np.arange(chunks):
          item = items.iloc[i*N]
          subid = item.eeg_sub_id
          offset = int(item.eeg_label_offset_seconds)
          start = (offset + 20) * fs
          end = (offset + 30) * fs
          eeg_sub_10 = eeg[start:end]
          fig, ax = plt.subplots(1, 1, figsize=(10, 15))
          plot_eeg(ax, eeg_sub_10, title='10 seconds sample - eeg: ' + str(item.eeg_id)
            + '/' + str(item.eeg_sub_id) + ' ' + item.expert_consensus, sep = 400)

          plt.tight_layout()
          plt.savefig(f'../results/05_plots/ref_all_{vote}_{eeg_id}_{subid}.png')
          plt.close()

In [17]:
# This eeg has very high values.

eeg_id = 320837057
eeg = pd.read_parquet(f'{base_dir}/train_eegs/{eeg_id}.parquet')


In [21]:
df.loc[df.eeg_id == 320837057].iloc[55]

eeg_id                               320837057
eeg_sub_id                                  55
eeg_label_offset_seconds                 266.0
spectrogram_id                       109868772
spectrogram_sub_id                          55
spectrogram_label_offset_seconds         266.0
label_id                            3085135040
patient_id                               55803
expert_consensus                          GRDA
seizure_vote                                 0
lpd_vote                                     0
gpd_vote                                     0
lrda_vote                                    0
grda_vote                                    3
other_vote                                   0
Name: 6023, dtype: object

In [23]:
eeg[286*200:296*200].max()

Fp1    694.260010
F3     683.200012
C3     682.770020
P3     680.159973
F7     689.140015
T3     701.289978
T5     697.960022
O1     706.770020
Fz     311.399994
Cz     676.789978
Pz     619.770020
Fp2    676.809998
F4     657.280029
C4     675.460022
P4     660.570007
F8     620.219971
T4     681.580017
T6     684.960022
O2     675.450012
EKG    497.380005
dtype: float32