In this notebook, we will import and apply some preprocessing to the EEG data, and store it for quick access in the future.  
Preprocessing would involve:
1. Filtering out sub-1000 Hz noise
1. Downsampling
1. Annotating time intervals that show motion artefacts

Where should this go in the pipeline? Should the preprocessing be part of the lfp class or EEGexp class?

Downsample then filter, or filter then downsample?  
Downsampling first leads to high frequency noise staying back in the result due to aliasing. So, first filter, then downsample.

In [1]:
%load_ext autoreload
%autoreload 2

import os
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from scipy import signal
from pandarallel import pandarallel

from tbd_eeg.data_analysis.eegutils import EEGexp
from tbd_eeg.data_analysis.Utilities import filters

pandarallel.initialize(progress_bar=True)
%matplotlib widget

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Raw data

In [2]:
data_folder = r"/allen/programs/braintv/workgroups/nc-ophys/Leslie/eeg_pilot/mouse507190/pilot1_2020-02-28_10-33-11/recording1/"
exp = EEGexp(data_folder)
eegdata = exp.memmap_EEGdata()
timestamps = np.load(exp.eegtimestamps_file)
eegdata = pd.DataFrame(data=eegdata, index=timestamps)

Loading /allen/programs/braintv/workgroups/nc-ophys/Leslie/eeg_pilot/mouse507190/pilot1_2020-02-28_10-33-11/recording1/continuous/Rhythm_FPGA-111.0/continuous.dat


In [3]:
# # get back to working on reshaping data without loading into memory
# datafiles = sorted(glob(exp.data_folder + '/**/*.dat', recursive=True))[0]
# data = np.memmap(datafiles)
# data.reshape(int(data.size/exp.num_chs), exp.num_chs)[:, exp.intanNNmap]

In [4]:
ch = 3
f, ax = plt.subplots(1, 1, figsize=(12, 2))
eegdata[ch].plot(ax=ax)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fdb64a01f90>

# Low-pass filter

In [4]:
eegdata_lp = eegdata.apply(
    lambda x: filters.butter_filter(
        np.expand_dims(x, 1),
        sampling_frequency=exp.sample_rate,
        cutoff_frequency=1000,
        filter_order=2,
        ftype='low'
    )[:, 0], raw=True, axis=0
)

In [6]:
f, ax = plt.subplots(1, 1, figsize=(12, 2))
eegdata_lp[3].plot(ax=ax)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fdb9e760990>

# Downsample to 2000Hz

In [5]:
eegdata_lp_ds = eegdata_lp[::int(exp.sample_rate/2000)]
del eegdata_lp

In [8]:
ch = 3
f, ax = plt.subplots(1, 1, figsize=(12, 2))
eegdata_lp_ds[ch].plot(ax=ax)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7fdb4cbd94d0>

# Annotate artefacts due to motion

## Identify unconnected channels
They are 30 and 31, plus others in some cases

In [6]:
def median_amplitude(data):
    peaks, _ = signal.find_peaks(data.abs().values[:2000*300], distance=2000*0.01)
    return np.median(data.abs().values[peaks][:2000*300])

In [7]:
median_amplitude = eegdata_lp_ds.apply(median_amplitude, axis=0)
active_channels = median_amplitude.index[median_amplitude < 10000]
median_amplitude

0       633.834355
1       626.275328
2      2119.047536
3       775.881038
4       784.596710
5       782.644316
6       838.050795
7       876.906502
8       873.140652
9       862.727444
10      878.053731
11      895.834787
12      867.247162
13      876.933570
14      862.764540
15      864.279731
16      886.047749
17      868.032536
18      909.164696
19      903.872068
20      891.194270
21      880.964463
22      873.301196
23      888.245156
24      731.085156
25      832.293945
26      843.218898
27      689.931287
28      747.177158
29      782.900895
30    33062.658410
31    29473.115976
dtype: float64

## Can we use running data to identify the artefacts?
Not explored much. Looks unlikely.

In [8]:
linear_velocity, runtime = exp.load_running(
    exp.load_sync_dataset()
)
running_speed = pd.DataFrame(index=runtime, data=linear_velocity, columns=['speed'])
running_speed = (running_speed - running_speed.mean())*200/running_speed.std()
# running_speed is rescaled so as to look good on the plots

  return eval(self.dfile['analog_meta'].value)


In [12]:
# look at 580:582 as an example of artefact
f, ax = plt.subplots(1, 1, figsize=(12, 2), sharex=True, tight_layout=True)
running_speed.plot(ax=ax, c=cm.Greys(0.5, 0.5), label='Running speed')
# eegdata_ds[30].plot(ax=ax2, c=cm.Blues(0.6, 0.4))
eegdata_lp_ds[2].plot(ax=ax, c=cm.Reds(0.6, 0.4), label='2')
eegdata_lp_ds[8].plot(ax=ax, c=cm.Greens(0.6, 0.4), label='8')
ax.legend(loc=1, ncol=2);

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Identifying artefacts using the spectrogram

In [9]:
ch = 5
_data = eegdata_lp_ds[ch]
f, t, Sxx = signal.spectrogram(_data, 2000, nperseg=512)
Sxx_min = Sxx.min(axis=1)
Sxx_max = Sxx.max(axis=1)
Sxx_norm = (Sxx - Sxx_min[:, np.newaxis]) / (Sxx_max[:, np.newaxis] - Sxx_min[:, np.newaxis])
frinds = f < 5000
frhinds = f > 100
t = t + _data.index[0]
fig, (ax, ax2) = plt.subplots(2, 1, figsize=(12, 4), sharex=True)
# plot spectrogram
spgm = ax.pcolormesh(t, f[frinds], Sxx_norm[frinds,:], cmap='YlOrRd', vmin=0, vmax=1)
ax.set_ylabel('Frequency (Hz)')
ax.set_xlabel('Time (s)')
_data.plot(ax=ax2, c=cm.Reds(0.6, 0.4))
running_speed.plot(ax=ax2, c=cm.Greys(0.5, 0.5))
# hfp = (Sxx[frhinds, :]>100).sum(axis=0)
# ax.twinx().plot(t, hfp, c='k')
# ax2.set_xlim(560, 620)
# ax2.set_ylim(-5000, 5000)

# f2, t2, Sxx2 = signal.spectrogram(eegdata[30], 2000, nperseg=2048)
# t2 = t2 + _data.index[0]
# spgm = ax3.pcolormesh(t2, f2[frinds], Sxx2[frinds,:], cmap='YlOrRd', vmin=0, vmax=40)
# ax3.set_ylabel('Frequency (Hz)')
# ax3.set_xlabel('Time (s)')
# eegdata_ds[30].plot(ax=ax4, c=cm.Blues(0.6, 0.4))
# hfp2 = (Sxx2[frhinds, :]>100).sum(axis=0)
# ax3.twinx().plot(t2, hfp2, c='k')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f31a6e58b90>

## Filter specteegdata_lp_dsll channels to find artefacts

In [14]:
def find_hf_windows(data, sample_rate=exp.sample_rate, nperseg=2048):
    f, t, Sxx = signal.spectrogram(data, sample_rate, nperseg=nperseg)
    frhinds = f > 500
    t = t + data.index[0]
    return pd.Series(data=(Sxx[frhinds, :]>100).sum(axis=0), index=t)

In [15]:
# hf_windows = eegdata[active_channels].apply(find_hf_windows, axis=0)

f, ax = plt.subplots(1, 1, figsize=(12, 3))
# hf_windows.plot(ax=ax, legend=False)
hf_windows.mean(axis=1).plot(ax=ax.twinx(), c='k')
eegdata_ds[0].plot(ax=ax, c=cm.Reds(0.6, 0.3))
# eegdata_ds[1].plot(ax=ax, c=cm.Blues(0.6, 0.3))
running_speed.plot(ax=ax, c=cm.Greys(0.6, 0.3))
ax.set_ylim(-3000, 3000)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

NameError: name 'hf_windows' is not defined