### Configuration

In [1]:
import os
import re
import numpy as np
import pandas as pd
from scipy import io
import hdf5storage
from tqdm import tqdm

import utils__config

In [2]:
os.chdir(utils__config.working_directory)
os.getcwd()

'Z:\\Layton\\Sleep_083023'

### Parameters

This script will take the output directory from Combinato and extract spike times and associated meta-data from the various H5 files contained in the directory.

In [3]:
root_dir = 'Cache/Subject05/Jul13/S05_Jul13_cnato__4___yemi_to_mat'
dict_dir = 'Data/Subject05/S05_dictionary.xlsx'
metric_out_path = 'Cache/Subject05/Jul13/S05_spike_metrics.csv'
spike_out_path = 'Cache/Subject05/Jul13/S05_spikes.csv'
recording_length = 10.40

In [4]:
# S02, Apr26: 01:59:20:000 UTC - 07:37:20:000 UTC = 5hr 38min = 5.63 hours
# S02, Apr27: 03:37:50:000 UTC - 08:13:50:000 UTC = 4hr 36min = 4.60 hours
# S05, Jul11: 02:24:20:624 UTC - 12:05:10:624 UTC = 9hr 41min = 9.68 hours
# S05, Jul12: 01:22:51:432 UTC - 11:55:41:432 UTC = 10hr 33min = 10.55 hours
# S05, Jul13: 01:01:00:000 UTC - 11:25:00:000 UTC = 10hr 24min = 10.40 hours

selected_regions = ['CLA', 'AMY', 'ACC']
MAT_version = '7.3' # MAT file version (SciPy reads < 7.3, hdf5storage reads >= 7.3)
min_firing_rate = 1 # minimum firing rate in Hz
min_spike_count = min_firing_rate * recording_length * 60 * 60

### Munging

In [5]:
micro_dict = pd.read_excel(dict_dir)
micro_dict = micro_dict[['number', 'laterality', 'region']]

In [6]:
data = pd.DataFrame()

for channel in tqdm(os.listdir(root_dir)):

    if MAT_version == '7.3':
        raw_data = hdf5storage.loadmat(os.path.join(root_dir, channel))
    else:
        raw_data = io.loadmat(os.path.join(root_dir, channel))

    chan_data = pd.DataFrame()

    # Extract unit type and spike times
    for unit in np.arange(0, len(raw_data['sp_types'])):

        # Skip this unit if it has no spikes
        if raw_data['sp_times'][unit][0].shape[1] == 0:
        
            continue
        
        # Extract unit type & spike times
        unit_type = raw_data['sp_types'][unit][0]
        unit_times = raw_data['sp_times'][unit][0][:, 0] # .squeeze() won't work if you only have one spike time

        # Format data
        unit_data = pd.DataFrame(unit_times, columns = ['ms'])
        unit_data['unit_type'] = unit_type
        unit_data['unit_num'] = unit + 1

        chan_data = pd.concat([chan_data, unit_data])
    
    # Extract channel meta-data
    chan_data['subject'] = channel.split('_')[0]

    chan_data['channel'] = channel.split('_')[1]
    chan_data['channel'] = chan_data['channel'].str.split('l', expand = True)[1]
    chan_data['channel'] = chan_data['channel'].astype('int64') # to merge with micro_dict

    chan_data['sign'] = channel.split('_')[2]
    chan_data['sign'] = chan_data['sign'].str.split('.', expand = True)[0]

    data = pd.concat([data, chan_data])

  0%|          | 0/81 [00:00<?, ?it/s]

100%|██████████| 81/81 [03:09<00:00,  2.34s/it]


In [7]:
# Merge with dictionary meta-data
data = data.merge(micro_dict, left_on = 'channel', right_on = 'number')

# Convert from milliseconds to seconds
data['seconds'] = data['ms'] / 1000 

# Account for the offset in unit number between Combinato and MATLAB
# so that you can compare units between Combinato GUI and your analysis (optional)
data['unit_num'] = data['unit_num'] - 1

# Create a unique unit ID
data['unit_id'] = data['subject'] + '_Ch' + data['channel'].astype('str') + '_' + data['sign'] + '_Unit' + data['unit_num'].astype('str')

# Rename laterality/region columns to specify that they apply to the unit
data.rename(columns = {'laterality' : 'unit_laterality', 'region' : 'unit_region'}, inplace = True)

### Quality Control

In [8]:
# Remove artifactual units
# (artifact = -1 | unassigned = 0 | MUA = 1 | SUA = 2)
#data = data[data['unit_type'] == 2] # SUA only
data = data[(data['unit_type'] != -1) & (data['unit_type'] != 0)] # MUA + SUA

# Keep only units from certain regions
data = data[data['unit_region'].isin(selected_regions)]

# Calculate ISI's between each spike by unit and mark those < 3 ms
data['diff'] = data.groupby(['unit_id'])['seconds'].diff()
data['short_isi'] = np.where(data['diff'] < 0.003, 1, 0)

# Calculate total spike count and number of ISI's < 3 ms
spike_metrics = data.groupby(['unit_id']).agg({'seconds' : ['count'], 'short_isi': ['sum']}).reset_index()
spike_metrics.columns = spike_metrics.columns.droplevel()
spike_metrics.columns = ['unit_id', 'num_count', 'num_isi']

# Mark units with ISI violations < 3 ms in more than 5% of spikes
spike_metrics['perc_isi_violations'] = spike_metrics['num_isi'] / spike_metrics['num_count']
spike_metrics['isi_violator'] = np.where(spike_metrics['perc_isi_violations'] >= 0.05, 1, 0)

# Remove units that violate ISI or have too few spikes
spike_metrics = spike_metrics[(spike_metrics['isi_violator'] == 0) & (spike_metrics['num_count'] >= min_spike_count)]
spike_metrics.to_csv(metric_out_path, index = False)

data = data[data['unit_id'].isin(spike_metrics['unit_id'])]

# Export
data.drop(['number', 'ms'], axis = 1, inplace = True)
data.to_csv(spike_out_path, index = False)

In [9]:
spike_metrics

Unnamed: 0,unit_id,num_count,num_isi,perc_isi_violations,isi_violator
0,S05_Ch193_neg_Unit2,116897,605,0.005175,0
1,S05_Ch193_neg_Unit4,113273,364,0.003213,0
4,S05_Ch194_neg_Unit2,113955,647,0.005678,0
5,S05_Ch194_neg_Unit4,98036,337,0.003438,0
7,S05_Ch195_neg_Unit2,96054,487,0.00507,0
8,S05_Ch195_neg_Unit5,61184,219,0.003579,0
9,S05_Ch197_neg_Unit2,136297,633,0.004644,0
10,S05_Ch198_neg_Unit4,160025,1501,0.00938,0
11,S05_Ch198_neg_Unit5,125590,709,0.005645,0
13,S05_Ch199_neg_Unit2,195023,1677,0.008599,0
