### Configuration

In [6]:
import os
import numpy as np
import pandas as pd

import neo
import quantities as pq
from tqdm import tqdm
from joblib import Parallel, delayed

from elephant.conversion import BinnedSpikeTrain as elephant_bst
from elephant.spike_train_correlation import cross_correlation_histogram as elephant_cch
import elephant.spike_train_surrogates as sgt

from itertools import combinations, product

import utils__config

In [7]:
os.chdir(utils__config.working_directory)
os.getcwd()

'Z:\\Layton\\Sleep_051324'

### Parameters

This cross-correlation pipeline is implemented via what I call "double permutation": One permutation to convert the raw cross-correlograms into z-scored (or t valued) cross-correlograms, and one permutation to convert TFCE-valued cross-correlograms into p-valued cross-correlograms. My steps are below:

- First, for each unit pair (two spike trains), I compute the "original" cross-correlogram.

- Second, to pair with the above "original" cross-correlogram (independently for each unit pair), I compute a surrogate cross-correlogram by starting with the same data, but every spike time in the second spike train has been jittered randomly (you only need to jitter one, not both). This is repeated 1000 times.

- Third, I can then transform the value of each bin in the "original" cross-correlogram into a z-score by using the mean/SD from the corresponding bins of the surrogate cross-correlograms.

- Fourth, I perform TFCE for each unit pair's cross-correlogram independently, generating a single scalar TFCE value for each bin (thus, a TFCE-valued cross-correlogram). Save only the maximum TFCE bin value from each original cross-correlogram.

- Fifth, to pair with the above TFCE-valued cross-correlogram (independently for each unit pair), I generate a surrogate TFCE cross-correlogram by shuffling the bins randomly and calculating TFCE values for each bin. I save only the maximum TFCE value from the whole cross-correlogram (the highest bin value amongst all bins). This is repeated 1000 times (in order to generate a distribution of maximum TFCE values).

- Sixth, for each original cross-correlogram, its p-value is the percent of the maximum surrogate TFCE values that are higher than its TFCE value. Thus, it is significant if its max TFCE value is larger than the 95% of the surrogate maximum TFCE values.*

- Seventh, each bin's p-value will have FDR correction applied based on how many unit pairs underwent this pipeline.

*If you saved all TFCE bin values from the original cross-correlogram in Step 4, you could individually compare each original TFCE bin value to the surrogate max TFCE value distribution in Step 6. This would allow you to highlight individual bins/clusters that were significant rather than just whole cross-correlograms. However, I believe it would increase the number of comparisons and force you to calculate the FDR correction using (unit pairs x bins per correlogram) rather than just (unit pairs).

Finally, we can visualize the results with two histograms, both stratified by region pairing (e.g. CLA-ACC or CLA-AMY):

- In the first histogram, we plot the maximum TFCE bin value of the original histogram along the x-axis and the count of unit pairs along the y-axis. Coloring indicates significance. This demonstrates the total number of unit pair interactions that were analyzed with significant pairs highlighted.

- In the second histogram, for significant unit pairs, we plot the bin number (i.e. time lag) of the maximum TFCE bin value of the original histogram.

In [8]:
input_path = 'Data/S01_Feb02_spike_times.csv'
hypno_path = 'Data/S01_Feb02_hypnogram.csv' 
output_path_zscore = 'Cache/S01_Feb02_zcorrelogram_b5_w10_n1000_nrem_allpairs.csv'
output_path_original = 'Cache/S01_Feb02_ocorrelogram_b5_w10_n1000_nrem_allpairs.csv'
recording_length = 2.01 # hours

# input_path = 'Data/S05_Jul11_spike_times.csv'
# hypno_path = 'Data/S05_Jul11_hypnogram.csv' 
# output_path_zscore = 'Cache/S05_Jul11_zcorrelogram_b5_w10_n1000_nrem_allpairs.csv'
# output_path_original = 'Cache/S05_Jul11_ocorrelogram_b5_w10_n1000_nrem_allpairs.csv'
# recording_length = 9.69 # hours

# input_path = 'Data/S05_Jul12_spike_times.csv'
# hypno_path = 'Data/S05_Jul12_hypnogram.csv' 
# output_path_zscore = 'Cache/S05_Jul12_zcorrelogram_b5_w10_n1000_nrem_allpairs.csv'
# output_path_original = 'Cache/S05_Jul12_ocorrelogram_b5_w10_n1000_nrem_allpairs.csv'
# recording_length = 10.55 # hours

# input_path = 'Data/S05_Jul13_spike_times.csv'
# hypno_path = 'Data/S05_Jul13_hypnogram.csv' 
# output_path_zscore = 'Cache/S05_Jul13_zcorrelogram_b5_w10_n1000_nrem_allpairs.csv'
# output_path_original = 'Cache/S05_Jul13_ocorrelogram_b5_w10_n1000_nrem_allpairs.csv'
# recording_length = 10.4 # hours

In [9]:
nrem_spikes_only = True
all_region_pairs = True
surrogate_num = 1000

hypno_sampling_freq = 256
bin_size = 5 * pq.ms
cc_window = [-10, 10] # in number of bins
dither = 20 * pq.ms
n_cores = -4

### Format spike times into Neo SpikeTrains

In [10]:
# Load data into Pandas dataframe
spike_times = pd.read_csv(input_path)
spike_times = spike_times[['unit_laterality', 'unit_region', 'unit_id', 'seconds']]

# Set recording start and stop lengths in seconds
t_start = 0
t_stop = recording_length * 3600

# Hypnogram dictionary: 
# (-2) = Unassigned
# (-1) = Artifact
# (0) = Awake
# (1) = N1
# (2) = N2
# (3) = N3
# (4) = REM

# Load hypnogram and merge
hypno = pd.read_csv(hypno_path, header = None)
hypno = hypno.reset_index()
hypno.columns = ['sample', 'stage']
hypno['time'] = hypno['sample'] / hypno_sampling_freq

# Extract and group sleep stages
hypno['stage'] = np.where(hypno['stage'].isin([2, 3]), 'NREM', 'WREM')

# Merge spikes with nearest sleep stage
spike_times = pd.merge_asof(spike_times.sort_values('seconds'), hypno.sort_values('time'),
                            left_on = 'seconds', right_on = 'time', direction = 'nearest')

spike_times.drop(columns=['sample', 'time'], inplace=True)

In [11]:
# Keep NREM spikes only if nrem_spikes_only == True
if nrem_spikes_only:
    spike_times = spike_times[spike_times['stage'] == 'NREM']

# Create a mapping from unit_id to unit_region
unit_region_map = spike_times.drop_duplicates(subset=['unit_id']).set_index('unit_id')['unit_region'].to_dict()

# Convert the DataFrame into a list of Neo SpikeTrains with annotations
spike_trains = []
for unit_id in spike_times['unit_id'].unique():
    # Subset the DataFrame to get the spike times for the current unit
    unit_times = spike_times[spike_times['unit_id'] == unit_id]['seconds'].values
    
    # Create the SpikeTrain
    st = neo.SpikeTrain(unit_times * pq.s, t_start=t_start * pq.s, t_stop=t_stop * pq.s)
    
    # Annotate with unit_id and unit_region
    st.annotate(unit_id=unit_id, unit_region=unit_region_map[unit_id])
    
    spike_trains.append(st)

### Define functions

In [12]:
# Define function to compute cross-correlation histograms
def compute_cross_correlation(spike_train_1, spike_train_2, bin_size):
    
    binned_st1 = elephant_bst(spike_train_1, bin_size=bin_size)
    binned_st2 = elephant_bst(spike_train_2, bin_size=bin_size)
    
    cc_hist, lags = elephant_cch(binned_st1, binned_st2, window=cc_window, border_correction=False, binary=False, kernel=None)

    return cc_hist.magnitude.flatten()

In [13]:
def compute_and_annotate_correlations(spike_train_1, spike_train_2, bin_size, surrogate_num, dither):
    # Generate surrogate spike trains for spike_train_2
    surrogate_spike_trains = sgt.surrogates(spiketrain=spike_train_2, n_surrogates=surrogate_num, 
                                            method='dither_spikes', dt=dither)
    
    # Compute cross-correlation for the original pair
    original_hist_values = compute_cross_correlation(spike_train_1, spike_train_2, bin_size)

    # Initialize array to store surrogate cross-correlations
    surrogate_values = np.zeros((surrogate_num, len(original_hist_values)))

    # Compute cross-correlation for each surrogate
    for i, surrogate in enumerate(surrogate_spike_trains):
        hist_values = compute_cross_correlation(spike_train_1, surrogate, bin_size)
        surrogate_values[i, :] = hist_values
    
    # Calculate mean and std of surrogate cross-correlations for each bin
    surrogate_mean = np.mean(surrogate_values, axis=0)
    surrogate_std = np.std(surrogate_values, axis=0)

    # Calculate z-score for the original cross-correlation for each bin
    z_scores = (original_hist_values - surrogate_mean) / surrogate_std

    # Save the unit ID's for metadata
    unit_id_1 = spike_train_1.annotations['unit_id']
    unit_id_2 = spike_train_2.annotations['unit_id']

    z_scored_result = {"unit_1": unit_id_1, "unit_2": unit_id_2}
    original_result = {"unit_1": unit_id_1, "unit_2": unit_id_2}

    # Name lag columns by ordinality
    for ordinal, z_score in enumerate(z_scores, start=1):
        z_scored_result[f"lag_{ordinal}"] = z_score

    for ordinal, value in enumerate(original_hist_values, start=1):
        original_result[f"lag_{ordinal}"] = value
    
    return z_scored_result, original_result

### Compute cross-correlation with original and surrogate spike trains

In [14]:
if all_region_pairs:
    # Prepare the list of all unique combinations of spike trains
    spike_train_pairs = list(combinations(spike_trains, 2))
    
else:
    # Filter spike_trains based on unit_region
    cla_spike_trains = [st for st in spike_trains if st.annotations['unit_region'] == 'CLA']
    amy_acc_spike_trains = [st for st in spike_trains if st.annotations['unit_region'] in ['AMY', 'ACC']]

    # Prepare the list of all unique combinations where one unit is from CLA and the other from AMY or ACC
    spike_train_pairs = list(product(cla_spike_trains, amy_acc_spike_trains))

In [15]:
# Execute the processing in parallel and collect both z-scored results and original correlogram results
results = Parallel(n_jobs=n_cores)(
    delayed(compute_and_annotate_correlations)(spike_train_1, spike_train_2, bin_size, surrogate_num, dither)
    for spike_train_1, spike_train_2 in tqdm(spike_train_pairs)
)

# Initialize lists to store the z-scored correlograms and original correlograms
zscored_correlograms = []
original_correlograms = []

# Iterate through the results to separate the z-scored and original correlograms
for z_scored_result, original_hist_values in results:
    zscored_correlograms.append(z_scored_result)
    original_correlograms.append(original_hist_values)

# Convert the lists of results to pandas DataFrames
zrellogram = pd.DataFrame(zscored_correlograms)
orellogram = pd.DataFrame(original_correlograms)

 44%|████▎     | 290/666 [7:23:49<10:30:36, 100.63s/it]

In [None]:
zrellogram.to_csv(output_path_zscore, index = False)
orellogram.to_csv(output_path_original, index = False)