recoding.ipynb

Recode data for ACN5314 Group Project, CRCNS pfc-6 data.

Last edit:  23 MAR 2022, CDR ||
Created:    08 MAR 2022, CDR

In [1]:
import pandas as pd
import numpy as np
from scipy import signal

In [2]:
# DECLARE GLOBALS


# Folder containing all session data
data_folder = '../pfc-6/mPFC_Data'

# column headers (see pfc-6 documentation for details)
df_headers = {
    "Behavior"  :    ['trialStart', 'trialEnd', 'rule', 'correct', 'direction','lightPos'],
    "SpikeData" :   ['spikeTime','cellID'],
    "CellType"  :   ['cellType'],
    "WakeEpoch" :   ['behaviorStart','behaviorEnd'],
    "SwsPost"   :   ['behaviorStart','behaviorEnd'],
    "SwsPre"    :   ['behaviorStart','behaviorEnd'],
    "Pos"       :   ['time','x','y']
}

df_column_specs = { # pd.read_fwf() incorrectly infers spacing of the long time numbers from the default 100 first rows. Times range to 9 spaces (8 digits + decimal), but first 100 rows do not represent this.
    "Behavior"  :   [(0,9),(10,20),(26,33),(36,41),(46,52),(56,61)],
    "SpikeData" :   [(0,9),(10,20)]
    # ,
    # "CellType"  :   'infer',
    # "WakeEpoch" :   'infer',
    # "SwsPost"   :   'infer',
    # "SwsPre"    :   'infer',
    # "Pos"       :   'infer'
}

In [3]:
# get all the files in data_folder
dir = os.listdir(data_folder)

# alphabetical sort
dir = sorted(dir)

# remove os files like ds_store; all the data folders are numeric (session ID: IDMMDD - ID: animal number, MM: month, DD: day)
for q in dir:
    if not q.isnumeric():
        dir.remove(q)

# dir

In [4]:
def get_df_from_file(df_id, session_id):
    """
    Read in specified .dat file
    df_id: String; which df type to read in. DF types, described in dataset documentation, can be:
        "Behavior"
        "SpikeData"
        "CellType"
        "WakeEpoch"
        "SwsPost"
        "SwsPre"
        "Pos"
    session_id: numeric rat/session. 6 digits: ID|MM|DD
    """
    name = "%s_%s"%(session_id,df_id)
    file_name = "%s/%s/%s_%s.dat"%(data_folder,session_id,session_id,df_id)

    cols = df_headers.get(df_id)
    assert cols != None, "Invalid dataframe Id"
    
    colspecs = df_column_specs.get(df_id)
    if colspecs is None:
        colspecs='infer' #read_fwf should have no trouble inferring these cols

    df = pd.read_fwf(file_name, name=cols, header=None, colspecs=colspecs)
    df.name = name
    df.columns = cols

    if df_id == "SpikeData":
        df['cellID'] = df['cellID'].astype('int8')

    return df

In [5]:
# todo: cycle through all sessions
session_id = '150628' # 6dig; IDMMDD - animal ID, month, day

# get all the dataframes we have for this session
# ~30s
beh_df       = get_df_from_file("Behavior", session_id)
spike_df     = get_df_from_file("SpikeData", session_id)
cell_df      = get_df_from_file("CellType", session_id)
wake_df      = get_df_from_file("WakeEpoch", session_id)
sws_pre_df   = get_df_from_file("SwsPre", session_id)
sws_post_df  = get_df_from_file("SwsPost", session_id)
pos_df       = get_df_from_file("Pos", session_id)

In [None]:
# show head of each

# print("---BEHAVIOR---")
# print(beh_df.head())

# print("\n---SPIKE DATA---")
# print(spike_df.head())

# print("\n---CELL TYPE---")
# print(cell_df.head())

# print("\n---WAKE---")
# print(wake_df.head())

# print("\n---SWS PRE TIMES---")
# print(sws_pre_df.head())

# print("\n---SWS POST TIMES---")
# print(sws_post_df.head())

# print("\n---POSITION---")
# print(pos_df.head())

In [6]:
# empty_channel: ensure that there is a "sample" for each timepoint
# takes a long time; easier to make once and drop rows to match max for each

# max time: 30pre + 40beh + 30post = 100 minutes * 6e4 ms/min * 10 samples/ms
# (~30s, one time)
max_time_length = int(100 * 6e4 * 10)

empty_channel = pd.DataFrame()
empty_channel['spikeTime'] = list(range(0,max_time_length+1))
empty_channel['cellID'] = np.int8(-1)


In [12]:
def spike_to_time_domain(spike_sd):
    """
    Converts spike-domain data to time-domain
    spike_sd: spike domain spike data
    
    return time_sd (time domain spike data)
    (~100s per session)
    """

    #---add correct length empty channel for continuity (~7s)---#
    time_length = (spike_df.max()['spikeTime']*10).astype(int)
    spike_sd_ext = pd.concat(objs=[spike_sd,empty_channel.drop(list(range(time_length,max_time_length+1)))])
    
    #---pivot to time domain (~90s)---#
    spike_sd_ext['spike']=True
    time_sd = spike_sd_ext.pivot(index="cellID", columns="spikeTime",values="spike")
    
    #---drop empty channel, first row (short)---#
    time_sd = time_sd.tail(-1)

    return time_sd

In [13]:
time_sd = spike_to_time_domain(spike_df)
time_sd

spikeTime,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,45366087.0,45366088.0,45366089.0,45366090.0,45366091.0,45366092.0,45366093.0,45366094.0,45366095.0,45366096.0
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [None]:
def freq_transform(time_domain_spike_data, epochs):    
    """
    weighted average (by time length) of spectrograms for a given behavior state
    spectrogram for each neuron
    
    format:
    neuron |    10Hz    100Hz   1000Hz
    1       |    7       0       4
    2       |    1       1       1
    3       |    ...etc
    4       |
    5       |
    """
    

    return

def freq_correlation(time_domain_spike_data,swsPre_data,wake_df):
    """
    # todo
    
    for each neuron, correlation = (sws_pre_spectrogram) * (wake_spectrogram)' {dot product}

    todo: cut sessions with >1 rule
    """
    return

In [None]:
# FINAL DF:
# features:
#   - session rule          R/L/light/dark (1 hot coded); check order
#   - light-based rule?     T if rule=light/dark, false if rule=L/R (direction-based)
#   - frequency corr        by neuron; see function freq_correlation
#   - cell types            by neuron