recoding.ipynb

Recoding data for ACN5314 Group Project, CRCNS pfc-6 data.

Last edit:  07 APR 2022, CDR ||
Created:    08 MAR 2022, CDR

In [1]:
import pandas as pd
import numpy as np
from scipy import fft
# import matplotlib.pyplot as plt

In [2]:
# DECLARE GLOBALS


# Folder containing all session data
data_folder = '../pfc-6/mPFC_Data'

# column headers (see pfc-6 documentation for details)
df_headers = {
    "Behavior"  :    ['trialStart', 'trialEnd', 'rule', 'correct', 'direction','lightPos'],
    "SpikeData" :   ['spikeTime','cellID'],
    "CellType"  :   ['cellType'],
    "WakeEpoch" :   ['behaviorStart','behaviorEnd'],
    "SwsPost"   :   ['behaviorStart','behaviorEnd'],
    "SwsPre"    :   ['behaviorStart','behaviorEnd'],
    "Pos"       :   ['time','x','y']
}

df_column_specs = { # pd.read_fwf() incorrectly infers spacing of the long time numbers from the default 100 first rows. Times range to 9 spaces (8 digits + decimal), but first 100 rows do not represent this.
    "Behavior"  :   [(0,9),(10,20),(26,33),(36,41),(46,52),(56,61)],
    "SpikeData" :   [(0,9),(10,20)]
    # ,
    # "CellType"  :   'infer',
    # "WakeEpoch" :   'infer',
    # "SwsPost"   :   'infer',
    # "SwsPre"    :   'infer',
    # "Pos"       :   'infer'
}

In [3]:
# get all the files in data_folder
dir = os.listdir(data_folder)

# alphabetical sort
dir = sorted(dir)

# remove os files like ds_store; all the data folders are numeric (session ID: IDMMDD - ID: animal number, MM: month, DD: day)
for q in dir:
    if not q.isnumeric():
        dir.remove(q)

# dir

In [4]:
def get_df_from_file(df_id, session_id):
    """
    Read in specified .dat file
    df_id: String; which df type to read in. DF types, described in dataset documentation, can be:
        "Behavior"
        "SpikeData"
        "CellType"
        "WakeEpoch"
        "SwsPost"
        "SwsPre"
        "Pos"
    session_id: numeric rat/session. 6 digits: ID|MM|DD
    """
    name = "%s_%s"%(session_id,df_id)
    file_name = "%s/%s/%s_%s.dat"%(data_folder,session_id,session_id,df_id)

    cols = df_headers.get(df_id)
    assert cols != None, "Invalid dataframe Id"
    
    colspecs = df_column_specs.get(df_id)
    if colspecs is None:
        colspecs='infer' #read_fwf should have no trouble inferring these cols

    df = pd.read_fwf(file_name, name=cols, header=None, colspecs=colspecs)
    df.name = name
    df.columns = cols

    if df_id == "SpikeData":
        df['cellID'] = df['cellID'].astype('int8')

    return df

In [5]:
session_id = '201222' # 6dig; IDMMDD - animal ID, month, day

# get all the dataframes we have for this session
# ~30s
beh_df       = get_df_from_file("Behavior", session_id)
spike_df     = get_df_from_file("SpikeData", session_id)
cell_df      = get_df_from_file("CellType", session_id)
wake_df      = get_df_from_file("WakeEpoch", session_id)
sws_pre_df   = get_df_from_file("SwsPre", session_id)
sws_post_df  = get_df_from_file("SwsPost", session_id)
# pos_df       = get_df_from_file("Pos", session_id)

In [6]:
# show head of each dataframe

# print("---BEHAVIOR---")
# print(beh_df.head())

# print("\n---SPIKE DATA---")
# print(spike_df.head())

# print("\n---CELL TYPE---")
# print(cell_df.head())

# print("\n---WAKE---")
# print(wake_df.head())

# print("\n---SWS PRE TIMES---")
# print(sws_pre_df.head())

# print("\n---SWS POST TIMES---")
# print(sws_post_df.head())

# print("\n---POSITION---")
# print(pos_df.head())

In [7]:
def make_bins(spike_df, timing_df, bin_length, behavior=False):
    '''
    spike_df:               pd dataframe storing spike data
    timing_df:              pd dataframe storing session times (sws_pre_df, sws_post_df, or beh_df)
    bin_length (Δt, in ms): length of bins
    
    behavior (def=False):   if "True", takes in behavior dataframe. output has  one extra column, "correct", which stores whether animal followed the rule during the trial this bin came from
    '''
    
    df = pd.DataFrame(columns=pd.RangeIndex(start=1,stop=32),dtype=np.int64)
    if behavior: df.insert(len(df.columns), 'correct', None)

    for b in timing_df.iterrows():
        if behavior:
            start = b[1]['trialStart']
            end = b[1]['trialEnd']
            trial_correct = b[1]['correct']
            df
        else:
            start = b[1]['behaviorStart']
            end = b[1]['behaviorEnd']

        for bin_start in np.arange(start,end,bin_length):
            # Get spikes occuring during this bin
            bin_spikes = spike_df.loc[(spike_df['spikeTime']>=bin_start) & (spike_df['spikeTime']< bin_start+bin_length)]
            
            # Number of spikes by cell ID
            counts = bin_spikes['cellID'].value_counts()

            # Add new bin to df
            df.loc[bin_start] = counts
            if behavior: df.loc[bin_start]['correct']=trial_correct
    return(df.fillna(0).astype(np.int64))

    

In [8]:
pre_sws_bins = make_bins(spike_df, timing_df=sws_pre_df, bin_length=100)
pre_sws_bins.to_csv('./recoded_data/201222_pre_sws_binned')

In [9]:
post_sws_bins = make_bins(spike_df, timing_df=sws_post_df, bin_length=100)
post_sws_bins.to_csv('./recoded_data/201222_post_sws_binned')

In [10]:
beh_bins = make_bins(spike_df, timing_df=beh_df, bin_length=100, behavior=True)
beh_bins.to_csv('./recoded_data/201222_beh_binned')