recoding_all.ipynb

Recode data for ACN5314 Group Project, CRCNS pfc-6 data.

Last edit:  03 APR 2022, CDR ||
Created:    03 APR 2022, CDR

In [2]:
import pandas as pd
import numpy as np
from scipy import fft

In [3]:
# DECLARE GLOBALS

# Folder containing all session data
data_folder = '../pfc-6/mPFC_Data'

# column headers (see pfc-6 documentation for details)
df_headers = {
    "Behavior"  :    ['trialStart', 'trialEnd', 'rule', 'correct', 'direction','lightPos'],
    "SpikeData" :   ['spikeTime','cellID'],
    "CellType"  :   ['cellType'],
    "WakeEpoch" :   ['behaviorStart','behaviorEnd'],
    "SwsPost"   :   ['behaviorStart','behaviorEnd'],
    "SwsPre"    :   ['behaviorStart','behaviorEnd'],
    "Pos"       :   ['time','x','y']
}

df_column_specs = { # pd.read_fwf() incorrectly infers spacing of the long time numbers from the default 100 first rows. Times range to 9 spaces (8 digits + decimal), but first 100 rows do not represent this.
    "Behavior"  :   [(0,9),(10,20),(26,33),(36,41),(46,52),(56,61)],
    "SpikeData" :   [(0,9),(10,20)]
    # ,
    # "CellType"  :   'infer',
    # "WakeEpoch" :   'infer',
    # "SwsPost"   :   'infer',
    # "SwsPre"    :   'infer',
    # "Pos"       :   'infer'
}

In [4]:
# get all the files in data_folder
dir = os.listdir(data_folder)

# alphabetical sort
dir = sorted(dir)

# remove os files like ds_store; all the data folders are numeric (session ID: IDMMDD - ID: animal number, MM: month, DD: day)
for q in dir:
    if not q.isnumeric():
        dir.remove(q)

# dir

In [5]:
def get_df_from_file(df_id, session_id):
    """
    Read in specified .dat file
    df_id: String; which df type to read in. DF types, described in dataset documentation, can be:
        "Behavior"
        "SpikeData"
        "CellType"
        "WakeEpoch"
        "SwsPost"
        "SwsPre"
        "Pos"
    session_id: numeric rat/session. 6 digits: ID|MM|DD
    """
    name = "%s_%s"%(session_id,df_id)
    file_name = "%s/%s/%s_%s.dat"%(data_folder,session_id,session_id,df_id)

    cols = df_headers.get(df_id)
    assert cols != None, "Invalid dataframe Id"
    
    colspecs = df_column_specs.get(df_id)
    if colspecs is None:
        colspecs='infer' #read_fwf should have no trouble inferring these cols

    df = pd.read_fwf(file_name, name=cols, header=None, colspecs=colspecs)
    df.name = name
    df.columns = cols

    if df_id == "SpikeData":
        df['cellID'] = df['cellID'].astype('int8')

    return df

def get_all_df(session_id):
    """
    Get all 6 df for the specified session
    (~30s)
    """
    beh_df       = get_df_from_file("Behavior", session_id)
    spike_df     = get_df_from_file("SpikeData", session_id)
    cell_df      = get_df_from_file("CellType", session_id)
    wake_df      = get_df_from_file("WakeEpoch", session_id)
    sws_pre_df   = get_df_from_file("SwsPre", session_id)
    sws_post_df  = get_df_from_file("SwsPost", session_id)
    pos_df       = get_df_from_file("Pos", session_id)

    return (beh_df, spike_df, cell_df, wake_df, sws_pre_df, sws_post_df, pos_df)

In [6]:
# 4min 20sec to load all. only issue is with 150704. no pre-behavior SWS (empty sws_pre df)
sessions_with_error = []

for session_id in dir:
    try:
        beh_df, spike_df, cell_df, wake_df, sws_pre_df, sws_post_df, pos_df = get_all_df(session_id)
        # todo: make refactored data table from these. return csv.
    except:
        sessions_with_error.append(session_id)

print(sessions_with_error)

['150704']
