recoding.ipynb

Scratch file for recoding data for ACN5314 Group Project, CRCNS pfc-6 data.

Last edit:  28 MAR 2022, CDR ||
Created:    08 MAR 2022, CDR

In [4]:
import pandas as pd
import numpy as np
from scipy import fft
# import matplotlib.pyplot as plt

In [5]:
# DECLARE GLOBALS


# Folder containing all session data
data_folder = '../pfc-6/mPFC_Data'

# column headers (see pfc-6 documentation for details)
df_headers = {
    "Behavior"  :    ['trialStart', 'trialEnd', 'rule', 'correct', 'direction','lightPos'],
    "SpikeData" :   ['spikeTime','cellID'],
    "CellType"  :   ['cellType'],
    "WakeEpoch" :   ['behaviorStart','behaviorEnd'],
    "SwsPost"   :   ['behaviorStart','behaviorEnd'],
    "SwsPre"    :   ['behaviorStart','behaviorEnd'],
    "Pos"       :   ['time','x','y']
}

df_column_specs = { # pd.read_fwf() incorrectly infers spacing of the long time numbers from the default 100 first rows. Times range to 9 spaces (8 digits + decimal), but first 100 rows do not represent this.
    "Behavior"  :   [(0,9),(10,20),(26,33),(36,41),(46,52),(56,61)],
    "SpikeData" :   [(0,9),(10,20)]
    # ,
    # "CellType"  :   'infer',
    # "WakeEpoch" :   'infer',
    # "SwsPost"   :   'infer',
    # "SwsPre"    :   'infer',
    # "Pos"       :   'infer'
}

In [6]:
# get all the files in data_folder
dir = os.listdir(data_folder)

# alphabetical sort
dir = sorted(dir)

# remove os files like ds_store; all the data folders are numeric (session ID: IDMMDD - ID: animal number, MM: month, DD: day)
for q in dir:
    if not q.isnumeric():
        dir.remove(q)

# dir

In [7]:
def get_df_from_file(df_id, session_id):
    """
    Read in specified .dat file
    df_id: String; which df type to read in. DF types, described in dataset documentation, can be:
        "Behavior"
        "SpikeData"
        "CellType"
        "WakeEpoch"
        "SwsPost"
        "SwsPre"
        "Pos"
    session_id: numeric rat/session. 6 digits: ID|MM|DD
    """
    name = "%s_%s"%(session_id,df_id)
    file_name = "%s/%s/%s_%s.dat"%(data_folder,session_id,session_id,df_id)

    cols = df_headers.get(df_id)
    assert cols != None, "Invalid dataframe Id"
    
    colspecs = df_column_specs.get(df_id)
    if colspecs is None:
        colspecs='infer' #read_fwf should have no trouble inferring these cols

    df = pd.read_fwf(file_name, name=cols, header=None, colspecs=colspecs)
    df.name = name
    df.columns = cols

    if df_id == "SpikeData":
        df['cellID'] = df['cellID'].astype('int8')

    return df

In [38]:
# sessions where animal learned rule (see pfc-6 documentation)
learned_sessions =[
    '150628','150630','150707', # A15
    '181012','181020', # A18
    '190214','190228', # A19
    '201222','201227','201229' # A20
    ]


def session_summary(session_id, print_summary=True):
    '''
    Aggregate summary for:
        Behavior:
            - Number correct trials/total (% correct)
            - Total time (average)
        Pre SWS:
            - Total time
            - Epoch count
        Post SWS:
            - Total time
            - Epoch count
    '''
    beh_summary = behavior_summary(session_id, print_summary)
    pre_summary = sleep_summary(session_id, 'SwsPre', print_summary)
    post_summary = sleep_summary(session_id, 'SwsPost', print_summary)

    return (beh_summary, pre_summary, post_summary)


def behavior_summary(session_id, print_summary=False):
    '''
    Return summary of behavior for a session:
        tuple (correct_count, total_count, total_time)
        
    print (default=False): if True, print summary to console before returning values. else, simply return values.
        Print format:
            - Number correct trials/total (% correct)
            - Total time (average)
    '''
    # Trial/correct count info
    beh_df       = get_df_from_file("Behavior", session_id)
    total_count = beh_df.shape[0]

    try:
        correct_count = beh_df['correct'].value_counts()[1.0]
    except KeyError:
        # no correct trials in session. shouldn't be a problem for this analysis.
        correct_count = 0
    
    percent_correct = float(100*(correct_count/total_count))

    # Timing info
    trial_lengths = [None] * total_count
    for i in range(0, total_count):
        trial_lengths[i] = float(beh_df.loc[i, 'trialEnd'] - beh_df.loc[i, 'trialStart'])
    
    total_time = sum(trial_lengths)

    # Report/return
    if print_summary:
        average_time = round(total_time/total_count,1)
        print(
            "%s: Behavior Summary:\n\
            -%i trials correct/%i total (%.1f%%) \n\
            -%.1fms total (%.1f ms/trial avg)"
                %( session_id,
                correct_count, total_count, percent_correct,
                total_time, average_time
            )
        )
    return (correct_count, total_count, total_time)


def sleep_summary(session_id, pre_post, print_summary=False):
    '''
    For a 'sleep' period in a session (either pre or post), return summary of SWS
    SWS:
        - Epoch count
        - Total time
    '''
    if (pre_post != "SwsPre") and (pre_post != "SwsPost"):
        raise KeyError("'" + pre_post + "' is not a valid dataframe title.")
    
    sleep_df = get_df_from_file(pre_post, session_id)
    epoch_count = sleep_df.shape[0]

    # Timing info
    epoch_lengths = [None] * epoch_count
    for i in range(0, epoch_count):
        epoch_lengths[i] = float(sleep_df.loc[i, 'behaviorEnd'] - sleep_df.loc[i, 'behaviorStart'])

    total_time = sum(epoch_lengths)

    # Report/return
    if print_summary:
        average_time = total_time/epoch_count
        print(
            "%s: %s Summary:\n\
            -%i SWS epochs \n\
            -%.1fms total (%.1f ms/epoch avg)"
                %( session_id, pre_post,
                epoch_count,
                total_time, average_time
            )
        )
    return (epoch_count, total_time)

In [39]:
summary_headings = ('sessionID', 'beh_correct_count', 'beh_total_count', 'beh_total_time', 'pre_epoch_count', 'pre_sws_time', 'post_epoch_count', 'post_sws_time')

summary_df = pd.DataFrame(columns=summary_headings)

for i in learned_sessions:
    s = session_summary(i, False)
    summary_df.loc[len(summary_df)] = [i]+list(s[0]+s[1]+s[2])

summary_df

Unnamed: 0,sessionID,beh_correct_count,beh_total_count,beh_total_time,pre_epoch_count,pre_sws_time,post_epoch_count,post_sws_time
0,150628,14,23,125527.9,3,724008.2,3,660965.2
1,150630,14,18,137832.1,3,703985.7,3,829958.8
2,150707,25,33,153017.5,3,866011.6,3,532979.8
3,181012,11,13,228557.2,2,481980.1,5,923932.0
4,181020,24,29,125887.6,4,1117011.1,3,644992.0
5,190214,22,29,155905.9,7,775999.4,4,1137015.0
6,190228,11,15,202622.2,5,742017.0,4,907981.8
7,201222,32,48,217274.0,4,561993.5,2,386996.5
8,201227,25,42,236810.1,1,130012.5,2,331033.3
9,201229,19,26,122978.8,3,540020.0,2,198973.2


In [31]:
session_id = '150628' # 6dig; IDMMDD - animal ID, month, day

# get all the dataframes we have for this session
# ~30s
beh_df       = get_df_from_file("Behavior", session_id)
spike_df     = get_df_from_file("SpikeData", session_id)
cell_df      = get_df_from_file("CellType", session_id)
wake_df      = get_df_from_file("WakeEpoch", session_id)
sws_pre_df   = get_df_from_file("SwsPre", session_id)
sws_post_df  = get_df_from_file("SwsPost", session_id)
pos_df       = get_df_from_file("Pos", session_id)

In [17]:
# show head of each dataframe

print("---BEHAVIOR---")
print(beh_df.head())


# print("\n---SPIKE DATA---")
# print(spike_df.head())

# print("\n---CELL TYPE---")
# print(cell_df.head())

# print("\n---WAKE---")
# print(wake_df.head())

# print("\n---SWS PRE TIMES---")
# print(sws_pre_df.head())

# print("\n---SWS POST TIMES---")
# print(sws_post_df.head())

# print("\n---POSITION---")
# print(pos_df.head())

---BEHAVIOR---
   trialStart   trialEnd  rule  correct  direction  lightPos
0   1512078.7  1516415.3   1.0      0.0        1.0       1.0
1   1541400.8  1558246.9   1.0      0.0        1.0       0.0
2   1606616.7  1613722.0   1.0      0.0        1.0       1.0
3   1662625.6  1667729.4   1.0      1.0        0.0       0.0
4   1728441.9  1732078.0   1.0      0.0        1.0       0.0
trialStart    1512078.7
trialEnd      1516415.3
rule                1.0
correct             0.0
direction           1.0
lightPos            1.0
Name: 0, dtype: float64


1512078.7

In [None]:
# FINAL DF:
# features:
#   - session rule              R/L/light/dark (1 hot coded); check order
#   - light-based rule?         T if rule=light/dark, false if rule=L/R (direction-based)
#   - pairwise correlations     by neuron
#   - cell types                by neuron
#   ?- rat ID                   1hc ?ask golden