### Imports

In [4]:
import pandas as pd
import numpy as np
import warnings
import sys
import os
import csv
import pickle
import re
import datetime
warnings.filterwarnings("ignore")

In [5]:
#establishes file paths
cur = os.getcwd()

#path to all completed annotations in csv form
absolute = "/Users/graciezhang/Documents/Hengen-Lab/annotations/completed/csv/"

#path to the original optical flow dataframe
df_path = 'Files/optical_flow'

In [6]:
#i pickled the OF dataframe for efficiency since it's used so often
#- this cell is just opening and storing the OF dataframe to of_df
infile = open(df_path,'rb')
of_df = pickle.load(infile)
infile.close()

### Cleaning Annotations

This is a multistep process. The original CSV comes with multiple lines of information before supplying the annotations and their corresponding timestamps. To retrieve the appropriate data, the following steps should be followed:

In [7]:
#ALL MICE
# CAF26 (top + side)
# CAF34
# CAF42
# CAF69
# CAF99
# CAF106

In [8]:
#encodes all categorical behaviors to numerical
def cat_num(df):
    #encode
    df.Behavior[df.Behavior=='Nesting'] = 1 # Location
    df.Behavior[df.Behavior=='Playing w/ Ball'] = 2 # Location
    df.Behavior[df.Behavior=='Playing w/ Box'] = 3 # Location
    df.Behavior[df.Behavior=='Riding the Ball'] = 4 # Location
    df.Behavior[df.Behavior=='Riding the Box'] = 5 # Location
    df.Behavior[df.Behavior=='Drinking'] = 6 # Location
    df.Behavior[df.Behavior=='Chewing'] = 7 # Location
    df.Behavior[df.Behavior=='Porthole Interaction'] = 8 # Location
    df.Behavior[df.Behavior=='Scaling Porthole'] = 9 #Location
    df.Behavior[df.Behavior=='Scaling Door'] = 10 #Location
    
    df.Behavior[df.Behavior=='Grooming'] = 11
    
    df.Behavior[df.Behavior=='Hollowing/Digging'] = 12
    df.Behavior[df.Behavior=='Digging'] = 12
    
    df.Behavior[df.Behavior=='Locomotion'] = 13
    df.Behavior[df.Behavior=='Locomotive Movement'] = 13
    
    df.Behavior[df.Behavior=='Minor Postural Movement'] = 14
    df.Behavior[df.Behavior=='Rearing'] = 15
    df.Behavior[df.Behavior=='Stretching'] = 16
    df.Behavior[df.Behavior=='Twitching'] = 17
    df.Behavior[df.Behavior=='Slipping'] = 18
    
    #drop all nonbehaviors
    df = df[df.Behavior.apply(lambda x: str(x).isnumeric())]
    
    return df

In [9]:
#reads each line of the CSV file until the video name is hit
#returns video name or None should it not be found
def get_video_path(file_name):
    with open(file_name, newline='') as f:
      reader = csv.reader(f)
      for row in reader:
            if row[0] == "Player #1":
                return row[1]
    return None

In [10]:
#reads each line of the CSV file to determine when annotations begin
def get_skip_rows(file_name):
    with open(file_name, newline='') as f:
      reader = csv.reader(f)
      for i, row in enumerate(reader):
            if row[0] == "Time":
                return i
    return None

In [11]:
#adds Frames column to the generated CSV dataframe - converts all timestamps into corresponding frames
def add_frames_column(df):
    fps = 15
    df['Frames'] = df['Time'].apply(lambda x: round(x * fps))
    df.sort_values(by = ['Frames', 'Status'])
    return df

*clean_ants(file_path, mouse_name)* consolidates all the cleaning steps:
1. The video that was annotated is extracted using the *get_video_path(file_name)* function. 
2. The number of rows to skip in order to only get the annotations is read in using the *get_skip_rows(file_name)* function.
3. The csv is read in using pandas, with the appropriate number of rows to skip supplied.
4. The video_path needs to be cleaned to get the file name of the video. 
5. The time column is converted to minutes (Time type) and added as a new column under "Minutes".
6. Unnecessary columns are dropped
7. Behaviors are encoded
8. Frames column is added
9. Cleaned dataframe and video_name are returned

In [18]:
def clean_ants(file_path, mouse_name):
    """
    The file_name should be standardized. The path I set for each annotation file was the directory, 
    which was the same for each csv, and the mouse name. This will change should there be multiple annotations 
    for each mouse. 
    """
    file_name = file_path + mouse_name + ".csv"
    video_path = get_video_path(file_name)
    skip = get_skip_rows(file_name)
    
    df = pd.read_csv(file_name, skiprows = skip)
    
    video_name = re.split("/", video_path)[-1]
    
    #add timestamps column
    df['Minutes'] = df.apply(lambda row: str(datetime.timedelta(seconds = row['Time'])), axis=1)
    
    #drop unnecessary columns
    drop_filter = df.filter(["Media file path", "Subject", "Behavioral category", "Comment"])
    
    df.drop(drop_filter, axis = 1, inplace = True)
    
    #encode all categorical behaviors
    df = cat_num(df)
    
    #add frames columns
    df = add_frames_column(df)
    
    return df, video_name

All annotated video names are stored in a dictionary, with the mouse's name as the key. Again, should this be used when there are multiple annotations for one mouse, this structure should change. *clean_ants(file_path, mouse_name)* is called for each CSV annotation file.

In [1]:
#stores all annotated video names into dictionary
video_names = {}

In [11]:
CAF34_df, vid34 = clean_ants(absolute, "CAF34")
video_names["CAF34"] = vid34

In [12]:
CAF42_df, vid42 = clean_ants(absolute, "CAF42")
video_names["CAF42"] = vid42

In [13]:
CAF69_df, vid69 = clean_ants(absolute, "CAF69")
video_names["CAF69"] = vid69

In [14]:
CAF77_df, vid77 = clean_ants(absolute, "CAF77")
video_names["CAF77"] = vid77

In [15]:
CAF99_df, vid99 = clean_ants(absolute, "CAF99")
video_names["CAF99"] = vid99

In [16]:
CAF106_df, vid106 = clean_ants(absolute, "CAF106")
video_names["CAF106"] = vid106

In [17]:
EAB50_df, vid50 = clean_ants(absolute, "EAB50_5state")
video_names["EAB50_5state"] = vid50

### Separating into Location and Regular Behaviors

In [19]:
#separates encoded behaviors based on location-constrained or not depending on value
def sep_beh(df):
    df_loc = df.loc[df.Behavior < 11]
    df_reg = df.loc[df.Behavior >= 11]
    
    #generates two separate dataframes
    return df_loc, df_reg

In [20]:
#called on each mouse's dataframe
CAF34_loc_df, CAF34_reg_df = sep_beh(CAF34_df)
CAF42_loc_df, CAF42_reg_df = sep_beh(CAF42_df)
CAF69_loc_df, CAF69_reg_df = sep_beh(CAF69_df)
CAF99_loc_df, CAF99_reg_df = sep_beh(CAF99_df)
CAF106_loc_df, CAF106_reg_df = sep_beh(CAF106_df)

In [21]:
CAF77_loc_df, CAF77_reg_df = sep_beh(CAF77_df)

In [22]:
EAB50_loc_df, EAB50_reg_df = sep_beh(EAB50_df)

### Behavior Column

In [19]:
def behavior_arr(df, mouse_name):
#     ALL VIDEOS ASSUMED TO HAVE 15 FPS
#     total_time = df['Total length'].unique()[0]
#     fps = 15
#     fps = df['FPS'].unique()[0]
    
    #total frames taken from optical flow dataframe depending on how many rows the specified video takes up
    total_frames = len(of_df[of_df["video_filename"] == video_names[mouse_name]])
    arr = np.zeros(total_frames)
    
    #frames column is converted to an array
    frames_arr = df['Frames'].to_numpy()
    
    """
    The frames column is assumed to take on a certain structure. There should be no nested START and STOP behaviors
    after location and regular behaviors are separated. Because of this assumption, iterating through even indexes
    yields the start frame and odd indexes yields the stop frame.
    """
    for start_idx, stop_idx in zip(range(0, len(df)-1, 2), range(1, len(df), 2)):
        start_frame = frames_arr[start_idx]
        stop_frame = frames_arr[stop_idx]
        behavior_num = df.iloc[start_idx].Behavior
        arr[start_frame:stop_frame] = behavior_num
    
    return arr

In [102]:
CAF34_loc_arr = behavior_arr(CAF34_loc_df, "CAF34")
CAF34_reg_arr = behavior_arr(CAF34_reg_df, "CAF34")

In [103]:
CAF42_loc_arr = behavior_arr(CAF42_loc_df, "CAF42")
CAF42_reg_arr = behavior_arr(CAF42_reg_df, "CAF42")

In [104]:
CAF69_loc_arr = behavior_arr(CAF69_loc_df, "CAF69")
CAF69_reg_arr = behavior_arr(CAF69_reg_df, "CAF69")

In [105]:
CAF99_loc_arr = behavior_arr(CAF99_loc_df, "CAF99")
CAF99_reg_arr = behavior_arr(CAF99_reg_df, "CAF99")

In [106]:
CAF106_loc_arr = behavior_arr(CAF106_loc_df, "CAF106")
CAF106_reg_arr = behavior_arr(CAF106_reg_df, "CAF106")

In [107]:
CAF77_loc_arr = behavior_arr(CAF77_loc_df, "CAF77")
CAF77_reg_arr = behavior_arr(CAF77_reg_df, "CAF77")

In [108]:
EAB50_loc_arr = behavior_arr(EAB50_loc_df, "EAB50_5state")
EAB50_loc_arr = behavior_arr(EAB50_loc_df, "EAB50_5state")

### Consolidating Top and Side Annotations

Applicable to CAF26 and KDR48, which both have top and side annotations for the same timestamp

In [206]:
CAF26_top_df, vid26 = clean_ants(absolute, "CAF26_top")
video_names["CAF26"] = vid26

In [208]:
CAF26_side_df, __ = clean_ants(absolute, "CAF26_side")

In [167]:
CAF26t_loc_df, CAF26t_reg_df = sep_beh(CAF26_top_df)
CAF26s_loc_df, CAF26s_reg_df = sep_beh(CAF26_side_df)

In [168]:
CAF26t_loc_arr = behavior_arr(CAF26t_loc_df, "CAF26")
CAF26t_reg_arr = behavior_arr(CAF26t_reg_df, "CAF26")

In [169]:
CAF26s_loc_arr = behavior_arr(CAF26s_loc_df, "CAF26")
CAF26s_reg_arr = behavior_arr(CAF26s_reg_df, "CAF26")

To resolve the two angles, a priority angle is chosen. Then, the two annotations are compared by frame. If the nonpriority angle has a behavior annotation where the priority angle does not, this annotation is included in the resolved array. The priority angle is used as the basis of the resolution array.

In [170]:
#combined function
#top acting as priority
def resolution(top_arr, side_arr):
    top_binary = np.array(top_arr != 0, dtype=int)
    side_binary = np.array(side_arr != 0, dtype=int)
    
    mask_side = (side_binary == 1)
    mask_top = (top_binary == 0)
    binary_mask = np.logical_and(mask_side, mask_top)
    
    res_arr = np.copy(top_arr)
    res_arr[binary_mask] = side_arr[binary_mask]
    
    return res_arr

In [171]:
#resolution array for location + regular behaviors
CAF26r_loc_arr = resolution(CAF26t_loc_arr, CAF26s_loc_arr)
CAF26r_reg_arr = resolution(CAF26t_reg_arr, CAF26s_reg_arr)

### Adding New Column to Dataframe

In [172]:
#dictionary with video names for each annotation
video_names

{'CAF34': 'e3v819c-20200828T011409-021410.mp4',
 'CAF42': 'e3v81a6-20200915T022243-032244.mp4',
 'CAF69': 'e3v81a6-20201230T155846-165847.mp4',
 'CAF77': 'e3v819c-20210202T164844-174844.mp4',
 'CAF99': 'CAF00099-20210608T040846-050846.mp4',
 'CAF106': 'CAF00106-20210607T110838-120839.mp4',
 'EAB50_5state': 'e3v817b-20190705T2055-2155.mp4',
 'CAF26': 'e3v819c-20200808T0609-0709.mp4'}

In [43]:
modof_df = of_df.copy()

In [46]:
#creates loc_behaviors and reg_behaviors columns
#adds annotations into given dataframe
def add_beh(loc_arr, reg_arr, video, df):
    #selects rows that correspond to annotated video
    video_df = df.index[df["video_filename"] == video]
    start_idx = video_df.tolist()[0]
    end_idx = video_df.tolist()[-1] + 1
    
    #add annotations to those rows
    df["loc_behaviors"][start_idx:end_idx] = loc_arr
    df["reg_behaviors"][start_idx:end_idx] = reg_arr
    
    return df

In [47]:
#must call function for each mouse - definitely a better way to do this, 
#maybe iteration, either through the function or outside of it
modof_df = add_beh(CAF34_loc_arr, CAF34_reg_arr, video_names["CAF34"], modof_df)
modof_df = add_beh(CAF42_loc_arr, CAF42_reg_arr, video_names["CAF42"], modof_df)
modof_df = add_beh(CAF69_loc_arr, CAF69_reg_arr, video_names["CAF69"], modof_df)
modof_df = add_beh(CAF99_loc_arr, CAF99_reg_arr, video_names["CAF99"], modof_df)
modof_df = add_beh(CAF106_loc_arr, CAF106_reg_arr, video_names["CAF106"], modof_df)

In [173]:
modof_df = add_beh(CAF26r_loc_arr, CAF26r_reg_arr, video_names["CAF26"], modof_df)

In [437]:
#csv
modof_df.to_csv("/Users/graciezhang/Documents/Hengen-Lab/annotations_df.csv", encoding='utf-8', index=False)

In [179]:
#feather
modof_df.to_feather("/Users/graciezhang/Documents/Hengen-Lab/annotations_df-feather")

### Verifying Correctness

In [65]:
check_arr = modof_df[modof_df["video_filename"] == video_names["CAF69"]]["reg_behaviors"].values
# check_arr
np.where(check_arr == 11)

(array([ 1317,  1318,  1319, ..., 16357, 16358, 16359]),)

In [180]:
CAF69_df[8:15]

Unnamed: 0,Time,Total length,FPS,Behavior,Status,Minutes,Frames
8,87.833,3600.96,30,11,START,0:01:27.833000,1317
9,88.233,3600.96,30,11,STOP,0:01:28.233000,1323
10,104.067,3600.96,30,12,START,0:01:44.067000,1561
11,116.8,3600.96,30,12,STOP,0:01:56.800000,1752
12,122.167,3600.96,30,13,START,0:02:02.167000,1833
13,122.733,3600.96,30,13,STOP,0:02:02.733000,1841
14,132.2,3600.96,30,13,START,0:02:12.200000,1983


In [182]:
CAF69_df["Status"].iloc[1::2].unique()

array(['STOP'], dtype=object)

In [402]:
video_names["CAF69"]

'e3v81a6-20201230T155846-165847.mp4'