In [2]:
## Import functions

import numpy as np  
import pandas as pd
np.set_printoptions(suppress=True) # Supress scientific notation when printing
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
import re # Regular expressions
import networkx as nx # Package for graph represenations 
from datetime import datetime, time 
import pygraphviz as gv


In [30]:
# def import_file(rat_filename, file_path = '../data/raw/', path2data = '../data/')
# """
#  Reads Rat HexMaze behavioural data from experiment logs.
#  Performs basic cleaning and sanity checks.
#  Re-organises in a pandas-friendly format, and returns pandas dataframe.
#  Import logs are also generated and saved at path2results path.
 
#  Parameters
#  ----------
#  arg1 : str
#      Filename of txt data with experimental logs for single rat
#  arg2 : str, optional
#      Path to where rat_filename is stored
#  arg3 : str, optional
#      Path to directory where import logs and re-organized csv should be saved

# Returns
# -------
# pandas dataframe with clean data

# """

## Set file names and paths
# Raw data directory
path2data = '../data/'
# Results directory
path2results = '../results/'
# Raw data file
rat_filename = '20210629_Rat5.txt'
# I usually have a directory for each project, with subfolders at least for
# for data (data), scripts (src), plots and other results (results), hence the paths specified below


# Set name for data import log file
today = datetime.now()
log_filename = 'import_log_' + today.strftime("%Y%m%d") + '.txt'
# Create/Overwrite log file (A new logfile will be created each day
# the script is run)   
with open(path2results + log_filename, 'w') as log_f:
    log_f.write('Import date: ' + today.strftime("%d-%m-%y") + '\n')
    log_f.write(list_all[0])

# Extract exp. date and Rat Id from file name (format: YYYYMMDD_RatX.txt)
filename_info = rat_filename.split('_')
exp_date = pd.to_datetime(filename_info[0]).date()
rat_id = filename_info[1].split('.')[0]

info_msg = 'Data from ' + rat_id + ' on ' + exp_date.strftime('%Y-%m-%d') + ' ('+ rat_filename + ')'
print('Loading ' + info_msg + ':')


# Open logfile again to start appending import logs
log_f = open(path2results + log_filename, 'a')
log_f.write('\n\n=========================================================================\n')
log_f.write(info_msg)
log_f.write('\n=========================================================================\n')

# Extract fileinfo to list_all
with open(path2data + rat_filename, 'r') as f:
    # Load all lines from file
    list_all = [x for k,x in enumerate(f.readlines())]
        
# Locate trial line boundaries and first trial line
list_tr_bb = [n for n, x in enumerate(list_all) if re.match(r'^Summary Trial', x)]
trial_headers = [x for n, x in enumerate(list_all) if re.match(r'^Summary Trial', x)]

# Add a final entry to list_tr_bb to mark end boundary of last trial
list_tr_bb.append(len(list_all))

info_msg = str(len(list_tr_bb)) + ' trials found initially\n'
log_f.write(info_msg)
print(info_msg)


### DATA CLENANING
# Data files contain duplicate lines and/or duplicate trials
# Find and remove them

# Look for duplicate trials to write info to log
dupe_trials = [x for n, x in enumerate(trial_headers) if x in trial_headers[:n]] # Get duplicate trial headers

info_msg = str(len(dupe_trials)) + ' duplicate trials found\n'

log_f.write(info_msg)
[log_f.write('Trial ' + x.split()[2]) for x in dupe_trials]
print(info_msg)
[print('Trial ' + x.split()[2]) for x in dupe_trials]


# Look for duplicate lines in general and remove them from list_all
dupe_lines = [n for n, x in enumerate(list_all) if x in list_all[:n]]

# Remove duplicate lines from list_all
info_msg = str(len(dupe_lines)) + ' duplicate lines found\n'
print(info_msg)
log_f.write(info_msg)
for x in sorted(dupe_lines, reverse=True):
    del list_all[x]

# Update trial boundaries after deletions
list_tr_bb = [n for n, x in enumerate(list_all) if re.match(r'^Summary Trial', x)]
list_tr_bb.append(len(list_all))

# Strip extraneous strings from file lines, and read data within each line into list of lists of strings
list_sess = []
for l,tr in enumerate(list_tr_bb[:-1]): 
    # Find all lines starting with ( between trial boundaries
    list_sing = [x for x in list_all[list_tr_bb[l]:list_tr_bb[l+1]] if x[0]=='('] # Lines starting with ( 
    # Remove parenthesis, commas and '' from string
    list_sing = [x.replace("(", "") for x in list_sing]
    list_sing = [x.replace(")", "") for x in list_sing]
    list_sing = [x.replace(",", "") for x in list_sing]
    list_sing = [x.replace("'", "") for x in list_sing]
    list_sing = [x.replace("\n", "") for x in list_sing]
    # Now separate data in each line within the trial
    list_sing = [x.split(sep=' ') for x in list_sing]
    list_sess.append(list_sing)
    
info_msg = 'Final number of trials loaded: ' + str(len(list_sess)) + '\n'
print(info_msg)
log_f.write(info_msg)

log_f.close()

## TRANSFORM TO 'LONG' FORMAT AND SAVE IN PANDAS DATA FRAME
# Transform current 'wide' data format into 'long' format, and store as a pandas 
# dataframe.
# Edge trajectory data will be re-ordered into a successive list of nodes and times.
# Additional data, such as seconds, distance, and speed refers to the trajectory from
# node[i-1] to node[i], being zero for the inital row.

# First re-order and store as single lists for each variable
# Variables with _ff suffix  ("from file") can be computed from other primary variables in the file. 
# Suffix is included in case these vars need to be re-computed 

size = sum([len(x) for x in list_sess]) + len(list_sess) # List size: total no. of lines (across all trials), 
                                                # with one additional line per trial (25 trials in total) to flatten structure
trial_no = [None]*size 
distance = [None]*size 
seconds_ff = [None]*size # seconds from file 
nodes = [None]*size #np.zeros(size, dtype=int) # nodes the rat passes through (flat)
speed_ff = [None]*size #np.zeros(size)
times = [None]*size #np.zeros(shape) # all timestamps (flat)
rat_id_col = [rat_id]*size
exp_date_coil = [exp_date]*size


line_no = 0 # initialize counter

for tr, trial in enumerate(list_sess):

    # Add inital trial line for nodes and timestamps array with starting node (first value of first line in the trial)
    trial_no[line_no] = tr+1
    nodes[line_no] = trial[0][0] # First value from first row in that trial: start node
    times[line_no] = trial[0][2] # Third value from first row in that trial: start time
    # times[:, cum_line_no] = [float(x) for x in time_list]
    # Initialize distance and seconds
    distance[line_no] = 0
    seconds_ff[line_no] = 0
    speed_ff[line_no] = 0
    line_no +=1
    
    for row in trial:
        
        trial_no[line_no] = tr+1
        nodes[line_no] = row[1]
        times[line_no] = row[3]
        seconds_ff[line_no] = row[4]
        distance[line_no] = row[5]
        speed_ff[line_no] = row[6]     
        line_no += 1


## BUILD PD DATA FRAME

data = pd.DataFrame(list(zip(rat_id_col, exp_date_col, trial_no, nodes, times, distance, seconds_ff, speed_ff)),
                   columns = ['rat_id', 'date', 'trial_no', 'node', 'time_ff', 'distance', \
                              'seconds_ff', 'speed_ff']) 

return data

Loading Data from Rat5 on 2021-06-29 (20210629_Rat5.txt):
47 trials found initially

18 duplicate trials found

Trial 1
Trial 1
Trial 1
Trial 2
Trial 3
Trial 4
Trial 5
Trial 6
Trial 7
Trial 8
Trial 9
Trial 10
Trial 11
Trial 12
Trial 13
Trial 14
Trial 14
Trial 28
1147 duplicate lines found

29 unique trials left after duplicate removal

Final number of trials loaded: 28

