The code was adjusted from Marc Vidal De Palol

__Data Saved:__

- __recordings_village_old.csv__ --> we save the ids + length of all recordings in one csv (here subject 8 is saved in two separate files)

__Dependencies__

In [None]:
import datetime  # datetime operations
import json  # read/write from/into json format
import os  # OS operations (read/write files/folders)
import uuid

# process parallelization
from multiprocessing import Manager, Pool, cpu_count

import pandas as pd  # operate with dataframes
import pyxdf  # read XDF files (LSL streams recordings)

# from IPython.display import display  # print nicely
from tqdm.notebook import tqdm  # mother of progressbars

__Options__

In [None]:
# raw data path
PATH_RAW = "./data/raw"

# progress bar customized format
B_FORMAT = """📄 {n_fmt} of {total_fmt} {desc} processed: {bar} 
            {percentage:3.0f}% ⏱️{elapsed} ⏳{remaining} ⚙️{rate_fmt}{postfix}"""

# specify decimals format on pandas tables
pd.options.display.float_format = "{:.3f}".format

CORES = cpu_count()  # number of cpu threads for multiprocessing
print(f"Total CPU threads: {CORES}")

__Helper functions__

In [None]:
def pbar_fork_hack():
    """
    Hack to enforce progress bars to be displayed by fork processes on
    IPython Apps like Jupyter Notebooks.

    Avoids [IPKernelApp] WARNING | WARNING: attempted to send message from fork

    Important: pass this function as argument for the initializer parameter
    while initializing a multiprocessing pool to make it work. E.g.:

    pool = Pool(processes=N_CORES, initializer=pbar_fork_hack)

    Source:
     - https://github.com/ipython/ipython/issues/11049#issue-306086846
     - https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
    """
    print(" ", end="", flush=True)


def seconds_to_minutes_seconds(seconds_decimal):
    """
    Given a time length in seconds, transform it to minutes' seconds".

    Parametsers:
        seconds_decimal (float): Time length in seconds to convert.

    Returns:
        str: Time length in minutes plus seconds format -> m' s".
    """
    minutes = int((seconds_decimal) / 60)
    seconds = int(((seconds_decimal) / 60 - minutes) * 60)

    return f"{minutes}' {seconds}\""

__Files selection__

In [None]:
# list of filenames to be excluded from processing
EXCLUDE = [
    "pilot",
    "02_v_010621_old1.xdf", # cable issues (immediately recorded a new file)
    "03_v_010621.xdf", # easy calibration (eye tracking most likely not usable)
    "06_v1_080621.xdf", # recording was messy and timing written down in participant information do not match timing of recordings
    "06_v2_080621.xdf", # same as 06_v1
    "09_v_110621.xdf", # easy validation (eye tracking most likely not usable)
    "09_v2_110621.xdf", # easy validation (eye tracking most likely not usable)
    "13_v_010721.xdf", # could not pass calibration after 10 minutes anymore
    "23_v_200721.xdf", # got motion sick and did not finish the experiment
    "25_v_230721.xdf", # asked to stop after 15 minutes
    "28_v_030821_old1.xdf", # got motion sick
    "28_v_030821.xdf", # got motion sick
    "31_v_171121_old1.xdf", # recording was started over
    "35_v_291121.xdf", # got motion sick
    "39_v1_191022.xdf", # the movement speed was incorrect
    "40_v1_191022.xdf", # the movement speed was incorrect
]
# get the list of files in the specified directory
files = os.listdir(PATH_RAW)

# first loop: remove files based on the EXCLUDE list
# note: two loops are used to modify the list 
for f in files:  # iterate over files
    # remove files by filter
    if f in EXCLUDE:
        files.remove(f)
# second loop: Remove remaining files based on the EXCLUDE list
# as it only removes one file for each number we need two loops to remove everything
for fi in files:
    if fi in EXCLUDE:
        files.remove(fi)

# third loop: Remove files that do not have the ".xdf" extension (hidden/config files and folders)
for f in files:  # iterate over files
    # remove hidden/config files and folders
    if not f.endswith(".xdf"):
        files.remove(f)


__Recordings info__

In [None]:
# create a shared-memory dict to fill and then transform to df
manager = Manager()  # manage shared memory types
recordings = manager.dict(
    {
        "id": manager.list(),
        "file": manager.list(),
        "created": manager.list(),
        "length": manager.list(),
        "start": manager.list(),
    }
)
# copy the list of files to a new list
# otherwise .append() won't work as shared memory
fil = files[:]

# iterate over each file in the list
for i in fil:
    fname = i
    # load XDF file using pyxdf
    data, header = pyxdf.load_xdf(f"{PATH_RAW}/{i}")
    # extract participant id from XDF file
    uid = data[0]["info"]["source_id"][0]
    # extract recording datetime and format it
    timestamp = header["info"]["datetime"][0]
    created = datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S+%f")
    created = created.strftime("%m.%d.%Y %H:%M")  # format it

    # extract start timestamp and recording length for each stream in the XDF file
    lengths = []  # to collect all stream lengths
    starts = []  # to collect all stream start timestamps
    for s in data:  # iterate over each stream in the XDF file
        # ensure there are timestamps
        if len(s["time_stamps"]) > 0:
            start = s["time_stamps"][0]  # start time
            end = s["time_stamps"][-1]  # end time
            # add stream length in seconds
            lengths.append(end - start)
            # add start timestamp
            starts.append(start)
    # get the ongest length and transform it to minutes and seconds (' ")
    length = seconds_to_minutes_seconds(max(lengths))
    # get first (smallest) timestamp
    start = min(starts)

    # add into our dict of lists
    recordings["id"].append(uid)
    recordings["file"].append(fname)
    recordings["created"].append(created)
    recordings["length"].append(length)
    recordings["start"].append(start)
    
    # clean up variables to free up memory
    del data, header, uid, timestamp, created, lengths, starts, start, end, fname, length



# dictProxy object to regular dict
recordings = dict(recordings)


# convert ListProxy objects to list
for key, value in recordings.items():
    recordings[key] = list(value)

# convert dict to df, set id column as index
recs = pd.DataFrame(recordings)
# order the DataFrame by the 'created' column
recs = recs.set_index("id")
# convert the 'created' column to datetime
recs.created = recs.created.astype("datetime64")
# order by creation date
recs = recs.sort_values(by="created")

# store recordings info as CSV
recs.to_csv("./recordings_village_old.csv", index=True)

# display the df
recs

In [None]:
def recording_info(fname):
    """
    Process recording file information: participant id, filename, creation
    date, length (duration), and start (first) timestamp.

    Parameters:
        fname (str): Recording filename.
    """
    # get participant id
    data, header = pyxdf.load_xdf(f"{PATH_RAW}/{fname}")
    uid = data[0]["info"]["source_id"][0]
    
    # get recording datetime
    timestamp = header["info"]["datetime"][0]
    # string timestamp ('2020-12-08T16:16:05+0100') to datetime object
    created = datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S+%f")
    created = created.strftime("%m.%d.%Y %H:%M")  # format it

    # initialize a tqdm progress bar for streams
    streams_pbar = tqdm(
        iterable=data,
        desc=f"🧻 streams from participant {uid}",
        dynamic_ncols=True,
        bar_format=B_FORMAT,
    )
    del header
    
    # get start timestamp and recording length
    lengths = []  # to collect all stream lengths
    starts = []  # to collect all stream start timestamps
    idds = [] # to collect all stream IDs
    for s in streams_pbar:  # for each stream
        # ensure there's timestamps
        if len(s["time_stamps"]) > 0:
            start = s["time_stamps"][0]  # start time
            end = s["time_stamps"][-1]  # end time
            # add stream length in seconds
            lengths.append(end - start)
            idds.append(s["info"]["name"][0])
            # add start timestamp
            starts.append(start)
    # get longest length and transform it to minutes and seconds (' ")
    length = seconds_to_minutes_seconds(max(lengths))
    del data, lengths
    # get first (smallest) timestamp
    start = min(starts)
    del starts
    # add into our dict of lists
    recordings["id"].append(uid)
    recordings["file"].append(fname)
    recordings["created"].append(created)
    recordings["length"].append(length)
    recordings["start"].append(start)

In [None]:
# create a shared-memory dict to fill and then transform to df
manager = Manager()  # manage shared memory types
recordings = manager.dict(
    {
        "id": manager.list(),
        "file": manager.list(),
        "created": manager.list(),
        "length": manager.list(),
        "start": manager.list(),
    }
)

# initialize pool of processes according to the available cpu core threads
pool = Pool(processes=CORES, initializer=pbar_fork_hack)

# copy the list of files to a new list
fil = files[:]

# tqdm progress bar for recordings
files_pbar = tqdm(
    iterable=pool.imap(func=recording_info, iterable=fil),
    total=len(fil),
    desc="📼 recordings",
    dynamic_ncols=True,
    bar_format=B_FORMAT,
)

# loop necessary for displaying properly the progressbar with multiprocessing
# source: https://stackoverflow.com/a/40133278
for _ in files_pbar:
    pass

# close pool instance, no more work to submit
pool.close()
# wait for the worker processes to terminate
pool.join()

# convert DictProxy object to dict
recordings = dict(recordings)


# convert ListProxy objects to list
for key, value in recordings.items():
    recordings[key] = list(value)

# convert dict to df, set id column as index
recs = pd.DataFrame(recordings)
recs = recs.set_index("id")
# set 'created' column (series) to datetime
recs.created = recs.created.astype("datetime64")
# order by creation date
recs = recs.sort_values(by="created")

# store recordings info as CSV
recs.to_csv("./recordings_village_old.csv", index=True)

# display the results
recs

With this, we can retrieve the recordings info from the CSV without having to reread (load in memory and process) again. Very convenient since we're working with large recordings (20 or more minutes, GBs of data).

__Generate new uids for the ones missing --> only run this after run the previous cell__

In [None]:
# check if the file "./recordings_village.csv" exists
if os.path.exists("./recordings_village.csv"):
    # if the file exists, read it into a DataFrame named recs_corr
    recs_corr = pd.read_csv("./recordings_village.csv", index_col=0)

# read the "./recordings_village_old.csv" file into a DataFrame named recs
recs = pd.read_csv("./recordings_village_old.csv", index_col=0)
idd = recs.index.tolist()  # keep old ids

# create new dictionary
recording = {}
recording = recs
recording["old_id"] = idd # keep old ids

# create a new_id list with the same values as the old ids
new_id = idd
# iterate over old ids and update if necessary
for i in range(len(idd)):
    it = idd[i]
    if it[0] == "(":
        # handle special case for vp 8_v2
        if it == "(0xc6cf9db3, 0x7459a68f)":
            new_id[i] = "ee9dac3c-a7e5-48b7-8187-6d9038651352"
        # check if the old id is present in recs_corr df
        elif it in recs_corr["old_id"].tolist():
            # if present, update the new_id with the corresponding value in recs_corr
            new_id[i] = recs_corr.loc[recs_corr["old_id"] == it].index[0]
        else:
            # if not present, generate a new UUID and update the new_id
            new_uid = str(uuid.uuid4())
            new_id[i] = new_uid
# add a new column "new_id" to the recording dictionary and set its values to the new_id list
recording["new_id"] = new_id
recording = recording.set_index("new_id") # set new_id as index
recording.to_csv("./recordings_village_old.csv", index=True) # save it as a .csv
recording # display the result