### This jupyter notebook is used to calculate the split's label distribution for each class.

In [14]:
dataset_args    = {}
# VTS
dataset_args['VTS'] = {}
dataset_args['VTS']['video_hz'] = 30
dataset_args['VTS']['label_hz'] = 30
dataset_args['VTS']['gestures'] = {}
dataset_args['VTS']['gestures']['num_class'] = 10
# JIGSAWS
dataset_args['JIGSAWS'] = {}
dataset_args['JIGSAWS']['video_hz'] = 30
dataset_args['JIGSAWS']['label_hz'] = 30
dataset_args['JIGSAWS']['gestures'] = {}
dataset_args['JIGSAWS']['gestures']['num_class'] = 10
# MultiBypass140
dataset_args['MultiBypass140'] = {}
dataset_args['MultiBypass140']['video_hz'] = 25
dataset_args['MultiBypass140']['label_hz'] = 25
dataset_args['MultiBypass140']['steps'] = {}
dataset_args['MultiBypass140']['phases'] = {}
dataset_args['MultiBypass140']['steps']['num_class'] = 46
dataset_args['MultiBypass140']['phases']['num_class'] = 12
# SAR_RARP50
dataset_args['SAR_RARP50'] = {}
dataset_args['SAR_RARP50']['video_hz'] = 60
dataset_args['SAR_RARP50']['label_hz'] = 10
dataset_args['SAR_RARP50']['gestures'] = {}
dataset_args['SAR_RARP50']['gestures']['num_class'] = 8

In [15]:
import os
DATASET='MultiBypass140'
TASK='steps'
CLASS_N = 12 if TASK == 'phases' else 46
FPS = dataset_args[DATASET]['video_hz']
LABEL_FPS = dataset_args[DATASET]['label_hz']

In [16]:
# Function to check if a video directory exists
def video_exists(video_name):
    return os.path.exists(video_name)

# **LABEL DISTRIBUTION**

In [17]:
def count_labels_in_video(video_list, split_type, split_num, video_name, TASK):
    with open(os.path.join("/data/home/gabrielg/BoundedFuture++/Bounded_Future_from_GIT/data/MultiBypass140/transcriptions", TASK, video_name + ".txt"), "r") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line == "":
                continue
            line = line.split(" ")
            if len(line) < 2:
                continue
            label = line[2]
            last_label = line[1]
            first_label = line[0]
            if label in video_list[split_type][f"Fold {split_num}"][video_name]:
                video_list[split_type][f"Fold {split_num}"][video_name][label] += int(last_label) - int(first_label) + 1
    # assert if sum of labels equals to total number of frames
    sum_labels = sum([video_list[split_type][f"Fold {split_num}"][video_name][label] for label in video_list[split_type][f"Fold {split_num}"][video_name] if label != 'total frames'])
    total_frames = video_list[split_type][f"Fold {split_num}"][video_name]['total frames']
    assert sum_labels <= total_frames , f"Sum of labels ({sum_labels}) is greater than total frames ({total_frames}) in {video_name}"
    return video_list
    

In [18]:
video_list = {}
for file in os.listdir('/data/home/gabrielg/BoundedFuture++/Bounded_Future_from_GIT/data/MultiBypass140/Splits'):
    if file.startswith('data') and file.endswith('.csv'):
        split_type = file.split('_')[1]
        split_num = file.split('_')[2].split('.')[0]
        with open(os.path.join('/data/home/gabrielg/BoundedFuture++/Bounded_Future_from_GIT/data/MultiBypass140/Splits', file)) as f:
            video_lines = f.read().splitlines()
        for video in video_lines:
            video_name = video.split(',')[0]
            total_frames = video.split(',')[1]
            if split_type not in video_list:
                video_list[split_type] = {}
            if f"Fold {split_num}" not in video_list[split_type]:
                video_list[split_type][f"Fold {split_num}"] = {}
            video_list[split_type][f"Fold {split_num}"][video_name] = {}
            video_list[split_type][f"Fold {split_num}"][video_name]['total frames'] = int(total_frames)
            for i in range(0, CLASS_N):
                first_letter = TASK[0].upper()
                video_list[split_type][f"Fold {split_num}"][video_name][f"{first_letter}{i}"] = 0
            video_list = count_labels_in_video(video_list, split_type, split_num, video_name, TASK)

In [19]:
import pandas as pd
import plotly.express as px

# Prepare data for Plotly
data = []

# Aggregate frame counts for each split_type and fold
for split_type, folds in video_list.items():
    for fold_name, videos in folds.items():
        label_counts = {f"{TASK[0].upper()}{i}": 0 for i in range(CLASS_N)}
        for video_name, labels in videos.items():
            for label, count in labels.items():
                if label != 'total frames':
                    label_counts[label] += count
        for label, count in label_counts.items():
            data.append({'Split': split_type, 'Fold': fold_name, 'Label': label, 'Count': count})

df = pd.DataFrame(data)

# Create the plot using Plotly for each split_type
for split_type in df['Split'].unique():
    df_split = df[df['Split'] == split_type]
    fig = px.line(df_split, x='Label', y='Count', color='Fold', markers=True, title=f'Label Distribution Across Folds for {split_type}')
    fig.show()

# Keep the data in a CSV file for future reference
df.to_csv(f"label_distribution_{TASK}.csv", index=False)

# **Label Duration Average and STD**

In [20]:
import numpy as np

def labels_duration_in_video(video_list, durations, split_type, split_num, video_name, TASK):
    with open(os.path.join("/data/home/gabrielg/BoundedFuture++/Bounded_Future_from_GIT/data/MultiBypass140/transcriptions", TASK, video_name + ".txt"), "r") as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            if line == "":
                continue
            line = line.split(" ")
            if len(line) < 2:
                continue
            label = line[2]
            last_label = line[1]
            first_label = line[0]
            duration = int(last_label) - int(first_label) + 1
            if label in video_list[split_type][f"Fold {split_num}"][video_name]:
                durations[split_type][label].append(duration)
    return video_list, durations
    

In [21]:
video_list = {}
durations = {}
durations['train'] = {}
durations['val'] = {}
durations['test'] = {}
for i in range(CLASS_N):
    durations['train'][f"{TASK[0].upper()}{i}"] = []
    durations['val'][f"{TASK[0].upper()}{i}"] = []
    durations['test'][f"{TASK[0].upper()}{i}"] = []
for file in os.listdir('/data/home/gabrielg/BoundedFuture++/Bounded_Future_from_GIT/data/MultiBypass140/Splits'):
    if file.startswith('data') and file.endswith('.csv'):
        split_type = file.split('_')[1]
        split_num = file.split('_')[2].split('.')[0]
        with open(os.path.join('/data/home/gabrielg/BoundedFuture++/Bounded_Future_from_GIT/data/MultiBypass140/Splits', file)) as f:
            video_lines = f.read().splitlines()
        for video in video_lines:
            video_name = video.split(',')[0]
            total_frames = video.split(',')[1]
            if split_type not in video_list:
                video_list[split_type] = {}
                # durations[split_type] = {}
            if f"Fold {split_num}" not in video_list[split_type]:
                video_list[split_type][f"Fold {split_num}"] = {}
            video_list[split_type][f"Fold {split_num}"][video_name] = {}
            video_list[split_type][f"Fold {split_num}"][video_name]['total frames'] = int(total_frames)
            for i in range(0, CLASS_N):
                first_letter = TASK[0].upper()
                video_list[split_type][f"Fold {split_num}"][video_name][f"{first_letter}{i}"] = {}
                # durations[split_type][f"{first_letter}{i}"] = []
            video_list, durations = labels_duration_in_video(video_list, durations, split_type, split_num, video_name, TASK)
            
# Calculate average and std for each label
avg_std_durations = {}
for split_type in durations:
   avg_std_durations[split_type] = {} 
for split_type in durations:   
    for label, duration_list in durations[split_type].items():
        if duration_list:
            avg_std_durations[split_type][label] = {
                'average': np.mean(duration_list),
                'std': np.std(duration_list)
            }
        else:
            avg_std_durations[split_type][label] = {
                'average': 0,
                'std': 0
            }

In [22]:
import pandas as pd
import plotly.express as px

# Prepare data for Plotly
data = []

# Aggregate duration statistics for each split_type and label
for split_type, labels in avg_std_durations.items():
    for label, stats in labels.items():
        data.append({'Split': split_type, 'Label': label, 'Average Duration': stats['average'], 'STD Duration': stats['std']})

df = pd.DataFrame(data)

# Create the plot using Plotly for each split_type
for split_type in df['Split'].unique():
    df_split = df[df['Split'] == split_type]
    fig = px.bar(df_split, x='Label', y='Average Duration', error_y='STD Duration', title=f'Average and STD of Label Durations for {split_type}')
    fig.show()

# Keep the data in a CSV file for future reference
df.to_csv(f"label_duration_{TASK}.csv", index=False)

# **Most Common Label after each label**

In [23]:
# Most Common Label after each label


In [24]:
# chmod u+x for data_*.csv files
os.system('chmod u+x /data/home/gabrielg/Bounded_Future_from_GIT/data/NULL/Splits/*.csv')

chmod: cannot access '/data/home/gabrielg/Bounded_Future_from_GIT/data/NULL/Splits/*.csv': No such file or directory


256