In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import csv 
import sys
import os
import pandas as pd


In [2]:


# Load the CSV file
def eval_set(csv_file_path):
    df = pd.read_csv(csv_file_path, header=None, names=['id', 'videos', 'unknown', 'time_range', 'empty_list', 'annotations'], skiprows=10)
    print(df.head())    
    # Extract the label "close box"
    df['annotations'] = df['annotations'].apply(eval)  # Convert string representation of dictionary to actual dictionary

    # Collect all unique entries from the 'annotations' column
    unique_annotations = set()
    df['annotations'].apply(lambda x: unique_annotations.update(x.values()))
    # Count the occurrences of each unique annotation
    annotation_counts = {annotation: 0 for annotation in unique_annotations}
    df['annotations'].apply(lambda x: [annotation_counts.update({v: annotation_counts[v] + 1}) for v in x.values()])

    # Plot the counts
  
    # total num of annotations
    total_num_annotations = sum(annotation_counts.values())
    
    return annotation_counts

In [3]:
csv_base = '/Users/dennisbaumann/cars_paper/data/Arctic_Annotations/CSV/'

csv_files = []

for file in sorted(os.listdir(csv_base)):
    if file.endswith(".csv"):
        csv_files.append(csv_base + file)
csv_dict = {}

for i,csv_file in enumerate(csv_files):
    csv_dict[f'S{i+1:02d}'] = eval_set(csv_file)
print(csv_dict)



           id               videos  unknown         time_range empty_list  \
0  1_M2Sgz7U3  ["box_grab_01.mp4"]        0   [1.7105,2.48134]         []   
1  1_S9ZmzwSQ  ["box_grab_01.mp4"]        0  [2.98134,3.91884]         []   
2  1_UyF46fth  ["box_grab_01.mp4"]        0   [4.62717,5.5855]         []   
3  1_P7LkutCY  ["box_grab_01.mp4"]        0   [5.60634,6.4605]         []   
4  1_iCKtqfX3  ["box_grab_01.mp4"]        0    [7.273,8.12717]         []   

         annotations  
0   {"1":"grab box"}  
1  {"1":"place box"}  
2   {"1":"grab box"}  
3  {"1":"place box"}  
4   {"1":"grab box"}  
           id               videos  unknown           time_range empty_list  \
0  1_A05aAnqX  ["box_grab_01.mp4"]        0    [1.23142,2.62726]         []   
1  1_8vFqmtsY  ["box_grab_01.mp4"]        0    [3.52309,5.02309]         []   
2  1_IZx2aGBn  ["box_grab_01.mp4"]        0    [6.23142,7.48142]         []   
3  1_KHRcEgcX  ["box_grab_01.mp4"]        0   [8.81476,10.10642]         []   
4  1

In [4]:
unique_labels = set()
for subset in csv_dict.values():
    unique_labels.update(subset.keys())

# Initialize a dictionary to store the counts
counts_dict = {label: {'total': 0} for label in unique_labels}
for subset_name, subset in csv_dict.items():
    for label in unique_labels:
        counts_dict[label][subset_name] = subset.get(label, 0)
        counts_dict[label]['total'] += subset.get(label, 0)

# Convert the counts dictionary to a DataFrame
df = pd.DataFrame.from_dict(counts_dict, orient='index').reset_index()
df.rename(columns={'index': 'label'}, inplace=True)

# Save the DataFrame to a CSV file
output_csv_path = '/Users/dennisbaumann/cars_paper/data/Arctic_Annotations/eval.csv'
df.to_csv(output_csv_path, index=False)


In [5]:
reduced_dict = {}
for label, counts in counts_dict.items():
    if counts['total'] > 20:
        reduced_dict[label] = {k: v for k, v in counts.items() if k != 'total'}
        # Calculate the percentage of the total occurrences for each label
        for subset_name in reduced_dict[label].keys():
            reduced_dict[label][subset_name] = int((counts[subset_name] / counts['total']) * 100)
df = pd.DataFrame.from_dict(reduced_dict, orient='index').reset_index()
df.rename(columns={'index': 'label'}, inplace=True)

# Save the DataFrame to a CSV file
output_csv_path = '/Users/dennisbaumann/cars_paper/data/Arctic_Annotations/reduced_eval.csv'
df.to_csv(output_csv_path, index=False)

In [23]:
from mpl_toolkits.mplot3d import Axes3D
from itertools import combinations

def generate_train_test_splits(data_dict):
    """
    Generate all possible 7-3 train-test splits for a dictionary containing label distributions.

    Parameters:
        data_dict (dict): A dictionary where each key is a label and the value is another dictionary
                          with keys 's1', 's2', ..., 's10' representing counts or percentages.

    Returns:
        list: A list of dictionaries, each containing 'train' and 'test' keys with the corresponding splits.
    """
    # Generate all combinations of 7 subsets out of 10
    # Remove 'total' entry from the dictionary
    
    subsets = list(range(1, 11))  # s1 to s10 as integers
    combinations_7_3 = list(combinations(subsets, 7))

    # Prepare train-test splits
    splits = []
    for train_combination in combinations_7_3:
        test_combination = [s for s in subsets if s not in train_combination]

        # Generate the train and test data for each label
        train_test_split = {
            'train': {},
            'test': {}
        }

        for label, subset_values in data_dict.items():
            # Sum the values for train and test subsets
            train_sum = sum(subset_values[f'S{s:02d}'] for s in train_combination)
            test_sum = sum(subset_values[f'S{s:02d}'] for s in test_combination)

            # Add to train-test split
            
            train_test_split['train'][label] = train_sum
            train_test_split['test'][label] = test_sum
        combo  = [train_combination, train_test_split]
        splits.append(combo)

    return splits

# Example usage:

# Generate splits
splits = generate_train_test_splits(reduced_dict)

# Print an example split



In [24]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


def score_splits(splits):
    scores = []
    for split in splits:
        train_values = list(split[1]['train'].values())
        test_values = list(split[1]['test'].values())
        
        train_mae = mean_absolute_error(train_values, [70] * len(train_values))
        test_mae = mean_absolute_error(test_values, [30] * len(test_values))
        train_abs_error = np.array([abs(70 - v) for v in train_values])
        test_abs_error = np.array([abs(30 - v) for v in test_values])
        cost = sum(train_abs_error**2)/len(train_abs_error) + 5*sum(test_abs_error**2)/len(test_abs_error) + train_mae**2 + test_mae**2
        
        scores.append({'split':split[0],'cost': cost,'train_mae': train_mae, 'test_mae': test_mae, 'train_highest': max(train_abs_error), 'test_highest': max(test_abs_error)})
    
    return scores


# Score the splits
split_scores = score_splits(splits)
best_split = min(split_scores, key=lambda x: x['cost'])
best_split_index = split_scores.index(best_split)
print(best_split)


{'split': (2, 4, 5, 6, 7, 9, 10), 'cost': 344.4479785969084, 'train_mae': 6.689655172413793, 'test_mae': 4.844827586206897, 'train_highest': 24, 'test_highest': 22}


In [26]:

train_split = splits[best_split_index][1]['train']
test_split = splits[best_split_index][1]['test']

print(train_split)
# Create a DataFrame from the best split
best_split_df = pd.DataFrame({
    'label': train_split.keys(),
    'train_count': train_split.values(),
    'test_count': test_split.values()
})
# Save the DataFrame to a CSV file
best_split_csv_path = '/Users/dennisbaumann/cars_paper/data/Arctic_Annotations/best_split.csv'
best_split_df.to_csv(best_split_csv_path, index=False)

print(best_split_df)

{'close box': 58, 'grab phone': 67, 'grab waffleiron': 70, 'rotate microwave': 76, 'place mixer': 71, 'place ketchup': 67, 'open microwave': 61, 'rotate waffleiron': 65, 'grab scissors': 64, 'open capsulemachine': 59, 'close ketchup': 73, 'open laptop': 59, 'grab notebook': 65, 'rotate ketchup': 65, 'close mixer': 67, 'open notebook': 60, 'place microwave': 63, 'rotate laptop': 64, 'grab microwave': 57, 'close waffleiron': 63, 'grab mixer': 71, 'place scissors': 62, 'rotate phone': 70, 'rotate espressomachine': 70, 'use ketchup': 61, 'place laptop': 59, 'read notebook': 65, 'close phone': 68, 'open box': 58, 'place box': 59, 'grab capsulemachine': 66, 'open phone': 72, 'place notebook': 66, 'grab ketchup': 71, 'grab box': 58, 'grab laptop': 56, 'rotate mixer': 62, 'lever espressomachine': 68, 'open mixer': 67, 'type phone': 46, 'rotate capsulemachine': 58, 'open waffleiron': 66, 'close capsulemachine': 59, 'close notebook': 61, 'knob espressomachine': 48, 'dial phone': 64, 'rotate box'