## Packages

In [1]:
import IPython
%load_ext autoreload

%autoreload 2

In [2]:
from addict import Dict
import albumentations as A
from albumentations.pytorch import ToTensorV2

import logging

from matplotlib import pyplot

import numpy as np

import os
import os.path as osp

import pandas as pd
from pprint import pprint
from PIL import Image

import sys
from tqdm import tqdm

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

from typing import Union, Any, List

# Logger config
logging.basicConfig(level="INFO")

In [3]:
sys.path.append("../src")

In [4]:
import config
import vocabulary as vocab

## Config

In [5]:
DATASET_DIR = "../../../dataset/robot_manipulations/"
FINAL_DATASET_DIR = "../../../dataset/robot_manipulations/"

In [6]:
os.listdir(FINAL_DATASET_DIR)

['v1', 'v2', 'dataset.csv', 'train.csv', 'test.csv']

## Pre-processing

### Merge Datasets

In [8]:
v1 = pd.read_csv(osp.join(FINAL_DATASET_DIR, "v1", "updated_train.csv"))
v2 = pd.read_csv(osp.join(FINAL_DATASET_DIR, "v2", "updated_trainV2.csv"))

v1.shape, v2.shape

((1970, 8), (3156, 8))

In [9]:
v1.head()

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd
0,1005,0,9,amihretu,put the :BOTTLE to the left of :BOTTLE,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...,8,11
1,1011,0,9,amihretu,move the :BOTTLE left,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...,4,8
2,1012,0,9,amihretu,put the :BOTTLE to the right of :MUG,:BOTTLE BLUE POSE-7 :MUG RED POSE-3 :BOTTLE #...,8,11
3,1013,0,9,amihretu,shift the :CUP backwards,:CUP RED POSE-4 :CUP #'*backward-transformati...,4,8
4,1015,0,9,amihretu,shift the :BOTTLE forwards,:BOTTLE GREEN POSE-3 :BOTTLE #'*forward-trans...,4,8


In [10]:
v2.head()

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd
0,8208,0,10,amihretu,put the :SPOON in front of :GLOVE,:SPOON RED POSE-9 :GLOVE RED POSE-3 :SPOON #'...,7,11
1,8329,0,10,amihretu,put the :BOWL behind :BOWL,:BOWL RED POSE-9 :BOWL BLUE POSE-4 :BOWL #'*b...,5,11
2,8350,0,10,amihretu,put the :FORK to the left of :MUG,:FORK BLUE POSE-8 :MUG BLUE POSE-3 :FORK #'*l...,8,11
3,8369,0,10,amihretu,put the :KNIFE to the left of :GLASSES,:KNIFE GREEN POSE-6 :GLASSES BLUE POSE-1 :KNIF...,8,11
4,8395,0,10,amihretu,put the :KNIFE to the left of :GLASSES,:KNIFE GREEN POSE-6 :GLASSES BLUE POSE-1 :KNIF...,8,11


In [11]:
n_collisions = len(set(v1.sample_ID.tolist()).intersection(set(v2.sample_ID.tolist())))
n_collisions

852

In [12]:
v1["version"] = "v1"
v2["version"] = "v2"

In [13]:
v1.head(n=2)

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,1005,0,9,amihretu,put the :BOTTLE to the left of :BOTTLE,:BOTTLE BLUE POSE-9 :BOTTLE RED POSE-2 :BOTTLE...,8,11,v1
1,1011,0,9,amihretu,move the :BOTTLE left,:BOTTLE BLUE POSE-3 :BOTTLE #'*leftward-trans...,4,8,v1


In [14]:
v2.head(n=2)

Unnamed: 0,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,8208,0,10,amihretu,put the :SPOON in front of :GLOVE,:SPOON RED POSE-9 :GLOVE RED POSE-3 :SPOON #'...,7,11,v2
1,8329,0,10,amihretu,put the :BOWL behind :BOWL,:BOWL RED POSE-9 :BOWL BLUE POSE-4 :BOWL #'*b...,5,11,v2


In [15]:
df = pd.concat([v1, v2]).sample(frac=1.).reset_index()

df.sample(n=2)

Unnamed: 0,index,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
1818,161,1355,0,9,amihretu,put the :BOTTLE to the left of :MUG,:BOTTLE BLUE POSE-6 :MUG BLUE POSE-1 :BOTTLE ...,8,11,v1
2517,1909,1834,0,10,mdanso,put the :CUP in front of :BREAKFAST-CEREAL,:CUP GREEN POSE-9 :BREAKFAST-CEREAL BLUE POSE-...,7,11,v2


In [16]:
df.shape

(5126, 10)

In [17]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
index,5126.0,1349.601053,847.745597,0.0,640.25,1281.0,1921.75,3155.0
sample_ID,5126.0,3469.328326,2680.043037,5.0,1419.25,2776.5,4676.25,10002.0
in_state,5126.0,0.003902,0.197507,0.0,0.0,0.0,0.0,10.0
goal_state,5126.0,9.420406,0.659789,0.0,9.0,9.0,10.0,10.0
len_action_desc,5126.0,6.650605,1.56094,4.0,5.0,7.0,8.0,8.0
len_motor_cmd,5126.0,10.374756,1.218803,6.0,11.0,11.0,11.0,11.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5126 entries, 0 to 5125
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               5126 non-null   int64 
 1   sample_ID           5126 non-null   int64 
 2   in_state            5126 non-null   int64 
 3   goal_state          5126 non-null   int64 
 4   validator           5126 non-null   object
 5   action_description  5126 non-null   object
 6   motor_cmd           5126 non-null   object
 7   len_action_desc     5126 non-null   int64 
 8   len_motor_cmd       5126 non-null   int64 
 9   version             5126 non-null   object
dtypes: int64(6), object(4)
memory usage: 400.6+ KB


## Fixing object identifiers in instructions

### Load merged dataset

In [8]:
df = pd.read_csv(osp.join(FINAL_DATASET_DIR, "dataset.csv"))

df.shape

(5126, 10)

In [9]:
OBJECTS_FIXMAP = {obj:obj.split(":")[1].lower() for obj in vocab.OBJECTS}
OBJECTS_FIXMAP

{':POT': 'pot',
 ':BLUE-METAL-PLATE': 'blue-metal-plate',
 ':SPOON': 'spoon',
 ':SHOE': 'shoe',
 ':FORK': 'fork',
 ':BOTTLE': 'bottle',
 ':GLASSES': 'glasses',
 ':SPATULA': 'spatula',
 ':CEREAL': 'cereal',
 ':WEISSWURST': 'weisswurst',
 ':BREAKFAST-CEREAL': 'breakfast-cereal',
 ':GLOVE': 'glove',
 ':BUTTERMILK': 'buttermilk',
 ':RED-METAL-PLATE': 'red-metal-plate',
 ':KNIFE': 'knife',
 ':MONDAMIN': 'mondamin',
 ':MUG': 'mug',
 ':CAP': 'cap',
 ':BOWL': 'bowl',
 ':CUBE': 'cube',
 ':MILK': 'milk',
 ':PLATE': 'plate',
 ':CUP': 'cup',
 ':TRAY': 'tray'}

In [10]:
df["action_description"] = df.action_description.replace(OBJECTS_FIXMAP, regex=True)

In [11]:
df.head(n=2)

Unnamed: 0,index,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,186,8667,0,10,amihretu,put the knife to the left of cup,:KNIFE GREEN POSE-7 :CUP BLUE POSE-3 :KNIFE #...,8,11,v2
1,3107,5906,0,10,dmusingu,put the bottle behind glove,:BOTTLE GREEN POSE-9 :GLOVE RED POSE-5 :BOTTLE...,5,11,v2


In [12]:
df.action_description.nunique()

604

In [15]:
test_samples = df.drop_duplicates(subset="action_description").sample(n=250, random_state=config.SEED).reset_index(drop=True)
test_samples.shape

(250, 10)

In [16]:
test_samples

Unnamed: 0,index,sample_ID,in_state,goal_state,validator,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,535,2146,0,9,cmanouan,put the fork to the left of mondamin,:FORK RED POSE-13 :MONDAMIN BLUE POSE-11 :FORK...,8,11,v1
1,530,9975,0,10,amihretu,move the breakfast-cereal right,:BREAKFAST-CEREAL BLUE POSE-3 :BREAKFAST-CEREA...,5,8,v2
2,1331,3679,0,9,mdanso,move the cereal forwards,:CEREAL BLUE POSE-11 :CEREAL #'*forward-trans...,4,8,v1
3,1758,1571,0,10,mdanso,put the breakfast-cereal to the left of plate,:BREAKFAST-CEREAL GREEN POSE-7 :PLATE RED POSE...,8,11,v2
4,2661,3711,0,9,dmusingu,shift the knife forwards,:KNIFE GREEN POSE-2 :KNIFE #'*forward-transfo...,5,8,v2
...,...,...,...,...,...,...,...,...,...,...
245,1881,829,0,9,dmusingu,put the bottle behind pot,:BOTTLE RED POSE-6 :POT BLUE POSE-2 :BOTTLE #...,5,11,v1
246,544,2165,0,9,cmanouan,put the fork behind mondamin,:FORK GREEN POSE-13 :MONDAMIN BLUE POSE-11 :FO...,5,11,v1
247,76,8200,0,10,amihretu,put the milk behind fork,:MILK GREEN POSE-8 :FORK BLUE POSE-5 :MILK #'...,5,11,v2
248,1622,1335,0,8,mdanso,put the fork in front of blue-metal-plate,:FORK BLUE POSE-7 :BLUE-METAL-PLATE GREEN POSE...,7,11,v2


In [17]:
train_samples = df[~df.index.isin(test_samples.index)].reset_index(drop=True)
train_samples.shape

(4876, 10)

In [18]:
(train_samples.shape[0] + test_samples.shape[0]) == df.shape[0]

True

In [19]:
train_samples[[
    "sample_ID",
    "in_state",
    "goal_state",
    "action_description",
    "motor_cmd",
    "len_action_desc",
    "len_motor_cmd",
    "version"
]].to_csv(osp.join(FINAL_DATASET_DIR, "train.csv"), index=False)

test_samples[[
    "sample_ID",
    "in_state",
    "goal_state",
    "action_description",
    "motor_cmd",
    "len_action_desc",
    "len_motor_cmd",
    "version"
]].to_csv(osp.join(FINAL_DATASET_DIR, "test.csv"), index=False)

# Edit sets

In [21]:
train = pd.read_csv(osp.join(FINAL_DATASET_DIR, "train.csv"))

In [22]:
train.head()

Unnamed: 0,sample_ID,in_state,goal_state,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,7294,0,10,put the fork to the right of buttermilk,:FORK GREEN POSE-9 :BUTTERMILK GREEN POSE-2 :F...,8,11,v2
1,405,0,8,move the bottle backwards,:BOTTLE RED POSE-2 :BOTTLE #'*backward-transf...,4,8,v1
2,4235,0,10,put the bottle to the left of breakfast-cereal,:BOTTLE RED POSE-7 :BREAKFAST-CEREAL BLUE POSE...,8,11,v2
3,6990,0,10,put the milk in front of bottle,:MILK BLUE POSE-8 :BOTTLE RED POSE-4 :MILK #'...,7,11,v2
4,7096,0,10,put the cup in front of glasses,:CUP GREEN POSE-6 :GLASSES RED POSE-2 :CUP #'...,7,11,v2


In [23]:
train.shape

(4876, 8)

In [57]:
TO_REMOVE = [
    121, 8536, 8732, 853, 8369, 8780, 2974, 8849, 8688, 86,
    8600
]

In [58]:
train = train[~train["sample_ID"].isin(TO_REMOVE)].reset_index(drop=True)
train.to_csv(osp.join(FINAL_DATASET_DIR, "train.csv"), index=False)

In [59]:
train.shape

(4864, 8)

In [11]:

# Define the root directory to start the search
root_directory = osp.join(FINAL_DATASET_DIR, "v2")  # Replace with the path to your directory

# Initialize lists to store folder names with incorrect file counts
folders_with_incorrect_counts = []

# Function to count files in a folder
def count_files_in_folder(folder_path):
    try:
        file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
        return file_count
    except FileNotFoundError:
        return 0

In [73]:
# Traverse subfolders and check file counts
for root, dirs, files in os.walk(root_directory):
    for folder_name in dirs:
        folder_path = os.path.join(root, folder_name)
        file_count = count_files_in_folder(folder_path)
        
        # Check if the file count is not 9 or 11
        if file_count != 9 and file_count != 11:
            folders_with_incorrect_counts.append(folder_name)

In [74]:
# # Print or store the folders with incorrect file counts
# for folder_name in folders_with_incorrect_counts:
#     print(f"Folder '{folder_name}")


In [76]:
len(folders_with_incorrect_counts)

276

In [81]:
folders_with_incorrect_counts = [int(f) for f in folders_with_incorrect_counts]

In [82]:
folders_with_incorrect_counts

[382,
 3012,
 8600,
 150,
 232,
 234,
 929,
 294,
 51,
 1637,
 94,
 2122,
 21,
 321,
 829,
 3471,
 3354,
 380,
 162,
 9178,
 98,
 23,
 29,
 371,
 59,
 195,
 717,
 3047,
 3736,
 253,
 155,
 314,
 2085,
 130,
 8369,
 110,
 307,
 3711,
 1697,
 105,
 317,
 3100,
 273,
 376,
 3503,
 67,
 70,
 3079,
 121,
 75,
 7714,
 357,
 8780,
 9210,
 283,
 2947,
 40,
 383,
 2928,
 4920,
 56,
 66,
 26,
 146,
 1275,
 306,
 1131,
 2747,
 86,
 129,
 3153,
 4129,
 4847,
 1562,
 72,
 27,
 1726,
 90,
 156,
 1022,
 252,
 261,
 9019,
 4097,
 9399,
 305,
 9595,
 3251,
 3957,
 790,
 249,
 325,
 2715,
 4388,
 8952,
 362,
 374,
 9084,
 45,
 1107,
 313,
 106,
 1350,
 265,
 319,
 8849,
 365,
 88,
 2058,
 6046,
 2809,
 288,
 12,
 76,
 39,
 225,
 7653,
 8688,
 34,
 74,
 1414,
 4630,
 71,
 102,
 4358,
 366,
 84,
 9431,
 1418,
 50,
 853,
 863,
 131,
 145,
 8536,
 1184,
 83,
 91,
 68,
 99,
 3325,
 3219,
 3304,
 260,
 4466,
 53,
 54,
 9,
 9563,
 3797,
 4214,
 220,
 52,
 35,
 243,
 32,
 158,
 236,
 1058,
 3881,
 96,
 378,
 5,

In [83]:
train = train[~train["sample_ID"].isin(folders_with_incorrect_counts)].reset_index(drop=True)
train.to_csv(osp.join(FINAL_DATASET_DIR, "train.csv"), index=False)

In [84]:
train.shape

(4508, 8)

In [85]:
train.head()

Unnamed: 0,sample_ID,in_state,goal_state,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,7294,0,10,put the fork to the right of buttermilk,:FORK GREEN POSE-9 :BUTTERMILK GREEN POSE-2 :F...,8,11,v2
1,405,0,8,move the bottle backwards,:BOTTLE RED POSE-2 :BOTTLE #'*backward-transf...,4,8,v1
2,4235,0,10,put the bottle to the left of breakfast-cereal,:BOTTLE RED POSE-7 :BREAKFAST-CEREAL BLUE POSE...,8,11,v2
3,6990,0,10,put the milk in front of bottle,:MILK BLUE POSE-8 :BOTTLE RED POSE-4 :MILK #'...,7,11,v2
4,7096,0,10,put the cup in front of glasses,:CUP GREEN POSE-6 :GLASSES RED POSE-2 :CUP #'...,7,11,v2


## Test dataset

In [13]:
folders_with_incorrect_counts = [int(f) for f in folders_with_incorrect_counts]
folders_with_incorrect_counts

[]

In [15]:
test = pd.read_csv(osp.join(FINAL_DATASET_DIR, "test.csv"))
print(test.shape)
test.head()

(249, 8)


Unnamed: 0,sample_ID,in_state,goal_state,action_description,motor_cmd,len_action_desc,len_motor_cmd,version
0,2146,0,9,put the fork to the left of mondamin,:FORK RED POSE-13 :MONDAMIN BLUE POSE-11 :FORK...,8,11,v1
1,9975,0,10,move the breakfast-cereal right,:BREAKFAST-CEREAL BLUE POSE-3 :BREAKFAST-CEREA...,5,8,v2
2,3679,0,9,move the cereal forwards,:CEREAL BLUE POSE-11 :CEREAL #'*forward-trans...,4,8,v1
3,1571,0,10,put the breakfast-cereal to the left of plate,:BREAKFAST-CEREAL GREEN POSE-7 :PLATE RED POSE...,8,11,v2
4,3711,0,9,shift the knife forwards,:KNIFE GREEN POSE-2 :KNIFE #'*forward-transfo...,5,8,v2


In [9]:
TO_REMOVE = [
    8395
]

In [10]:
test = test[~test["sample_ID"].isin(TO_REMOVE)].reset_index(drop=True)
print(test.shape)


(249, 8)


In [14]:
test.to_csv(osp.join(FINAL_DATASET_DIR, "test.csv"), index=False)