# Dataset Inspection

Inspect, merge and balance datasets.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.chdir('..')

In [None]:
from __future__ import print_function, division

from pathlib2 import Path
import subprocess

import numpy as np
import pandas as pd
import rospy

from vgn.dataset import Dataset

## Compute Statistics

In [None]:
dataset_dir = Path("data/datasets/train")
csv_path = dataset_dir / "grasps.csv"

In [None]:
df = pd.read_csv(csv_path)

positives = df[df["label"] == 1]
negatives = df[df["label"] == 0]

print("Number of samples:", len(df.index))
print("Number of positives:", len(positives.index))
print("Number of negatives:", len(negatives.index))

## Visualize Samples

Visualize random grasp samples in rviz.

In [None]:
rospy.init_node("dataset_inspection")

In [None]:
dataset = Dataset(dataset_dir, augment=False)

i = np.random.choice(len(dataset))
dataset.draw(i, 0.05)

## Balance Dataset

Discard a random subset of negative samples to ensure the same number of positive and negative grasp samples.

In [None]:
df = pd.read_csv(csv_path)
positives = df[df["label"] == 1]
negatives = df[df["label"] == 0]

i = np.random.choice(negatives.index, len(negatives.index) - len(positives.index), replace=False)
df = df.drop(i)

*DANGER ZONE* - overwrite dataframe and remove unreferenced TSDFs

In [None]:
df.to_csv(csv_path, index=False)

In [None]:
scenes = df["scene_id"].values
tsdfs_dir = dataset_dir / "tsdfs"
for f in tsdfs_dir.iterdir():
    if f.suffix == ".npz" and f.stem not in scenes:
        print("Removing ", f)
        f.unlink()

## Merge Datasets

In [None]:
root = Path("data/datasets")
sources = ["train0", "train1", "train2", "train3"]
target = "train"

In [None]:
target_dir = root / target
target_csv_path= target_dir / "grasps.csv"

target_df = pd.read_csv(target_csv_path)
# target_df = pd.DataFrame()

for source in sources:
    source_dir = root / source
    # concatenate dataframes
    csv_path = source_dir / "grasps.csv"
    source_df = pd.read_csv(csv_path)
    target_df = pd.concat([target_df, source_df])
    # move tsdfs
    cmd = "mv {} {}".format(str(source_dir) + "/*.npz", str(target_dir / "tsdfs"))
    subprocess.call(cmd, shell=True)

In [None]:
target_df.to_csv(target_csv_path, index=False)