# Create smaller dataset
This file returns a subset of the original Eigen dataset, where the following conditions are met:
- Same distribution of the sampled images over the 28 scenes as the dataset constructed by the Eigen et al. 
- The right and left image from a stereo image set are never both selected to increase the variance.
- Even distribution of images taken by the left and right camera.

In [1]:
import random
import itertools

random.seed(10)

# Import the txt file with the image names in the training set
with open('splits/eigen_zhou/train_files.txt') as f:
    lines = f.readlines()

#### Obtain the different scenes in the Eigen training split.

In [2]:
scenes = []
for line in lines:
    data = line[11:37]
    if data not in scenes:
        scenes.append(data)

"The scenes in the eigen training split:", scenes

('The scenes in the eigen training split:',
 ['2011_09_26_drive_0022_sync',
  '2011_09_29_drive_0026_sync',
  '2011_09_26_drive_0087_sync',
  '2011_09_30_drive_0028_sync',
  '2011_10_03_drive_0034_sync',
  '2011_10_03_drive_0042_sync',
  '2011_09_26_drive_0061_sync',
  '2011_09_26_drive_0091_sync',
  '2011_09_30_drive_0033_sync',
  '2011_09_29_drive_0004_sync',
  '2011_09_26_drive_0051_sync',
  '2011_09_26_drive_0014_sync',
  '2011_09_26_drive_0032_sync',
  '2011_09_26_drive_0028_sync',
  '2011_09_26_drive_0039_sync',
  '2011_09_26_drive_0018_sync',
  '2011_09_26_drive_0104_sync',
  '2011_09_26_drive_0070_sync',
  '2011_09_30_drive_0034_sync',
  '2011_09_26_drive_0001_sync',
  '2011_09_30_drive_0020_sync',
  '2011_09_26_drive_0005_sync',
  '2011_09_26_drive_0095_sync',
  '2011_09_26_drive_0015_sync',
  '2011_09_26_drive_0035_sync',
  '2011_09_26_drive_0113_sync',
  '2011_09_26_drive_0019_sync',
  '2011_09_26_drive_0011_sync',
  '2011_09_28_drive_0001_sync',
  '2011_09_26_drive_0079_syn

#### Number of images from the left and right camera. Is approximately the same.

In [3]:
# Check distribution of left and right images
l, r = 0, 0
for line in lines:
    data = line[-2:-1]
    if data == 'l':
        l += 1
    elif data == 'r':
        r += 1
    else:
        print("ERROR")
        
f"left: {l}, right: {r}"

'left: 19956, right: 19854'

#### Store the images and number of images per scene

In [4]:
# Store images per scene
scenes_count = dict()
scenes_images = dict()
for line in lines:
    scene = line[11:37]
    try:
        scenes_count[scene] += 1
        scenes_images[scene].append(line)
    except:
        scenes_count[scene] = 1
        scenes_images[scene] = [line]
        
"The number of images used per scene. Some scenes have much more recordings than others:", scenes_count

('The number of images used per scene. Some scenes have much more recordings than others:',
 {'2011_09_26_drive_0022_sync': 1427,
  '2011_09_29_drive_0026_sync': 48,
  '2011_09_26_drive_0087_sync': 1296,
  '2011_09_30_drive_0028_sync': 9287,
  '2011_10_03_drive_0034_sync': 8413,
  '2011_10_03_drive_0042_sync': 1994,
  '2011_09_26_drive_0061_sync': 1268,
  '2011_09_26_drive_0091_sync': 594,
  '2011_09_30_drive_0033_sync': 2873,
  '2011_09_29_drive_0004_sync': 570,
  '2011_09_26_drive_0051_sync': 547,
  '2011_09_26_drive_0014_sync': 549,
  '2011_09_26_drive_0032_sync': 700,
  '2011_09_26_drive_0028_sync': 783,
  '2011_09_26_drive_0039_sync': 701,
  '2011_09_26_drive_0018_sync': 198,
  '2011_09_26_drive_0104_sync': 565,
  '2011_09_26_drive_0070_sync': 757,
  '2011_09_30_drive_0034_sync': 2009,
  '2011_09_26_drive_0001_sync': 189,
  '2011_09_30_drive_0020_sync': 1984,
  '2011_09_26_drive_0005_sync': 263,
  '2011_09_26_drive_0095_sync': 476,
  '2011_09_26_drive_0015_sync': 535,
  '2011_09_2

In [5]:
# Remove left or right image if both exist for a certain timestamp
scenes_images_s = scenes_images.copy()
scenes_count_s = dict()
for images in scenes_images_s.values():
    for im in images:
        if im[-2:-1] == 'r':
            inverse = im[:-2] + 'l\n'
        else:
            inverse = im[:-2] + 'r\n'
        if inverse in images:
            images.remove(random.choice([im, inverse]))


In [6]:
# select 1/4 of all the images in every scene
total_selected = 0
new_scenes_images, new_scenes_count = dict(), dict()
for scene in scenes:
    select_no = round(scenes_count[scene] / 4)
    new_scenes_images[scene] = random.choices(scenes_images_s[scene], k=select_no)
    new_scenes_count[scene] = select_no
    total_selected += select_no
    
f"Total images in new training data set: {total_selected}", \
"Number of training images per scene in new dataset", \
new_scenes_count

('Total images in new training data set: 9950',
 'Number of training images per scene in new dataset',
 {'2011_09_26_drive_0022_sync': 357,
  '2011_09_29_drive_0026_sync': 12,
  '2011_09_26_drive_0087_sync': 324,
  '2011_09_30_drive_0028_sync': 2322,
  '2011_10_03_drive_0034_sync': 2103,
  '2011_10_03_drive_0042_sync': 498,
  '2011_09_26_drive_0061_sync': 317,
  '2011_09_26_drive_0091_sync': 148,
  '2011_09_30_drive_0033_sync': 718,
  '2011_09_29_drive_0004_sync': 142,
  '2011_09_26_drive_0051_sync': 137,
  '2011_09_26_drive_0014_sync': 137,
  '2011_09_26_drive_0032_sync': 175,
  '2011_09_26_drive_0028_sync': 196,
  '2011_09_26_drive_0039_sync': 175,
  '2011_09_26_drive_0018_sync': 50,
  '2011_09_26_drive_0104_sync': 141,
  '2011_09_26_drive_0070_sync': 189,
  '2011_09_30_drive_0034_sync': 502,
  '2011_09_26_drive_0001_sync': 47,
  '2011_09_30_drive_0020_sync': 496,
  '2011_09_26_drive_0005_sync': 66,
  '2011_09_26_drive_0095_sync': 119,
  '2011_09_26_drive_0015_sync': 134,
  '2011_09_

#### Create a new txt file with the shuffled list

In [7]:
new_training_lines = list(itertools.chain.from_iterable(new_scenes_images.values()))
random.shuffle(new_training_lines)
    
with open('splits/eigen_reduced/training_files.txt', 'w') as f:
    for line in new_training_lines:  
        f.write(line)

## Do the same for the validation set

In [8]:
# Import the txt file with the image names in the validation set
with open('splits/eigen_zhou/val_files.txt') as f:
    lines = f.readlines()

# Obtain the scenes
scenes = []
for line in lines:
    data = line[11:37]
    if data not in scenes:
        scenes.append(data)

# Store images per scene
scenes_count = dict()
scenes_images = dict()
for line in lines:
    scene = line[11:37]
    try:
        scenes_count[scene] += 1
        scenes_images[scene].append(line)
    except:
        scenes_count[scene] = 1
        scenes_images[scene] = [line]
        
# Remove left or right image if both exist for a certain timestamp
scenes_images_s = scenes_images.copy()
scenes_count_s = dict()
for images in scenes_images_s.values():
    for im in images:
        if im[-2:-1] == 'r':
            inverse = im[:-2] + 'l\n'
        else:
            inverse = im[:-2] + 'r\n'
        if inverse in images:
            images.remove(random.choice([im, inverse]))
            
# select 1/4 of all the images in every scene
new_scenes_images, new_scenes_count = dict(), dict()
for scene in scenes:
    select_no = round(scenes_count[scene] / 4)
    new_scenes_images[scene] = random.choices(scenes_images_s[scene], k=select_no)
    new_scenes_count[scene] = select_no

new_validation_lines = list(itertools.chain.from_iterable(new_scenes_images.values()))
random.shuffle(new_validation_lines)

with open('splits/eigen_reduced/val_files.txt', 'w') as f:
    for line in new_validation_lines:  
        f.write(line)