# Load, Explore, and Clean Data

Prior to exploring this notebook, please download the image data and drivable area labeled data in the website: https://bdd-data.berkeley.edu/. Documentation about the data is located here: https://doc.bdd100k.com/

In [7]:
# Required: Import packages
import json
import pandas as pd
from PIL import Image
import numpy as np


In [4]:
# REQUIRED: Load labels json file containing image object labels

with open('../Final_Project/bdd100k_drivable_labels_trainval/labels/bdd100k_labels_images_train.json') as json_file:
    data = json.load(json_file)


In [8]:
# Show the structure of the data file 

data[:1]

[{'name': '0000f77c-6257be58.jpg',
  'attributes': {'weather': 'clear',
   'scene': 'city street',
   'timeofday': 'daytime'},
  'timestamp': 10000,
  'labels': [{'category': 'traffic light',
    'attributes': {'occluded': False,
     'truncated': False,
     'trafficLightColor': 'green'},
    'manualShape': True,
    'manualAttributes': True,
    'box2d': {'x1': 1125.902264,
     'y1': 133.184488,
     'x2': 1156.978645,
     'y2': 210.875445},
    'id': 0},
   {'category': 'traffic light',
    'attributes': {'occluded': False,
     'truncated': False,
     'trafficLightColor': 'green'},
    'manualShape': True,
    'manualAttributes': True,
    'box2d': {'x1': 1156.978645,
     'y1': 136.637417,
     'x2': 1191.50796,
     'y2': 210.875443},
    'id': 1},
   {'category': 'traffic sign',
    'attributes': {'occluded': False,
     'truncated': False,
     'trafficLightColor': 'none'},
    'manualShape': True,
    'manualAttributes': True,
    'box2d': {'x1': 1101.731743,
     'y1': 211

In [9]:
# REQUIRED: Normalize semi-structured JSON data into a pd dataframe
data_normalized = pd.json_normalize(data)
type(data_normalized)

pandas.core.frame.DataFrame

In [10]:
# Optional - Testing only -- check number of images 
data_normalized.shape

(69863, 6)

In [11]:
# Optional - Testing only -- check format and contents of df
data_normalized.head()

Unnamed: 0,name,timestamp,labels,attributes.weather,attributes.scene,attributes.timeofday
0,0000f77c-6257be58.jpg,10000,"[{'category': 'traffic light', 'attributes': {...",clear,city street,daytime
1,0000f77c-62c2a288.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,dawn/dusk
2,0000f77c-cb820c98.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,residential,dawn/dusk
3,0001542f-5ce3cf52.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,city street,night
4,0001542f-7c670be8.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,night


In [12]:
# Optional - Testing only -- check the df values corresponding to a specific image file name
data_normalized[data_normalized['name'] == '0a0a0b1a-7c39d841.jpg']


Unnamed: 0,name,timestamp,labels,attributes.weather,attributes.scene,attributes.timeofday
3915,0a0a0b1a-7c39d841.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime


In [13]:
# Optional - Testing only -- check what format the "labels" contains
type(data_normalized[data_normalized['name'] == '0a0a0b1a-7c39d841.jpg'].labels)
data_normalized.iloc[3915].labels

[{'category': 'car',
  'attributes': {'occluded': True,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 555.647397,
   'y1': 304.228432,
   'x2': 574.015906,
   'y2': 316.474104},
  'id': 109344},
 {'category': 'car',
  'attributes': {'occluded': True,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 554.116689,
   'y1': 318.004813,
   'x2': 567.89307,
   'y2': 328.719775},
  'id': 109345},
 {'category': 'car',
  'attributes': {'occluded': True,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 523.502507,
   'y1': 311.881977,
   'x2': 544.932435,
   'y2': 327.189068},
  'id': 109346},
 {'category': 'car',
  'attributes': {'occluded': True,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1'

In [14]:
# Optional - Check size of test and validation data for conditions clear, highway, and daytime.
data_shape = data_normalized[(data_normalized['attributes.weather'] == "clear") & 
               (data_normalized['attributes.scene'] == "highway") & 
               (data_normalized['attributes.timeofday'] == "daytime")].shape

In [15]:
# Optional - Print out number of images in train and test datasets

print("We get a total of %d images taken during clear weather, highway, and daytime." %data_shape[0])

We get a total of 3575 images taken during clear weather, highway, and daytime.


# Filter Data for Sunny, Highway, and Daytime Conditions

In [16]:
# REQUIRED: Function used to clean data, extract labels, and extract data for sunny, 
# highway, and daytime conditions

def clean_data(data_normalized):
    # Initialize lists of object types. Each list will contain 69863 values, each of which corresponds to an image specified as a row in data_cleaned 
    num_objects = []
    num_road = []
    num_sidewalk = []
    num_building = []
    num_wall = []
    num_fence = []
    num_pole = []
    num_traffic_light = []
    num_traffic_sign = []
    num_vegetation = []
    num_terrain = []
    num_sky = []
    num_person = []
    num_rider = []
    num_car = []
    num_truck = []
    num_bus = []
    num_train = []
    num_motorcycle = []
    num_bicycle = []
    num_lanes = []
    num_drivable_area = []

    # Iterate through each image in data_normalized, sum up the number of objects corresponding to each object type, and append to
    # the corresponding object list
    for index, row in data_normalized.iterrows():
        num_objects.append(len(row['labels']))
        num_road.append(sum(x['category'] == 'road' for x in row['labels']))
        num_sidewalk.append(sum(x['category'] == 'sidewalk' for x in row['labels']))
        num_building.append(sum(x['category'] == 'building' for x in row['labels']))
        num_wall.append(sum(x['category'] == 'wall' for x in row['labels']))
        num_fence.append(sum(x['category'] == 'fence' for x in row['labels']))
        num_pole.append(sum(x['category'] == 'pole' for x in row['labels']))
        num_traffic_light.append(sum(x['category'] == 'traffic light' for x in row['labels']))
        num_traffic_sign.append(sum(x['category'] == 'traffic sign' for x in row['labels']))
        num_vegetation.append(sum(x['category'] == 'vegetation' for x in row['labels']))
        num_terrain.append(sum(x['category'] == 'terrain' for x in row['labels']))
        num_sky.append(sum(x['category'] == 'sky' for x in row['labels']))
        num_person.append(sum(x['category'] == 'person' for x in row['labels']))
        num_rider.append(sum(x['category'] == 'rider' for x in row['labels']))
        num_car.append(sum(x['category'] == 'car' for x in row['labels']))
        num_truck.append(sum(x['category'] == 'truck' for x in row['labels']))
        num_bus.append(sum(x['category'] == 'bus' for x in row['labels']))
        num_train.append(sum(x['category'] == 'train' for x in row['labels']))
        num_motorcycle.append(sum(x['category'] == 'motorcycle' for x in row['labels']))
        num_bicycle.append(sum(x['category'] == 'bicycle' for x in row['labels']))
        num_lanes.append(sum(x['category'] == 'lane' for x in row['labels']))
        num_drivable_area.append(sum(x['category'] == 'drivable area' for x in row['labels']))

    data_normalized['num_objects'] = num_objects
    data_normalized['num_sidewalk'] = num_sidewalk
    data_normalized['num_building'] = num_building
    data_normalized['num_wall'] = num_wall
    data_normalized['num_fence'] = num_fence
    data_normalized['num_pole'] = num_pole
    data_normalized['num_traffic_light'] = num_traffic_light
    data_normalized['num_traffic_sign'] = num_traffic_sign
    data_normalized['num_vegetation'] = num_vegetation
    data_normalized['num_terrain'] = num_terrain
    data_normalized['num_sky'] = num_sky
    data_normalized['num_person'] = num_person
    data_normalized['num_rider'] = num_rider
    data_normalized['num_car'] = num_car
    data_normalized['num_truck'] = num_truck
    data_normalized['num_bus'] = num_bus
    data_normalized['num_train'] = num_train
    data_normalized['num_motorcycle'] = num_motorcycle
    data_normalized['num_bicycle'] = num_bicycle
    data_normalized['num_lanes'] = num_lanes
    data_normalized['num_drivable_area'] = num_drivable_area

    # Filter only for images where weather == "clear", scene == "highway", and timeofday == "daytime". 
    
    data_cleaned = data_normalized[(data_normalized['attributes.weather'] == "clear") & 
                   (data_normalized['attributes.scene'] == "highway") & 
                   (data_normalized['attributes.timeofday'] == "daytime")]
    return data_cleaned

In [17]:
# Clean Training & Validation Data
data_cleaned = clean_data(data_normalized)
data_cleaned

Unnamed: 0,name,timestamp,labels,attributes.weather,attributes.scene,attributes.timeofday,num_objects,num_sidewalk,num_building,num_wall,...,num_person,num_rider,num_car,num_truck,num_bus,num_train,num_motorcycle,num_bicycle,num_lanes,num_drivable_area
10,00067cfb-caba8a02.jpg,10000,"[{'category': 'traffic light', 'attributes': {...",clear,highway,daytime,29,0,0,0,...,0,0,3,0,1,0,0,0,11,3
62,002d290d-89f4e5c0.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,23,0,0,0,...,0,0,6,0,0,1,0,0,10,2
88,003baca5-aab2e274.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,6,0,0,0,...,0,0,2,0,0,0,0,0,1,1
89,003baca5-ad660439.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,15,0,0,0,...,0,0,10,1,0,0,0,0,2,1
92,003e23ee-07d32feb.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,28,0,0,0,...,0,0,6,1,1,0,0,0,8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69675,b18cb922-e3af77af.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,18,0,0,0,...,0,0,5,0,0,0,0,0,9,2
69805,fde2db45-f6e2fbd1.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,11,0,0,0,...,0,0,4,0,0,0,0,0,6,1
69809,fde816b0-1b0f1a85.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,29,0,0,0,...,0,0,8,0,0,0,0,0,9,2
69812,fdebe7ab-8409a734.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,40,0,0,0,...,0,0,12,0,0,0,0,0,8,2


In [19]:
# Split training dataset to training and validation

training_data_split = data_cleaned.sample(frac = 0.9)
val_data_split = data_cleaned.drop(training_data_split.index)
val_data_split

Unnamed: 0,name,timestamp,labels,attributes.weather,attributes.scene,attributes.timeofday,num_objects,num_sidewalk,num_building,num_wall,...,num_person,num_rider,num_car,num_truck,num_bus,num_train,num_motorcycle,num_bicycle,num_lanes,num_drivable_area
10,00067cfb-caba8a02.jpg,10000,"[{'category': 'traffic light', 'attributes': {...",clear,highway,daytime,29,0,0,0,...,0,0,3,0,1,0,0,0,11,3
62,002d290d-89f4e5c0.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,23,0,0,0,...,0,0,6,0,0,1,0,0,10,2
105,004855fc-ff3946ad.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,18,0,0,0,...,3,0,8,0,0,0,0,0,4,2
159,006fdb67-f4820206.jpg,10000,"[{'category': 'bus', 'attributes': {'occluded'...",clear,highway,daytime,40,0,0,0,...,5,0,9,0,1,0,0,0,6,2
203,0081e27b-17bf4a9e.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,31,0,0,0,...,0,0,19,1,0,0,0,0,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68985,afe7c43b-998729a8.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,45,0,0,0,...,0,0,20,0,1,0,0,0,12,4
69080,b01e39e0-d115b4da.jpg,10000,"[{'category': 'traffic light', 'attributes': {...",clear,highway,daytime,45,0,0,0,...,0,0,8,1,0,0,0,0,16,2
69224,b0729f3b-63288f86.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,34,0,0,0,...,0,0,15,0,0,0,0,0,11,3
69330,b0b1349e-28a20d14.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,29,0,0,0,...,0,0,9,0,0,0,0,0,12,3


In [20]:
training_data_split

Unnamed: 0,name,timestamp,labels,attributes.weather,attributes.scene,attributes.timeofday,num_objects,num_sidewalk,num_building,num_wall,...,num_person,num_rider,num_car,num_truck,num_bus,num_train,num_motorcycle,num_bicycle,num_lanes,num_drivable_area
30133,4ceece28-50d76200.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,23,0,0,0,...,1,0,7,1,0,0,0,0,6,2
58067,94d4c571-a8d68872.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,38,0,0,0,...,0,0,17,1,0,0,0,0,10,4
68454,aea0ec7b-8cf142f1.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,19,0,0,0,...,0,0,4,0,0,0,0,0,8,3
30175,4d0e6a24-22fd4c94.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,35,0,0,0,...,0,0,18,1,0,0,0,0,6,3
35417,59ec3a2d-b6cb8c71.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,20,0,0,0,...,5,0,4,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27809,47072934-a510d82e.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,30,0,0,0,...,0,0,10,0,2,0,0,0,10,4
61874,9e9bbeed-1eaefaa6.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,30,0,0,0,...,0,0,12,2,0,0,0,0,12,3
41197,6956a22e-258b6915.jpg,10000,"[{'category': 'traffic sign', 'attributes': {'...",clear,highway,daytime,38,0,0,0,...,0,0,14,0,0,0,0,0,11,2
45372,73fd9d6f-ba1ccb6d.jpg,10000,"[{'category': 'car', 'attributes': {'occluded'...",clear,highway,daytime,29,0,0,0,...,3,0,17,0,0,0,0,0,4,1


In [None]:
# REQUIRED - ONE TIME ONLY - ONLY PERFORM THIS STEP ONCE
# Move training and validation files to a separate folder. 

import os
import shutil

source = '../Final_Project/bdd100k_images_100k/images/100k/train/'
destination = '../Final_Project/bdd100k_images_100k/images/100k/train_clean/'

for f in training_data_split['name']:
    origin = source + f
    dest = destination + f
    shutil.move(origin, dest)


In [12]:
# REQUIRED - ONE TIME ONLY - ONLY PERFORM THIS STEP ONCE
# Move cleaned validation dataset to new folder

source = '../Final_Project/bdd100k_images_100k/images/100k/train/'
destination = '../Final_Project/bdd100k_images_100k/images/100k/val_clean/'

for f in val_data_split['name']:
    origin = source + f
    dest = destination + f
    shutil.move(origin, dest)

In [26]:
# REQUIRED - ONE TIME ONLY - ONLY PERFORM THIS STEP ONCE
# Move masks training files to a separate folder. 

source = '../Final_Project/bdd100k_sem_masks_labels_release/labels/drivable/masks/train/'
destination = '../Final_Project/bdd100k_sem_masks_labels_release/labels/drivable/masks/train_clean/'

for f in training_data_split['name']:
    origin = source + f[:-3] + 'png'
    dest = destination + f[:-3] + 'png'
    shutil.move(origin, dest)

In [29]:
# REQUIRED - ONE TIME ONLY - ONLY PERFORM THIS STEP ONCE
# Move masks validation files to a separate folder. 

source = '../Final_Project/bdd100k_sem_masks_labels_release/labels/drivable/masks/train/'
destination = '../Final_Project/bdd100k_sem_masks_labels_release/labels/drivable/masks/val_clean/'

for f in val_data_split['name']:
    origin = source + f[:-3] + 'png'
    dest = destination + f[:-3] + 'png'
    shutil.move(origin, dest)