In [272]:
# AAI-540-Group4 Project
## EDA, DataProcessing, Feature Engineering Experiments
### Geoffrey Fadera

In [1]:
import numpy as np
import pandas as pd
from IPython.display import clear_output
import time
import json

## (1) load relevant json files

In [2]:
# load list of annotated (unique) bounding boxes in the entire dataset
with open('./caltech_bboxes_20200316.json', 'r') as file:  # Replace with your file name if different
    bbox_meta = json.load(file)

# load list suggested datasplit
with open('./CaltechCameraTrapsSplits_v0.json', 'r') as file:  # Replace with your file name if different
    data_split_meta = json.load(file)

In [3]:
bboxes_images_df = pd.DataFrame(bbox_meta['images'])
bboxes_annotations_df = pd.DataFrame(bbox_meta['annotations'])
bboxes_categories_df = pd.DataFrame(bbox_meta['categories'])


In [5]:
print("From BBOX Annotations File:")
print(f"\tnum unique images: {len(bboxes_images_df)}")
print(f"\tnum unique bbox annotations: {len(bboxes_annotations_df)}")
print(f"\tnum unique categories: {len(bboxes_categories_df)}")


From BBOX Annotations File:
	num unique images: 63025
	num unique bbox annotations: 65112
	num unique categories: 22


## (2) Create Feature Groups that can be derived from these two json files


In [19]:
bboxes_images_df.head()

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b.jpg,6f084ccc-5567-11e8-bc84-dca9047ef277,1494,5998cfa4-23d2-11e8-a6a3-ec086b02610b,1,2048,2011-05-13 23:43:18,Justin Brown,3,33
1,588a679f-23d2-11e8-a6a3-ec086b02610b.jpg,6f12067d-5567-11e8-b3c0-dca9047ef277,1494,588a679f-23d2-11e8-a6a3-ec086b02610b,2,2048,2012-03-17 03:48:44,Justin Brown,3,115
2,593d68d7-23d2-11e8-a6a3-ec086b02610b.jpg,6f0f6778-5567-11e8-b5d2-dca9047ef277,1494,593d68d7-23d2-11e8-a6a3-ec086b02610b,3,2048,2011-06-28 15:29:42,Justin Brown,3,90
3,59fae563-23d2-11e8-a6a3-ec086b02610b.jpg,6f181999-5567-11e8-a472-dca9047ef277,1494,59fae563-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-01-05 07:41:39,Justin Brown,3,46
4,595f7b9c-23d2-11e8-a6a3-ec086b02610b.jpg,701cb328-5567-11e8-8d4a-dca9047ef277,1494,595f7b9c-23d2-11e8-a6a3-ec086b02610b,2,2048,2014-09-28 01:00:53,Justin Brown,3,67


In [20]:
display(len(np.unique(bboxes_images_df['file_name'])))
display(len(np.unique(bboxes_images_df['id'])))

63025

63025

In [21]:
bboxes_annotations_df.head(3)

Unnamed: 0,image_id,id,bbox,category_id
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b,2a545520-cbf1-11e8-819c-970a9450cdbc,"[499.2, 711.68, 353.28000000000003, 199.679999...",1
1,588a679f-23d2-11e8-a6a3-ec086b02610b,2a54562e-cbf1-11e8-819c-970a9450cdbc,"[985.6, 783.36, 368.64, 230.39999999999998]",1
2,593d68d7-23d2-11e8-a6a3-ec086b02610b,2a54599e-cbf1-11e8-819c-970a9450cdbc,"[1779.2, 407.04, 61.440000000000055, 84.480000...",5


In [22]:
display(len(np.unique(bboxes_annotations_df['image_id'])))
display(len(np.unique(bboxes_annotations_df['id'])))

61945

65112

In [7]:
# 2.1 key (image_id) => LOCATION from bbox_meta['images']
image_id_to_location_df = bboxes_images_df[['id', 'location']].copy()
image_id_to_location_df.set_index('id', inplace = True)
image_id_to_location_df.head()


Unnamed: 0_level_0,location
id,Unnamed: 1_level_1
5998cfa4-23d2-11e8-a6a3-ec086b02610b,33
588a679f-23d2-11e8-a6a3-ec086b02610b,115
593d68d7-23d2-11e8-a6a3-ec086b02610b,90
59fae563-23d2-11e8-a6a3-ec086b02610b,46
595f7b9c-23d2-11e8-a6a3-ec086b02610b,67


In [8]:
image_id_to_location_df.loc['59fae563-23d2-11e8-a6a3-ec086b02610b']

location    46
Name: 59fae563-23d2-11e8-a6a3-ec086b02610b, dtype: object

In [10]:
# 2.2 key (location) => train or val from the suggested datasplit json file
train_locations = data_split_meta['splits']['train']
val_locations = data_split_meta['splits']['val']

loc_to_splitType_df = pd.DataFrame({
    'location': np.unique(train_locations + val_locations),
    
})
loc_to_splitType_df['splitType'] = loc_to_splitType_df['location'].apply(lambda x: 'train' if x in train_locations else 'val')   
loc_to_splitType_df.set_index('location', inplace = True)
loc_to_splitType_df.head()


Unnamed: 0_level_0,splitType
location,Unnamed: 1_level_1
0,val
1,val
10,train
100,val
101,val


In [11]:
# 2.3 use bbox annotation json as base and add location and train/val split type for each annotation entry
new_dataset_df = bboxes_annotations_df.copy()
new_dataset_df['location'] = new_dataset_df['image_id'].map(image_id_to_location_df['location'])
new_dataset_df['splitType'] = new_dataset_df['location'].map(loc_to_splitType_df['splitType'])
new_dataset_df.head()

Unnamed: 0,image_id,id,bbox,category_id,location,splitType
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b,2a545520-cbf1-11e8-819c-970a9450cdbc,"[499.2, 711.68, 353.28000000000003, 199.679999...",1,33,train
1,588a679f-23d2-11e8-a6a3-ec086b02610b,2a54562e-cbf1-11e8-819c-970a9450cdbc,"[985.6, 783.36, 368.64, 230.39999999999998]",1,115,train
2,593d68d7-23d2-11e8-a6a3-ec086b02610b,2a54599e-cbf1-11e8-819c-970a9450cdbc,"[1779.2, 407.04, 61.440000000000055, 84.480000...",5,90,train
3,59fae563-23d2-11e8-a6a3-ec086b02610b,2a545a84-cbf1-11e8-819c-970a9450cdbc,"[1456.0, 480.0, 156.79999999999995, 137.600000...",10,46,val
4,595f7b9c-23d2-11e8-a6a3-ec086b02610b,2a545dfe-cbf1-11e8-819c-970a9450cdbc,"[249.35160827635713, 798.4633789062436, 291.60...",99,67,val


In [12]:
# 2.4 category id to category name
cat_id_to_name_df = bboxes_categories_df[['id', 'name']].copy().sort_values(by='id')
cat_id_to_name_df.set_index('id', inplace = True)

cat_id_to_name_df

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
1,opossum
3,raccoon
5,squirrel
6,bobcat
7,skunk
8,dog
9,coyote
10,rabbit
11,bird
14,lizard


In [None]:
np.unique(bboxes_categories_df['name'])

In [37]:
# 3. show summary distribution per category 
new_ds_summary = []
new_ds_summary = pd.DataFrame({
    'category_id': cat_id_to_name_df.index.tolist(),
    })
new_ds_summary['category_label'] = new_ds_summary['category_id'].map(cat_id_to_name_df['name'])

new_ds_summary['num_train'] = new_ds_summary['category_id'].apply( 
    lambda x: len(new_dataset_df[(new_dataset_df['category_id'] == x) & (new_dataset_df['splitType'] == 'train')]))

new_ds_summary['num_val'] = new_ds_summary['category_id'].apply( 
    lambda x: len(new_dataset_df[(new_dataset_df['category_id'] == x) & (new_dataset_df['splitType'] == 'val')]))

new_ds_summary['total'] = new_ds_summary['num_train'] + new_ds_summary['num_val']



new_ds_summary

Unnamed: 0,category_id,category_label,num_train,num_val,total
0,1,opossum,7162,4972,12134
1,3,raccoon,3076,4832,7908
2,5,squirrel,1852,796,2648
3,6,bobcat,2204,2844,5048
4,7,skunk,729,469,1198
5,8,dog,1850,1010,2860
6,9,coyote,4212,2293,6505
7,10,rabbit,5272,753,6025
8,11,bird,2735,1291,4026
9,14,lizard,54,123,177


In [17]:
new_dataset_df.head()
display(len(np.unique(new_dataset_df['image_id'])))
display(len(np.unique(new_dataset_df['id'])))

61945

65112

In [23]:
bboxes_annotations_df.head(3)

Unnamed: 0,image_id,id,bbox,category_id
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b,2a545520-cbf1-11e8-819c-970a9450cdbc,"[499.2, 711.68, 353.28000000000003, 199.679999...",1
1,588a679f-23d2-11e8-a6a3-ec086b02610b,2a54562e-cbf1-11e8-819c-970a9450cdbc,"[985.6, 783.36, 368.64, 230.39999999999998]",1
2,593d68d7-23d2-11e8-a6a3-ec086b02610b,2a54599e-cbf1-11e8-819c-970a9450cdbc,"[1779.2, 407.04, 61.440000000000055, 84.480000...",5


In [24]:
bboxes_annotations_df['image_id'] .value_counts().head(10)

598acda7-23d2-11e8-a6a3-ec086b02610b    9
59f79880-23d2-11e8-a6a3-ec086b02610b    9
5a1e530f-23d2-11e8-a6a3-ec086b02610b    9
598f7588-23d2-11e8-a6a3-ec086b02610b    9
59e10475-23d2-11e8-a6a3-ec086b02610b    9
59b7a98d-23d2-11e8-a6a3-ec086b02610b    9
59849691-23d2-11e8-a6a3-ec086b02610b    9
5992911e-23d2-11e8-a6a3-ec086b02610b    9
599a596e-23d2-11e8-a6a3-ec086b02610b    9
59a30c5b-23d2-11e8-a6a3-ec086b02610b    9
Name: image_id, dtype: int64

In [26]:
bboxes_annotations_df[bboxes_annotations_df['image_id']=='598acda7-23d2-11e8-a6a3-ec086b02610b']

Unnamed: 0,image_id,id,bbox,category_id
52748,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f87d4-cbf1-11e8-819c-970a9450cdbc,"[1482.24, 1139.2, 47.36, 35.84]",11
52749,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f88a6-cbf1-11e8-819c-970a9450cdbc,"[1459.2, 1160.96, 53.76, 49.92000000000016]",11
52750,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f898c-cbf1-11e8-819c-970a9450cdbc,"[1781.76, 1111.04, 92.16, 140.80000000000115]",11
52751,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f8a5e-cbf1-11e8-819c-970a9450cdbc,"[1587.2, 1208.32, 64.0, 46.079999999999835]",11
52752,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f8b26-cbf1-11e8-819c-970a9450cdbc,"[1487.36, 1104.6399999999999, 53.7600000000001...",11
52753,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f8bf8-cbf1-11e8-819c-970a9450cdbc,"[1303.04, 1108.48, 53.76, 53.760000000000005]",11
52754,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f8cc0-cbf1-11e8-819c-970a9450cdbc,"[980.48, 1128.96, 58.880000000000166, 53.76000...",11
52755,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f8d88-cbf1-11e8-819c-970a9450cdbc,"[1220.1654464285714, 1339.9595926339287, 48.49...",11
52756,598acda7-23d2-11e8-a6a3-ec086b02610b,2c2f8e5a-cbf1-11e8-819c-970a9450cdbc,"[1189.468125, 1324.896556919643, 47.3204910714...",11


In [28]:
np.unique(bboxes_categories_df['name'])

array(['badger', 'bat', 'bird', 'bobcat', 'car', 'cat', 'cow', 'coyote',
       'deer', 'dog', 'empty', 'fox', 'insect', 'lizard', 'mountain_lion',
       'opossum', 'pig', 'rabbit', 'raccoon', 'rodent', 'skunk',
       'squirrel'], dtype=object)

In [29]:
new_dataset_df[new_dataset_df['image_id'] == '5998cfa4-23d2-11e8-a6a3-ec086b02610b']

Unnamed: 0,image_id,id,bbox,category_id,location,splitType
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b,2a545520-cbf1-11e8-819c-970a9450cdbc,"[499.2, 711.68, 353.28000000000003, 199.679999...",1,33,train


In [32]:
bboxes_images_df.head(2)

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b.jpg,6f084ccc-5567-11e8-bc84-dca9047ef277,1494,5998cfa4-23d2-11e8-a6a3-ec086b02610b,1,2048,2011-05-13 23:43:18,Justin Brown,3,33
1,588a679f-23d2-11e8-a6a3-ec086b02610b.jpg,6f12067d-5567-11e8-b3c0-dca9047ef277,1494,588a679f-23d2-11e8-a6a3-ec086b02610b,2,2048,2012-03-17 03:48:44,Justin Brown,3,115


In [33]:
# load list of annotated (unique) bounding boxes in the entire dataset
with open('./cct_labels.json', 'r') as file:  # Replace with your file name if different
    tyler_json = json.load(file)

tyler_json_df = pd.DataFrame(tyler_json).T
tyler_json_df.head(3)

Unnamed: 0,label,category_id,bbox
5998cfa4-23d2-11e8-a6a3-ec086b02610b_0.jpg,opossum,1,"[499.2, 711.68, 353.28000000000003, 199.679999..."
588a679f-23d2-11e8-a6a3-ec086b02610b_0.jpg,opossum,1,"[985.6, 783.36, 368.64, 230.39999999999998]"
593d68d7-23d2-11e8-a6a3-ec086b02610b_0.jpg,squirrel,5,"[1779.2, 407.04, 61.440000000000055, 84.480000..."


In [34]:
# 3. show summary distribution per category 
tyler_ds_summary = []
tyler_ds_summary = pd.DataFrame({
    'category_id': cat_id_to_name_df.index.tolist(),
    })
tyler_ds_summary['category_label'] = tyler_ds_summary['category_id'].map(cat_id_to_name_df['name'])

tyler_ds_summary['total'] = tyler_ds_summary['category_id'].apply( 
    lambda x: len(tyler_json_df[(tyler_json_df['category_id'] == x)]))

tyler_ds_summary    

Unnamed: 0,category_id,category_label,total
0,1,opossum,12134
1,3,raccoon,7908
2,5,squirrel,2648
3,6,bobcat,5048
4,7,skunk,1198
5,8,dog,2860
6,9,coyote,6505
7,10,rabbit,6025
8,11,bird,4026
9,14,lizard,177


In [40]:
len(set(bboxes_images_df['id']))

63025

In [41]:
len(set(bboxes_annotations_df['image_id']))

61945

In [42]:
# Get unique values from each column as sets
image_ids_set1 = set(bboxes_images_df['id']) # from bboxes_images_df
image_ids_set2 = set(bboxes_annotations_df['image_id'])

In [43]:
# Find items not common in both columns (symmetric difference)
not_common = list(image_ids_set1.symmetric_difference(image_ids_set2))

print(not_common)  # Output: [1, 2, 5, 6]

['59dc25fc-23d2-11e8-a6a3-ec086b02610b', '5967370c-23d2-11e8-a6a3-ec086b02610b', '59bac8fe-23d2-11e8-a6a3-ec086b02610b', '597b48c4-23d2-11e8-a6a3-ec086b02610b', '5a27d7d1-23d2-11e8-a6a3-ec086b02610b', '59c99d6e-23d2-11e8-a6a3-ec086b02610b', '58d47f83-23d2-11e8-a6a3-ec086b02610b', '598f78d4-23d2-11e8-a6a3-ec086b02610b', '59e5e0e8-23d2-11e8-a6a3-ec086b02610b', '59cb38a2-23d2-11e8-a6a3-ec086b02610b', '593100d9-23d2-11e8-a6a3-ec086b02610b', '5a13098e-23d2-11e8-a6a3-ec086b02610b', '5883e83f-23d2-11e8-a6a3-ec086b02610b', '598c5d1e-23d2-11e8-a6a3-ec086b02610b', '591b23e3-23d2-11e8-a6a3-ec086b02610b', '5a1e5139-23d2-11e8-a6a3-ec086b02610b', '5a24a88b-23d2-11e8-a6a3-ec086b02610b', '5a24a994-23d2-11e8-a6a3-ec086b02610b', '59973e69-23d2-11e8-a6a3-ec086b02610b', '58adc3d1-23d2-11e8-a6a3-ec086b02610b', '59484577-23d2-11e8-a6a3-ec086b02610b', '59b615e9-23d2-11e8-a6a3-ec086b02610b', '58af7850-23d2-11e8-a6a3-ec086b02610b', '5983083e-23d2-11e8-a6a3-ec086b02610b', '5a0968e2-23d2-11e8-a6a3-ec086b02610b',

In [46]:
len(not_common)

1080

In [49]:
not_common[:5]  # Display first 10 items not common in both columns

['59dc25fc-23d2-11e8-a6a3-ec086b02610b',
 '5967370c-23d2-11e8-a6a3-ec086b02610b',
 '59bac8fe-23d2-11e8-a6a3-ec086b02610b',
 '597b48c4-23d2-11e8-a6a3-ec086b02610b',
 '5a27d7d1-23d2-11e8-a6a3-ec086b02610b']

In [44]:
bboxes_images_df[bboxes_images_df['id']==not_common[0]]

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
47472,59dc25fc-23d2-11e8-a6a3-ec086b02610b.jpg,6f02f46e-5567-11e8-8929-dca9047ef277,1494,59dc25fc-23d2-11e8-a6a3-ec086b02610b,2,2048,2012-03-02 19:18:27,Justin Brown,3,38


In [50]:
bboxes_images_df.head()

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b.jpg,6f084ccc-5567-11e8-bc84-dca9047ef277,1494,5998cfa4-23d2-11e8-a6a3-ec086b02610b,1,2048,2011-05-13 23:43:18,Justin Brown,3,33
1,588a679f-23d2-11e8-a6a3-ec086b02610b.jpg,6f12067d-5567-11e8-b3c0-dca9047ef277,1494,588a679f-23d2-11e8-a6a3-ec086b02610b,2,2048,2012-03-17 03:48:44,Justin Brown,3,115
2,593d68d7-23d2-11e8-a6a3-ec086b02610b.jpg,6f0f6778-5567-11e8-b5d2-dca9047ef277,1494,593d68d7-23d2-11e8-a6a3-ec086b02610b,3,2048,2011-06-28 15:29:42,Justin Brown,3,90
3,59fae563-23d2-11e8-a6a3-ec086b02610b.jpg,6f181999-5567-11e8-a472-dca9047ef277,1494,59fae563-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-01-05 07:41:39,Justin Brown,3,46
4,595f7b9c-23d2-11e8-a6a3-ec086b02610b.jpg,701cb328-5567-11e8-8d4a-dca9047ef277,1494,595f7b9c-23d2-11e8-a6a3-ec086b02610b,2,2048,2014-09-28 01:00:53,Justin Brown,3,67


In [54]:
bboxes_images_df.head()

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
0,5998cfa4-23d2-11e8-a6a3-ec086b02610b.jpg,6f084ccc-5567-11e8-bc84-dca9047ef277,1494,5998cfa4-23d2-11e8-a6a3-ec086b02610b,1,2048,2011-05-13 23:43:18,Justin Brown,3,33
1,588a679f-23d2-11e8-a6a3-ec086b02610b.jpg,6f12067d-5567-11e8-b3c0-dca9047ef277,1494,588a679f-23d2-11e8-a6a3-ec086b02610b,2,2048,2012-03-17 03:48:44,Justin Brown,3,115
2,593d68d7-23d2-11e8-a6a3-ec086b02610b.jpg,6f0f6778-5567-11e8-b5d2-dca9047ef277,1494,593d68d7-23d2-11e8-a6a3-ec086b02610b,3,2048,2011-06-28 15:29:42,Justin Brown,3,90
3,59fae563-23d2-11e8-a6a3-ec086b02610b.jpg,6f181999-5567-11e8-a472-dca9047ef277,1494,59fae563-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-01-05 07:41:39,Justin Brown,3,46
4,595f7b9c-23d2-11e8-a6a3-ec086b02610b.jpg,701cb328-5567-11e8-8d4a-dca9047ef277,1494,595f7b9c-23d2-11e8-a6a3-ec086b02610b,2,2048,2014-09-28 01:00:53,Justin Brown,3,67


In [60]:
unbounded_images_df = bboxes_images_df[bboxes_images_df['id'].isin(not_common)]
unbounded_images_df.head()

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
9,59bac8fe-23d2-11e8-a6a3-ec086b02610b.jpg,701361b0-5567-11e8-959e-dca9047ef277,1494,59bac8fe-23d2-11e8-a6a3-ec086b02610b,1,2048,2013-02-09 10:47:54,Justin Brown,3,61
170,58c4d8d9-23d2-11e8-a6a3-ec086b02610b.jpg,6f16484a-5567-11e8-bdf4-dca9047ef277,1494,58c4d8d9-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-04-23 13:23:14,Justin Brown,3,125
249,58d7a2b0-23d2-11e8-a6a3-ec086b02610b.jpg,6f13e53a-5567-11e8-9674-dca9047ef277,1494,58d7a2b0-23d2-11e8-a6a3-ec086b02610b,1,2048,2011-08-13 06:13:42,Justin Brown,3,88
253,589cb238-23d2-11e8-a6a3-ec086b02610b.jpg,6f14932e-5567-11e8-8d6e-dca9047ef277,1494,589cb238-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-04-08 17:53:10,Justin Brown,3,88
264,59f128b5-23d2-11e8-a6a3-ec086b02610b.jpg,6efeea54-5567-11e8-b765-dca9047ef277,1494,59f128b5-23d2-11e8-a6a3-ec086b02610b,2,2048,2011-06-29 07:40:11,Justin Brown,3,38


In [57]:
len(unbounded_images_df)

1080

In [63]:
unbounded_images_df.head()

Unnamed: 0,file_name,seq_id,height,id,frame_num,width,date_captured,rights_holder,seq_num_frames,location
9,59bac8fe-23d2-11e8-a6a3-ec086b02610b.jpg,701361b0-5567-11e8-959e-dca9047ef277,1494,59bac8fe-23d2-11e8-a6a3-ec086b02610b,1,2048,2013-02-09 10:47:54,Justin Brown,3,61
170,58c4d8d9-23d2-11e8-a6a3-ec086b02610b.jpg,6f16484a-5567-11e8-bdf4-dca9047ef277,1494,58c4d8d9-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-04-23 13:23:14,Justin Brown,3,125
249,58d7a2b0-23d2-11e8-a6a3-ec086b02610b.jpg,6f13e53a-5567-11e8-9674-dca9047ef277,1494,58d7a2b0-23d2-11e8-a6a3-ec086b02610b,1,2048,2011-08-13 06:13:42,Justin Brown,3,88
253,589cb238-23d2-11e8-a6a3-ec086b02610b.jpg,6f14932e-5567-11e8-8d6e-dca9047ef277,1494,589cb238-23d2-11e8-a6a3-ec086b02610b,3,2048,2012-04-08 17:53:10,Justin Brown,3,88
264,59f128b5-23d2-11e8-a6a3-ec086b02610b.jpg,6efeea54-5567-11e8-b765-dca9047ef277,1494,59f128b5-23d2-11e8-a6a3-ec086b02610b,2,2048,2011-06-29 07:40:11,Justin Brown,3,38


In [68]:
unbounded_images_df['location'].to_list().map(loc_to_splitType_df['splitType'])

AttributeError: 'list' object has no attribute 'map'

In [58]:
unbounded_images_df['location'].map(loc_to_splitType_df['splitType'])

9        train
170        val
249      train
253      train
264      train
         ...  
62428      val
62435    train
62506    train
62589    train
62808      val
Name: location, Length: 1080, dtype: object

In [53]:
[bboxes_images_df[bboxes_images_df['id']== image_id] for image_id in not_common][]

AttributeError: 'list' object has no attribute 'head'

In [72]:
x = '59bac8fe-23d2-11e8-a6a3-ec086b02610b'
unbounded_images_df[unbounded_images_df['id'] == x]['location'].values[0]

'61'

In [71]:

empty_bboxes_df['image_id'].apply(lambda x: unbounded_images_df[unbounded_images_df['id'] == x]['location'].values[0])

1080

In [75]:
# create a new df for 1080 missing images
# 2.3 use bbox annotation json as base and add location and train/val split type for each annotation entry
empty_bboxes_df = []
empty_bboxes_df = pd.DataFrame({
    'image_id': not_common,    
})
empty_bboxes_df['id'] = empty_bboxes_df['image_id']  # use image_id as id
empty_bboxes_df['location'] = empty_bboxes_df['image_id'].map(image_id_to_location_df['location'])
empty_bboxes_df['location_fromJSON'] = empty_bboxes_df['image_id'].apply(lambda x: unbounded_images_df[unbounded_images_df['id'] == x]['location'].values[0])

empty_bboxes_df['splitType'] = empty_bboxes_df['location'].map(loc_to_splitType_df['splitType'])


empty_bboxes_df.head()

Unnamed: 0,image_id,id,location,location_fromJSON,splitType
0,59dc25fc-23d2-11e8-a6a3-ec086b02610b,59dc25fc-23d2-11e8-a6a3-ec086b02610b,38,38,train
1,5967370c-23d2-11e8-a6a3-ec086b02610b,5967370c-23d2-11e8-a6a3-ec086b02610b,46,46,val
2,59bac8fe-23d2-11e8-a6a3-ec086b02610b,59bac8fe-23d2-11e8-a6a3-ec086b02610b,61,61,train
3,597b48c4-23d2-11e8-a6a3-ec086b02610b,597b48c4-23d2-11e8-a6a3-ec086b02610b,46,46,val
4,5a27d7d1-23d2-11e8-a6a3-ec086b02610b,5a27d7d1-23d2-11e8-a6a3-ec086b02610b,46,46,val


In [77]:
bboxes_annotations_df[bboxes_annotations_df['image_id'] == '59dc25fc-23d2-11e8-a6a3-ec086b02610b']

Unnamed: 0,image_id,id,bbox,category_id
