# Bees dataset

* Images from Tanya Latty
* Labels from Jakub Nabaglo


In [1]:
import json
import pandas as pd
import numpy as np

## Exploring the raw labels

In [2]:
label_data = json.load(open('labels.json','r'))
label_data['comment']

'Origin of coordinates is in the bottom left. Parts of bee that are not visible are marked with null.'

In [3]:
label_data.keys()

dict_keys(['labelled', 'comment', 'unlabelled'])

In [4]:
print('%d labelled, %d unlabelled examples' % (len(label_data['labelled']), len(label_data['unlabelled'])))

640 labelled, 0 unlabelled examples


In [5]:
label_list = label_data['labelled']
label_list[0].keys()

dict_keys(['head', 'left antenna', 'difficult', 'size', 'right wing', 'path', 'thorax', 'right antenna', 'abdomen', 'left wing'])

## Convert to pandas dataframe

In [6]:
tuples = ['size', 'head', 'thorax', 'abdomen', 'left antenna', 'right antenna', 'left wing', 'right wing']
coords = ['x', 'y']
col_pairs = []
for tup in tuples:
    for c in coords:
        col_pairs.append('%s_%s' % (tup, c))
col_names = ['path', 'species', 'is_difficult'] + col_pairs
num_labels = len(label_list)


In [7]:
labels = pd.DataFrame(index=range(num_labels), columns=col_names)
for ix,lab in enumerate(label_list):
    path = lab['path']
    labels['path'].ix[ix] = path
    labels['species'].ix[ix] = path.split('/')[0]
    labels['is_difficult'].ix[ix] = not 'difficult' in lab
    for key in tuples:
        if lab[key] is not None:
            labels['%s_x' % key].ix[ix] = lab[key][0]
            labels['%s_y' % key].ix[ix] = lab[key][1]
        else:
            labels['%s_x' % key].ix[ix] = np.nan
            labels['%s_y' % key].ix[ix] = np.nan
labels.head(10)

Unnamed: 0,path,species,is_difficult,size_x,size_y,head_x,head_y,thorax_x,thorax_y,abdomen_x,abdomen_y,left antenna_x,left antenna_y,right antenna_x,right antenna_y,left wing_x,left wing_y,right wing_x,right wing_y
0,Afranthidium repetitum/Addison Rd Haphazard Ta...,Afranthidium repetitum,False,427,285,233.7278,134.3201,206.9032,146.0612,183.6006,167.7252,221.2229,96.08085,281.4559,118.2541,132.8173,134.8652,,
1,Afranthidium repetitum/Balgowlah box 2 cell b2...,Afranthidium repetitum,False,427,285,243.3961,142.5,223.7689,142.4641,187.1618,133.4845,266.3332,161.5821,270.9878,141.0083,,,162.1912,145.9534
2,Afranthidium repetitum/Balgowlah box 2 cell b2...,Afranthidium repetitum,False,427,285,241.7249,98.80314,211.7374,127.0117,170.3352,124.1034,251.3965,131.6664,236.0943,127.0117,187.276,173.6169,148.8899,102.4785
3,Afranthidium repetitum/Balgowlah box 2 cell c2...,Afranthidium repetitum,False,427,285,234.6417,148.3167,220.7953,157.6227,177.2845,148.3526,258.855,166.0539,266.4148,148.8977,,,159.4004,161.1121
4,Afranthidium repetitum/Balgowlah box 2 cell c2...,Afranthidium repetitum,False,427,285,249.9113,73.61387,226.0669,88.08048,174.0857,88.952,294.9138,98.51263,283.0292,50.27536,173.2109,140.6068,159.8704,59.25171
5,Afranthidium repetitum/Balgowlah box 2 cell c2...,Afranthidium repetitum,False,427,285,246.1315,150.0924,219.7802,153.6568,166.3073,139.7353,262.4554,124.6126,243.3341,117.6339,,,144.4572,141.9125
6,Afranthidium repetitum/Balgowlah box 2 cell c2...,Afranthidium repetitum,True,427,285,183.284,139.4448,201.6055,134.6106,221.3078,123.3069,174.3403,129.1595,,,,,242.6095,140.3522
7,Afranthidium repetitum/Balgowlah Haphazard Ali...,Afranthidium repetitum,False,427,285,184.7365,137.0424,199.9245,144.9644,221.4841,150.7452,,,174.8103,147.0012,237.3673,162.0129,223.7722,167.2094
8,Afranthidium repetitum/Balgowlah Haphazard Jam...,Afranthidium repetitum,False,427,285,174.6634,151.2903,207.3439,149.6191,254.7814,129.8417,156.1199,140.3489,,,294.2969,163.2501,292.6257,180.4096
9,Afranthidium repetitum/Balgowlah Haphazard Jam...,Afranthidium repetitum,False,427,285,198.4034,173.4276,223.6318,173.4276,262.4913,153.6535,189.6784,177.8635,168.6313,159.8717,287.7915,204.7991,261.7666,210.6517


In [8]:
labels.to_csv('labels.csv', index=False)

## Statistics about labels

In [9]:
labels.groupby('species').count()

Unnamed: 0_level_0,path,is_difficult,size_x,size_y,head_x,head_y,thorax_x,thorax_y,abdomen_x,abdomen_y,left antenna_x,left antenna_y,right antenna_x,right antenna_y,left wing_x,left wing_y,right wing_x,right wing_y
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Afranthidium repetitum,123,123,123,123,122,122,121,121,119,119,106,106,99,99,98,98,101,101
Amegilla bombiformis,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1
Amegilla sp,109,109,109,109,100,100,109,109,108,108,96,96,100,100,96,96,94,94
Amphylaeus obscuriceps,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Exoneura sp,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
Homalictus sp,47,47,47,47,47,47,47,47,47,47,43,43,43,43,40,40,32,32
Homalictus urbanus,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Homalictus urbanus male (possibly),6,6,6,6,6,6,6,6,6,6,6,6,6,6,4,4,4,4
Hylaeus alcyoneus,6,6,6,6,6,6,6,6,6,6,6,6,5,5,3,3,5,5
Hylaeus euxanthus,2,2,2,2,2,2,1,1,1,1,2,2,2,2,2,2,1,1


In [10]:
labels[['species','is_difficult']].groupby('species').agg(['sum', 'count'])

Unnamed: 0_level_0,is_difficult,is_difficult
Unnamed: 0_level_1,sum,count
species,Unnamed: 1_level_2,Unnamed: 2_level_2
Afranthidium repetitum,9,123
Amegilla bombiformis,0,2
Amegilla sp,6,109
Amphylaeus obscuriceps,False,1
Exoneura sp,0,3
Homalictus sp,7,47
Homalictus urbanus,0,2
Homalictus urbanus male (possibly),0,6
Hylaeus alcyoneus,0,6
Hylaeus euxanthus,0,2


In [14]:
big_images = np.logical_and(labels['size_x'] > 500, labels['size_y'] > 300)
is_easy = np.logical_not(labels['is_difficult'])
print('Number of big images = %d' % big_images.sum())
print('Number of difficult images = %d' % labels['is_difficult'].sum())
labels[col_pairs].ix[np.logical_and(is_easy, np.logical_not(big_images))].mean()

Number of big images = 59
Number of difficult images = 49


size_x             424.375231
size_y             287.624769
head_x             212.682894
head_y             139.747535
thorax_x           213.727282
thorax_y           148.559299
abdomen_x          214.748171
abdomen_y          144.925642
left antenna_x     219.868296
left antenna_y     141.240267
right antenna_x    210.696577
right antenna_y    142.469177
left wing_x        217.669965
left wing_y        151.245448
right wing_x       207.864318
right wing_y       154.238102
dtype: float64