# Import Dependency

In [2]:
from collections import OrderedDict
from sklearn.ensemble import RandomForestClassifier

# Prepare Data and Label

In [3]:
attribute_file = "/home/b1deng/228/attributes.txt"

In [4]:
# calculate num of possible values each attribute has 
type2num = {}
with open(attribute_file, 'r', encoding='utf-8') as f:
    num = 0
    for line in f:
        attr_type, _ = line.rstrip().split()[1].split("::")
        type2num[attr_type] = type2num.get(attr_type, 0) + 1

In [5]:
type2num

{'has_bill_shape': 9,
 'has_wing_color': 15,
 'has_upperparts_color': 15,
 'has_underparts_color': 15,
 'has_breast_pattern': 4,
 'has_back_color': 15,
 'has_tail_shape': 6,
 'has_upper_tail_color': 15,
 'has_head_pattern': 11,
 'has_breast_color': 15,
 'has_throat_color': 15,
 'has_eye_color': 14,
 'has_bill_length': 3,
 'has_forehead_color': 15,
 'has_under_tail_color': 15,
 'has_nape_color': 15,
 'has_belly_color': 15,
 'has_wing_shape': 5,
 'has_size': 5,
 'has_shape': 14,
 'has_back_pattern': 4,
 'has_tail_pattern': 4,
 'has_belly_pattern': 4,
 'has_primary_color': 15,
 'has_leg_color': 15,
 'has_bill_color': 15,
 'has_crown_color': 15,
 'has_wing_pattern': 4}

In [6]:
# only keep attributes related to color
filtered_attr = dict((k, type2num[k]) for k in type2num.keys() if 'color' in k)

In [7]:
filtered_attr

{'has_wing_color': 15,
 'has_upperparts_color': 15,
 'has_underparts_color': 15,
 'has_back_color': 15,
 'has_upper_tail_color': 15,
 'has_breast_color': 15,
 'has_throat_color': 15,
 'has_eye_color': 14,
 'has_forehead_color': 15,
 'has_under_tail_color': 15,
 'has_nape_color': 15,
 'has_belly_color': 15,
 'has_primary_color': 15,
 'has_leg_color': 15,
 'has_bill_color': 15,
 'has_crown_color': 15}

In [8]:
# color attribute and their id range
type2ids = OrderedDict()
with open(attribute_file, 'r', encoding='utf-8') as f:
    prev_type = ""
    for line in f:
        attr_id, attr = line.rstrip().split()
        attr_id = int(attr_id)
        attr_type, _ = attr.split("::")
        if 'color' in attr_type:
            ids = type2ids.get(attr_type, [1000, -1]) # list with placeholder
            if ids[0] > attr_id:
                ids[0] = attr_id
            if ids[1] < attr_id:
                ids[1] = attr_id
            type2ids[attr_type] = ids

In [9]:
type2ids = list(type2ids.items())

In [10]:
type2ids

[('has_wing_color', [10, 24]),
 ('has_upperparts_color', [25, 39]),
 ('has_underparts_color', [40, 54]),
 ('has_back_color', [59, 73]),
 ('has_upper_tail_color', [80, 94]),
 ('has_breast_color', [106, 120]),
 ('has_throat_color', [121, 135]),
 ('has_eye_color', [136, 149]),
 ('has_forehead_color', [153, 167]),
 ('has_under_tail_color', [168, 182]),
 ('has_nape_color', [183, 197]),
 ('has_belly_color', [198, 212]),
 ('has_primary_color', [249, 263]),
 ('has_leg_color', [264, 278]),
 ('has_bill_color', [279, 293]),
 ('has_crown_color', [294, 308])]

In [11]:
image_attr_file = "/home/b1deng/228/CUB_200_2011/attributes/image_attribute_labels.txt"

In [12]:
# construct mapping from image id and their features
img_id2attrs = {}
with open(image_attr_file, 'r', encoding='utf-8') as f:
    for line in f:
        img_id, attr_id, is_present, _ = map(int, line.rstrip().split(' ')[0: 4])
        feature_list = img_id2attrs.get(img_id, [0] * 16)        # 0 means not present, otherwise 1~15 according to class
        for idx in range(len(type2ids)):
            attr_range = type2ids[idx][1]
            if attr_id >= attr_range[0] and attr_id <= attr_range[1] and is_present == 1:
                feature_list[idx] = attr_id - attr_range[0] + 1
        img_id2attrs[img_id] = feature_list

In [13]:
# mapping from image id to train/test split
train_test_split = '/home/b1deng/228/CUB_200_2011/train_test_split.txt'
img_id2train = {}
with open(train_test_split, 'r', encoding='utf-8') as f:
    for line in f:
        img_id, is_train = map(int, line.rstrip().split(' '))
        img_id2train[img_id] = is_train

In [14]:
# mapping from image_id to image class
img_id2class = {}
class_file = '/home/b1deng/228/CUB_200_2011/image_class_labels.txt'
with open(class_file, 'r', encoding='utf-8') as f:
    for line in f:
        img_id, class_id = map(int, line.rstrip().split(' '))
        img_id2class[img_id] = class_id

In [15]:
train_x = []
train_y = []
test_x = []
test_y = []
for img_id, is_train in img_id2train.items():
    if is_train:
        train_x.append(img_id2attrs[img_id])
        train_y.append(img_id2class[img_id])
    else:
        test_x.append(img_id2attrs[img_id])
        test_y.append(img_id2class[img_id])

# Train and Test

In [16]:
classifier = RandomForestClassifier()
classifier.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
classifier.score(test_x, test_y)

0.24577148774594407