In [8]:
import collections
import imageio
import random
import os
import struct

import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

Define simple iterator to read samples.

In [2]:
def iterate(path):
    with open(path, 'rb') as buffer:
        head_format = '<I3H'
        head_size = struct.calcsize(head_format)

        while True:
            head = buffer.read(head_size)
            if not head:
                break
            _, tag, width, height = struct.unpack(head_format, head)
            image = np.frombuffer(buffer.read(width * height), dtype=np.uint8)

            yield image.reshape(height, width), tag

In [30]:
names = [
    '../data/1.0test-gb1.gnt',
    '../data/1.0train-gb1-part1.gnt',
    '../data/1.0train-gb1-part2.gnt',
    '../data/1.0train-gb1-part3.gnt',
]

In [31]:
tags = set()

for name in names:
    for _, tag in iterate(name):
        tags.add(tag)
        
        
print(len(tags))

In [32]:
tags = list(tags)
tags.sort()
tags = set(tags[-1000:])

In [37]:
data = []
for name in names:
    for image, tag in iterate(name):
        if tag in tags:
            data.append([image, tag])
            
data = np.array(data)

In [38]:
random.shuffle(data)
print(data.shape)

(416234, 2)


In [47]:
train_size = int(0.8 * len(data))
train = data[:train_size]
test = data[train_size:]

In [85]:
train.shape

(332987, 2)

In [76]:
part_n = 4
for part in range(part_n):
    size = len(train)
    begin = int(size * part / part_n)
    end  = int(size * (part + 1) / part_n)
    np.save('train-%i.npy' % (part+1), train[begin:end])

In [77]:
np.save('test.npy', test[:, 0])

with open('labels.csv', 'w') as out:
    print('Id,Category', file=out)
    for n, tag in enumerate(test[:, 1], 1):
        print('{n},{tag}'.format(n=n, tag=tag), file=out)