In [43]:
import keras
import pandas as pd
import numpy as np
from PIL import Image
from os import listdir
import tensorflow as tf
from os.path import join, abspath, dirname
from functional import seq
from scipy.misc import imread
from collections import defaultdict

try:
    from .settings import data_dir
except Exception as e:
    data_dir = "/home/jessica/Kaggle/HumanProteinAtlasImageClassification/kaggle_comp/data"


In [44]:
train_csv = f"{data_dir}/train.csv"
train_imgs = f"{data_dir}/train"
test_imgs = f"{data_dir}/test"
sample_submission = f"{data_dir}/sample_submission.csv"

In [45]:
def load_images(basepath, img_ids):
    images_matrix = np.array([load_image(basepath, img) for img in img_ids])
    return images_matrix

In [46]:
def load_image(basepath, image_id):
    """ Reads an image data by file id, and builds a matrix for this image 
        - 4 channels for each filter colour
        - 512 * 512 for each image dimension
    :param basepath : path to image folder
    :param image_id : image name, minus the filter and png part
    :returns: image matrix """
    # empty numpy array of 4 (colour channel) by 512*512 (image width * height)
    image = np.zeros(shape=(4,512,512))
    image[0,:,:] = imread(basepath + image_id + "_green" + ".png")
    image[1,:,:] = imread(basepath + image_id + "_red" + ".png")
    image[2,:,:] = imread(basepath + image_id + "_blue" + ".png")
    image[3,:,:] = imread(basepath + image_id + "_yellow" + ".png")
    return image

In [47]:
def load_train_csv(gold_path):
    """ Loads train.csv in a dict of filename to list of gold classes labels dictionary
    :param gold_path: path to train.csv 
    :type gold_path: str
    :returns: loaded gold data
    :rtype: dict of str:list of str """
    data = pd.read_csv(open(gold_path, "r"))
    data['Target'] = data['Target'].str.split(' ')  # making Target a list of labels
    return data


In [48]:
load_train_csv(f"{data_dir}/train.csv")

Unnamed: 0,Id,Target
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,"[16, 0]"
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,"[7, 1, 2, 0]"
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,[5]
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,[1]
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,[18]
5,001bcdd2-bbb2-11e8-b2ba-ac1f6b6435d0,[0]
6,0020af02-bbba-11e8-b2ba-ac1f6b6435d0,"[25, 2]"
7,002679c2-bbb6-11e8-b2ba-ac1f6b6435d0,[0]
8,00285ce4-bba0-11e8-b2b9-ac1f6b6435d0,"[2, 0]"
9,002daad6-bbc9-11e8-b2bc-ac1f6b6435d0,[7]


In [49]:
def load_sample_submission(sample_sub):
    data = pd.read_csv(open(sample_sub, "r"))
    return data

In [60]:
def load_training_dataset(training_csv, training_images_folder, batch_size):
    all_matrices = np.array([])
    img_to_label_df = load_train_csv(training_csv)
    ids = img_to_label_df.Id
    batches = [ids[x:x+batch_size] for x in range(0, ids.size, batch_size)]
    for batch in batches:
        b = load_images(training_images_folder, batch)
        all_matrices = np.concatenate(all_matrices, b)
    return all_matrices
    

In [59]:
load_training_dataset(train_csv, train_imgs, 10000)

[0       00070df0-bbc3-11e8-b2bc-ac1f6b6435d0
1       000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0
2       000a9596-bbc4-11e8-b2bc-ac1f6b6435d0
3       000c99ba-bba4-11e8-b2b9-ac1f6b6435d0
4       001838f8-bbca-11e8-b2bc-ac1f6b6435d0
5       001bcdd2-bbb2-11e8-b2ba-ac1f6b6435d0
6       0020af02-bbba-11e8-b2ba-ac1f6b6435d0
7       002679c2-bbb6-11e8-b2ba-ac1f6b6435d0
8       00285ce4-bba0-11e8-b2b9-ac1f6b6435d0
9       002daad6-bbc9-11e8-b2bc-ac1f6b6435d0
10      002ff91e-bbb8-11e8-b2ba-ac1f6b6435d0
11      00301238-bbb2-11e8-b2ba-ac1f6b6435d0
12      0032a07e-bba9-11e8-b2ba-ac1f6b6435d0
13      00344514-bbc2-11e8-b2bb-ac1f6b6435d0
14      00357b1e-bba9-11e8-b2ba-ac1f6b6435d0
15      00383b44-bbbb-11e8-b2ba-ac1f6b6435d0
16      0038d6a6-bb9a-11e8-b2b9-ac1f6b6435d0
17      003957a8-bbb7-11e8-b2ba-ac1f6b6435d0
18      003feb6e-bbca-11e8-b2bc-ac1f6b6435d0
19      0042017c-bba4-11e8-b2b9-ac1f6b6435d0
20      00456fd2-bb9b-11e8-b2b9-ac1f6b6435d0
21      0047c984-bba6-11e8-b2ba-ac1f6b6435d0
22      0

### For prediction

Use red/yellow/blue filters as features, and green is the one to predict on