In [1]:
## notebook setup
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [15]:
import shutil
import glob
import numpy as np
from fastai import *
from fastai.vision import *
from challenge_setup import *

In [19]:
#import sys
#sys.path.append('../tests/')
#from challenge_setup import *

## Prepare Learner

In [16]:
src = (ImageFileList.from_folder(path)            
       .label_from_csv('train.csv', sep=' ', folder='train_combined', suffix='.png')  
       .random_split_by_pct(0.2))
tfms = get_transforms(flip_vert=True, max_lighting=0.1, max_zoom=1.05, max_warp=0.)
bs = 64

In [17]:
# start with smaller size
data = (src.datasets()
        .transform(tfms, size=128)
        .databunch(bs=bs).normalize(imagenet_stats))

In [18]:
# use pretrained model
arch = models.resnet34
f1_score = partial(fbeta, thresh=0.2, beta=1.)
learn = create_cnn(data, arch, metrics=f1_score)

## Predictions part

In [22]:
# add test databunch
learn.data = (src.add_test_folder('test_combined')
        .datasets(ImageMultiDataset)
        .transform(tfms, size=256)
        .databunch().normalize(imagenet_stats))



## Get ids of test images

In [23]:
def get_image_names(data):
    return [fn.stem for fn in data.test_dl.x]
fnames = get_image_names(learn.data); fnames[:2]

['cdda98c0-bad6-11e8-b2b9-ac1f6b6435d0',
 '0305dfb6-bad0-11e8-b2b8-ac1f6b6435d0']

## Get predicted labels for each image

In [24]:
def get_tags(predictions, data):
    return [" ".join(data.test_ds.ds.classes[i] for i,el in enumerate(predictions[j] > 0.2) if el==1)
            for j in range(len(predictions))]

## Run predictions

In [25]:
preds, _ = learn.get_preds(DatasetType.Test)



In [27]:
tags = get_tags(preds, learn.data); 

['1 18 0 2 7 21 6 25 11 16 12 3 14 13 4 20 5 17 22 19 8 9 26 24 27 15']

In [31]:
print(f"We have {len(tags)} predictions")
print(f"Example {tags[:1]}")
print(f"Example {tags[:2]}")

We have 11702 predictions
Example ['1 18 0 2 7 21 6 25 11 16 12 3 14 13 4 20 5 17 22 19 8 9 26 24 27 15']
Example ['1 18 0 2 7 21 6 25 11 16 12 3 14 13 4 20 5 17 22 19 8 9 26 24 27 15', '1 18 2 23 21 6 25 11 16 12 3 14 13 4 20 5 17 22 19 8 9 10 26 24 27 15']


## Use tta

In [35]:
#preds_t,y_t = learn.TTA(is_test=True)
preds_t,y_t = learn.TTA(scale=1.0, ds_type=DatasetType.Test, with_loss=False)

Total time: 04:15



In [41]:
print(f"{type(preds_t)}")
print(f"{preds_t}")
preds_backup = preds_t

<class 'numpy.ndarray'>
[[0.789035 0.663046 0.742068 0.579836 ... 0.785043 0.567557 0.802602 0.716225]
 [0.517464 0.654974 0.48183  0.422788 ... 0.528396 0.390112 0.528229 0.526367]
 [0.351265 0.19469  0.189377 0.186476 ... 0.145741 0.169631 0.189239 0.250755]
 [0.690684 0.591899 0.557788 0.74184  ... 0.714469 0.690951 0.801054 0.623152]
 ...
 [0.569611 0.607786 0.428354 0.522511 ... 0.625799 0.445595 0.621549 0.529495]
 [0.612756 0.567041 0.517705 0.623163 ... 0.60671  0.696876 0.448253 0.497769]
 [0.555561 0.373711 0.292564 0.303093 ... 0.295174 0.64424  0.263279 0.487083]
 [0.611835 0.629169 0.590494 0.557223 ... 0.517316 0.614062 0.534392 0.513647]]


In [42]:
def sigmoid_np(x):
    return 1.0/(1.0 + np.exp(-x))

In [46]:
print(preds_t.shape)
preds_t = np.stack(preds_t, axis=-1)
print(preds_t.shape)
print(preds_t[:1])
preds_t = sigmoid_np(preds_t)
print(preds_t.shape)
print(preds_t[:1])
pred_t = preds_t.max(axis=-1) #max works better for F1 macro score
print(pred_t[:1])

(11702, 28)
(28, 11702)
[[0.660481 0.659091 0.659972 0.658133 ... 0.660438 0.657989 0.660626 0.659687]]
(28, 11702)
[[0.659368 0.659056 0.659254 0.658841 ... 0.659359 0.658809 0.659401 0.65919 ]]
[0.659718]


In [47]:
th = np.array([0.565,0.39,0.55,0.345,0.33,0.39,0.33,0.45,0.38,0.39,
               0.34,0.42,0.31,0.38,0.49,0.50,0.38,0.43,0.46,0.40,
               0.39,0.505,0.37,0.47,0.41,0.545,0.32,0.1])

In [52]:
pred_list = []
for line in pred_t:
    s = ' '.join(list([str(i) for i in np.nonzero(line>th)[0]]))
    pred_list.append(s)
print(pred_list)
print(len(pred_list))

['0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27', '0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27'

In [55]:
sample_df = pd.read_csv(sample)
sample_list = list(sample_df.Id)
pred_dic = dict((key, value) for (key, value) 
#             in zip(learn.data.test_ds.fnames,pred_list))
             in zip(fnames,pred_list))


## Each image in test dataset now has a set of labels

In [59]:
pred_dic['cdda98c0-bad6-11e8-b2b9-ac1f6b6435d0']

'0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27'

In [65]:
fnames['00008af0-bad0-11e8-b2b8-ac1f6b6435d0']

TypeError: list indices must be integers or slices, not str

In [63]:
sample_list[:2]

['00008af0-bad0-11e8-b2b8-ac1f6b6435d0',
 '0000a892-bacf-11e8-b2b8-ac1f6b6435d0']

In [69]:
learn.data.test_dl.x

array([PosixPath('/home/fabsta/projects/datascience/competitions/kaggle_human_protein_segmentation/input/test_combined/cdda98c0-bad6-11e8-b2b9-ac1f6b6435d0.png'),
       PosixPath('/home/fabsta/projects/datascience/competitions/kaggle_human_protein_segmentation/input/test_combined/0305dfb6-bad0-11e8-b2b8-ac1f6b6435d0.png'),
       PosixPath('/home/fabsta/projects/datascience/competitions/kaggle_human_protein_segmentation/input/test_combined/04eef62c-bad6-11e8-b2b9-ac1f6b6435d0.png'),
       PosixPath('/home/fabsta/projects/datascience/competitions/kaggle_human_protein_segmentation/input/test_combined/34203822-bad9-11e8-b2b9-ac1f6b6435d0.png'),
       ...,
       PosixPath('/home/fabsta/projects/datascience/competitions/kaggle_human_protein_segmentation/input/test_combined/1b3dd548-bad5-11e8-b2b8-ac1f6b6435d0.png'),
       PosixPath('/home/fabsta/projects/datascience/competitions/kaggle_human_protein_segmentation/input/test_combined/9a5c8f9a-bac6-11e8-b2b7-ac1f6b6435d0.png'),
       Pos

In [60]:
pred_list_cor = [pred_dic[id] for id in sample_list]
#df = pd.DataFrame({'Id':sample_list,'Predicted':pred_list_cor})
#df.to_csv(fname, header=True, index=False)

KeyError: '00008af0-bad0-11e8-b2b8-ac1f6b6435d0'