# Load modules

In [1]:
%cd ~/Workspace/kaggle-2019Q3-cellular/

/home/chengjiun/Workspace/kaggle-2019Q3-cellular


In [2]:
import os
import numpy as np
import pandas as pd

from fastai.vision import *
BASE_DIR = '../DATA/kaggle-2019Q3-cellular/'


def open_rcic_image(fn):
    images = []
    for i in range(6):
        file_name = fn+str(i+1)+'.png'
        im = cv2.imread(file_name)
        if im is None: 
            print(f'file reading failed {file_name}')
        
        im = cv2.cvtColor(im, cv2.COLOR_RGB2GRAY)
        images.append(im)
    image = np.dstack(images)
    #print(pil2tensor(image, np.float32).shape)#.div_(255).shape)
    return Image(pil2tensor(image, np.float32).div_(255))
  
class MultiChannelImageList(ImageList):
    def open(self, fn):
        return open_rcic_image(fn)
    
def image2np(image:Tensor)->np.ndarray:
    "Convert from torch style `image` to numpy/matplotlib style."
    res = image.cpu().permute(1,2,0).numpy()
    if res.shape[2]==1:
        return res[...,0]  
    elif res.shape[2]>3:
        #print(res.shape)
        #print(res[...,:3].shape)
        return res[...,:3]
    else:
        return res

vision.image.image2np = image2np

## efficientnet

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
%matplotlib inline  
from sklearn.model_selection import StratifiedKFold
from joblib import load, dump
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
import cv2
from fastai import *
from fastai.vision import *
from fastai.callbacks import *
from torchvision import models as md
from torch import nn
from torch.nn import functional as F
import scipy as sp
import re
import math
import collections
from functools import partial
from torch.utils import model_zoo
from sklearn import metrics
from collections import Counter
import json

In [4]:
from efficientnet_pytorch import EfficientNet
from efficientnet_pytorch.utils import Conv2dStaticSamePadding
model = EfficientNet.from_pretrained('efficientnet-b6', num_classes=1108) 

def get_model_params(model_name, override_params):
    """ Get the block args and global params for a given model """
    if model_name.startswith('efficientnet'):
        w, d, s, p = efficientnet_params(model_name)
        # note: all models have drop connect rate = 0.2
        blocks_args, global_params = efficientnet(
            width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
    else:
        raise NotImplementedError('model name is not pre-defined: %s' % model_name)
    if override_params:
        # ValueError will be raised here if override_params has fields not included in global_params.
        global_params = global_params._replace(**override_params)
    return blocks_args, global_params


url_map = {
    'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet-b0-08094119.pth',
    'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet-b1-dbc7070a.pth',
    'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet-b2-27687264.pth',
    'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet-b3-c8376fa2.pth',
    'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet-b4-e116e8b3.pth',
    'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet-b5-586e6cc6.pth',
    'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth',
    'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth',
}

def load_pretrained_weights(model, model_name, load_fc=True):
    """ Loads pretrained weights, and downloads if loading for the first time. """
    state_dict = model_zoo.load_url(url_map[model_name])
    if load_fc:
        model.load_state_dict(state_dict)
    else:
        state_dict.pop('_fc.weight')
        state_dict.pop('_fc.bias')
        res = model.load_state_dict(state_dict, strict=False)
        assert str(res.missing_keys) == str(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
    print('Loaded pretrained weights for {}'.format(model_name))
            

# Set seed fol all
def seed_everything(seed=1358):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
    


seed_everything()

Loaded pretrained weights for efficientnet-b6


## utils

In [5]:
#lazy calculation of stats
pixel_stats = pd.read_csv(f'{BASE_DIR}/pixel_stats.csv')
channel1_mean = pixel_stats.iloc[::6,:]['mean'].mean()
channel2_mean = pixel_stats.iloc[1::6,:]['mean'].mean()
channel3_mean = pixel_stats.iloc[2::6,:]['mean'].mean()
channel4_mean = pixel_stats.iloc[3::6,:]['mean'].mean()
channel5_mean = pixel_stats.iloc[4::6,:]['mean'].mean()
channel6_mean = pixel_stats.iloc[5::6,:]['mean'].mean()


channel1_std = pixel_stats.iloc[::6,:]['std'].mean()
channel2_std = pixel_stats.iloc[1::6,:]['std'].mean()
channel3_std = pixel_stats.iloc[2::6,:]['std'].mean()
channel4_std = pixel_stats.iloc[3::6,:]['std'].mean()
channel5_std = pixel_stats.iloc[4::6,:]['std'].mean()
channel6_std = pixel_stats.iloc[5::6,:]['std'].mean()
stats = (torch.Tensor([channel1_mean,channel2_mean,channel3_mean,channel4_mean,channel5_mean,channel6_mean])/255,torch.Tensor([channel1_std,channel2_std,channel3_std,channel4_std,channel5_std,channel6_std])/255)
print(stats)

(tensor([0.0229, 0.0611, 0.0396, 0.0391, 0.0219, 0.0356]), tensor([0.0271, 0.0492, 0.0219, 0.0292, 0.0183, 0.0193]))


## Loading and formatting data

Here I will load the csv into the DataFrame, and create a column in the DataFrame with the path to the corresponding image (`generate_df`)

In [6]:
train_df = pd.read_csv(f'{BASE_DIR}/train.csv')
train_df['exp'] = train_df['experiment'].apply(lambda x: x.split('-')[0])
train_df.head(10)

Unnamed: 0,id_code,experiment,plate,well,sirna,exp
0,HEPG2-01_1_B03,HEPG2-01,1,B03,513,HEPG2
1,HEPG2-01_1_B04,HEPG2-01,1,B04,840,HEPG2
2,HEPG2-01_1_B05,HEPG2-01,1,B05,1020,HEPG2
3,HEPG2-01_1_B06,HEPG2-01,1,B06,254,HEPG2
4,HEPG2-01_1_B07,HEPG2-01,1,B07,144,HEPG2
5,HEPG2-01_1_B08,HEPG2-01,1,B08,503,HEPG2
6,HEPG2-01_1_B09,HEPG2-01,1,B09,188,HEPG2
7,HEPG2-01_1_B10,HEPG2-01,1,B10,700,HEPG2
8,HEPG2-01_1_B11,HEPG2-01,1,B11,1100,HEPG2
9,HEPG2-01_1_B12,HEPG2-01,1,B12,611,HEPG2


In [7]:
from sklearn.model_selection import train_test_split


exp_list = ['HEPG2', 'HUVEC', 'RPE', 'U2OS']
def generate_df(train_df,sample_num=[1,2],exp=None, test_size=None, random_state=42):
    train_df['is_valid'] = False
    if exp is not None:
        train_df = train_df[train_df['exp'] == exp]
        print(f'extract experiment: {exp}, number of train data {len(train_df)}')
    if test_size is not None:
        sub_train_df, sub_test_df = train_test_split(train_df, test_size=test_size, random_state=random_state)
        sub_test_df.loc[:, 'is_valid'] = True
        train_df = pd.concat([sub_train_df, sub_test_df], axis=0, sort=False)
    
    temp_df = train_df.drop(columns=['id_code','experiment','plate','well'])
    res_df = pd.DataFrame()
    for s in sample_num:
        temp_df['path'] = (train_df['experiment'].str.cat(train_df['plate'].astype(str)
                                                .str.cat(train_df['well'],sep='/'),sep='/Plate') + '_s'+str(s) + '_w')
        
        temp_df = temp_df.reindex(columns=['path','sirna', 'is_valid'])
        if len(res_df) ==0:
            res_df = temp_df.copy(deep=True)
        else:
            res_df = pd.concat([temp_df, res_df], axis=0, sort=False)
            
    return res_df

proc_train_all_df = generate_df(train_df, sample_num=[1,2], exp=None, test_size=0.1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [8]:
proc_train_all_df.head(10)

Unnamed: 0,path,sirna,is_valid
24735,HUVEC-16/Plate2/H14_s2_w,694,False
33693,U2OS-01/Plate2/M13_s2_w,128,False
20160,HUVEC-12/Plate1/M12_s2_w,272,False
24447,HUVEC-16/Plate1/G22_s2_w,450,False
8517,HUVEC-01/Plate3/L20_s2_w,255,False
5686,HEPG2-06/Plate1/I17_s2_w,459,False
35838,U2OS-03/Plate2/I21_s2_w,639,False
24541,HUVEC-16/Plate1/L18_s2_w,162,False
23079,HUVEC-14/Plate4/H02_s2_w,334,False
5186,HEPG2-05/Plate3/L11_s2_w,48,False


In [9]:
il = MultiChannelImageList.from_df(df=proc_train_all_df,path='../DATA/kaggle-2019Q3-cellular/train/')

## Creating and Training a Model

I will use a pretrained ResNet. I have to now adjust the CNN arch to take in 6 channels as opposed to the usual 3 channels:

In [10]:
"""Inspired by https://github.com/wdhorton/protein-atlas-fastai/blob/master/resnet.py"""

import torchvision
RESNET_MODELS = {
    34: torchvision.models.resnet34,
    50: torchvision.models.resnet50,
    101: torchvision.models.resnet101,
    152: torchvision.models.resnet152,
    '101x': torchvision.models.resnext101_32x8d,
}


def resnet_multichannel(depth=50,pretrained=True,num_classes=1108,num_channels=6):
    model = RESNET_MODELS[depth](pretrained=pretrained)
    w = model.conv1.weight
    model.conv1 = nn.Conv2d(num_channels, 64, kernel_size=7, stride=2, padding=3,
                           bias=False)
    model.conv1.weight = nn.Parameter(torch.cat((w,torch.zeros(64,num_channels-3,7,7)),dim=1))
    return model

def efficientnetb4_multichannel(name='efficientnet-b4', pretrained=True, num_classes=1108, num_channels=6):
    model = EfficientNet.from_pretrained(name, num_classes)
    load_pretrained_weights(model,model_name=name, load_fc=False)
    w = model._conv_stem.weight
    model._conv_stem = Conv2dStaticSamePadding(num_channels, 48, kernel_size=3, 
                                               image_size=image_size, stride=2, padding=3)
    model._conv_stem.weight = nn.Parameter(torch.cat((w,torch.zeros(48,num_channels-3,3,3)),dim=1))
    return model

def efficientnetb6_multichannel(name='efficientnet-b6', pretrained=True, num_classes=1108, num_channels=6):
    model = EfficientNet.from_pretrained(name, num_classes)
    load_pretrained_weights(model,model_name=name, load_fc=False)
    w = model._conv_stem.weight
    model._conv_stem = Conv2dStaticSamePadding(num_channels, 56, kernel_size=3, 
                                               image_size=image_size, stride=2, padding=3)
    model._conv_stem.weight = nn.Parameter(torch.cat((w,torch.zeros(56,num_channels-3,3,3)),dim=1))
    return model

In [11]:
image_size=512
def resnet34(pretrained,num_channels=6):
    return resnet_multichannel(depth=50,pretrained=pretrained,num_channels=num_channels)

def resnet50(pretrained,num_channels=6):
    return resnet_multichannel(depth=50,pretrained=pretrained,num_channels=num_channels)

def resnext101(pretrained,num_channels=6):
    return resnet_multichannel(depth='101x',pretrained=pretrained,num_channels=num_channels)

effnetb4 = efficientnetb4_multichannel('efficientnet-b4', pretrained=True,num_channels=6)

effnetb6 = efficientnetb6_multichannel('efficientnet-b6', pretrained=True,num_channels=6)

def _resnet_split(m): return (m[0][6],m[1])

Loaded pretrained weights for efficientnet-b4
Loaded pretrained weights for efficientnet-b4
Loaded pretrained weights for efficientnet-b6
Loaded pretrained weights for efficientnet-b6


In [12]:
"""From https://www.kaggle.com/leighplt/densenet121-pytorch"""
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size).item())
        return torch.Tensor(np.array(res))

Let's create our Learner:

In [13]:
from optim.ranger import Ranger
optar = partial(Ranger)

In [14]:
%ls effb6-gce/fastaimodels/

bestmodel_0.pth  bestmodel_4.pth             effnetb6-512-stage2-best.pth
bestmodel_1.pth  bestmodel.pth               effnetb6-512-stage2.pth
bestmodel_2.pth  effnetb6-512-s1-stage3.pth  effnetb6-s2-stage3.pth
bestmodel_3.pth  effnetb6-512-stage-1.pth    tmp.pth


In [15]:
# 1080Ti 9G: img size=384, efficientnet-b4, bs=16
# 4Ti
exp_list = ['HEPG2', 'HUVEC', 'RPE', 'U2OS']
image_size=512
tfms = get_transforms(do_flip=True, flip_vert=True,
                      max_lighting=0.2, p_lighting=0.5, 
                      max_warp=0.05, max_rotate=20., max_zoom=1.1)
params = {'HEPG2': [10, 1e-5, 'effnetb6-HEPG2-512-stage2'], 
          'HUVEC': [5, 1e-6, ''], 
          'RPE': [10, 1e-5, ''],
          'U2OS': [10, 1e-5, '']}

for exp in exp_list:

    proc_train_exp_df = generate_df(train_df, sample_num=[1,2], exp=exp, test_size=0.1)
    data = (MultiChannelImageList.from_df(df=proc_train_exp_df,path='../DATA/kaggle-2019Q3-cellular/train/')
            .split_from_df(col=2)
            .label_from_df()
            .transform(tfms,size=image_size)
            .databunch(bs=8,num_workers=8)
            .normalize(stats)
           )
    print(params[exp])
    learn = Learner(data, 
                    effnetb6, opt_func=optar,
                    metrics = [accuracy], 
                    model_dir='./fastaimodels').to_fp16()
    # learn.model = torch.nn.DataParallel(learn.model)
    if params[exp][2] is '':
        learn.path = Path('./effb6-gce')
        learn.load('effnetb6-512-stage2')
        learn.path = Path('./effb6-4exps-gce')
    else:
        learn.path = Path('./effb6-4exps-gce')
        learn.load(params[exp][2])
    
    learn.unfreeze()
    learn.fit_one_cycle(params[exp][0],params[exp][1], 
                        callbacks=[EarlyStoppingCallback(learn, monitor='accuracy', min_delta=0.5, patience=2)])
    
    learn.save(f'effnetb6-{exp}-512-stage2')
    learn.export()

extract experiment: HEPG2, number of train data 7750
[10, 1e-05, 'effnetb6-HEPG2-512-stage2']


epoch,train_loss,valid_loss,accuracy,time
0,2.665823,1.679833,61.612904,26:32
1,2.75105,1.677793,61.612904,26:11
2,2.623259,1.6638,61.935482,25:33
3,2.62878,1.626541,62.387096,27:00
4,2.505732,1.637583,62.451614,27:01
5,2.646666,1.618964,62.258064,27:00
6,2.609795,1.590024,63.935482,27:00
7,2.573082,1.587401,63.741936,26:58
8,2.511311,1.588107,63.483871,27:00


Epoch 9: early stopping
extract experiment: HUVEC, number of train data 17688
[5, 1e-06, '']


epoch,train_loss,valid_loss,accuracy,time
0,1.599543,1.598156,82.560768,1:00:37
1,1.510332,1.726038,84.143585,1:00:18
2,1.725623,1.912736,84.171852,59:16
3,1.568355,1.297918,85.245903,58:48
4,1.500873,1.622182,84.256645,59:31


extract experiment: RPE, number of train data 7753
[10, 1e-05, '']


epoch,train_loss,valid_loss,accuracy,time
0,3.063864,2.027673,54.252579,26:31
1,2.706188,1.80523,59.020618,26:11
2,2.546066,1.64173,63.079895,26:09
3,2.481042,1.597129,64.304123,27:08
4,2.448711,1.488937,65.786079,27:07
5,2.448539,1.506261,66.043816,27:07
6,2.137379,1.492765,66.237114,27:09
7,2.095664,1.489275,66.301544,27:08
8,2.342709,1.463512,67.139175,27:08
9,2.09023,1.4458,67.396904,27:08


extract experiment: U2OS, number of train data 3324
[10, 1e-05, '']


epoch,train_loss,valid_loss,accuracy,time
0,5.155396,4.315378,18.768768,11:02
1,4.92471,4.016593,21.021021,11:05
2,4.471451,3.695094,24.624624,11:38
3,4.347263,3.507907,26.876877,11:38
4,3.959831,3.389163,28.828829,11:38
5,4.095483,3.319865,29.879879,11:38
6,4.024506,3.284612,30.33033,11:38
7,3.908053,3.19913,30.63063,11:38
8,3.994433,3.251812,30.780781,11:39
9,3.980525,3.218936,30.930931,11:39


In [20]:
params = {'HEPG2': [5, 5e-6, 'effnetb6-HEPG2-512-stage2'], 
          'HUVEC': [0, 1e-6, 'effnetb6-HUVEC-512-stage2'], 
          'RPE': [5, 5e-6, 'effnetb6-RPE-512-stage2'],
          'U2OS': [10, 1e-5, 'effnetb6-U2OS-512-stage3']}
exp_list2 = ['U2OS', 'HEPG2', 'RPE', 'HUVEC']
for exp in exp_list2:

    proc_train_exp_df = generate_df(train_df, sample_num=[1,2], exp=exp, test_size=0.1)
    data = (MultiChannelImageList.from_df(df=proc_train_exp_df,path='../DATA/kaggle-2019Q3-cellular/train/')
            .split_from_df(col=2)
            .label_from_df()
            .transform(tfms,size=image_size)
            .databunch(bs=8,num_workers=8)
            .normalize(stats)
           )
    print(params[exp])
    learn = Learner(data, 
                    effnetb6, opt_func=optar,
                    metrics = [accuracy], 
                    model_dir='./fastaimodels').to_fp16()
    # learn.model = torch.nn.DataParallel(learn.model)
    if params[exp][2] is '':
        learn.path = Path('./effb6-gce')
        learn.load('effnetb6-512-stage2')
        learn.path = Path('./effb6-4exps-gce')
    else:
        learn.path = Path('./effb6-4exps-gce')
        learn.load(params[exp][2])
    
    learn.unfreeze()
    learn.fit_one_cycle(params[exp][0],params[exp][1], 
                        callbacks=[EarlyStoppingCallback(learn, monitor='accuracy', min_delta=0.5, patience=2)])
    
    learn.save(f'effnetb6-{exp}-512-stage3')
    learn.export()

extract experiment: U2OS, number of train data 3324


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


[10, 1e-05, 'effnetb6-U2OS-512-stage3']


epoch,train_loss,valid_loss,accuracy,time
0,3.710686,3.094473,33.633633,11:37
1,3.64501,3.176811,32.882881,11:37
2,3.626143,2.984352,34.684685,11:37
3,3.55015,3.022666,34.984985,11:37
4,3.533374,2.903181,35.885887,11:38
5,3.627841,2.976133,36.036037,11:38
6,3.305894,3.070938,35.285286,11:38


Epoch 7: early stopping
extract experiment: HEPG2, number of train data 7750
[5, 5e-06, 'effnetb6-HEPG2-512-stage2']


epoch,train_loss,valid_loss,accuracy,time
0,2.529162,1.587186,62.838711,26:47
1,2.554509,1.589318,63.419353,26:38
2,2.303339,1.574421,64.19355,25:45
3,2.410763,1.587344,63.419353,27:07
4,2.522799,1.578426,63.483871,27:07


extract experiment: RPE, number of train data 7753
[5, 5e-06, 'effnetb6-RPE-512-stage2']


epoch,train_loss,valid_loss,accuracy,time
0,2.285493,1.472392,66.752579,26:21
1,2.130054,1.483425,67.203606,26:01


KeyboardInterrupt: 

## Inference and Submission Generation

In [80]:
def generate_submission(preds_score, name='submission-effB4-384-singlesite.csv'):
    submission_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')
    if type(preds_score) is np.ndarray:
        result = preds_score.argmax(axis=-1)
    else:
        result = preds_score.argmax(dim=-1).numpy()
    submission_df.sirna = result.astype(int)
    submission_df.to_csv(name,index=False)
    print(f'write to submission_df {name}')
    return submission_df
test_df = pd.read_csv(f'{BASE_DIR}/test.csv')
test_df['exp'] = test_df['id_code'].apply(lambda x: x.split('-')[0])

###  all data

In [16]:
proc_train_all_df = generate_df(train_df, sample_num=[1,2], exp=None, test_size=0.1)

image_size=512
tfms = get_transforms(do_flip=True, flip_vert=True,
                      max_lighting=0.2, p_lighting=0.5, 
                      max_warp=0.05, max_rotate=20., max_zoom=1.1)

data = (MultiChannelImageList.from_df(df=proc_train_all_df,path='../DATA/kaggle-2019Q3-cellular/train/')
        .split_from_df(col=2)
        .label_from_df()
        .transform(tfms,size=image_size)
        .databunch(bs=96,num_workers=8)
        .normalize(stats)
       )


In [17]:
preds_mean_df = test_df.copy(deep=True)

In [21]:
a_df = preds_mean_df.loc[preds_mean_df['exp'] == exp_list[0]]
a_df['sirna'] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(4429, 3)

In [30]:
exp = exp_list[0]
pth_name = f'effnetb6-{exp}-512-stage2'
print(f'loading {pth_name}')
learn = Learner(data, 
                effnetb6, opt_func=optar,
                metrics = [accuracy], 
                model_dir='./fastaimodels').to_fp16()

learn.path = Path('./effb6-4exps-gce')
learn.load(pth_name)
print('model loaded', flush=True)
proc_test_df = generate_df(test_df.copy(), sample_num=[1])
data_test = MultiChannelImageList.from_df(df=proc_test_df[proc_test_df['path'].str.startswith(exp)],
                                          path=f'{BASE_DIR}/test/')
learn.data.add_test(data_test)
print(f'{pth_name}   site 1 ')
preds, _ = learn.get_preds(DatasetType.Test)

In [52]:

preds_mean_all = np.concatenate([preds_mean_all, preds_mean])
print(len(preds_mean_all), len(preds_mean))
preds_mean_all = np.concatenate([preds_mean_all, preds_mean])
print(len(preds_mean_all), len(preds_mean))

4429 4429
8858 4429


In [42]:
(proc_test_df['path'].apply(lambda x: x.split('/')[0]) != test_df['experiment']).sum()

0

In [60]:

import gc
learn = Learner(data, 
                effnetb6, opt_func=optar,
                metrics = [accuracy], 
                model_dir='./fastaimodels').to_fp16()

learn.path = Path('./effb6-4exps-gce')
preds_mean_all = np.ndarray((0,1108))
index_list = []
index_path_list = []
for i, exp in enumerate(exp_list):
    pth_name = f'effnetb6-{exp}-512-stage2'
    print(f'loading {pth_name}')

    learn.load(pth_name)
    print('model loaded', flush=True)
    proc_test_df = generate_df(test_df.copy(), sample_num=[1])
    idx = proc_test_df[proc_test_df['path'].str.startswith(exp)].index.tolist()
    data_test = MultiChannelImageList.from_df(df=proc_test_df.loc[idx],
                                              path=f'{BASE_DIR}/test/')
    learn.data.add_test(data_test)
    print(f'{i} {pth_name}   site 1 ')
    preds, _ = learn.get_preds(DatasetType.Test)
    
    proc_test_df = generate_df(test_df.copy(), sample_num=[2])
    data_test = MultiChannelImageList.from_df(df=proc_test_df.loc[idx],
                                              path=f'{BASE_DIR}/test/')
    learn.data.add_test(data_test)
    print(f'{i} {pth_name}   site 2 ')
    preds2, _ = learn.get_preds(DatasetType.Test)
    preds_mean = (preds.numpy() + preds2.numpy())/2.
    preds_mean_all = np.concatenate([preds_mean_all, preds_mean])
    index_list = index_list + idx
    index_path_list = index_path_list + proc_test_df.loc[idx, 'path'].tolist()
    print(preds_mean_all.shape, len(index_list), preds_mean.shape)
    

(19897, 1108) 19897


In [46]:
## local CV: 0.46
sub_2site_df = generate_submission(preds_mean, name='submission-effB4-384-doublesite.csv')

sub_2site_df.head(10)

Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,468
1,HEPG2-08_1_B04,7
2,HEPG2-08_1_B05,1091
3,HEPG2-08_1_B06,908
4,HEPG2-08_1_B07,135
5,HEPG2-08_1_B08,981
6,HEPG2-08_1_B09,118
7,HEPG2-08_1_B10,638
8,HEPG2-08_1_B11,807
9,HEPG2-08_1_B12,778


In [36]:
## local CV: 0.59
sub_2site_df = generate_submission(preds_mean, name='submission-effB6-512-doublesite.csv')
sub_2site_df.head(10)


write to submission_df submission-effB6-512-doublesite.csv


Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,381
1,HEPG2-08_1_B04,637
2,HEPG2-08_1_B05,506
3,HEPG2-08_1_B06,706
4,HEPG2-08_1_B07,43
5,HEPG2-08_1_B08,37
6,HEPG2-08_1_B09,430
7,HEPG2-08_1_B10,638
8,HEPG2-08_1_B11,154
9,HEPG2-08_1_B12,1064


In [None]:
submission_df = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')
print((preds_mean_sum['id_code'] != submission_df['id_code']).sum())
print((preds_mean_sum['id_code'] != preds_mean_df['id_code']).sum())


In [77]:
res = preds_mean_all.argmax(axis=-1)
type(preds_mean_all) is np.ndarray

True

In [81]:
## local CV: 0.59
sub_4exp2site_df = generate_submission(preds_mean_all, name='submission-effB6-512-4exps-doublesite.csv')
sub_4exp2site_df.head(10)


write to submission_df submission-effB6-512-4exps-doublesite.csv


Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,409
1,HEPG2-08_1_B04,68
2,HEPG2-08_1_B05,506
3,HEPG2-08_1_B06,706
4,HEPG2-08_1_B07,445
5,HEPG2-08_1_B08,37
6,HEPG2-08_1_B09,430
7,HEPG2-08_1_B10,638
8,HEPG2-08_1_B11,154
9,HEPG2-08_1_B12,1064


## self-boost

In [142]:
# sub_df = pd.read_csv('submission-431.csv')
sub_df = pd.read_csv('submission-effB6-512-doublesite.boosted431.csv')
sub_df.head()
sub_ori_df = sub_df.copy(deep=True)

In [143]:
plate_groups = np.zeros((1108,4), int)
for sirna in range(1108):
    grp = train_df.loc[train_df.sirna==sirna,:].plate.value_counts().index.values
    assert len(grp) == 3
    plate_groups[sirna,0:3] = grp
    plate_groups[sirna,3] = 10 - grp.sum()
    


all_test_exp = test_df.experiment.unique()
group_plate_probs = np.zeros((len(all_test_exp),4))
for idx in range(len(all_test_exp)):
    preds = sub_df.loc[test_df.experiment == all_test_exp[idx],'sirna'].values
    pp_mult = np.zeros((len(preds),1108))
    pp_mult[range(len(preds)),preds] = 1
    
    sub_test = test_df.loc[test_df.experiment == all_test_exp[idx],:]
    assert len(pp_mult) == len(sub_test)
    
    for j in range(4):
        mask = np.repeat(plate_groups[np.newaxis, :, j], len(pp_mult), axis=0) == \
               np.repeat(sub_test.plate.values[:, np.newaxis], 1108, axis=1)
        
        group_plate_probs[idx,j] = np.array(pp_mult)[mask].sum()/len(pp_mult)

exp_to_group = group_plate_probs.argmax(1)
print(exp_to_group)


def select_plate_group(pp_mult, idx, 
                       test_df=test_df, all_test_exp=all_test_exp, plate_groups=plate_groups,
                       exp_to_group=exp_to_group):
    sub_test = test_df.loc[test_df.experiment == all_test_exp[idx],:]
    assert len(pp_mult) == len(sub_test)
    mask = np.repeat(plate_groups[np.newaxis, :, exp_to_group[idx]], len(pp_mult), axis=0) != \
           np.repeat(sub_test.plate.values[:, np.newaxis], 1108, axis=1)
    pp_mult[mask] = 0
    return pp_mult

[3 1 0 0 0 0 2 2 3 0 0 3 1 0 0 0 2 3]


In [144]:
(sub_df['sirna'] != sub_ori_df['sirna']).sum()

0

In [145]:
def apply_boost(sub_df, preds_mean, 
                all_test_exp=all_test_exp, test_df=test_df, 
                plate_groups=plate_groups, exp_to_group=exp_to_group):
    if type(preds_mean) is not np.ndarray:
        preds_mean = preds_mean.numpy()
    
    for idx in range(len(all_test_exp)):
        #print('Experiment', idx)
        indices = (test_df.experiment == all_test_exp[idx])
        
        preds_slice = preds_mean[indices,:].copy()

        preds_slice = select_plate_group(preds_slice, idx, 
                                         test_df=test_df, all_test_exp=all_test_exp, plate_groups=plate_groups,
                                         exp_to_group=exp_to_group)
        sub_df.loc[indices,'sirna'] = preds_slice.argmax(1)

    return sub_df

In [54]:
sub_df = apply_boost(sub_df, preds_mean)
sub_df.to_csv('submission-effB6-512-doublesite.boosted431.csv',index=False)
sub_df.head()

In [135]:
sub431_df = apply_boost(sub_df, preds_mean_all)
sub431_df.to_csv('submission-effB6-512-4exp-doublesite.boosted431.csv',index=False)
print((sub_ori_df['sirna'] != sub431_df['sirna']).mean())
sub431_df.head()

0.4306679398904357


Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,141
1,HEPG2-08_1_B04,68
2,HEPG2-08_1_B05,836
3,HEPG2-08_1_B06,609
4,HEPG2-08_1_B07,1055


0.4306679398904357

In [146]:
sub527_df = apply_boost(sub_df, preds_mean_all)
sub527_df.to_csv('submission-effB6-512-4exp-doublesite.boosted527.csv',index=False)
print((sub_ori_df['sirna'] != sub431_df['sirna']).mean())
sub527_df.head()

0.24722319947730814


Unnamed: 0,id_code,sirna
0,HEPG2-08_1_B03,141
1,HEPG2-08_1_B04,68
2,HEPG2-08_1_B05,836
3,HEPG2-08_1_B06,609
4,HEPG2-08_1_B07,1055


In [141]:
print((sub_ori_df['sirna'] != sub527_df['sirna']).mean())

0.22445594813288436


In [90]:
type(preds_mean_all) is not np.ndarray

False