In [1]:
import copy
import itertools
import time
from pathlib import Path
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torchvision
from torchvision import datasets, models, transforms

from multiprocessing import cpu_count
from multiprocessing.dummy import Pool
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer

from PIL import Image

import matplotlib as mpl
mpl_params = {
    'figure.figsize': (10, 5),
    'figure.dpi': 300,
}
from matplotlib import pyplot as plt
mpl.rcParams.update(mpl_params)

import seaborn as sns
sns.set()

In [2]:
# DATA_DIR = Path('../input/human-protein-atlas-image-classification/')
# TRAIN_DIR = DATA_DIR / 'train'
# TEST_DIR = DATA_DIR / 'test'

# train_df = pd.read_csv(DATA_DIR / 'train.csv')
# test_df = pd.read_csv(DATA_DIR / 'sample_submission.csv')

In [3]:
TEST_PROCESSED = Path('test_processed')
TEST_PROCESSED.mkdir(parents=True, exist_ok=True)

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [5]:
LABELS = {
    0: 'Nucleoplasm', 
    1: 'Nuclear membrane',   
    2: 'Nucleoli',   
    3: 'Nucleoli fibrillar center' ,  
    4: 'Nuclear speckles',
    5: 'Nuclear bodies',
    6: 'Endoplasmic reticulum',   
    7: 'Golgi apparatus',
    8: 'Peroxisomes',
    9: 'Endosomes',
    10: 'Lysosomes',
    11: 'Intermediate filaments',   
    12: 'Actin filaments',
    13: 'Focal adhesion sites',   
    14: 'Microtubules',
    15: 'Microtubule ends',   
    16: 'Cytokinetic bridge',   
    17: 'Mitotic spindle',
    18: 'Microtubule organizing center',  
    19: 'Centrosome',
    20: 'Lipid droplets',   
    21: 'Plasma membrane',   
    22: 'Cell junctions', 
    23: 'Mitochondria',
    24: 'Aggresome',
    25: 'Cytosol',
    26: 'Cytoplasmic bodies',   
    27: 'Rods & rings'
}

LABEL_NAMES = list(LABELS.values())

In [6]:
class ProteinDataset(Dataset):
    def __init__(self, df, images_dir, transform=None):            
        self.df = df.copy()
        self._dir = images_dir
        self.transform = transform
        self.p = Pool(1)
        self.mlb = MultiLabelBinarizer(list(range(len(LABELS))))
        self.count = 0
        self.total_load = 0
        self.total_stack = 0
        self.total_transform = 0
        self.colors = ['red', 'green', 'blue', 'yellow']
        self.cache_size = len(self.df)
        self.latest = 0
        self.stack = []
        self.save = iter(list(range(32)))
        
        self.cache = {}
#         for i in range(self.cache_size):
#             self.latest = i
#             id_ = self.df.iloc[i].Id
#             image_paths = [self._dir / f'{id_}_{c}.png' for c in self.colors]
#             self.cache[i] = self.p.map_async(self.mp_load, image_paths)

    def __len__(self):
        return len(self.df)
    
    def mp_load(self, path):
        pil_im = Image.open(path)
        return np.array(pil_im, np.uint8)
                                      
    def __getitem__(self, key):
        self.count += 1
        id_ = self.df.iloc[key].Id
        
        image_paths = [self._dir / f'{id_}_{c}.png' for c in self.colors]
        t1 = time.time()
        if key in self.cache:
            r, g, b, y = self.cache.pop(key).get()
        else:
            r, g, b, y = self.p.map(self.mp_load, image_paths)
        self.total_load += time.time() - t1
        
        t1 = time.time()
        rgb = np.stack([
            r // 2 + y // 2,
            g // 2 + y // 2,
            b // 2
        ], axis=2)
        self.total_stack += time.time() - t1
        
        y = []
        if 'Target' in self.df:
            y = list(map(int, self.df.iloc[key].Target.split(' ')))
            y = self.mlb.fit_transform([y]).squeeze()
            
        if transform:
            t1 = time.time()
            X = transform(rgb)
            self.total_transform += time.time() - t1
        else:
            X = rgb
            
        self.stack.append(np.array(X))
        
        if len(self.stack) == (len(self.df) / 2):
            np.savez_compressed(TEST_PROCESSED / f'{next(self.save)}-processed.npz', *self.stack)
            del self.stack
            self.stack = []
            
        fn = f'{id_}.png'
        return None #(np.array(X), y, fn)

In [7]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((299, 299)),  # (299, 299) InceptionV3 input
    transforms.ToTensor(),  # To Tensor dtype and convert [0, 255] uint8 to [0, 1] float
    transforms.Normalize(  # Standard image normalization
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

In [9]:
%%time
train_ds = ProteinDataset(
    df=train_df,
    images_dir=TRAIN_DIR,
    transform=transform
)

CPU times: user 2.16 ms, sys: 1.4 ms, total: 3.56 ms
Wall time: 2.38 ms


In [10]:
train_dl = DataLoader(
    train_ds,
    batch_size=1,
    shuffle=False,
    num_workers=0,
)

In [9]:
# save_pool = Pool(8)
count = 0
t1 = time.time()
for _ in test_ds:
    count += 1
#     save_pool.apply_async(np.save, args=(PROCESSED / z.replace('png', 'npy'), X))
    if count % 512 == 0:
        print(count)
        print(time.time() - t1)
# np.savez_compressed(PROCESSED / 'processed.npz', *train_ds.stack)

512
10.592767000198364
1024
21.134756803512573
1536
31.707923889160156
2048
42.11814570426941
2560
52.578545808792114
3072
62.97861194610596
3584
73.34701895713806
4096
83.82030177116394
4608
94.37246584892273
5120
104.96391201019287
5632
115.53500986099243
6144
295.10383892059326
6656
306.67052388191223
7168
317.6403658390045
7680
328.4414279460907
8192
340.0331027507782
8704
350.800724029541
9216
361.57620787620544
9728
372.99183082580566
10240
384.1622977256775
10752
395.59630393981934
11264
406.91136479377747


In [13]:
%%time
np.savez_compressed(PROCESSED / 'proc.npz', *train_ds.stack[:1024])

CPU times: user 31.4 s, sys: 235 ms, total: 31.7 s
Wall time: 31.8 s


In [10]:
len(train_df)

31072

In [11]:
len(train_ds.stack)

31072

In [None]:
for i, t in enumerate(train_ds.stack):
    torch.save(t, f'{i}.pth')

In [None]:
# '{total_load:.4f}, {total_stack:.4f}, {total_transform:.4f}'.format(**train_ds.__dict__)

In [None]:
model = torchvision.models.inception_v3(pretrained=True, transform_input=False)

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma
        
    def forward(self, input, target):
        if not (target.size() == input.size()):
            raise ValueError("Target size ({}) must be the same as input size ({})"
                             .format(target.size(), input.size()))

        max_val = (-input).clamp(min=0)
        loss = input - input * target + max_val + \
            ((-max_val).exp() + (-input - max_val).exp()).log()

        invprobs = F.logsigmoid(-input * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        
        return loss.sum(dim=1).mean()

In [None]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

In [None]:
for name, param in model.named_parameters():
    if name.startswith('Mixed_7') or name.startswith('Mixed_6'):
        param.requires_grad = True
    else:
        param.requires_grad = False

model.fc = nn.Linear(model.fc.in_features, len(LABELS))
# torch.nn.init.xavier_uniform_(model.fc.weight)
model.to(device)

criterion = nn.BCEWithLogitsLoss()
# criterion = FocalLoss()
# criterion = nn.MultiLabelMarginLoss()

optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad),
    lr=0.001,
)

# exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

In [None]:
for epoch in range(N_EPOCHS):
    ts_start = time.time()
    print(f'Epoch {epoch+1}/{N_EPOCHS}')
    model.train()
    
    running_loss, correct, count= 0.0, 0, 0
    for X, y in train_dl:
        count += BATCH_SIZE
        X, y = X.to(device, dtype=torch.float), y.to(device, dtype=torch.float)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        with torch.set_grad_enabled(True):
            y_, _ = model(X)
            loss = criterion(y_, y)

        loss.backward()
        optimizer.step()
        
        if count % 128 == 0:
            print(f'    batch loss: {loss.item():0.3f}')
            print(f'    epoch time: {time.time() - ts_start:0.3f}')
        
#         y_label_ = (sigmoid(y_) > .5).float()
#         correct += (y_label_ == y).sum().item()
#         running_loss += loss.item() * X.shape[0]
    
#     print(f"  Train Loss: {running_loss / len(train_dl.dataset)}")
#     print(f"  Train Acc:  {correct / len(train_dl.dataset)}")

#     # Eval
#     model.eval()  # IMPORTANT
    
#     running_loss, correct = 0.0, 0.0
#     with torch.no_grad():  # IMPORTANT
#         for X, y in val_dl:
#             X, y = X.to(device), y.to(device)
                    
#             y_ = model(X)
        
#             y_label_ = (y_ > .5).float()
#             correct += (y_label_ == y).sum().item()
            
#             loss = criterion(y_, y.squeeze())
#             running_loss += loss.item() * X.shape[0]
    
#     print(f"  Valid Loss: {running_loss / len(val_dl.dataset)}")
#     print(f"  Valid Acc:  {correct / len(val_dl.dataset)}")
#     print()

In [None]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

In [8]:
test_ds = ProteinDataset(
    df=test_df,
    images_dir=TEST_DIR,
    transform=transform
)

test_dl = DataLoader(
    test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

In [None]:
SUBMISSION_RUN = True

In [None]:
sigmoid = nn.Sigmoid()

In [None]:
# Eval
model.eval()

# y_predict = []
ys = []
t1 = time.time()
with torch.no_grad():
    for X, _ in test_dl:
        X = X.to(device, dtype=torch.float)

        y_ = model(X)
        y_ = sigmoid(y_)
        y_ = y_.to('cpu').numpy()

#         y_label_ = np.array(sigmoid(y_) > .5, dtype=np.float)
        
#         y_predict.extend(y_label_)
        ys.extend(y_)
        print(time.time() - t1)

In [None]:
ys_stack = np.stack(ys)

In [None]:
if SUBMISSION_RUN:
    submission = test_df.copy()
    Predicted = []
    for i, prediction in enumerate(train_ds.mlb.inverse_transform(ys_stack > .4)):
        if len(prediction) == 0:
            prediction = tuple([np.argmax(ys_stack[i])])
        all_labels = []
        for label in prediction:
            all_labels.append(str(label))
        Predicted.append(' '.join(all_labels))

    submission['Predicted'] = Predicted

In [None]:
submission.head()

In [None]:
if SUBMISSION_RUN:
    submission.to_csv('protein_classification.csv', index=False)

In [None]:
torch.save(model.state_dict(), 'model.pth')

In [None]:
np.save('ys_stack.npy', ys_stack)