In [1]:
#export
from collections import OrderedDict
import os
import re
from pdb import set_trace
from multiprocessing import cpu_count
from pprint import pprint as pp
import warnings
warnings.filterwarnings('ignore')

from imageio import imread
import numpy as np
import pandas as pd
import PIL.Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision
import torchvision.transforms as T

from catalyst.contrib.schedulers import OneCycleLR
from catalyst.data.dataset import ListDataset
from catalyst.dl.callbacks import AccuracyCallback, AUCCallback, F1ScoreCallback
from catalyst.dl.runner import SupervisedRunner
import pretrainedmodels
from jupytools import auto_set_trace

W0829 23:01:10.188656 140535679878976 compression.py:14] lz4 not available, disabling compression. To install lz4, run `pip install lz4`.


In [2]:
#export
seed = 1
set_trace = auto_set_trace()
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

Version of set_trace(): ipdb


In [3]:
#export
def list_files(folder):
    dirname = os.path.expanduser(folder)
    return [os.path.join(dirname, x) for x in os.listdir(dirname)]

In [4]:
#export
def extract_labels(files):
    regex = re.compile('.*_(\d+)\\.png$')
    return [int(regex.match(os.path.basename(fn)).group(1)) for fn in files]

In [5]:
#export
uniq_labels = np.unique(extract_labels(list_files('~/data/protein/tmp/train')))
num_classes = len(uniq_labels)

In [6]:
#export
from typing import List, Dict, Callable, Any

class PathsDataset(ListDataset):
    """
    Dataset that derives features and targets from samples filesystem paths.
    """
    def __init__(
        self,
        filenames: List[Dict],
        open_fn: Callable,
        get_label_fn: Callable,
        **list_dataset_params
    ):
        list_data = [
            {'features': filename, 
             'targets': get_label_fn(filename)}
            for filename in filenames]
        
        super().__init__(
            list_data=list_data,
            open_fn=open_fn,
            **list_dataset_params
        )
        
class RegexLabelExtractor:
    def __init__(self, regex):
        self.regex = re.compile(regex)

    def __call__(self, filename):
        return int(self.regex.match(os.path.basename(filename)).group(1))
    
class ImageTransformer:
    def __init__(self, image_tr: Callable):
        self.image_tr = image_tr
    
    def __call__(self, dict_):
        dict_ = dict_.copy()
        dict_['features'] = self.image_tr(dict_['features'])
        return dict_
    
class OneHotTransformer:
    def __init__(self, num_classes):
        self.num_classes = num_classes
    
    def __call__(self, dict_):
        y = dict_['targets']
        onehot = np.zeros(self.num_classes, dtype=np.float32)
        onehot[y] = 1
        dict_['targets_one_hot'] = onehot
        return dict_
    
class TransformerList:
    def __init__(self, transforms):
        self.transforms = transforms
    
    def __call__(self, dict_):
        from functools import reduce
        return reduce(lambda x, f: f(x), self.transforms, dict_)
        
def open_image(dict_):
    dict_ = dict_.copy()
    dict_['features'] = PIL.Image.open(dict_['features'])
    return dict_

model_name = 'resnet50'
params = pretrainedmodels.pretrained_settings[model_name]['imagenet']
pp(params)

transformers = TransformerList([
    ImageTransformer(T.Compose([
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize(params['mean'], params['std'])
    ])),
    OneHotTransformer(num_classes)
])

labelled_files = list_files('~/data/protein/tmp/train')

from sklearn.model_selection import train_test_split
trn_files, tst_files = train_test_split(labelled_files, test_size=0.1, random_state=seed)
regex_label = RegexLabelExtractor('.*_(\d+)\\.png$')

trn_ds = PathsDataset(
    filenames=trn_files,
    open_fn=open_image,
    get_label_fn=regex_label,
    dict_transform=transformers
)

val_ds = PathsDataset(
    filenames=tst_files,
    open_fn=open_image,
    get_label_fn=regex_label,
    dict_transform=transformers
)

batch_size = 800
loaders = OrderedDict()
loaders['train'] = DataLoader(trn_ds, shuffle=True, batch_size=batch_size)
loaders['valid'] = DataLoader(val_ds, shuffle=False, batch_size=batch_size)

{'input_range': [0, 1],
 'input_size': [3, 224, 224],
 'input_space': 'RGB',
 'mean': [0.485, 0.456, 0.406],
 'num_classes': 1000,
 'std': [0.229, 0.224, 0.225],
 'url': 'https://download.pytorch.org/models/resnet50-19c8e357.pth'}


In [7]:
#export
def get_model(model_name, num_classes, pretrained='imagenet'):
    model_fn = pretrainedmodels.__dict__[model_name]
    model = model_fn(num_classes=1000, pretrained=pretrained)
    dim_feats = model.last_linear.in_features
    model.last_linear = nn.Linear(dim_feats, num_classes)
    return model

## Train

In [None]:
#export
epochs = 20

resnet = get_model(model_name, num_classes)
for param in resnet.parameters():
    param.requires_grad = False

resnet.last_linear.weight.requires_grad = True
for param in resnet.layer4.parameters():
    param.requires_grad = True

loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(resnet.parameters(), lr=0.01, weight_decay=0.01)
logdir = '/tmp/protein/logs/'
runner = SupervisedRunner()
sched = OneCycleLR(opt, 
                   num_steps=epochs * len(loaders['train']),
                   warmup_fraction=0.2,
                   lr_range=(0.1, 0.01, 0.001))

runner.train(
    model=resnet,
    criterion=loss_fn,
    optimizer=opt,
    loaders=loaders,
    logdir=logdir,
    num_epochs=epochs,
    scheduler=sched,
    callbacks=[
        AccuracyCallback(num_classes=num_classes),
        AUCCallback(
            num_classes=num_classes,
            input_key="targets_one_hot"
        ),
        F1ScoreCallback(
            input_key="targets_one_hot",
            activation="Softmax"
        )
    ],
    verbose=True
)

In [None]:
#export
print('Saving the trained model')
basedir = os.path.expanduser('~/data/protein/tmp/models')
os.makedirs(basedir)
torch.save(resnet, os.path.join(basedir, 'resnet50_simple.pth'))

## Test

In [8]:
resnet = get_model(model_name, num_classes).cuda(0)
restored = torch.load('/tmp/protein/logs/checkpoints/best.pth')
resnet.load_state_dict(restored['model_state_dict'])

<All keys matched successfully>

In [10]:
from tqdm import tqdm_notebook as tqdm

resnet.eval()

for param in resnet.parameters():
    param.requires_grad = False

filenames = sorted(list_files('~/data/protein/tmp/test'))
    
tst_ds = PathsDataset(
    filenames=filenames,
    open_fn=open_image,
    get_label_fn=regex_label,
    dict_transform=transformers
)

tst_dl = DataLoader(tst_ds, batch_size=512, num_workers=cpu_count())
preds = []
for batch in tqdm(tst_dl):
    out = resnet(batch['features'].cuda(0))
    y = out.softmax(dim=1)
    preds.extend(y.tolist())

HBox(children=(IntProgress(value=0, max=87), HTML(value='')))

In [31]:
# site1 = torch.tensor(preds[::2])
# site2 = torch.tensor(preds[1::2])
# avg_pred = ((site1 + site2)/2).argmax(dim=1).tolist()

In [32]:
# submission = []
# for filename, pred in list(zip(filenames, avg_pred)):
#     basename, _ = os.path.splitext(os.path.basename(filename))
#     sirna = int(basename.split('_')[-1])
#     if sirna != 0: continue
#     submission.append(pred)

In [38]:
# odd
site1 = []
for filename, pred in list(zip(filenames, preds))[::2]:
    basename, _ = os.path.splitext(os.path.basename(filename))
    sirna = int(basename.split('_')[-1])
    if sirna != 0: 
        continue
    site1.append(pred)
    
# even
site2 = []
for filename, pred in list(zip(filenames, preds))[1::2]:
    basename, _ = os.path.splitext(os.path.basename(filename))
    sirna = int(basename.split('_')[-1])
    if sirna != 0: 
        continue
    site2.append(pred)

In [43]:
t1 = torch.tensor(site1)
t2 = torch.tensor(site2)
avg_pred = ((t1 + t2)/2).argmax(dim=1)
print(avg_pred.shape)

torch.Size([19897])


In [44]:
sample = pd.read_csv('/home/ck/data/protein/sample_submission.csv')
sample['sirna'] = avg_pred.tolist()
sample.to_csv('submit.csv', index=False)
from IPython.display import FileLink
FileLink('submit.csv')

In [24]:
# # odd
# site1 = []
# for filename, pred in list(zip(filenames, preds))[::2]:
#     basename, _ = os.path.splitext(os.path.basename(filename))
#     sirna = int(basename.split('_')[-1])
#     if sirna != 0: 
#         continue
#     site1.append(pred)
    
# # even
# site2 = []
# for filename, pred in list(zip(filenames, preds))[1::2]:
#     basename, _ = os.path.splitext(os.path.basename(filename))
#     sirna = int(basename.split('_')[-1])
#     if sirna != 0: 
#         continue
#     site2.append(pred)

In [30]:
# sample = pd.read_csv('/home/ck/data/protein/sample_submission.csv')
# sample['sirna'] = site1
# sample.to_csv('submit1.csv', index=False)
# sample['sirna'] = site2
# sample.to_csv('submit2.csv', index=False)

In [31]:
# from IPython.display import FileLink

In [33]:
# display(FileLink('submit1.csv'))
# display(FileLink('submit2.csv'))

In [None]:
# https://github.com/catalyst-team/catalyst/blob/master/examples/notebooks/notebook-example.ipynb
# https://github.com/catalyst-team/catalyst/blob/master/examples/notebooks/classification-tutorial.ipynb