In [77]:
import torch
import torchvision.models as models
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import DataParallel
from torch.utils.data.sampler import RandomSampler, SequentialSampler
import torch.backends.cudnn as cudnn
import numpy as np

In [78]:
import cv2
import pandas as pd
from copy import deepcopy
from tqdm import tqdm
from importlib import import_module

import os
opj = os.path.join
ope = os.path.exists

In [79]:
from bestfitting.protein_clean.src.net import _init_paths
from bestfitting.protein_clean.src.net import densenet
from bestfitting.protein_clean.src.config.config import *
from bestfitting.protein_clean.src.dataset import protein_dataset
from bestfitting.protein_clean.src import train_net_base
from bestfitting.protein_clean.src import train_cls_net

### Set parameters and configurations

In [80]:
module = 'densenet'
model_name = 'class_densenet121_large_dropout'
out_dir = 'external_crop1024_focal_slov_hardlog_clean'
train_batch_size = 36
test_batch_size = 12
epochs = 45
scheduler = 'Adam45'
scheduler_name = scheduler
img_size = 1536
crop_size = 1024
in_channels = 4
start_fold = 0
end_fold = 2
# gpus = '0, 1, 2, 3'
gpus = '0'

folds_num = 5
fold = 0
num_classes = 28

train_model = 0
predict_val = 1
predict_test = 1
is_predict_val = True
is_predict_test = True
predict_aug = 'default,flipud,fliplr,transpose,flipud_lr,flipud_transpose,fliplr_transpose,flipud_lr_transpose'
seeds = '0,1,2,3'
seed = 100
aug_version = 2
loss = 'FocalSymmetricLovaszHardLogLoss'
loss_name = loss
predict_epoch = None
predict_epochs = [predict_epoch]

save_probs = True
clipnorm = True
overwrite = True

# use external data
use_external = True

# no leak
clean = True

In [81]:
split_alias = 'random'
if use_external:
    split_alias = 'random_ext'
    if clean: # no leak in the data
        split_alias = 'random_ext_noleak_clean'

In [82]:
# directory arguments
dir_args = {
    "split_dir": opj(DATA_DIR, "split", "%s_folds%d" % (split_alias, folds_num)),
    "log_dir": opj(RESULT_DIR, "logs"),
    "subm_dir": opj(RESULT_DIR, "submissions"),
    "model_dir": opj(RESULT_DIR, "models"),
    "image_check_dir": opj(RESULT_DIR, "image_check"),
}
print(dir_args)

{'split_dir': '/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/data/split/random_ext_noleak_clean_folds5', 'log_dir': '/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/logs', 'subm_dir': '/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/submissions', 'model_dir': '/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/models', 'image_check_dir': '/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/image_check'}


In [83]:
# data files
data_infos = {
    "model_level_name": "%s_i%d_aug%d_%dfolds/fold%d" % (model_name if out_dir is None else out_dir + '_' + model_name,
                                                             img_size, aug_version, folds_num, fold),
}
is_debug = False
if is_debug: # if true we use small dataset for debugging
    data_infos["train_split_file"] = "../train_160.csv"
    data_infos["valid_split_file"] = "../valid_160.csv"
    data_infos["test_split_file"] = "../test_160.csv"
else:
    data_infos["train_split_file"] = "random_train_cv{}.csv".format(fold)
    data_infos["valid_split_file"] = "random_valid_cv{}.csv".format(fold)
    data_infos["test_split_file"] = "../test_11702.csv"
print(data_infos)

{'model_level_name': 'external_crop1024_focal_slov_hardlog_clean_class_densenet121_large_dropout_i1536_aug2_5folds/fold0', 'train_split_file': 'random_train_cv0.csv', 'valid_split_file': 'random_valid_cv0.csv', 'test_split_file': '../test_11702.csv'}


In [84]:
data_args = {
    "train_split_file": data_infos["train_split_file"],
    "valid_split_file": data_infos["valid_split_file"],
    "test_split_file": data_infos["valid_split_file"], # should be "test_split_file?"
    "model_level_name": data_infos["model_level_name"],
    "result_type": "val", # for test, should change to 'test'
    'predict_aug':predict_aug,
}
print(data_args)

{'train_split_file': 'random_train_cv0.csv', 'valid_split_file': 'random_valid_cv0.csv', 'test_split_file': 'random_valid_cv0.csv', 'model_level_name': 'external_crop1024_focal_slov_hardlog_clean_class_densenet121_large_dropout_i1536_aug2_5folds/fold0', 'result_type': 'val', 'predict_aug': 'default,flipud,fliplr,transpose,flipud_lr,flipud_transpose,fliplr_transpose,flipud_lr_transpose'}


### Load model and predict labels

In [85]:
# get Protein class
trainer = train_cls_net.Protein(dir_args,
                                train_batch_size=train_batch_size,
                                test_batch_size=test_batch_size,
                                seed=seed, img_size=img_size,in_channels=in_channels,
                                save_probs=save_probs,
                                aug_version=aug_version,
                                num_classes=num_classes,
                                crop_size=crop_size,
                                use_external=use_external,
                                clipnorm=clipnorm,)

In [86]:
# check how many gpus?
n_gpu = trainer.setgpu(gpus)

# directory for densenet architecture
model = import_module("net.%s" % module)

# get densenet architecture pretrained on imagenet (model_name = class_densenet121_large_dropout)
net, scheduler, loss = model.get_model(model_name,
                                       num_classes,
                                       loss_name,
                                       scheduler_name=scheduler_name,
                                       in_channels=in_channels,
                                       )


# GPU
net = trainer.set_data_parallel(net, n_gpu=n_gpu)

using gpu 0
in_channels 4


In [89]:
# set directories
trainer.set_datasets(data_args)

# load model located on model file
trainer.load_model(net=net, epoch=predict_epoch) 

# print model file 
print(trainer.get_model_file()) 

# use GPU
net = trainer.set_data_parallel(net, n_gpu=n_gpu) 

if seeds is not None:
    seeds = [int(i) for i in seeds.split(',')]

/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/models/external_crop1024_focal_slov_hardlog_clean_class_densenet121_large_dropout_i1536_aug2_5folds/fold0/final.pth


In [90]:
# for seed in seeds:
#     trainer.seed = seed
#     if is_predict_val:
#         data_args['result_type'] = 'val'
#         data_args['test_split_file'] = data_infos["valid_split_file"]
#         trainer.set_datasets(data_args)
        
#         # run prediction code
#         trainer.do_submission(net, overwrite)

#     if is_predict_test:
#         data_args['result_type'] = 'test'
#         data_args['test_split_file'] = data_infos["test_split_file"]
#         trainer.set_datasets(data_args)
        
#         # run prediction code
#         trainer.do_submission(net, overwrite)    

do_submission

In [91]:
augments = predict_aug.split(',')
augment_name = augments[2]
print(augment_name)

fliplr


In [92]:
epoch_name = 'epoch_final'
augment_name += '_seed%d'%seed
sub_dir = opj(trainer.subm_dir, epoch_name, augment_name)

if trainer.use_external and trainer.result_type == 'val':
    trainer.result_csv_file = opj(sub_dir, 'results_%s_external.csv.gz' % trainer.result_type)
    trainer.result_prob_fname = opj(sub_dir, "prob_%s_external.npy" % trainer.result_type)
    trainer.extract_feat_fname = opj(sub_dir, 'extract_feats_%s_external.npz' % trainer.result_type)
else:
    trainer.result_csv_file = opj(sub_dir, 'results_%s.csv.gz' % trainer.result_type)
    trainer.result_prob_fname = opj(sub_dir, "prob_%s.npy" % trainer.result_type)
    trainer.extract_feat_fname = opj(sub_dir, 'extract_feats_%s.npz' % trainer.result_type)
os.makedirs(sub_dir, exist_ok=True)

print(trainer.result_csv_file)
print(trainer.result_prob_fname)
print(trainer.extract_feat_fname)

/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/submissions/external_crop1024_focal_slov_hardlog_clean_class_densenet121_large_dropout_i1536_aug2_5folds/fold0/epoch_final/fliplr_seed100/results_val_external.csv.gz
/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/submissions/external_crop1024_focal_slov_hardlog_clean_class_densenet121_large_dropout_i1536_aug2_5folds/fold0/epoch_final/fliplr_seed100/prob_val_external.npy
/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/result/submissions/external_crop1024_focal_slov_hardlog_clean_class_densenet121_large_dropout_i1536_aug2_5folds/fold0/epoch_final/fliplr_seed100/extract_feats_val_external.npz


In [93]:
test_dataset = protein_dataset.ProteinDataset(trainer.test_split_file,
                                               img_size=trainer.img_size,
                                               is_trainset=not trainer.result_type == 'test',
                                               return_label=True,
                                               seed=trainer.seed,
                                               in_channels=trainer.in_channels,
                                               transform=None,
                                               crop_size=trainer.crop_size,
                                               random_crop=trainer.seed!=0,
                                               )

test_loader = protein_dataset.DataLoader(test_dataset,
                                         sampler=SequentialSampler(test_dataset),
                                         batch_size=trainer.test_batch_size,
                                         drop_last=False,
                                         num_workers=trainer.num_workers,
                                         pin_memory=True)

/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/data/train/images_1536
/home/ubuntu/HPA/hpa_interp/bestfitting/protein_clean/data/train/external_v18_1536


In [94]:
if trainer.gpu_flag:
    net.cuda()
# net eval mode
net.eval()
        
n = 0 # number of test data points
img_ids = np.array(test_dataset.img_ids) # get img_ids from dataset
all_probs = []
for iter, (images, labels, indices) in tqdm(enumerate(test_loader, 0), # return_label is False
                                            total=int(np.ceil(test_dataset.num / trainer.test_batch_size))):
    batch_size = len(images)
    n += batch_size
    if trainer.gpu_flag:
        images = Variable(images.cuda(), volatile=True)
    else:
        images = Variable(images, volatile=True)

    outputs = net(images)
    if type(outputs)==list or type(outputs)==tuple:
        logits = outputs[0]
    else:
        logits = outputs

    probs = trainer.logits_to_probs(logits.data)
    all_probs += probs.cpu().numpy().reshape(-1).tolist() # collect all probs

# start = timer()

all_probs = np.array(all_probs).reshape(-1, trainer.num_classes) # all_probs is an array of n-by-num_classes
if trainer.save_probs:
    print(all_probs.shape)
    np.save(trainer.result_prob_fname, all_probs)

df = prob_to_result(all_probs, img_ids) # prob_to_result located in net/loss_funcs/kaggle_metric.py; output pd.dataframe of img_ids and pred_list
df.to_csv(trainer.result_csv_file, index=False, compression='gzip')

# if result_type is 'val', then compute f1_score
if trainer.result_type == 'val':
    truth = pd.read_csv(trainer.valid_split_file)
    score = get_probs_f1_score(df, all_probs, truth, th=0.5)
    print('macro f1 score:%.5f' % score)
    if trainer.use_external:
        sub_name = 'results_%s_%.5f_external.csv.gz' % (trainer.result_type, score)
    else:
        sub_name = 'results_%s_%.5f.csv.gz' % (trainer.result_type, score)
    df.to_csv(opj(sub_dir, sub_name), index=False, compression='gzip')



  0%|          | 0/1666 [00:00<?, ?it/s][A[A

RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 11.17 GiB total capacity; 10.49 GiB already allocated; 2.44 MiB free; 43.87 MiB cached)