In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import os
from typing import Dict
from imet.utils import binarize_prediction, seed_everything
import warnings
from sklearn.metrics import fbeta_score
from sklearn.exceptions import UndefinedMetricWarning
from torch.autograd import Variable
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.utils.data as utils


seed_everything()

In [2]:
models = ['densenet201', 'densenet121', 'resnet50', 'resnet101', 'se_resnet50', 'resnet34', 'nasnetamobile']

In [94]:
from torch.utils.data import Dataset


class TestDataset(Dataset):
    def __init__(self, data: np.array, ids: np.array):
        super().__init__()
        self._data = data
        self._ids = ids

        
    def __len__(self):
        return self._ids.shape[0]

    
    def __getitem__(self, idx: int):
        return torch.tensor(self._data[idx]), self._ids[idx]

In [95]:
def get_preds_fold(models, test=True, fold=0):
    filename = 'test' if test else 'val'
    preds = np.array([pd.read_hdf(f'zoo/model_{model}_fold_{fold}/{filename}.h5').values for model in models])
    return preds


def get_ids():
    return pd.read_hdf(f'zoo/model_resnet50_fold_0/test.h5').index.values


def get_preds(models, test=True):
    preds_by_fold = []
    for fold in range(5):
        preds = get_preds_fold(models, test=test, fold=fold)
        preds_by_fold.append(preds)
    return np.array(preds_by_fold)


def ids_to_labels(attribute_ids):
    labels = np.zeros(1103)
    indexes = list(map(int, attribute_ids.split()))
    labels[indexes] = 1
    return labels


def validation(model: nn.Module, criterion, valid_loader, use_cuda) -> Dict[str, float]:
    model.eval()
    all_losses, all_predictions, all_targets = [], [], []
    with torch.no_grad():
        for inputs, targets in valid_loader:
            all_targets.append(targets.numpy().copy())
            if use_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            all_losses.append(loss.item())
            predictions = torch.sigmoid(outputs)
            all_predictions.append(predictions.cpu().numpy())
    all_predictions = np.concatenate(all_predictions)
    all_targets = np.concatenate(all_targets)

    def get_score(y_pred):
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UndefinedMetricWarning)
            return fbeta_score(
                all_targets, y_pred, beta=2, average='samples')

    metrics = {}
    argsorted = all_predictions.argsort(axis=1)
    for threshold in [0.05, 0.10, 0.15]:
        metrics[f'valid_f2_th_{threshold:.2f}'] = get_score(
            binarize_prediction(all_predictions, threshold, argsorted))
    metrics['valid_loss'] = np.mean(all_losses)
    print(' | '.join(f'{k} {v:.3f}' for k, v in sorted(
        metrics.items(), key=lambda kv: -kv[1])))

    return metrics


def get_train_loader(models, test=True, fold=0):
    preds = get_preds_fold(models, test, fold)
    preds = preds.reshape((7443, 7, 1103, 1))
    X = np.swapaxes(np.swapaxes(preds, 1, 3), 2, 3)
    ids = get_ids()
    print(ids.shape)
    print(X.shape)
    train_dataset = TestDataset(X, ids)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1000)
    return train_loader

In [55]:
preds_fold_0 = get_preds_fold(models, test=False)

In [62]:
pd.read_hdf(f'zoo/model_resnet50_fold_0/test.h5').index.values

(7443,)

In [5]:
preds_fold_0 = np.swapaxes(preds_fold_0, 0, 1)

In [6]:
preds_fold_0.shape

(21816, 7, 1103)

In [7]:
folds = pd.read_csv('folds.csv')

In [8]:
preds_y_fold_0 = folds[folds['fold'] == 0]['attribute_ids'].values

In [9]:
preds_y_fold_0 = np.array(list(map(ids_to_labels, preds_y_fold_0)))

In [10]:
def corr(df1, df2):
    n = len(df1)
    v1, v2 = df1.values, df2.values
    sums = np.multiply.outer(v2.sum(0), v1.sum(0))
    stds = np.multiply.outer(v2.std(0), v1.std(0))
    return pd.DataFrame((v2.T.dot(v1) - sums / n) / stds / n,
                        df2.columns, df1.columns).abs().sum().sum()

In [11]:
def mean_df(predictions):
    concat = pd.concat((predictions))
    return concat.groupby(concat.index).mean()

In [12]:
class EnsembleCNN(torch.nn.Module):
    
    
    def __init__(self):
        super(EnsembleCNN, self).__init__()
        
        self.conv1 = nn.Sequential(nn.Conv2d(1, 8, kernel_size=(3, 1)), nn.Dropout2d())
        self.conv2 = nn.Sequential(nn.Conv2d(8, 16, kernel_size=(3, 1)), nn.Dropout2d())                
        self.conv3 = nn.Sequential(nn.Conv2d(16, 32, kernel_size=(3, 1)), nn.Dropout2d())

        self.fc1 = nn.Sequential(nn.Linear(32 * 1 * 1103, 1024), nn.Dropout2d())
        self.fc2 = nn.Linear(1024, 1103)
        
        
    def forward(self, x):

        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
         
        x = x.view(-1, 32 * 1 * 1103)
        
        x = F.relu(self.fc1(x))
        
        x = self.fc2(x)
        return x

In [13]:
preds_fold_0 = preds_fold_0.reshape((21816, 7, 1103, 1))

In [14]:
X = np.swapaxes(np.swapaxes(preds_fold_0, 1, 3), 2, 3)
y = preds_y_fold_0

In [36]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = EnsembleCNN().cuda()
    model.train()
    criterion = nn.BCEWithLogitsLoss()
    lr = 3e-4
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_dataset = utils.TensorDataset(torch.tensor(X_train.astype(np.float32)), torch.tensor(y_train.astype(np.float32))) # create your datset
    valid_dataset = utils.TensorDataset(torch.tensor(X_test.astype(np.float32)), torch.tensor(y_test.astype(np.float32))) # create your datset
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8)
    best_valid_loss = 10000
    zloy = 0
    for epoch in range(10):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.cuda(), labels.cuda()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

        print('train_loss %d' % running_loss)
        running_loss = 0.0
        valid_metrics = validation(model, criterion, valid_loader, True)
        if epoch == 3:
            print(f'epoch = {epoch} lr = {lr}')
            lr = 6e-5
            optimizer = torch.optim.Adam(model.parameters(), lr = lr)
        if epoch == 6:
            print(f'epoch = {epoch} lr = {lr}')
            lr = 1e-6
            optimizer = torch.optim.SGD(model.parameters(), momentum=0.9, lr=lr)
    print('Finished Training')

train_loss 43
valid_f2_th_0.05 0.195 | valid_f2_th_0.10 0.195 | valid_f2_th_0.15 0.195 | valid_f2_th_0.20 0.170 | valid_loss 0.037
train_loss 24
valid_f2_th_0.10 0.571 | valid_f2_th_0.15 0.564 | valid_f2_th_0.20 0.553 | valid_f2_th_0.05 0.541 | valid_loss 0.009
train_loss 17
valid_f2_th_0.10 0.588 | valid_f2_th_0.15 0.584 | valid_f2_th_0.20 0.571 | valid_f2_th_0.05 0.563 | valid_loss 0.008
train_loss 15
valid_f2_th_0.15 0.593 | valid_f2_th_0.10 0.592 | valid_f2_th_0.20 0.586 | valid_f2_th_0.05 0.564 | valid_loss 0.008
epoch = 3 lr = 0.0003
train_loss 13
valid_f2_th_0.15 0.602 | valid_f2_th_0.10 0.602 | valid_f2_th_0.20 0.591 | valid_f2_th_0.05 0.584 | valid_loss 0.008
train_loss 13
valid_f2_th_0.15 0.603 | valid_f2_th_0.10 0.602 | valid_f2_th_0.20 0.593 | valid_f2_th_0.05 0.582 | valid_loss 0.008
train_loss 12
valid_f2_th_0.10 0.602 | valid_f2_th_0.15 0.602 | valid_f2_th_0.20 0.593 | valid_f2_th_0.05 0.582 | valid_loss 0.008
epoch = 6 lr = 6e-05
train_loss 12
valid_f2_th_0.10 0.602 | v

KeyboardInterrupt: 

In [37]:
X_train = X
y_train = y

In [116]:
model = EnsembleCNN().cuda()
model.train()
criterion = nn.BCEWithLogitsLoss()
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
train_dataset = utils.TensorDataset(torch.tensor(X_train.astype(np.float32)), torch.tensor(y_train.astype(np.float32))) # create your datset
valid_dataset = utils.TensorDataset(torch.tensor(X_test.astype(np.float32)), torch.tensor(y_test.astype(np.float32))) # create your datset
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8)
best_valid_loss = 10000
zloy = 0
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs
        inputs, labels = data
        inputs, labels = inputs.cuda(), labels.cuda()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    print('train_loss %d' % running_loss)
    running_loss = 0.0
#     valid_metrics = validation(model, criterion, valid_loader, True)
    if epoch == 4:
        print(f'epoch = {epoch} lr = {lr}')
        lr = 2e-5
        optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    if epoch == 6:
        print(f'epoch = {epoch} lr = {lr}')
        lr = 4e-6
        optimizer = torch.optim.SGD(model.parameters(), momentum=0.9, lr=lr)
print('Finished Training')

train_loss 61
train_loss 43
train_loss 39
train_loss 34
train_loss 30
epoch = 4 lr = 0.0001
train_loss 29
train_loss 29
epoch = 6 lr = 2e-05
train_loss 28
train_loss 28
train_loss 28
Finished Training


In [46]:
model2 = EnsembleCNN().cuda()
model2.train()
criterion = nn.BCEWithLogitsLoss()
lr = 3e-4
optimizer = torch.optim.Adam(model2.parameters(), lr=lr)
train_dataset = utils.TensorDataset(torch.tensor(X_train.astype(np.float32)), torch.tensor(y_train.astype(np.float32))) # create your datset
valid_dataset = utils.TensorDataset(torch.tensor(X_test.astype(np.float32)), torch.tensor(y_test.astype(np.float32))) # create your datset
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8)
best_valid_loss = 10000
zloy = 0
for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs
        inputs, labels = data
        inputs, labels = inputs.cuda(), labels.cuda()

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model2(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()

    print('train_loss %d' % running_loss)
    running_loss = 0.0
#     valid_metrics = validation(model, criterion, valid_loader, True)
    if epoch == 6:
        print(f'epoch = {epoch} lr = {lr}')
        lr = 6e-5
        optimizer = torch.optim.Adam(model2.parameters(), lr = lr)
    if epoch == 8:
        print(f'epoch = {epoch} lr = {lr}')
        lr = 1e-6
        optimizer = torch.optim.SGD(model2.parameters(), momentum=0.9, lr=lr)
print('Finished Training')

train_loss 41
train_loss 27
train_loss 25
train_loss 23
train_loss 22
train_loss 22
train_loss 21
epoch = 6 lr = 0.0003
train_loss 21
train_loss 20
epoch = 8 lr = 6e-05
train_loss 20
Finished Training


In [96]:
test_loader = get_train_loader(models)

(7443,)
(7443, 1, 7, 1103)


In [117]:
import tqdm

model.eval()
all_outputs, all_ids = [], []
with torch.no_grad():
    for inputs, ids in tqdm.tqdm(test_loader, desc='Predict'):
        inputs = inputs.cuda()
        outputs = model(inputs)
        outputs = torch.sigmoid(outputs)
        all_outputs.append(outputs.data.cpu().numpy())
        all_ids.extend(ids)
df = pd.DataFrame(
    data=np.concatenate(all_outputs),
    index=all_ids,
    columns=map(str, range(1103)))
df.to_hdf('pred2.h5', 'prob', index_label='id')
print(f'Saved predictions to pred2.h5')



Predict:   0%|          | 0/8 [00:00<?, ?it/s][A[A

Predict:  25%|██▌       | 2/8 [00:00<00:00, 15.47it/s][A[A

Predict:  50%|█████     | 4/8 [00:00<00:00, 15.95it/s][A[A

Predict:  75%|███████▌  | 6/8 [00:00<00:00, 16.29it/s][A[A

Predict: 100%|██████████| 8/8 [00:00<00:00, 17.95it/s][A[A

Saved predictions to pred2.h5


In [111]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1093,1094,1095,1096,1097,1098,1099,1100,1101,1102
10023b2cc4ed5f68,4.1e-05,0.004746,0.000632,0.0002,0.000183,0.000494,0.000402,0.000133,0.000493,0.001267,...,0.045992,0.000237,0.000895,0.002102,0.007947,0.023899,0.016448,0.001016,0.001723,0.002723
100fbe75ed8fd887,0.000256,0.002444,0.000298,6.2e-05,2.7e-05,6.3e-05,4.5e-05,4.6e-05,0.00013,0.000321,...,0.032825,0.000102,0.008818,0.007217,0.013293,0.015014,0.063053,0.00075,0.003658,0.001463
101b627524a04f19,0.000204,0.001851,0.000627,0.000149,0.000528,0.00021,0.000537,7.9e-05,0.001756,0.000135,...,0.011145,0.000246,0.001725,0.026945,0.005531,0.013237,0.060282,0.000582,0.006636,0.00094
10234480c41284c6,0.001292,0.008975,0.002044,0.000695,0.00213,0.001384,0.003359,0.000595,0.005635,0.000744,...,0.036896,0.00105,0.004956,0.053629,0.023539,0.059371,0.12138,0.004508,0.035552,0.010871
1023b0e2636dcea8,0.001379,0.001385,0.000636,0.000461,0.004306,0.002927,0.001647,0.000563,0.004016,0.000283,...,0.048271,0.000425,0.016674,0.023854,0.018106,0.020378,0.125742,0.00078,0.002628,0.004019


In [114]:
kek = pd.read_csv('submission_nn_1.csv')

In [115]:
kek.head()

Unnamed: 0,id,attribute_ids
0,10023b2cc4ed5f68,13 79 147 322 725 776 813 830 1046 1092
1,100fbe75ed8fd887,121 147 188 189 335 541 542 597 813 1092
2,101b627524a04f19,147 161 188 335 612 671 813 864 1059 1092
3,10234480c41284c6,147 189 194 671 776 780 830 1046 1059 1062
4,1023b0e2636dcea8,51 156 369 737 738 813 1019 1046 1059 1092


In [7]:
df = pd.read_csv('data/train.csv')

In [8]:
df.head()

Unnamed: 0,id,attribute_ids
0,1000483014d91860,147 616 813
1,1000fe2e667721fe,51 616 734 813
2,1001614cb89646ee,776
3,10041eb49b297c08,51 671 698 813 1092
4,100501c227f8beea,13 404 492 903 1093


In [10]:
df['count'] = df.apply (lambda row: len(row['attribute_ids'].split()), axis=1)

In [22]:
np.sum(df['count'] == 7)

920

In [23]:
df.drop(['attribute_ids'], axis=0)

KeyError: "['attribute_ids'] not found in axis"