In [1]:
import os
import random
import datetime

import matplotlib.cm
import numpy as np

In [2]:
import pandas as pd
import scipy
import skimage.exposure
import sklearn
import sklearn.metrics
import sklearn.metrics
import sklearn.preprocessing
import skorch.helper
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils
import torchvision
import tqdm
from matplotlib import pyplot as plt
import tqdm



In [3]:
random.seed(2020)
np.random.seed(2020)
torch.manual_seed(2020)

<torch._C.Generator at 0x7f1593aa0ed0>

In [4]:
TRAIN_DATA_URL = "https://storage.googleapis.com/fchouteau-isae-deep-learning/hackathon-2020/eurosat_train.npz"

VALID_DATA_URL = "https://storage.googleapis.com/fchouteau-isae-deep-learning/hackathon-2020/eurosat_valid.npz"

UNSUP_DATA_URL = "https://storage.googleapis.com/fchouteau-isae-deep-learning/hackathon-2020/eurosat_unsup.npz"

KAGGL_DATA_URL = "https://storage.googleapis.com/fchouteau-isae-deep-learning/hackathon-2020/test.npz"

CLASSES = [
    'AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential',
    'River', 'SeaLake'
]

DATASOURCE = np.DataSource(None)

In [5]:
THRESHOLDS = [0.87, 0.71, 0.61, 0.68, 0.86, 0.74, 0.67, 0.89, 0.63, 0.81]

In [6]:
def plot_imgs(x, y=None, grid_size=4, title="samples"):
    """
    Plot grid_size*grid_size images
    """
    fig, ax = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    fig.tight_layout()
    idxs = np.random.randint(len(x), size=16)

    for i in range(grid_size**2):
        k = idxs[i]
        if y is not None:
            img, lbl = x[k], CLASSES[y[k]]
        else:
            img, lbl = x[k], "unlabelled"
        if img.dtype == np.float32:
            img = skimage.exposure.rescale_intensity(img, out_range=(0., 1.))
        img = skimage.exposure.adjust_gamma(img, gamma=0.7)
        ax[i % 4][i // 4].imshow(img)
        ax[i % 4][i // 4].set_title(lbl)
        ax[i % 4][i // 4].axis('off')
    fig.suptitle(title, fontsize=14)
    plt.show()

In [7]:
train_dataset = DATASOURCE.open(TRAIN_DATA_URL, "rb")
train_dataset = np.load(train_dataset)

valid_dataset = DATASOURCE.open(VALID_DATA_URL, "rb")
valid_dataset = np.load(valid_dataset)

unsup_dataset = DATASOURCE.open(UNSUP_DATA_URL, "rb")
unsup_dataset = np.load(unsup_dataset)

x_train, y_train = train_dataset['x'], train_dataset['y']
x_valid, y_valid = valid_dataset['x'], valid_dataset['y']

x_unsup = unsup_dataset['x']

print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_unsup.shape)

(500, 64, 64, 3) (500,)
(1500, 64, 64, 3) (1500,)
(5000, 64, 64, 3)


In [8]:
train_transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToPILImage(),
    torchvision.transforms.RandomVerticalFlip(p=0.5),
    torchvision.transforms.RandomHorizontalFlip(p=0.5),
    #     torchvision.transforms.ColorJitter(
    #         brightness=0.25,
    #         contrast=0.25,
    #         saturation=0.25,
    #         hue=0.1,
    #     ),
    torchvision.transforms.ToTensor(),
])

valid_transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
])

In [9]:
class ArrayDataset(torch.utils.data.Dataset):
    def __init__(self, array_x, array_y, transform=None):
        self.array_x = array_x
        self.array_y = array_y
        self.transform = transform

    def __len__(self):
        return self.array_x.shape[0]

    def __getitem__(self, idx):
        x = self.array_x[idx]
        if self.transform is not None:
            x = self.transform(x)
        else:
            x = torch.tensor(x)
        if self.array_y is None:
            y = None
        else:
            y = self.array_y[idx]
            y = torch.tensor(y)
        return x, y

In [10]:
train_ds = ArrayDataset(x_train, y_train, transform=train_transforms)
valid_ds = ArrayDataset(x_valid, y_valid, transform=valid_transforms)

unsup_ds = ArrayDataset(x_unsup, None, transform=valid_transforms)
unsup_ds_tta = ArrayDataset(x_unsup, None, transform=train_transforms)

In [11]:
class PretrainedModel(nn.Module):
    def __init__(self, output_features):
        super().__init__()
        model = torchvision.models.resnet18(pretrained=True)
        num_ftrs = model.fc.in_features
        model.fc = nn.Sequential(
            nn.Linear(num_ftrs, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, output_features),
            nn.Softmax(dim=1),
        )

        self.model = model

    def forward(self, x):
        return self.model(x)


model = torch.load("./model_20200326_1431.pt")

In [15]:
def pseudo_label(idx, thresholds_pred, threshold_std=0.1):
    # Batch TTA
    x = unsup_ds[idx][0]
    x = torch.stack([x])
    x_tta = [unsup_ds[idx][0]] + [unsup_ds_tta[idx][0] for _ in range(3)]
    x_tta = torch.stack(x_tta)

    # Predictions
    y = model(x).detach().numpy()
    y_tta = model(x_tta).detach().numpy()

    # Means & std
    y_tta_mean = np.mean(y_tta, axis=0)
    y_tta_std = np.std(y_tta, axis=0)

    ranks = np.argsort(y_tta_mean)[::-1]
    top1 = ranks[0]

    if y_tta_mean[top1] >= thresholds_pred[top1] and y_tta_std[top1] < threshold_std:
        return top1
    else:
        return None

In [33]:
labels = []
for idx in tqdm.trange(len(x_unsup)):
    y_pseudo = pseudo_label(idx, THRESHOLDS)
    if y_pseudo is not None:
        labels.append((idx, y_pseudo))

100%|██████████| 5000/5000 [02:46<00:00, 29.98it/s]


In [35]:
indexes = np.asarray([label[0] for label in labels])
y = np.asarray([label[1] for label in labels])

In [36]:
np.unique(y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([258, 115, 127, 122, 237, 229,  78, 266,  51, 373]))

In [42]:
x_unsup_labellized_1 = x_unsup[indexes]
y_unsup_labellized_1 = y

In [43]:
train_dataset = DATASOURCE.open(TRAIN_DATA_URL, "rb")
train_dataset = np.load(train_dataset)

x_train, y_train = train_dataset['x'], train_dataset['y']

In [44]:
x_train_unsup_1 = np.concatenate([x_train, x_unsup_labellized_1])
y_train_unsup_1 = np.concatenate([y_train, y_unsup_labellized_1])
x_train_unsup_1.shape, y_train_unsup_1.shape

((2356, 64, 64, 3), (2356,))

In [45]:
with open('./eurosat_pseudolabellized_1.npz', 'wb') as f:
    np.savez_compressed(f, x=x_train_unsup_1, y=y_train_unsup_1)