In [1]:
import os, glob, pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import seaborn_image as isns
from PIL import Image

## Feature extraction
from keras_preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras.applications.resnet_v2 import ResNet50V2
from keras.layers import Dense, Flatten, Input, Activation
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import mixed_precision
import IPython



First step is to generate embeddings for unlabelled data

In [42]:
# Building Training and Validation Dataframes
labels = ['buildings', 'forest', 'glacier', 'mountain', 'sea', 'street']

def dataframer(base_dir, include_unlabelled=False):
    path = []
    label = []
    for img_class in os.listdir(base_dir):
        label_path = os.path.join(base_dir, img_class)
        if img_class in labels:
            for img in os.listdir(label_path):
                path.append(os.path.join(label_path, img))
                label.append(img_class)
    
    if include_unlabelled:
        print("Loading unlabelled data...")
        c = 0
        for tag in os.listdir("../data/seg_pred"):

            img_path = os.path.join("../data/seg_pred", tag)

            if c < 1:
                print(f"Example: {img_path}")
                c += 1

            path.append(img_path)
            label.append('-1') 

    img_data = pd.DataFrame({'Path': path, 'Label': label})
    return img_data

train_base_dir = "../data/seg_train"
validation_base_dir = "../data/seg_test"

train_img_data = dataframer(train_base_dir, include_unlabelled=True)
validation_img_data = dataframer(validation_base_dir)

target_size = (150, 150)


def preprocessing(arr):
    arr = tf.image.rgb_to_grayscale(arr)
    arr = tf.image.resize(arr, target_size)
    return arr

# Data Augmentation
train_datagen = ImageDataGenerator(
    rescale = 1/255,
    preprocessing_function=preprocessing
)

validation_datagen = ImageDataGenerator(
    rescale = 1/255
)

# Generating Batches of the Augmented Data:
batch_size = 360

train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_img_data,
    x_col="Path",
    y_col="Label",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="categorical",
    subset="training",
    shuffle=False,
    seed=10)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=validation_img_data,
    x_col="Path",
    y_col="Label",
    target_size=target_size,
    batch_size=batch_size,
    class_mode="categorical",
    shuffle=False)

# Build Model
resnet = ResNet50V2(include_top=False, pooling='avg')
resnet.trainable = False # Let's start with frozen resnet weights and train just the dense

# Compile the model
resnet.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["accuracy"])

Loading unlabelled data...
Example: ../data/seg_pred\10004.jpg
Found 21335 validated image filenames belonging to 7 classes.
Found 3000 validated image filenames belonging to 6 classes.


In [43]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
train_labels = le.fit_transform(train_img_data["Label"])
validation_labels = le.transform(validation_img_data["Label"])

cats = le.inverse_transform([0, 1, 2, 3, 4, 5, 6])

with open("../models/semisup_labels.pickle", "wb") as handle:
    labels = {"train": train_labels, "validation": validation_labels, "categorical": cats}
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [44]:
cats

array(['-1', 'buildings', 'forest', 'glacier', 'mountain', 'sea',
       'street'], dtype=object)

In [45]:
def extract_features(model, data_gen):

    features = model.predict(data_gen)

    return features


train_feats = extract_features(resnet, train_generator)
validation_feats = extract_features(resnet, validation_generator)



In [46]:
with open('../models/semisup_embedding_train.pickle', 'wb') as handle:
    pickle.dump(train_feats, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../models/semisup_embedding_validation.pickle', 'wb') as handle:
    pickle.dump(validation_feats, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [47]:
with open('../models/semisup_embedding_train.pickle', 'rb') as handle:
    train_feats = pickle.load(handle)

with open('../models/semisup_embedding_validation.pickle', 'rb') as handle:
    validation_feats = pickle.load(handle)

The unlabelled images need to have a virtual label index of -1, as instructed by `sckit-learn`.

At this point, we have all the embeddings stored (only need to load them with the above cell), but we still need to enforce above convetion on label values.

In [48]:
train_feats.shape

(21335, 2048)

In [49]:
train_labels.shape

(21335,)

In [50]:
validation_feats.shape

(3000, 2048)

In [51]:
validation_labels.shape

(3000,)

Choose only same models used for ensemble:

1. k-Nearest Neighbours
2. Logistic Regression
3. Huber Loss Classifier (SGD + Modified Hubber Loss)

Apply probability calibration, as instructed in the API:
- [Semi-supervised learning](https://scikit-learn.org/stable/modules/semi_supervised.html#id4)
- [Probability calibration](https://scikit-learn.org/stable/modules/calibration.html#calibration)
- [Probability calibration example](https://scikit-learn.org/stable/auto_examples/calibration/plot_calibration_multiclass.html#sphx-glr-auto-examples-calibration-plot-calibration-multiclass-py)