Notebook equivalent of `fit_ensemble.py`
---

* This program trains the ensemble of CNN models reported in https://link.springer.com/article/10.1007/s42113-020-00073-z
    * It trains a model/ensemble on a 180 images training set of a 360 images dataset
    * Makes predictions on
        1. the 90 images validation set (part of the same 360 images set)
        2. the 90 images test set (part of the same 360 images set)
        3. the 120 images set (a different set)
* Requires
    * File `mds_360.txt` with labels (*missing*)
    * Directory `360 Rocks/` with `*.jpg` images (*missing*)
    * File `mds_120.txt` with labels (*missing*)
    * Directory `120 Rocks/` with `*.jpg` images (*missing*)
* Available
    * Directory `120 Rock Images/` with 120 `*.png` images
    * Directory `Similarity Judgements Data/` with similarity labels for the "120 Rocks" set as individual textfiles for each of the 85 participants: `rocks_similarity_120_*.txt`
    * Directory `Categorization Data/` with category labels (1 = Igneous, 2 = Metamorphic, 4 = Mixed) for the "120 Rocks" set as individual textfiles for each of the 85 participants: `rocks_similarity_120_*_*.txt`
    * File `MDS/mds_120_supplemental_dims.txt`
* What to do?
    - **a)** Ask for missing files?  
    - **b)** Rewrite script to process what is available?  
        * using the `*.png` images in the`120 Rock Images/` directory
        * and using `mds_120_supplemental_dims.txt`  
    
    
#### **Update from 2022/05/31: Missing data can found in this paper: https://link.springer.com/article/10.3758/s13428-017-0884-8**  
* https://osf.io/w64fv/

In [1]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.applications import resnet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras import backend as K

import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

nPixels = 224

Original:

In [2]:
# nTest = 90

Replacement:

In [3]:
nTest = 30

## Categories

Original:

In [4]:
# categories = [i for i in range(30) for j in range(12)] # creates 360 list items like so: [0, 0, 0, 0, ... 29, 29, 29, 29]

Replacement:

In [5]:
categories = [i for i in range(30) for j in range(4)] # creates 120 list items like so: [0, 0, 0, 0, ... 29, 29, 29, 29]

## Functions

Original:

In [6]:
#def load_images(directory, nPixels, preprocesser):
#    """
#    Creates array-like data from a directory with image files for usage with Keras.
#    """
#    
#    X = []
#    for subdir, dirs, files in os.walk(directory):
#        for file in files:
#            if file.endswith(".jpg"):
#                img = load_img(os.path.join(subdir, file), target_size=(nPixels, nPixels))
#                x = img_to_array(img)
#                X.append(x)
#    X = np.stack(X)
#    X = preprocesser(X)
#    return X

Replacement:

In [7]:
def load_images(directory, nPixels, preprocesser):
    """
    Creates array-like data from a directory with image files for usage with Keras.
    """
    
    X = []
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".png"):
                img = load_img(os.path.join(subdir, file), target_size=(nPixels, nPixels))
                x = img_to_array(img)
                X.append(x)
    X = np.stack(X)
    X = preprocesser(X)
    return X

## Prepare 360 Rocks data

Original:

In [8]:
## load image files
#X = load_images("360 Rocks", nPixels, lambda x: resnet50.preprocess_input(np.expand_dims(x, axis=0)).squeeze())
#
## load labels
#mds_360 = np.loadtxt("mds_360.txt") # missing
#
## split data: train vs test
#(X_train_, X_test, 
# Y_train_, Y_test, 
# categories_train_, categories_test) = train_test_split(X, 
#                                                        mds_360, 
#                                                        categories,
#                                                        test_size=nTest,
#                                                        stratify=categories, 
#                                                        random_state=0)
#
## split train set again: train vs validate
#(X_train, X_validate, 
# Y_train, Y_validate) = train_test_split(X_train_, 
#                                         Y_train_, 
#                                         test_size=nTest,
#                                         stratify=categories_train_, 
#                                         random_state=0)

Replacement is the available 120 Rocks images set in png format:

In [9]:
# load image files
X = load_images("120 Rock Images", nPixels, lambda x: resnet50.preprocess_input(np.expand_dims(x, axis=0)).squeeze())

# load labels
Y = np.loadtxt("MDS/mds_120_supplemental_dims.txt", skiprows=1)

# split data: train vs test
(X_train_, X_test, 
 Y_train_, Y_test, 
 categories_train_, categories_test) = train_test_split(X, 
                                                        Y, 
                                                        categories,
                                                        test_size=nTest,
                                                        stratify=categories, 
                                                        random_state=0)

# split train set again: train vs validate
(X_train, X_validate, 
 Y_train, Y_validate) = train_test_split(X_train_, 
                                         Y_train_, 
                                         test_size=nTest,
                                         stratify=categories_train_, 
                                         random_state=0)

## Prepare 120 Rocks data

no train, test, validate splits ...will be later used only for testing

Original:

In [10]:
## load image files
#X_120 = load_images("120 Rocks", nPixels, lambda x: resnet50.preprocess_input(np.expand_dims(x, axis=0)).squeeze())
#
## load labels
#Y_120 = np.loadtxt("mds_120.txt") # missing

No Replacement, will just be left out

## Hyperparameters

In [11]:
datagen = ImageDataGenerator(featurewise_center=False,
                    samplewise_center=False,
                    featurewise_std_normalization=False,
                    samplewise_std_normalization=False,
                    zca_whitening=False,
                    rotation_range=20,
                    width_shift_range=0.2,
                    height_shift_range=0.2,
                    shear_range=0.2,
                    zoom_range=0.2,
                    channel_shift_range=0.,
                    fill_mode='nearest',
                    cval=0.,
                    horizontal_flip=True,
                    vertical_flip=True)

nEpochs = 10
dropout = 0.5
nEnsemble = 2
          
nDense = 256
nLayers = 2
loglr = -2.2200654426745987

lr = 10 ** loglr

Original:

In [12]:
# nDim = 8

Replacement:

In [13]:
nDim = 13 # we have 5 extra dimensions in our file mds_120_supplemental_dims.txt

Original:

In [14]:
# batch_size = 90

Replacement:

In [15]:
batch_size = 30

## Train models

In [None]:
for e in range(nEnsemble):
    #Build model
    arch = resnet50.ResNet50(include_top=False, pooling='avg')
    for layer in arch.layers:
        layer.trainable = False    
    
    x = arch.output
    x = Dropout(dropout)(x)
    for lyr in range(nLayers):
        x = Dense(nDense, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(dropout)(x)
    x = Dense(nDim)(x)
    
    model = Model(inputs=arch.input, outputs=x)
    
    #Initial training
    model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=lr))
    
    checkpoint1 = ModelCheckpoint('intermediate_model.hdf5', save_best_only=True)

    hist1 = model.fit(datagen.flow(X_train, Y_train, batch_size), 
                                steps_per_epoch=len(X_train) / batch_size,
                                epochs=nEpochs,
                                validation_data=(X_validate, Y_validate),
                                callbacks=[checkpoint1],
                                verbose=False)
    
    #Fine tuning
    model = load_model("intermediate_model.hdf5")
    
    for layer in model.layers:
        layer.trainable = True
    
    model.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.9), loss='mean_squared_error')
    
    batch_size = 30 #reduce the batch size so that the gradients of all layers can fit in memory
    
    checkpoint2 = ModelCheckpoint('ensemble_{}.hdf5'.format(e), save_best_only=True)
    
    hist2 = model.fit(datagen.flow(X_train, Y_train, batch_size), 
                                steps_per_epoch=len(X_train) / batch_size,
                                epochs=nEpochs,
                                validation_data=(X_validate, Y_validate),
                                callbacks=[checkpoint2],
                                verbose=False)
    
    K.clear_session() #Clear tensorflow session to prevent memory issues

## Get predictions for validation and test sets

Original:

In [18]:
#validate_pred = np.zeros((nEnsemble, nTest, nDim))
#test_pred = np.zeros((nEnsemble, nTest, nDim))
#rocks_120_pred = np.zeros((nEnsemble, 120, nDim))
#
#for e in range(nEnsemble):
#    model = load_model("ensemble_{}.hdf5".format(e))
#    validate_pred[e,:] = model.predict(X_validate)
#    test_pred[e,:] = model.predict(X_test)
#    rocks_120_pred[e,:] = model.predict(X_120)
#    
#    K.clear_session()
#
#validate_prediction = np.mean(validate_pred, 0)
#test_prediction = np.mean(test_pred, 0)
#rocks_120_prediction = np.mean(rocks_120_pred, 0)

Replacement:

In [19]:
validate_pred = np.zeros((nEnsemble, nTest, nDim))
test_pred = np.zeros((nEnsemble, nTest, nDim))

for e in range(nEnsemble):
    model = load_model("ensemble_{}.hdf5".format(e))
    validate_pred[e,:] = model.predict(X_validate)
    test_pred[e,:] = model.predict(X_test)
    
    K.clear_session()

validate_prediction = np.mean(validate_pred, 0)
test_prediction = np.mean(test_pred, 0)



## Get MSE

Original:

In [None]:
#print(mean_squared_error(Y_validate, validate_prediction))
#print(mean_squared_error(Y_test, test_prediction))
#print(mean_squared_error(Y_120, rocks_120_prediction))

Replacement:

In [20]:
print(mean_squared_error(Y_validate, validate_prediction))
print(mean_squared_error(Y_test, test_prediction))

5.0210778242600975
4.748088967288659


## Get R²

Original:

In [None]:
#print(r2_score(Y_validate, validate_prediction))
#print(r2_score(Y_test, test_prediction))
#print(r2_score(Y_120, rocks_120_prediction))

Replacement:

In [21]:
print(r2_score(Y_validate, validate_prediction))
print(r2_score(Y_test, test_prediction))

-1.2933656778110103
-0.7927527839019183
