In [1]:
import ast
import datetime
import os
import random as python_random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from imblearn.under_sampling import RandomUnderSampler
from keras import Input
from keras import backend as K
from keras import optimizers
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.layers import (Conv2D, Dense, Dropout, Flatten,
                          MaxPooling2D)
from keras.models import Sequential
from keras_preprocessing.image import ImageDataGenerator
from sklearn.metrics import (accuracy_score, classification_report, f1_score,
                             multilabel_confusion_matrix)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from talos.utils import early_stopper
from talos.utils.best_model import activate_model
from tensorflow.keras.utils import plot_model

import talos



np.random.seed(8)
python_random.seed(8)
tf.random.set_seed(8)

tf.compat.v1.disable_eager_execution() # May help speed because we are using generators with Talos

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Some helper functions

In [None]:
def order_label_pivot(ds):
    ds.labels = ds.labels.astype(str)
    ds_balance = ds[['labels', 'order']]
    ds_balance = ds_balance.pivot_table(index="order", columns="labels", aggfunc=len, fill_value=0)
    ds_balance.loc["Total"] = ds_balance.sum()
    return ds_balance

In [None]:
def plot_history(history, run):
    plot_path = Path("plots/" + str(run) + "_"+ datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".pdf")
    plt.plot(history.history['loss'], label='Train MAE')
    plt.plot(history.history['val_loss'], label='Val MAE')
    plt.plot(history.history['f1score'], label='Train f1')
    plt.plot(history.history['val_f1score'], label='Val f1')
    plt.plot(history.history['acc'], label='Train Acc')
    plt.plot(history.history['val_acc'], label='Val Acc')
    plt.title('MAE F1 and Accuracy for CNN')
    plt.ylabel('Value')
    plt.xlabel('No. epoch')
    plt.legend(loc="upper left")
    plt.savefig(plot_path, bbox_inches='tight')
    plt.show()

In [None]:
def f1_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
def make_generators(train, val, test, ds_dir, hp):
    def orthogonal_rot(image):
        return np.rot90(image, np.random.choice([-1, 0, 1]))

    if hp['rotation']: preprocessing_function = orthogonal_rot
    else: preprocessing_function = None
    generator = ImageDataGenerator(
        rescale=1./255.,
        preprocessing_function=preprocessing_function,
        width_shift_range=None,#hp['width_shift'],
        height_shift_range=None#hp['height_shift']
        )
    test_generator = ImageDataGenerator(rescale=1./255.) 

    train_gen = generator.flow_from_dataframe(
        train,
        directory=ds_dir,
        x_col='filename',
        y_col='labels',
        class_mode='categorical',
        target_size=hp['input_shape'][:2],
        batch_size=hp['batch_size'],
        color_mode=hp['color_mode']
    )
    val_gen = test_generator.flow_from_dataframe(
        val,
        directory=ds_dir,
        x_col='filename',
        y_col='labels',
        class_mode='categorical',
        target_size=hp['input_shape'][:2],
        batch_size=hp['batch_size'],
        color_mode=hp['color_mode']
    )
    test_gen = test_generator.flow_from_dataframe(
        test,
        directory=ds_dir,
        x_col='filename',
        y_col='labels',
        class_mode='categorical',
        target_size=hp['input_shape'][:2],
        batch_size=hp['batch_size'],
        color_mode=hp['color_mode'],
        shuffle=False
    )

    return train_gen, val_gen, test_gen

## Model and dataset creation functions

In [None]:
def gen_dataset(DF_PATH, hp, undersample):
    ds = pd.read_csv(DF_PATH, usecols=['filename', 'labels', 'order'])
    ds = ds.dropna(subset=['labels'])
    ds.labels = ds.labels.apply(ast.literal_eval)
    print(f"Original: {ds.shape}")

    if undersample:
        rus = RandomUnderSampler(sampling_strategy='all')
        ds, _ = rus.fit_resample(ds, ds.labels.astype(str)) 
    ds = ds.sample(hp["samples"], random_state=8)
    # ds_balance = order_label_pivot(ds)
    # print(ds_balance)
    TRAIN, val_test = train_test_split(ds, test_size=0.2, random_state=8)
    VAL, TEST = train_test_split(val_test, test_size=0.5, random_state=8)
    print(f"Train: {TRAIN.shape}\nVal: {VAL.shape}\nTest: {TEST.shape}")
    return TRAIN, VAL, TEST

In [None]:
def input_model(hp, output_shape=2):
    model = Sequential()
    model.add(Input(shape=(hp['input_shape'])))
    for i in range(1, hp['hidden_layers']+1):
        for i in range(1, hp['conv_layers']+1):
            model.add(Conv2D(i*hp['first_layer'], (3, 3), activation='relu', kernel_initializer=hp['kernel_initializer'], padding='same'))
        model.add(MaxPooling2D((2, 2)))
        model.add(Dropout(hp['dropout']))
    model.add(Flatten())
    for i in range(1, hp['dense_layers']+1):
        model.add(Dense(hp['dense'], activation='relu', kernel_initializer=hp['kernel_initializer']))
    model.add(Dense(output_shape, activation='sigmoid'))
    model.compile(optimizer=optimizers.Adam(learning_rate=hp['lr']), loss='binary_crossentropy', metrics=["acc", talos.utils.metrics.f1score])
    return model

In [None]:
def wrapper(x, y, x_val, y_val, hp):
    DS_DIR = Path('c:/Users/flori/download/subset')
    DF_PATH = Path('C:/Users/flori/OneDrive/Documents/Uni/8_Master_thesis/code/subset_logs/20210518-001138.csv')

    model = input_model(hp)
    print(model.summary())
    plot_model(model, to_file= 'plots/model_plot_{}.pdf'.format(hp['experiment']), show_shapes=True, show_layer_names=False)
    train_df, val_df, test_df = gen_dataset(DF_PATH, hp, undersample=False)
    train_gen, val_gen, test_gen = make_generators(train_df, val_df, test_df, DS_DIR, hp)
    es = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=2, verbose=1, mode='max')
    mc = ModelCheckpoint('model_tuned_2_224902.h5', monitor='val_acc', mode='max', verbose=1)

    out = model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=hp['epochs'],
        verbose=1
        ,callbacks=[es, mc] # 
        ,workers=8
        ,max_queue_size=16
    )

    plot_history(out, hp['experiment'])
    return out, model

## Run the model

In [None]:
samples =  281128
epochs = 10

params = {
    'experiment': ['final'],
    "samples": [samples],
    "epochs": [epochs],
	"batch_size": [32],                         # [16, 32, 64]
    "rotation": [True],                         # [True, False]
    "width_shift": [0],                         # [0, 0.1, 0.2],
    "height_shift": [0],                        # [0, 0.1, 0.2],
    "input_shape": [(128,128,1)],               # [(64,64,1), (128,128,1)]
    "color_mode": ["grayscale"],                # ["rgb", "grayscale"]

 	"lr": [0.001],                              # [0.0001, 0.001, 0.01]
    "hidden_layers": [3],                       # [2, 3, 4]
    "kernel_initializer": ['glorot_uniform'],   # [he_uniform, glorot_uniform, he_normal]
    "conv_layers": [2],                         # [1,2]
    "dropout": [.3],                            # [.2, .3, .4]
    "dense_layers": [1],                        # [1,2]
    "dense": [1024],                            # [256, 512, 1024, 2048]
    "first_layer": [32],                        # [16, 32, 48, 64]
}

params_baseline = {
    'experiment': ['baseline'],
    "samples": [samples],
    "epochs": [epochs],
	"batch_size": [32],
    "rotation": [True],
    "width_shift": [0.2],
    "height_shift": [0.2],
    "input_shape": [(64,64,1)],
    "color_mode": ["grayscale"],

 	"lr": [0.001],
    "hidden_layers": [3],
    "kernel_initializer": ['he_uniform'],
    "conv_layers": [1],
    "dropout": [0.3],
    "dense_layers": [3],
    "dense": [1024],
    "first_layer": [64],
}

dummy_x, dummy_y = np.empty(1), np.empty(1)
# https://autonomio.github.io/talos/#/README?id=quick-start
scan = talos.Scan(
    x=dummy_x,
    y=dummy_y,
    model=wrapper,
    params=params,
    experiment_name='talos/final',
    print_params=True,
    save_weights=True,
    clear_session=True
    # ,fraction_limit=.5
    )


In [None]:
model = scan.best_model(metric='val_f1score', asc=False)
model.save('models/model_tuned.h5')

## Predict on the test set

In [7]:
model_predict = tf.keras.models.load_model(
    'model_tuned_2_224902.h5',
    custom_objects={'f1score': talos.utils.metrics.f1score})

DS_DIR = Path('c:/Users/flori/download/subset')
DF_PATH = Path('C:/Users/flori/OneDrive/Documents/Uni/8_Master_thesis/code/subset_logs/20210518-001138.csv')
samples =  281128
hp = {
    "samples": samples,
	"batch_size": 32,                         # [16, 32, 64]
    "rotation": True,                         # [True, False]
    "width_shift": 0,                         # [0, 0.1, 0.2],
    "height_shift": 0,                        # [0, 0.1, 0.2],
    "input_shape": (128,128,1),                 # [(64,64,1), (128,128,1)]
    "color_mode": "grayscale",                # ["rgb", "grayscale"]
}

In [None]:
train_df, val_df, test_df = gen_dataset(DF_PATH, hp, undersample=False)
train_gen, val_gen, test_gen = make_generators(train_df, val_df, test_df, DS_DIR, hp)
model_predict = activate_model(scan, 0)

In [None]:
preds = model_predict.predict(
    test_gen,
    steps=20,
    verbose=1,
    workers=8,
    max_queue_size=8)
y_pred = np.rint(preds).astype(int)

In [None]:
y_pred = np.rint(preds).astype(int)
y_true = np.array(test_gen.classes)
y_true = MultiLabelBinarizer().fit_transform(y_true)
print(classification_report(y_true[:len(y_pred)], y_pred))
print(multilabel_confusion_matrix(y_true[:len(y_pred)], y_pred))
print(accuracy_score(y_true[:len(y_pred)], y_pred))
print(f1_score(y_true[:len(y_pred)], y_pred, average='samples'))

In [None]:
filenames = test_gen.filenames[:len(y_pred)]
pred_df = pd.DataFrame({'filenames': filenames})
pred_df['y_pred'] = pd.Series(list(y_pred))
pred_df['y_true'] = pd.Series(list(y_true))
pred_df['correct'] = pred_df['y_pred'].astype(str) == pred_df['y_true'].astype(str)
wrong_preds = pred_df[pred_df.correct == False]
# wrong_preds.to_csv('models/wrong_preds_test_25000.csv')
wrong_preds

## Predict on the rest of the original dataset

In [2]:
model_predict = tf.keras.models.load_model(
    'model_tuned_2_224902.h5',
    custom_objects={'f1score': talos.utils.metrics.f1score})

DS_DIR = Path('c:/Users/flori/download/original')
DF_PATH = Path('C:/Users/flori/OneDrive/Documents/Uni/8_Master_thesis/code/subset_logs/20210518-001138.csv')
samples =  281128
hp = {
    "samples": samples,
	"batch_size": 32,
    "rotation": True,
    "width_shift": 0,
    "height_shift": 0,
    "input_shape": (128,128,1),
    "color_mode": "grayscale",
}

In [3]:
DS_DIR = Path('c:/Users/flori/download/original')
orignal_df = pd.DataFrame(os.listdir(Path('c:/Users/flori/download/original')), columns=['filename'])
sorted_dir = pd.Series(list(os.walk(DS_DIR))[0][2], name='filename')
print(sorted_dir.shape)
ds_sorted = pd.merge(sorted_dir, orignal_df, left_on='filename', right_on='filename')
ds_sorted.shape

(213128,)


(213128, 1)

In [5]:
DS_DIR = Path('c:/Users/flori/download/original')
import PIL
def check_image(path):
    try:
        img = PIL.Image.open(path)
        img.verify()
        img.close()
        return True
    except Exception as e:
        print(e)
        return False
orignal_df['check_image'] = orignal_df.filename.apply(lambda x: check_image(DS_DIR / x))
print(orignal_df.check_image.value_counts())
orignal_df_checked = orignal_df[orignal_df.check_image == True].drop('check_image', axis=1)

cannot identify image file 'C:\\Users\\flori\\download\\original\\desktop.ini'
True     213128
False         1
Name: check_image, dtype: int64


In [23]:
generator = ImageDataGenerator(rescale=1./255.) 
original_gen = generator.flow_from_dataframe(
    ds_sorted,
    directory=DS_DIR,
    class_mode='input',
    target_size=hp['input_shape'][:2],
    batch_size=hp['batch_size'],
    color_mode=hp['color_mode'],
    shuffle=False
)

Found 213128 validated image filenames.


In [29]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
preds = model_predict.predict(
    original_gen,
    steps=20,
    verbose=1,
    workers=8,
    max_queue_size=8)
y_pred = np.rint(preds).astype(int)
print(y_pred.shape)

(213128, 2)


In [31]:
np.savetxt('predictions_original.csv', y_pred, delimiter=',')

In [2]:

DS_DIR = Path('c:/Users/flori/download/original')
orignal_df = pd.DataFrame(os.listdir(Path('c:/Users/flori/download/original')), columns=['filename'])
sorted_dir = pd.Series(list(os.walk(DS_DIR))[0][2], name='filename')
print(sorted_dir.shape)
ds_sorted = pd.merge(sorted_dir, orignal_df, left_on='filename', right_on='filename')
ds_sorted.shape
y_pred = np.genfromtxt('predictions_original.csv', delimiter=',')

(213128,)


In [4]:
ds_sorted['pred'] = list(y_pred)
pred_df = pd.DataFrame(y_pred, columns=['filename', 'prediction'])
print(ds_sorted.pred.astype(str).value_counts())
labels = ds_sorted[ds_sorted.pred == list([])]
