## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Helper Functions

In [2]:
def build_decoder(with_labels=True, img_size=64, ext='jpg'):
    def decode(path, add_noise=True):
        file_bytes = tf.io.read_file(path)

        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, (img_size, img_size), method="bicubic")
        
        if add_noise:
            noise = np.random.normal(loc=0.1, scale=0.2, size=img.shape)
            img = tf.clip_by_value(tf.add(img, noise), 0, 1)

        return img
    
    def decode_with_labels(src_img_path, tgt_img_path):
        return decode(src_img_path), decode(tgt_img_path, add_noise=False)
    
    return decode_with_labels if with_labels else decode

In [3]:
def build_dataset(paths, labels=None, 
                  bsize=32, decode_fn=None, 
                  repeat=True, shuffle=256):
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)
    return dset

## Load autoencoder model

In [4]:
autoencoder = load_model('../input/he-ffi-dae-model-v2-2/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 384, 384, 3)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 192, 192, 32)      896       
_________________________________________________________________
batch_normalization (BatchNo (None, 192, 192, 32)      128       
_________________________________________________________________
activation (Activation)      (None, 192, 192, 32)      0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 96, 96, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 48, 48, 64)        18496     
_________________________________________________________________
batch_normalization_1 (Batch (None, 48, 48, 64)        256   

## Prepare data

In [5]:
with open("../input/he-ffi-preprocess-data/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

train_df.shape, test_df.shape

((1382, 157), (600, 156))

In [6]:
train_df['id'] = train_df['Image_path']
test_df['id'] = test_df['Image_path']
train_df.set_index("id", inplace=True)
test_df.set_index("id", inplace=True)
train_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,clusters__6,clusters__7,clusters__8,clusters__9,clusters__10,clusters__11,clusters__12,clusters__13,clusters__14,Image_path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/trainImages/img_4513976.jpg,6.0,0.411534,1.895378,0.0,1.0,4.0,2026.0,4.0,12.0,1.318946,...,0,0,0,0,0,0,0,1,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7764995.jpg,6.0,1.464186,1.104747,1.0,1.0,4.0,2025.0,3.0,7.0,0.040157,...,0,0,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_451308.jpg,0.0,1.212751,2.225731,0.0,0.0,5.0,2022.0,3.0,8.0,0.216204,...,0,0,0,0,0,0,0,1,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7768372.jpg,0.0,-0.369558,-0.425228,1.0,0.0,5.0,2022.0,3.0,8.0,0.177827,...,0,0,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7765274.jpg,2.0,-0.989697,-1.029375,1.0,0.0,2.0,2026.0,2.0,5.0,-0.461215,...,0,1,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...


In [7]:
train_paths = train_df['Image_path']
test_paths = test_df['Image_path']
len(train_paths), len(test_paths)

(1382, 600)

In [8]:
img_size = 384
mini_batch_size = 128

decoder = build_decoder(with_labels=True, img_size=img_size)

dtrain = build_dataset(
    train_paths, train_paths, 
    bsize=mini_batch_size, repeat=False, 
    shuffle=False, decode_fn=decoder
)

dtest = build_dataset(
    test_paths, test_paths, 
    bsize=mini_batch_size, repeat=False, 
    shuffle=False, decode_fn=decoder
)

In [9]:
Xtrain_embed = feature_model.predict(dtrain, verbose=1)
Xtest_embed = feature_model.predict(dtest, verbose=1)
col_list = ['dae_'+str(i) for i in range(Xtrain_embed.shape[1])]
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, columns=col_list, index=train_df.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, columns=col_list, index=test_df.index)

train_df = pd.merge(train_df, Xtrain_embed_df, on='id', sort=False)
test_df = pd.merge(test_df, Xtest_embed_df, on='id', sort=False)

print("\n\ntrain_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

del Xtrain_embed
del Xtest_embed
del Xtrain_embed_df
del Xtest_embed_df
gc.collect()



train_df: (1382, 1693) 
test_df: (600, 1692)


104

In [10]:
train_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,dae_1526,dae_1527,dae_1528,dae_1529,dae_1530,dae_1531,dae_1532,dae_1533,dae_1534,dae_1535
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/trainImages/img_4513976.jpg,6.0,0.411534,1.895378,0.0,1.0,4.0,2026.0,4.0,12.0,1.318946,...,-0.212962,-0.155224,-0.165064,0.157809,-0.274866,-0.235475,0.291924,-0.273442,-0.119392,-0.273063
../input/he-fast-furious-insured/dataset/trainImages/img_7764995.jpg,6.0,1.464186,1.104747,1.0,1.0,4.0,2025.0,3.0,7.0,0.040157,...,0.303757,0.286932,-0.229121,-0.011811,-0.261355,0.13601,-0.040179,-0.145919,0.218217,-0.028747
../input/he-fast-furious-insured/dataset/trainImages/img_451308.jpg,0.0,1.212751,2.225731,0.0,0.0,5.0,2022.0,3.0,8.0,0.216204,...,-0.164636,-0.165039,-0.199889,-0.036386,-0.090873,-0.256736,0.228164,-0.021015,0.04524,-0.082554
../input/he-fast-furious-insured/dataset/trainImages/img_7768372.jpg,0.0,-0.369558,-0.425228,1.0,0.0,5.0,2022.0,3.0,8.0,0.177827,...,-0.254031,-0.189123,-0.099886,-0.277091,-0.268922,-0.06136,0.166065,1.531418,-0.277391,-0.191362
../input/he-fast-furious-insured/dataset/trainImages/img_7765274.jpg,2.0,-0.989697,-1.029375,1.0,0.0,2.0,2026.0,2.0,5.0,-0.461215,...,-0.221035,0.31783,-0.096522,0.113684,0.270596,-0.035356,0.183275,0.355658,0.044402,0.101078


In [11]:
test_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,dae_1526,dae_1527,dae_1528,dae_1529,dae_1530,dae_1531,dae_1532,dae_1533,dae_1534,dae_1535
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/testImages/img_4538519.jpg,3.0,-1.873053,-1.895259,1.0,1.0,5.0,2025.0,2.0,4.0,-0.615025,...,-0.221904,-0.170043,-0.033982,0.816785,-0.005888,-0.18808,0.006128,-0.117917,-0.174629,-0.103205
../input/he-fast-furious-insured/dataset/testImages/img_7766002.jpg,7.0,-0.89578,-0.93391,1.0,2.0,5.0,2028.0,3.0,8.0,0.307111,...,-0.126698,0.037176,0.076595,-0.007312,-0.264694,-0.018877,2.475356,0.132535,-0.178443,-0.25015
../input/he-fast-furious-insured/dataset/testImages/img_4637390.jpg,2.0,0.605955,0.468203,1.0,0.0,2.0,2023.0,4.0,11.0,1.194628,...,-0.137336,0.16925,1.129882,0.52912,-0.161149,0.153779,0.559947,0.196964,0.025427,0.283643
../input/he-fast-furious-insured/dataset/testImages/img_4516108.jpg,4.0,0.848075,0.667419,1.0,1.0,1.0,2028.0,1.0,2.0,-1.3402,...,-0.252469,1.362725,-0.032899,-0.15739,0.089461,-0.205311,-0.263384,0.050781,-0.143166,0.068949
../input/he-fast-furious-insured/dataset/testImages/img_4517008.jpg,4.0,0.334851,0.230353,1.0,1.0,1.0,2022.0,1.0,1.0,-5.199338,...,-0.14382,0.184091,0.02199,0.215983,0.335759,-0.158006,0.609146,0.041074,0.348379,-0.183588


## Save the processed datasets

In [12]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./HE_FFI_Dataset.txt", 'wb')
pickle.dump(data_dict, file)
file.close()