## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Helper Functions

In [2]:
def build_decoder(with_labels=True, img_size=64, ext='jpg'):
    def decode(path, add_noise=True):
        file_bytes = tf.io.read_file(path)

        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, (img_size, img_size), method="bicubic")
        
        if add_noise:
            noise = np.random.normal(loc=0.1, scale=0.2, size=img.shape)
            img = tf.clip_by_value(tf.add(img, noise), 0, 1)

        return img
    
    def decode_with_labels(src_img_path, tgt_img_path):
        return decode(src_img_path), decode(tgt_img_path, add_noise=False)
    
    return decode_with_labels if with_labels else decode

In [3]:
def build_dataset(paths, labels=None, 
                  bsize=32, decode_fn=None, 
                  repeat=True, shuffle=256):
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)
    return dset

## Load autoencoder model

In [4]:
autoencoder = load_model('../input/he-ffi-dae-model-v2-3-tpu/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 512, 512, 3)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 256, 256, 32)      896       
_________________________________________________________________
batch_normalization (BatchNo (None, 256, 256, 32)      128       
_________________________________________________________________
activation (Activation)      (None, 256, 256, 32)      0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 128, 128, 32)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 64, 64, 64)        18496     
_________________________________________________________________
batch_normalization_1 (Batch (None, 64, 64, 64)        256   

## Prepare data

In [5]:
with open("../input/he-ffi-preprocess-data/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

train_df.shape, test_df.shape

((1382, 157), (600, 156))

In [6]:
train_df['id'] = train_df['Image_path']
test_df['id'] = test_df['Image_path']
train_df.set_index("id", inplace=True)
test_df.set_index("id", inplace=True)
train_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,clusters__6,clusters__7,clusters__8,clusters__9,clusters__10,clusters__11,clusters__12,clusters__13,clusters__14,Image_path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/trainImages/img_4513976.jpg,6.0,0.411534,1.895378,0.0,1.0,4.0,2026.0,4.0,12.0,1.318946,...,0,0,0,0,0,0,0,1,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7764995.jpg,6.0,1.464186,1.104747,1.0,1.0,4.0,2025.0,3.0,7.0,0.040157,...,0,0,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_451308.jpg,0.0,1.212751,2.225731,0.0,0.0,5.0,2022.0,3.0,8.0,0.216204,...,0,0,0,0,0,0,0,1,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7768372.jpg,0.0,-0.369558,-0.425228,1.0,0.0,5.0,2022.0,3.0,8.0,0.177827,...,0,0,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7765274.jpg,2.0,-0.989697,-1.029375,1.0,0.0,2.0,2026.0,2.0,5.0,-0.461215,...,0,1,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...


In [7]:
train_paths = train_df['Image_path']
test_paths = test_df['Image_path']
len(train_paths), len(test_paths)

(1382, 600)

In [8]:
img_size = 512
mini_batch_size = 16

decoder = build_decoder(with_labels=True, img_size=img_size)

dtrain = build_dataset(
    train_paths, train_paths, 
    bsize=mini_batch_size, repeat=False, 
    shuffle=False, decode_fn=decoder
)

dtest = build_dataset(
    test_paths, test_paths, 
    bsize=mini_batch_size, repeat=False, 
    shuffle=False, decode_fn=decoder
)

In [9]:
Xtrain_embed = feature_model.predict(dtrain, verbose=1)
Xtest_embed = feature_model.predict(dtest, verbose=1)
col_list = ['dae_'+str(i) for i in range(Xtrain_embed.shape[1])]
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, columns=col_list, index=train_df.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, columns=col_list, index=test_df.index)

train_df = pd.merge(train_df, Xtrain_embed_df, on='id', sort=False)
test_df = pd.merge(test_df, Xtest_embed_df, on='id', sort=False)

print("\n\ntrain_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

del Xtrain_embed
del Xtest_embed
del Xtrain_embed_df
del Xtest_embed_df
gc.collect()



train_df: (1382, 2205) 
test_df: (600, 2204)


4

In [10]:
train_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,dae_2038,dae_2039,dae_2040,dae_2041,dae_2042,dae_2043,dae_2044,dae_2045,dae_2046,dae_2047
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/trainImages/img_4513976.jpg,6.0,0.411534,1.895378,0.0,1.0,4.0,2026.0,4.0,12.0,1.318946,...,-0.26365,-0.176882,-0.217598,-0.175477,0.033186,-0.269549,1.801367,2.530462,-0.237596,0.171023
../input/he-fast-furious-insured/dataset/trainImages/img_7764995.jpg,6.0,1.464186,1.104747,1.0,1.0,4.0,2025.0,3.0,7.0,0.040157,...,0.058405,1.275331,-0.158268,0.697313,0.360745,2.191928,0.163027,0.144619,-0.139716,0.323664
../input/he-fast-furious-insured/dataset/trainImages/img_451308.jpg,0.0,1.212751,2.225731,0.0,0.0,5.0,2022.0,3.0,8.0,0.216204,...,-0.278216,0.057924,0.733783,-0.238819,1.388735,-0.260438,-0.109063,0.204682,-0.26786,0.116413
../input/he-fast-furious-insured/dataset/trainImages/img_7768372.jpg,0.0,-0.369558,-0.425228,1.0,0.0,5.0,2022.0,3.0,8.0,0.177827,...,0.956077,-0.162246,-0.248758,-0.273115,-0.238521,0.193087,-0.061484,-0.24359,-0.217328,-0.073152
../input/he-fast-furious-insured/dataset/trainImages/img_7765274.jpg,2.0,-0.989697,-1.029375,1.0,0.0,2.0,2026.0,2.0,5.0,-0.461215,...,0.763553,1.018331,0.662348,0.184428,0.320125,0.438596,1.181544,0.435776,1.132121,-0.213422


In [11]:
test_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,dae_2038,dae_2039,dae_2040,dae_2041,dae_2042,dae_2043,dae_2044,dae_2045,dae_2046,dae_2047
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/testImages/img_4538519.jpg,3.0,-1.873053,-1.895259,1.0,1.0,5.0,2025.0,2.0,4.0,-0.615025,...,-0.132521,-0.208475,-0.107623,0.578275,1.528087,-0.259153,0.627411,0.082578,-0.053445,0.220757
../input/he-fast-furious-insured/dataset/testImages/img_7766002.jpg,7.0,-0.89578,-0.93391,1.0,2.0,5.0,2028.0,3.0,8.0,0.307111,...,0.281249,-0.070836,-0.236844,-0.045239,-0.276376,0.574146,0.317211,-0.2358,0.153874,-0.236977
../input/he-fast-furious-insured/dataset/testImages/img_4637390.jpg,2.0,0.605955,0.468203,1.0,0.0,2.0,2023.0,4.0,11.0,1.194628,...,0.189035,-0.121746,0.174906,0.256447,0.31076,0.395528,0.139229,0.570269,0.330026,-0.0221
../input/he-fast-furious-insured/dataset/testImages/img_4516108.jpg,4.0,0.848075,0.667419,1.0,1.0,1.0,2028.0,1.0,2.0,-1.3402,...,-0.249979,0.051298,-0.278355,0.425848,-0.219857,0.032326,-0.129058,-0.024918,-0.15421,0.048197
../input/he-fast-furious-insured/dataset/testImages/img_4517008.jpg,4.0,0.334851,0.230353,1.0,1.0,1.0,2022.0,1.0,1.0,-5.199338,...,0.006022,0.007696,-0.218007,0.212004,0.072686,0.085695,0.373596,0.151468,-0.129987,0.666618


## Save the processed datasets

In [12]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./HE_FFI_Dataset.txt", 'wb')
pickle.dump(data_dict, file)
file.close()