## Import libraries

In [1]:
import gc
import pickle
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow_addons.optimizers import AdamW, Lookahead

## Helper Functions

In [2]:
def build_decoder(with_labels=True, img_size=64, ext='jpg'):
    def decode(path, add_noise=True):
        file_bytes = tf.io.read_file(path)

        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, (img_size, img_size), method="bicubic")
        
        if add_noise:
            noise = np.random.normal(loc=0.1, scale=0.2, size=img.shape)
            img = tf.clip_by_value(tf.add(img, noise), 0, 1)

        return img
    
    def decode_with_labels(src_img_path, tgt_img_path):
        return decode(src_img_path), decode(tgt_img_path, add_noise=False)
    
    return decode_with_labels if with_labels else decode

In [3]:
def build_dataset(paths, labels=None, 
                  bsize=32, decode_fn=None, 
                  repeat=True, shuffle=256):
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)
    return dset

## Load autoencoder model

In [4]:
autoencoder = load_model('../input/he-ffi-dae-model-v2/DAE_model.h5')
feature_model = Model(inputs=autoencoder.input,
                      outputs=autoencoder.get_layer('Embedding').output)
feature_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 192, 192, 3)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 96, 96, 32)        896       
_________________________________________________________________
batch_normalization (BatchNo (None, 96, 96, 32)        128       
_________________________________________________________________
activation (Activation)      (None, 96, 96, 32)        0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 48, 48, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
batch_normalization_1 (Batch (None, 24, 24, 64)        256   

## Prepare data

In [5]:
with open("../input/he-ffi-preprocess-data/HE_FFI_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

train_df.shape, test_df.shape

((1382, 157), (600, 156))

In [6]:
train_df['id'] = train_df['Image_path']
test_df['id'] = test_df['Image_path']
train_df.set_index("id", inplace=True)
test_df.set_index("id", inplace=True)
train_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,clusters__6,clusters__7,clusters__8,clusters__9,clusters__10,clusters__11,clusters__12,clusters__13,clusters__14,Image_path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/trainImages/img_4513976.jpg,6.0,0.411534,1.895378,0.0,1.0,4.0,2026.0,4.0,12.0,1.318946,...,0,0,0,0,0,0,0,1,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7764995.jpg,6.0,1.464186,1.104747,1.0,1.0,4.0,2025.0,3.0,7.0,0.040157,...,0,0,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_451308.jpg,0.0,1.212751,2.225731,0.0,0.0,5.0,2022.0,3.0,8.0,0.216204,...,0,0,0,0,0,0,0,1,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7768372.jpg,0.0,-0.369558,-0.425228,1.0,0.0,5.0,2022.0,3.0,8.0,0.177827,...,0,0,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...
../input/he-fast-furious-insured/dataset/trainImages/img_7765274.jpg,2.0,-0.989697,-1.029375,1.0,0.0,2.0,2026.0,2.0,5.0,-0.461215,...,0,1,0,0,0,0,0,0,0,../input/he-fast-furious-insured/dataset/train...


In [7]:
train_paths = train_df['Image_path']
test_paths = test_df['Image_path']
len(train_paths), len(test_paths)

(1382, 600)

In [8]:
img_size = 192
mini_batch_size = 128

decoder = build_decoder(with_labels=True, img_size=img_size)

dtrain = build_dataset(
    train_paths, train_paths, 
    bsize=mini_batch_size, repeat=False, 
    shuffle=False, decode_fn=decoder
)

dtest = build_dataset(
    test_paths, test_paths, 
    bsize=mini_batch_size, repeat=False, 
    shuffle=False, decode_fn=decoder
)

In [9]:
Xtrain_embed = feature_model.predict(dtrain, verbose=1)
Xtest_embed = feature_model.predict(dtest, verbose=1)
col_list = ['dae_'+str(i) for i in range(Xtrain_embed.shape[1])]
Xtrain_embed_df = pd.DataFrame(Xtrain_embed, columns=col_list, index=train_df.index)
Xtest_embed_df = pd.DataFrame(Xtest_embed, columns=col_list, index=test_df.index)

train_df = pd.merge(train_df, Xtrain_embed_df, on='id', sort=False)
test_df = pd.merge(test_df, Xtest_embed_df, on='id', sort=False)

print("\n\ntrain_df: {} \ntest_df: {}".format(train_df.shape, test_df.shape))

del Xtrain_embed
del Xtest_embed
del Xtrain_embed_df
del Xtest_embed_df
gc.collect()



train_df: (1382, 1693) 
test_df: (600, 1692)


104

In [10]:
train_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,dae_1526,dae_1527,dae_1528,dae_1529,dae_1530,dae_1531,dae_1532,dae_1533,dae_1534,dae_1535
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/trainImages/img_4513976.jpg,6.0,0.411534,1.895378,0.0,1.0,4.0,2026.0,4.0,12.0,1.318946,...,0.57073,-0.195283,-0.235323,-0.25736,-0.130368,-0.206337,0.267727,1.835724,0.177982,-0.183419
../input/he-fast-furious-insured/dataset/trainImages/img_7764995.jpg,6.0,1.464186,1.104747,1.0,1.0,4.0,2025.0,3.0,7.0,0.040157,...,-0.272337,-0.200668,-0.094072,-0.003438,0.469453,0.462481,0.410512,1.625738,-0.273421,-0.055653
../input/he-fast-furious-insured/dataset/trainImages/img_451308.jpg,0.0,1.212751,2.225731,0.0,0.0,5.0,2022.0,3.0,8.0,0.216204,...,-0.220627,-0.111004,-0.243602,-0.234025,-0.264965,-0.20011,-0.041671,-0.246042,0.097152,-0.108427
../input/he-fast-furious-insured/dataset/trainImages/img_7768372.jpg,0.0,-0.369558,-0.425228,1.0,0.0,5.0,2022.0,3.0,8.0,0.177827,...,0.866317,0.072017,2.382662,0.254028,-0.23963,-0.09877,-0.152048,-0.151377,-0.246364,4.6343
../input/he-fast-furious-insured/dataset/trainImages/img_7765274.jpg,2.0,-0.989697,-1.029375,1.0,0.0,2.0,2026.0,2.0,5.0,-0.461215,...,0.795036,0.658731,-0.042184,0.1451,-0.127701,0.464389,1.263631,0.584261,-0.194823,0.220396


In [11]:
test_df.head()

Unnamed: 0_level_0,Insurance_company,Min_coverage,Max_coverage,Condition,Parent_company,Child_company,expiry_dt_year,expiry_dt_quarter,expiry_dt_month,expiry_dt_week,...,dae_1526,dae_1527,dae_1528,dae_1529,dae_1530,dae_1531,dae_1532,dae_1533,dae_1534,dae_1535
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
../input/he-fast-furious-insured/dataset/testImages/img_4538519.jpg,3.0,-1.873053,-1.895259,1.0,1.0,5.0,2025.0,2.0,4.0,-0.615025,...,0.630905,0.380645,0.058258,0.423731,-0.181567,0.551998,1.217214,0.06939,0.164178,0.533777
../input/he-fast-furious-insured/dataset/testImages/img_7766002.jpg,7.0,-0.89578,-0.93391,1.0,2.0,5.0,2028.0,3.0,8.0,0.307111,...,-0.083731,-0.042264,0.051222,-0.03725,-0.154595,-0.271914,0.25934,0.214751,-0.057445,-0.278002
../input/he-fast-furious-insured/dataset/testImages/img_4637390.jpg,2.0,0.605955,0.468203,1.0,0.0,2.0,2023.0,4.0,11.0,1.194628,...,0.370341,0.344271,0.143334,-0.271525,-0.00996,0.384522,0.139219,0.286501,-0.011328,0.09533
../input/he-fast-furious-insured/dataset/testImages/img_4516108.jpg,4.0,0.848075,0.667419,1.0,1.0,1.0,2028.0,1.0,2.0,-1.3402,...,-0.230202,-0.274917,-0.167255,-0.260308,-0.113609,-0.098236,-0.235533,0.004004,-0.210981,-0.212444
../input/he-fast-furious-insured/dataset/testImages/img_4517008.jpg,4.0,0.334851,0.230353,1.0,1.0,1.0,2022.0,1.0,1.0,-5.199338,...,0.245114,0.047936,-0.265861,0.043784,0.04678,0.123816,1.676999,0.702943,-0.194432,-0.108078


## Save the processed datasets

In [12]:
data_dict = {}
data_dict['train_df'] = train_df
data_dict['test_df'] = test_df

file = open("./HE_FFI_Dataset.txt", 'wb')
pickle.dump(data_dict, file)
file.close()