# **LIBRARY**
* numpy :  1.18.5
* pandas :  1.1.4
* scikit-learn :  0.23.2
* tensorflow :  2.3.1
* nltk :  3.2.4
* matplotlib :  3.2.1
* PIL :  3.2.4

In [None]:
import sys
import numpy as np
import pandas as pd
import datetime

import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras.layers as layers
from tensorflow import keras

import nltk
import matplotlib as mpl
import matplotlib.pyplot as plt

from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [None]:
print("python : ", sys.version)
print("numpy : ", np.__version__)
print("pandas : ", pd.__version__)
print("scikit-learn : ", sklearn.__version__)
print("tensorflow : ", tf.__version__)
print("PIL : ", nltk.__version__)
print("nltk : ", nltk.__version__)
print("matplotlib : ", mpl.__version__)

In [None]:
image_height, image_width = 80, 80
def get_image(dirpath, filepath):
    """
        parameter: filepath, ex: image.jpg
    """
    filedir =  dirpath + filepath
    filename, filetype = filepath.split(".")
    img = Image.open(filedir)
    img = img.resize((image_width,image_height), Image.ANTIALIAS)
    return np.asarray(img).astype('float32') / 250.0
#     image = tf.io.read_file(filedir)
#     if image is not None:
#         if filetype == "jpg":
#             image = tf.io.decode_jpeg(image)
#         elif filetype == "png":
#             image = tf.io.decode_png(image) 
#         image = tf.image.resize_with_crop_or_pad(image, 300, 300)
#         image = tf.image.convert_image_dtype(image, dtype=tf.float32, saturate=False)       
#     else:
#         print("read_file is null")
#         return []
    #return image

In [None]:
training_data_url_csv = r"../input/idndsc2020-shopee-advanced/new_training_set.csv"
training_image_dir = r"../input/idndsc2020-shopee-advanced/training_img/training_img/"
test_data_url_csv = r"../input/idndsc2020-shopee-advanced/new_test_sample.csv"
test_image_dir = r"../input/idndsc2020-shopee-advanced/sample_img/sample_img/"

In [None]:
df_train_raw = pd.read_csv(training_data_url_csv)
df_train_raw.head()

In [None]:
df_train_raw.info()

In [None]:
df_test_raw = pd.read_csv(test_data_url_csv)
df_test_raw.rename(columns={"Unnamed: 0": "pair_index"}, inplace=True)
df_test_raw.head()

In [None]:
df_copy = df_train_raw.copy()
df_copy["filetype_1"] = df_copy["image_1"].map(lambda x: x.split('.')[1])

df_copy["filetype_2"] = df_copy["image_2"].map(lambda x: x.split('.')[1])
df_copy["filetype_2"].value_counts()

print("File type image - 1 :")
print(df_copy["filetype_1"].value_counts())
print()
print("File type image - 2 :")
print(df_copy["filetype_2"].value_counts())


In [None]:
filename_image1_unique = df_train_raw["image_1"].values
filename_image2_unique = df_train_raw["image_2"].values
filename_unique = np.concatenate((filename_image1_unique, filename_image2_unique))
filename_unique = set(filename_unique)
filename_unique = list(filename_unique)

print("Total file unique image - 1 : ", df_train_raw["image_1"].unique().shape)
print("Total file unique image - 2 : ", df_train_raw["image_2"].unique().shape)
print("Total file unique image 1 & 2 : ", len(filename_unique))

In [None]:
images_train_unique = {}
images_test_unique = {}
print("start @@ ", datetime.datetime.now())

arr_space = np.linspace(0, len(filename_unique), 10)
for i, x  in enumerate(arr_space[:3]):
    start = int(arr_space[i-1])
    end = int(arr_space[i])
    print("start : ", start, " @@ end : ", end)
    if i > 0 and i < len(arr_space):
        temp_dict = {x: get_image(training_image_dir, x) for x in filename_unique[start:end]}
        images_train_unique = dict(**images_train_unique, **temp_dict)

print("Count : ", len(images_train_unique))
print("finish @@ ", datetime.datetime.now())

In [None]:
print("start @@ ", datetime.datetime.now())

for i, x  in enumerate(arr_space[3:6]):
    i += 3
    start = int(arr_space[i-1])
    end = int(arr_space[i])
    print("start : ", start, " @@ end : ", end)
    if i > 0 and i < len(arr_space):
        temp_dict = {x: get_image(training_image_dir, x) for x in filename_unique[start:end]}
        images_train_unique = dict(**images_train_unique, **temp_dict)

print("Count : ", len(images_train_unique))
print("finish @@ ", datetime.datetime.now())

In [None]:
print("start @@ ", datetime.datetime.now())

for i, x  in enumerate(arr_space[6:8]):
    i += 6
    start = int(arr_space[i-1])
    end = int(arr_space[i])
    print("start : ", start, " @@ end : ", end)
    if i > 0 and i < len(arr_space):
        temp_dict = {x: get_image(training_image_dir, x) for x in filename_unique[start:end]}
        images_train_unique = dict(**images_train_unique, **temp_dict)

print("Count : ", len(images_train_unique))
print("finish @@ ", datetime.datetime.now())

In [None]:
print("start @@ ", datetime.datetime.now())

for i, x  in enumerate(arr_space[8:10]):
    i += 8
    start = int(arr_space[i-1])
    end = int(arr_space[i])
    print("start : ", start, " @@ end : ", end)
    if i > 0 and i < len(arr_space):
        temp_dict = {x: get_image(training_image_dir, x) for x in filename_unique[start:end]}
        images_train_unique = dict(**images_train_unique, **temp_dict)

print("Count : ", len(images_train_unique))
print("finish @@ ", datetime.datetime.now())

In [None]:
def get_image_from_memory(dirpath, key):
    images_unique = {}
    images_unique = images_test_unique if 'sample' in dirpath.split('/')[-2] else images_train_unique
    if key in images_unique:
        return images_unique[key][...,:3]
    else:
        new_image = get_image(dirpath, key)
        images_unique[key] = new_image
        return new_image[...,:3]

In [None]:
X_raw = df_train_raw[["image_1", "image_2"]].values
y = df_train_raw["Label"].values

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_raw, y, test_size = 0.25, random_state = 0)

In [None]:
X_train_image_1 = X_train_raw.T[0]
X_train_image_2 = X_train_raw.T[1]

X_train_image_1 = np.array([get_image_from_memory(training_image_dir, x) for x in X_train_image_1.tolist()])
X_train_image_2 = np.array([get_image_from_memory(training_image_dir, x) for x in X_train_image_2.tolist()])

In [None]:
X_test_image_1 = X_test_raw.T[0]
X_test_image_2 = X_test_raw.T[1]

X_test_image_1 = np.array([get_image_from_memory(training_image_dir, x) for x in X_test_image_1.tolist()])
X_test_image_2 = np.array([get_image_from_memory(training_image_dir, x) for x in X_test_image_2.tolist()])

In [None]:
X_sample_raw = df_test_raw[["image_1", "image_2"]].values
X_sample_image_1 = X_sample_raw.T[0]
X_sample_image_2 = X_sample_raw.T[1]

X_sample_image_1 = np.array([get_image_from_memory(test_image_dir, x) for x in X_sample_image_1.tolist()])
X_sample_image_2 = np.array([get_image_from_memory(test_image_dir, x) for x in X_sample_image_2.tolist()])

In [None]:
# input_shape = (image_height,image_width,3)
# def create_image_layer(input):
#     layer = layers.Conv2D(64, kernel_size=5, padding='same', input_shape=input_shape)(input)
#     layer = layers.MaxPool2D()(layer)
    
#     layer = layers.Conv2D(64, kernel_size=5, padding='same')(layer)
#     layer = layers.MaxPool2D()(layer)
#     layer = layers.Conv2D(64, kernel_size=5, padding='same')(layer)
#     layer = layers.MaxPool2D()(layer)
#     return layer

In [None]:
# image1_input = keras.Input(shape=input_shape)
# image1_layer = create_image_layer(image1_input)

# image2_input = keras.Input(shape=input_shape)
# image2_layer = create_image_layer(image2_input)

# image_concate = layers.Concatenate()([image1_layer, image2_layer])
# image_concate = layers.Flatten()(image_concate)

# relu_1 = layers.Dense(32, activation='relu')(image_concate)

# output = layers.Dense(1, activation='sigmoid')(relu_1)

In [None]:
# model = keras.Model(inputs=[image1_input, image2_input], outputs=output)
# model.compile(
#     optimizer="adam",
#     loss="binary_crossentropy",
#     metrics=["binary_accuracy"]
# )
# model.summary()
# keras.utils.plot_model(model, "test_model.png", show_shapes=True)

In [None]:
# model.fit(
#     [X_train_image_1, X_train_image_2], 
#     y_train,
#     batch_size=32,
#     epochs=5
# )

In [None]:
X_train_c=np.array([[X_train_image_1[n],X_train_image_2[n]] for n in range (len(X_train_image_1))])

In [None]:
X_test_c=np.array([[X_test_image_1[n],X_test_image_2[n]] for n in range (len(X_test_image_1))])

In [None]:
X_train_c.shape

In [None]:
X_sample_c=np.array([[X_sample_image_1[n],X_sample_image_2[n]] for n in range (len(X_sample_image_1))])

In [None]:
from keras import backend as K
from keras.layers import Activation
from keras.layers import Input, Lambda, Dense, Dropout, Convolution2D, MaxPooling2D, Flatten
from keras.models import Sequential, Model
from keras.optimizers import RMSprop,Adam
from keras import optimizers
from keras import callbacks
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau, TensorBoard
from keras.models import Model,load_model
from keras.models import model_from_json, load_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization

In [None]:
left_input = Input(X_train_c.shape[2:])
right_input = Input(X_train_c.shape[2:])

# We will use 2 instances of 1 network for this task
convnet = Sequential([
    Conv2D(8,(3,3), input_shape=X_train_c.shape[2:]),
    Activation('relu'),
    MaxPooling2D(),
    Conv2D(8,(3,3)),
    Activation('relu'),
    MaxPooling2D(),
    Conv2D(16,(3,3)),
    Activation('relu'),
    MaxPooling2D(),
    Conv2D(16,(3,3)),
    Activation('relu'),
    Flatten(),
    Dense(32),
    Activation('sigmoid')
])
# Connect each 'leg' of the network to each input
# Remember, they have the same weights
encoded_l = convnet(left_input)
encoded_r = convnet(right_input)

# Getting the L1 Distance between the 2 encodings
L1_layer = Lambda(lambda tensor:K.abs(tensor[0] - tensor[1]))

# Add the distance function to the network
L1_distance = L1_layer([encoded_l, encoded_r])

prediction = Dense(1,activation='sigmoid')(L1_distance)
siamese_net = Model(inputs=[left_input,right_input],outputs=prediction)

optimizer = Adam(0.0001, decay=2.5e-4)
#//TODO: get layerwise learning rates and momentum annealing scheme described in paperworking
siamese_net.compile(loss="binary_crossentropy",optimizer=optimizer,metrics=['accuracy'])

In [None]:
siamese_net.summary()

In [None]:
img_1 = X_train_c[:, 0]
img_2 = X_train_c[:, 1]
imgt_1 = X_test_c[:, 0]
imgt_2 = X_test_c[:, 1]


In [None]:

siamese_net.fit([img_1,img_2], y_train,
          batch_size=16,
          epochs=4,
          verbose=1,
          validation_data=([imgt_1,imgt_2],y_test))

In [None]:
# img_1 = tf.cast(img_1, dtype='float64')
# img_2 = tf.cast(img_2, dtype='float64')
# y_train = tf.cast(y_train, dtype='float64')

In [None]:
imgc_1 = X_sample_c[:, 0]
imgc_2 = X_sample_c[:, 1]
y_prob=siamese_net.predict([imgc_1,imgc_2])
y_prob

In [None]:
pd.set_option('Display.max_rows',None)
pd.DataFrame(y_prob)

In [None]:
y_prob[y_prob>0.5]=1
y_prob[y_prob<0.5]=0

pd.DataFrame(y_prob).astype(int).to_csv('abc.csv')

In [None]:
n=35
fig, axs = plt.subplots()
plt.imshow(imgc_1[n])
fig, axs = plt.subplots()
plt.imshow(imgc_2[n])