In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sdfghjkl/dataset/val.csv
/kaggle/input/sdfghjkl/dataset/dataset_disclaimer.txt
/kaggle/input/sdfghjkl/dataset/val/5cfd280e.jpg
/kaggle/input/sdfghjkl/dataset/val/2352494d.jpg
/kaggle/input/sdfghjkl/dataset/val/d86dc029.jpg
/kaggle/input/sdfghjkl/dataset/val/4b5572d4.jpg
/kaggle/input/sdfghjkl/dataset/val/e9b285bb.jpg
/kaggle/input/sdfghjkl/dataset/val/e15b612e.jpg
/kaggle/input/sdfghjkl/dataset/val/f37b0f48.jpg
/kaggle/input/sdfghjkl/dataset/val/b5088bdf.jpg
/kaggle/input/sdfghjkl/dataset/val/0c43bf45.jpg
/kaggle/input/sdfghjkl/dataset/val/924a9818.jpg
/kaggle/input/sdfghjkl/dataset/val/d61d3506.jpg
/kaggle/input/sdfghjkl/dataset/val/2e5ef377.jpg
/kaggle/input/sdfghjkl/dataset/val/2749b3f0.jpg
/kaggle/input/sdfghjkl/dataset/val/e0a59289.jpg
/kaggle/input/sdfghjkl/dataset/val/d6c7f49c.jpg
/kaggle/input/sdfghjkl/dataset/val/01e0cc3c.jpg
/kaggle/input/sdfghjkl/dataset/val/b70cf794.jpg
/kaggle/input/sdfghjkl/dataset/val/b74681a6.jpg
/kaggle/input/sdfghjkl/dataset/val/6c95143a

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint
import os
import random
import numpy as np

# Set the random seed for reproducibility
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Define the paths to your train dataset
train_dir = "/kaggle/input/sdfghjkl/dataset/train"

# Define the batch size and image dimensions
batch_size = 32
image_height = 64
image_width = 64

# Load and preprocess the dataset
def load_dataset():
    # Get the subdirectories representing different writers
    writer_dirs = [os.path.join(train_dir, writer) for writer in os.listdir(train_dir)]
    
    pairs = []
    labels = []
    
    for writer_dir in writer_dirs:
        # Get the list of image files for the current writer
        images = [os.path.join(writer_dir, img) for img in os.listdir(writer_dir)]
        
        # Create pairs of images and their labels
        for i in range(len(images)):
            for j in range(i+1, len(images)):
                pairs.append((images[i], images[j]))
                labels.append(1)  # Same writer label
            
                # Randomly select an image from a different writer
                diff_writer = random.choice(writer_dirs)
                while diff_writer == writer_dir:
                    diff_writer = random.choice(writer_dirs)
                
                # Randomly select an image from a different writer
                diff_image = random.choice(os.listdir(diff_writer))
                pairs.append((images[i], os.path.join(diff_writer, diff_image)))
                labels.append(0)  # Different writer label

    return pairs, labels

def preprocess_image(image_path):
    # Load the image and resize
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(image_height, image_width))
    
    # Convert the image to numpy array and normalize
    img_array = tf.keras.preprocessing.image.img_to_array(img) / 255.0
    
    return img_array

def generate_data(pairs, labels, augment_data=True):
    while True:
        # Shuffle the data
        combined = list(zip(pairs, labels))
        random.shuffle(combined)
        pairs, labels = zip(*combined)
        
        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i+batch_size]
            batch_labels = labels[i:i+batch_size]
            
            # Load and preprocess the batch of image pairs
            batch_images_1 = np.array([preprocess_image(pair[0]) for pair in batch_pairs])
            batch_images_2 = np.array([preprocess_image(pair[1]) for pair in batch_pairs])

            # Apply data augmentation if enabled
            if augment_data:
                datagen = ImageDataGenerator(
                    rotation_range=10,
                    width_shift_range=0.1,
                    height_shift_range=0.1,
                    zoom_range=0.1,
                    horizontal_flip=True,
                    vertical_flip=False,
                    fill_mode='nearest'
                )
                batch_images_1 = datagen.flow(batch_images_1, batch_size=batch_size, shuffle=False).next()
                batch_images_2 = datagen.flow(batch_images_2, batch_size=batch_size, shuffle=False).next()
            
            yield [batch_images_1, batch_images_2], np.array(batch_labels)

# Load the dataset
pairs, labels = load_dataset()

# Split the dataset into train and validation sets
train_size = int(0.8 * len(pairs))
train_pairs = pairs[:train_size]
train_labels = labels[:train_size]
val_pairs = pairs[train_size:]
val_labels = labels[train_size:]

# Create the data generators
train_generator = generate_data(train_pairs, train_labels, augment_data=True)
val_generator = generate_data(val_pairs, val_labels, augment_data=False)

def create_model():
    # Input tensors for two images
    input_1 = tf.keras.Input(shape=(image_height, image_width, 3))
    input_2 = tf.keras.Input(shape=(image_height, image_width, 3))
    
    # Base convolutional network for each image
    base_model = tf.keras.applications.ResNet50(
        include_top=False,
        weights='imagenet',
        input_shape=(image_height, image_width, 3),
        pooling='avg'
    )
    
    # Encoded features for each image
    encoded_1 = base_model(input_1)
    encoded_2 = base_model(input_2)
    
    # Concatenate the encoded features
    concatenated = Concatenate()([encoded_1, encoded_2])
    
    # Dense layers for classification
    x = Dense(256, activation='relu')(concatenated)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)
    
    # Create the model
    model = Model(inputs=[input_1, input_2], outputs=output)
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Create the model
model = create_model()

# Define the checkpoint callback
checkpoint_path = 'model_checkpoint.h5'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# Train the model with checkpoints
model.fit(
    train_generator,
    epochs=10,
    steps_per_epoch=len(train_pairs)//batch_size,
    validation_data=val_generator,
    validation_steps=len(val_pairs)//batch_size,
    callbacks=[checkpoint]
)

# Save the final trained model
model.save('trained_model.h5')


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/10
Epoch 1: val_loss improved from inf to 0.57622, saving model to model_checkpoint.h5
Epoch 2/10
Epoch 2: val_loss did not improve from 0.57622
Epoch 3/10
Epoch 3: val_loss improved from 0.57622 to 0.57399, saving model to model_checkpoint.h5
Epoch 4/10
Epoch 4: val_loss improved from 0.57399 to 0.56881, saving model to model_checkpoint.h5
Epoch 5/10
Epoch 5: val_loss improved from 0.56881 to 0.55961, saving model to model_checkpoint.h5
Epoch 6/10
Epoch 6: val_loss improved from 0.55961 to 0.54997, saving model to model_checkpoint.h5
Epoch 7/10
Epoch 7: val_loss improved from 0.54997 to 0.54412, saving model to model_checkpoint.h5
Epoch 8/10
Epoch 8: val_loss did not improve from 0.54412
Epoch 9/10
Epoch 9: val_loss improved from 0.54412 to 0.54072, saving model to model_checkpoint.h5
Epoch 10/10
Epoch 10: val_loss improved from 0.54072