#Get Data

In [None]:
# Install Kaggle Library
!pip install kaggle

# Before next step, user needs to download the free API KEY from Kaggle settings
# Upload the kaggle.json file to Google Colab Files

# Make directory for Kaggle & Refer to API KEY
! mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# Download Dataset
! kaggle datasets download shreelakshmigp/cedardataset

In [None]:
# ! mkdir sfddata
! unzip cedardataset.zip -d sfddata

#Combine Data

In [65]:
import os
import shutil
import numpy as np

# Paths to directories
real_sig_dir = '/content/sfddata/signatures/full_org'
fake_sig_dir = '/content/sfddata/signatures/full_forg'

# Define the destination folder for the combined dataset
destination_folder = 'signatures_combined'
label_location = "/content/signatures_combined/og_labels.npy"

# delete old stuff in dest folder if applicable
try:
    shutil.rmtree(destination_folder)
except OSError as e:
    print(f"Error creating directory '{destination_folder}': {e}")

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)


# List the files in the source subfolders
files1 = os.listdir(real_sig_dir)
files2 = os.listdir(fake_sig_dir)

# The list of results
labels = []

# Copy files from the first subfolder to the destination
for file in files1:
    source_file = os.path.join(real_sig_dir, file)
    destination_file = os.path.join(destination_folder, file)
    shutil.copy(source_file, destination_file)
    labels.append(1)

# Copy files from the second subfolder to the destination
for file in files2:
    source_file = os.path.join(fake_sig_dir, file)
    destination_file = os.path.join(destination_folder, file)
    shutil.copy(source_file, destination_file)
    labels.append(0)

sorted_labels = np.array(labels)
# save images somewhere
np.save(label_location, sorted_labels, allow_pickle=False)

Error creating directory 'signatures_combined': [Errno 2] No such file or directory: 'signatures_combined'


#Convert Image to Grayscale


In [7]:
"""Code is used for processing images"""

from PIL import Image, ImageOps
from tqdm import tqdm
import shutil
import os

THRESHOLD = 128

def image_to_grayscale(image_dir: str) -> Image:
    """Used for testing purposes to convert one image to grayscale"""
    image = Image.open(image_dir)
    gray_image = ImageOps.grayscale(image)
    return gray_image

def convert_grayscale(directory: str) -> list:
    """Converts all images in the given directory into gray scale"""
    converted_images = []
    for filename in os.listdir(directory):
        if filename.endswith(".png"):
            image = Image.open(directory + "/" + filename)
            gray_image = ImageOps.grayscale(image)
            converted_images.append(gray_image)
    return converted_images

def resize(images: list, dimensions: tuple) -> None:
    """Resizes all the given images in a list"""
    for i in range(0, len(images)):
        image = images[i]
        images[i] = image.resize(dimensions)

# actual code to run
dimensions = (250, 250) # dimensions for the images, can be changed
directory = 'signatures_combined'  # where the image files are located
image_path = 'converted_images'  # where the  images will be stored
gray_images = convert_grayscale(directory)
# bitmap_images = convert_bitmap(gray_images)
resize(gray_images, dimensions)
save_images = True

if not os.path.exists(image_path):
    # If the image path directory does not exist, create it.
    !mkdir converted_images
else:
  shutil.rmtree(image_path)
  !mkdir converted_images

if save_images:
    for i in tqdm(range(0, len(gray_images))):
        # Saves the bitmap images
        image_filename = os.path.join(image_path, f"image{i}.png")
        gray_images[i].save(image_filename)


100%|██████████| 2640/2640 [00:36<00:00, 72.31it/s]


#Reduce Noise In Images & Save Data

In [66]:
import cv2
from tqdm import tqdm
import numpy as np
from numpy import asarray
from matplotlib import pyplot as plt
from concurrent.futures import ThreadPoolExecutor
import pathlib
plt.style.use('seaborn')

# Get directory to access bitmap images
dir = pathlib.Path('/content/converted_images')

# Extract bitmap images and store in list
pictures = list(dir.glob('*.png'))

# Convert filenames to str and store in list
images = []
for pic in pictures:
  images.append(str(pic))

# where to store the images w reduced noise
reduced_dir = "/content/reduced"
!mkdir reduced

compare_images = []
less_noise_pics = []

# Reduce noise in each bitmap image and store in list
for i in tqdm(range(len(images))):
  image = images[i]
  noise_pic = cv2.imread(image)
  image_again = asarray(noise_pic)
  less_noise_pic = cv2.fastNlMeansDenoising(image_again, None, 15, 7, 21)

  # images stored in tuple form => (original image, noise reduced image)
  compare_images.append((noise_pic, less_noise_pic))
  less_noise_pics.append(less_noise_pic)
  curr_path = os.path.join(reduced_dir, f"image{i}.png")
  cv2.imwrite(curr_path, less_noise_pic)

  plt.style.use('seaborn')


mkdir: cannot create directory ‘reduced’: File exists


100%|██████████| 2640/2640 [10:49<00:00,  4.06it/s]


#Split Data into Train & Test Directories

In [73]:
import random
import numpy

data_path_train = "/content/reduced"
! mkdir split
data_path_test = "/content/split"

# path to destination folders
train_folder = os.path.join(data_path_test, 'training')
test_folder = os.path.join(data_path_test, 'testing')

# Define a list of image extensions
image_extensions = '.png'

# Create a list of image filenames in 'data_path'
imgs_list = [filename for filename in os.listdir(data_path_train) if os.path.splitext(filename)[-1] in image_extensions]

# Sets the random seed
random.seed(1107)

# Shuffle the list of image filenames
indices = [x for x in range(0, len(imgs_list))]
random.shuffle(indices)

# determine the number of images for each set
train_size = int(len(imgs_list) * 0.85)
test_size = int(len(imgs_list) * 0.15)

# Create destination folders if they don't exist
if not os.path.exists(train_folder):
    os.makedirs(train_folder)
if not os.path.exists(test_folder):
    os.makedirs(test_folder)

# Rearrange the image files and labels
new_imgs_list = [x for x in range(0, len(imgs_list))]
new_labels = [x for x in range(0, len(imgs_list))]
labels = numpy.load("/content/signatures_combined/og_labels.npy")

for i, j in enumerate(indices):
  new_imgs_list[i] = imgs_list[j]
  new_labels[i] = labels[j]

# Store test imgs
test_imgs = []

# Copy image files to destination folders
for i, f in enumerate(imgs_list):
    if i < train_size:
        dest_folder = train_folder
        shutil.copy(os.path.join(data_path_train, f), os.path.join(dest_folder, f))
    else:
        dest_folder = test_folder
        # add to test array
        pic = cv2.imread(os.path.join(data_path_train, f))
        test_imgs.append(asarray(pic))


# Save labels
train_labels = np.array(new_labels[:train_size])
test_labels = np.array(new_labels[:train_size])
# save images somewhere
np.save("/content/og_train_labels.npy", train_labels, allow_pickle=False)
np.save("/content/test_labels.npy", train_labels, allow_pickle=False)

# Save test imgs
test_imgs = np.array(test_imgs)
np.save("/content/test_imgs.npy", test_imgs)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  [255 255 255]
  [255 255 255]]]
[[[237 237 237]
  [237 237 237]
  [237 237 237]
  ...
  [238 238 238]
  [238 238 238]
  [238 238 238]]

 [[237 237 237]
  [237 237 237]
  [237 237 237]
  ...
  [238 238 238]
  [238 238 238]
  [238 238 238]]

 [[237 237 237]
  [237 237 237]
  [237 237 237]
  ...
  [238 238 238]
  [238 238 238]
  [238 238 238]]

 ...

 [[238 238 238]
  [238 238 238]
  [238 238 238]
  ...
  [237 237 237]
  [237 237 237]
  [237 237 237]]

 [[238 238 238]
  [238 238 238]
  [238 238 238]
  ...
  [237 237 237]
  [237 237 237]
  [237 237 237]]

 [[238 238 238]
  [238 238 238]
  [238 238 238]
  ...
  [237 237 237]
  [237 237 237]
  [237 237 237]]]
[[[252 252 252]
  [252 252 252]
  [252 252 252]
  ...
  [252 252 252]
  [252 252 252]
  [252 252 252]]

 [[252 252 252]
  [252 252 252]
  [252 252 252]
  ...
  [252 252 252]
  [252 252 252]
  [252 252 252]]

 [[252 252 252]
  [252 252 252]
  [252 252 252]
  ...
  [252 25

#Split Train Images into Tensorflow Datasets

In [74]:
import tensorflow as tf
from keras.utils import image_dataset_from_directory
import tensorflow_datasets as tfds
import pathlib

BATCH_SIZE = 2232
IMG_HEIGHT = 180
IMG_WIDTH = 180

# Set directory to pull images from
DATA_DIR = pathlib.Path('/content/split/training')

paths = len(list(DATA_DIR.glob('*.png')))
print(paths)

# get labels
my_labels = numpy.load("/content/og_train_labels.npy")
print(my_labels)

# Make training & validation tensorflow datasets stored in list
train_ds = tf.keras.utils.image_dataset_from_directory(
    DATA_DIR,
    labels=my_labels.tolist(),
    label_mode='binary',
    validation_split=0.18,
    subset="both",
    shuffle = True,
    seed=1107,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
)

dataset_array = tfds.as_numpy(train_ds)

train_dataset = dataset_array[0]
val_dataset = dataset_array[1]

for images, labels in dataset_array[0]:
  np.save("/content/train_labels.npy", labels)
  np.save("/content/train_imgs.npy", images)

for images, labels in dataset_array[1]:
  np.save("/content/val_labels.npy", labels)
  np.save("/content/val_imgs.npy", images)
# Testing folders has 15% of data but does not go through splitting
# using tensorflow

2244
[1 0 0 ... 0 0 1]
Found 2244 files belonging to 2 classes.
Using 1841 files for training.
Using 403 files for validation.


# Example to Load Data

In [75]:
import numpy

train_labels = numpy.load("/content/train_labels.npy")
train_imgs = numpy.load("/content/train_imgs.npy")

val_labels = numpy.load("/content/val_labels.npy")
val_imgs  = numpy.load("/content/val_imgs.npy")

test_labels = numpy.load("/content/test_labels.npy")
test_imgs = numpy.load("/content/test_imgs.npy")


#Export Data

In [77]:
from google.colab import files
# files.download("/content/test_imgs.npy")
files.download("/content/train_imgs.npy")
files.download("/content/val_imgs.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>