In [22]:
import tensorflow as tf
from glob import glob
from struct import unpack
import os
from tqdm.autonotebook import tqdm
import imghdr
import cv2
from PIL import Image

In [17]:
dataset1 = glob(r'D:\DL-CV-ML Projects\AIART\data\archive\*\*.png')
dataset2 = glob(r'D:\DL-CV-ML Projects\AIART\data\delaunay\DELAUNAY\*\*.jpg')

In [10]:
marker_mapping = {
    0xffd8: "Start of Image",
    0xffe0: "Application Default Header",
    0xffdb: "Quantization Table",
    0xffc0: "Start of Frame",
    0xffc4: "Define Huffman Table",
    0xffda: "Start of Scan",
    0xffd9: "End of Image"
}


class JPEG:
    def __init__(self, image_file):
        with open(image_file, 'rb') as f:
            self.img_data = f.read()
    
    def decode(self):
        data = self.img_data
        while(True):
            marker, = unpack(">H", data[0:2])

            if marker == 0xffd8:
                data = data[2:]
            elif marker == 0xffd9:
                return
            elif marker == 0xffda:
                data = data[-2:]
            else:
                lenchunk, = unpack(">H", data[2:4])
                data = data[2+lenchunk:]            
            if len(data)==0:
                break        


bads = []

for img in tqdm(dataset1):
  # image = os.path.join(root_img, img)
  image = JPEG(img) 
  try:
    image.decode()   
  except:
    bads.append(img)
    print('Encountered a bad img')

  2%|▏         | 693/28820 [00:10<12:10, 38.48it/s] 

Encountered a bad img


 20%|█▉        | 5642/28820 [01:40<03:10, 121.90it/s]

Encountered a bad img


100%|██████████| 28820/28820 [08:42<00:00, 55.17it/s] 


In [12]:
for name in bads:
  os.remove(name)

In [24]:
def int64_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def int64_list_feature(value):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def bytes_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def bytes_list_feature(value):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))


def float_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def float_list_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def create_tf_records(image_dir, output_path):

    if not os.path.isfile(output_path):
      
      print(f'Saving output to: {output_path}')

      images = sorted(image_dir, key=os.path.getsize)
      expected_size = sum(list(map(lambda x: os.path.getsize(x), images))) / (1e+9)

      print(f'Writing {len(images)} to TFRecord file, expected size is {expected_size} GB ...')

      writer = tf.io.TFRecordWriter(output_path)

      for image_path in tqdm(images):

          with tf.io.gfile.GFile(image_path, 'rb') as f:
            try:
                encoded_image_data = f.read()
            except:
                print(f'{image_path} gives Error')
                break

          # Get image width and height
          try:
              image = Image.open(image_path)
              width, height = image.size
          except:
              print(f'{image_path} gives Error')
              break

          # Create tf.train.Example
          tf_example = tf.train.Example(features=tf.train.Features(feature={
              'image/height': int64_feature(height),
              'image/width': int64_feature(width),
              'image/encoded': bytes_feature(encoded_image_data),
          }))
          writer.write(tf_example.SerializeToString())

      writer.close()

      print("TFRecord files created successfully!")
    else:
      print(f'Skipping, {output_path} aleady exists')

In [26]:
output = os.path.join('D:\DL-CV-ML Projects\AIART\data', 'dataset1')
create_tf_records(dataset1, output)

Saving output to: D:\DL-CV-ML Projects\AIART\data\dataset1
Writing 28818 to TFRecord file, expected size is 22.687575678 GB ...


100%|██████████| 28818/28818 [04:27<00:00, 107.64it/s]


TFRecord files created successfully!


In [25]:
output = os.path.join('D:\DL-CV-ML Projects\AIART\data', 'dataset2')
create_tf_records(dataset2, output)

Saving output to: D:\DL-CV-ML Projects\AIART\data\dataset2
Writing 11493 to TFRecord file, expected size is 3.400833821 GB ...


100%|██████████| 11493/11493 [00:23<00:00, 486.44it/s]


TFRecord files created successfully!
