# Data preprocessing License Plate Detection

Download dataset

In [2]:
from IPython.display import clear_output
import os, glob


# Downloading teh dataset
!gdown --id '1DJcNLTpUBc8NRgH-ZDIZ_LWSUTNzJToE'
!unzip trash_data.zip; rm trash_data.zip;
clear_output()

# Removing unnecessary demo data folder from workspace.
!rm -r sample_data

# Renaming raw data folder to remove space. Trust me, it makes life a lot easier :D 
%mv 'train' train_data_raw


rm: cannot remove 'sample_data': No such file or directory


Function for converting XML to TXT


In [3]:
from xml.dom import minidom

lut={"glass":0,
     "metal":1,
     "plastic":2,
     "paper":3
     }

label_count ={}

print(f'Object Names: {list(lut.keys())}' ) ##using .keys() we get all the keys of dictionery

def convert_coordinates(size, box):
    """
    This function converts the coordinates. 
    box: (xmin, ymin, xmax, ymax)
    size: (width, height)

    returns a touple where (x, y, height, width) of the boundary box
    """
    dw = 1.0/size[0]
    dh = 1.0/size[1]
    x = (box[0]+box[1])/2.0
    y = (box[2]+box[3])/2.0
    w = box[1]-box[0]
    h = box[3]-box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)


def convert_xml2yolo(filelist, lut ):
    """
    filelist: list of .xml file paths to convert to .txt file
    lut: a dictionary containing class_name to class_index mapping
    """
    for fname in filelist:
        xmldoc = minidom.parse(fname)
        fname_out = (fname[:-4]+'.txt')

        with open(fname_out, "w") as f:
            # print(f'processing{fname}')

            itemlist = xmldoc.getElementsByTagName('object')
            size = xmldoc.getElementsByTagName('size')[0]
            width = int((size.getElementsByTagName('width')[0]).firstChild.data)
            height = int((size.getElementsByTagName('height')[0]).firstChild.data)

            for item in itemlist:
                # get class label
                classid =  (item.getElementsByTagName('name')[0]).firstChild.data
                if classid in lut:
                    label_str = str(lut[classid])
                else:
                    #print(fname )
                    label_str = "-1"
                    #print ("warning: label '%s' not in look-up table for file '%s'", classid, fname )
                # get bbox coordinates
                xmin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmin')[0]).firstChild.data
                ymin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymin')[0]).firstChild.data
                xmax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmax')[0]).firstChild.data
                ymax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymax')[0]).firstChild.data
                b = (float(xmin), float(xmax), float(ymin), float(ymax))
                bb = convert_coordinates((width,height), b)
                #print(bb)
                
                if classid in lut:
                  label_count[classid] = label_count.get(classid, 0) + 1
                #label_count[classid] = label_count.get(classid, 0) + 1

                if label_str != "-1":
                  f.write(label_str + " " + " ".join([("%.6f" % a) for a in bb]) + '\n')



                #f.write(label_str + " " + " ".join([("%.6f" % a) for a in bb]) + '\n')
        # print ("wrote %s" % fname_out)


Object Names: ['glass', 'metal', 'plastic', 'paper']


In [4]:
# Reading Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png']
image_file_list = []
for format in formats:
  #glob.glob is used to search for filepaths with a matching pattern and store the filepath in a list
    image_file_list.extend(glob.glob(f'/content/train_data_raw/*.{format}'))

# Reading XML label file paths
label_file_list_xml = glob.glob('/content/train_data_raw/*.xml')

print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

Image files found: 369 
Label files found: 369


In [7]:
# Converting .xml file to .txt file
convert_xml2yolo(label_file_list_xml, lut)
label_file_list_txt = glob.glob('/content/train_data_raw/*.txt')
print(f'XML --> TXT files: {len(label_file_list_txt)}')

XML --> TXT files: 369


In [8]:
label_count

{'glass': 41, 'metal': 127, 'paper': 127, 'plastic': 226}

In [9]:
from PIL import Image
img_sizes = {}

for fname in image_file_list:
    img = Image.open(fname)
    img_sizes[img.size] = img_sizes.get(img.size, 0) +1 
img_sizes

{(416, 416): 179,
 (1241, 751): 1,
 (1537, 2049): 33,
 (1925, 1408): 1,
 (2049, 1537): 12,
 (2214, 3264): 1,
 (2442, 2597): 1,
 (2448, 3264): 109,
 (3163, 2448): 1,
 (3264, 2448): 31}

Function for resizing image


In [10]:
def resize_images(file_list, width = 800, height = 800, overwrite = True, save_dir = ''):
    total_files = len(file_list)
    idx = 1
    for path in file_list:
        img = Image.open(path)
        img_resized = img.resize((width, height))
        if overwrite:
            img_resized.save(path)
            filename = path.split('/')[-1] 
            print(f"{idx}/{total_files}: {filename} {img.size}--> ({width}x{height})")
        else:
            filename = path.split('/')[-1]
            img_resized.save(save_dir + filename)
            print(f'{filename} saved to {save_dir}')
        idx +=1
    clear_output()

In [11]:
resize_images(image_file_list , overwrite= True)

Split into train and validation set

In [12]:
import random
random.seed(689)

#randomply selecting the index of the files
valid_set_index = random.sample(range(len(image_file_list)), 69) #randomly selecting 100 indexes from training set
len(set(image_file_list)), len(set(label_file_list_txt)), len(valid_set_index)

image_file_list = sorted(image_file_list)
label_file_list_txt = sorted(label_file_list_txt)

# sanity check of the image files and labels being in the same order
print('Checking files concurrency')
print(image_file_list[:5])
print(label_file_list_txt[:5])

# code to separate train and validation set
valid_selected_images = []
valid_selected_labels = []

for index in valid_set_index: 
    valid_selected_images.append(image_file_list[index])
    valid_selected_labels.append(label_file_list_txt[index])


print('\n\nChecking files concurrency in validation set')
print(valid_selected_images[:5])
print(valid_selected_labels[:5])

Checking files concurrency
['/content/train_data_raw/000000_JPG.rf.d3371cb3d63a59c5ba6730368b7905af.jpg', '/content/train_data_raw/000000_jpg.rf.beffaf3b548106ccf1da5dc629bc9504.jpg', '/content/train_data_raw/000000_jpg.rf.e662cb85f63817325956fea222d0990f.jpg', '/content/train_data_raw/000000_jpg.rf.ee75fdf06813399a8376c6ff7056423a.jpg', '/content/train_data_raw/000001_JPG.rf.ccfdd243a78a6cfa119be0349d18c0ed.jpg']
['/content/train_data_raw/000000_JPG.rf.d3371cb3d63a59c5ba6730368b7905af.txt', '/content/train_data_raw/000000_jpg.rf.beffaf3b548106ccf1da5dc629bc9504.txt', '/content/train_data_raw/000000_jpg.rf.e662cb85f63817325956fea222d0990f.txt', '/content/train_data_raw/000000_jpg.rf.ee75fdf06813399a8376c6ff7056423a.txt', '/content/train_data_raw/000001_JPG.rf.ccfdd243a78a6cfa119be0349d18c0ed.txt']


Checking files concurrency in validation set
['/content/train_data_raw/000086_JPG.rf.a9c84c634a161dec85f5ca62fff0d0b7.jpg', '/content/train_data_raw/000022_JPG.rf.2142a3416365cf34993f4abb1d

In [13]:
import shutil

# Creating validation directory
valid_dir = '/content/valid/'

if os.path.exists(valid_dir):
    print(f'Directory {valid_dir} already exists !')
else: 
    os.makedirs(valid_dir)
    print(f"Directory {valid_dir} is created successfully!") 


for idx in range(len(valid_selected_images)):
    # moving image files
    mypath = valid_selected_images[idx]
    if os.path.exists(mypath):
        filename = mypath.split('/')[-1]
        shutil.move(mypath , valid_dir + filename)
    else:
        print(f'{mypath} not found')
        
    # moving label files
    mypath = valid_selected_labels[idx]
    if os.path.exists(mypath):
        filename = mypath.split('/')[-1]
        shutil.move(mypath , valid_dir + filename)
    else:
        print(f'{mypath} not found')

Directory /content/valid/ is created successfully!


In [14]:
!mv train_data_raw train 

Directories fixup


In [15]:
def lookup_image_file_paths(formats, dir):
    """
    This function takes a specified set of formats and directory address to list all the filepaths
    of the desired format in that directory
    """
    filepaths = []
    for format in formats:
        filepaths.extend(glob.glob(f'{dir}*.{format}'))
    return filepaths

def make_txt_file(formats, dir):
    """
    Formats the file names to write in the desired txt file
    """
    filepaths = lookup_image_file_paths(formats, dir)
    
    filenames = [x.split('/')[-1] for x in filepaths]
    txt_file_name = dir.split('/')[-2]

    print(f'{txt_file_name} : {len(filepaths)} images')
    with open(f'/content/metadata/{txt_file_name}.txt', 'w') as outfile:
        for filename in filenames:
            outfile.write(f'data/{txt_file_name}/'+filename+'\n')
        outfile.close()


In [16]:
train_dir = '/content/train/'
valid_dir = '/content/valid/'
!mkdir metadata

# Making the .txt file containing list of images. 
make_txt_file(formats, dir = train_dir)
make_txt_file(formats, dir = valid_dir)

# Writing the file traffic.names
object_labels = list(lut.keys())
with open('/content/metadata/trash.names', 'w') as outfile:
    for label in object_labels:
        outfile.write(label + '\n')
    outfile.close()

# Writing the file traffic.data
data_config = f'classes=4\ntrain=train.txt\nvalid=valid.txt\nnames=traffic.names'
with open('/content/metadata/trash.data', 'w') as outfile:
    outfile.write(data_config + '\n')
    outfile.close()

train : 300 images
valid : 69 images


In [17]:
!zip -r trash-processed.zip train valid metadata
clear_output()

Uploading processed zip to drive

In [18]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
!cp trash-processed.zip '/content/drive/MyDrive/Plastic Detection'