In [None]:
from IPython.display import clear_output
import os, glob


# Downloading teh dataset
!gdown --id 1GIiqqmqEPSiGBb1MU1kIZG4q7BOIzqik
!unzip traffic-dataset.zip; rm traffic-dataset.zip;
clear_output()

# There was .rar file inside .zip file. So we unzip them again !
!unrar x train.rar
!unrar x test1.rar
clear_output()

# Removing rar files that we no longer need. 
!rm train.rar
!rm test1.rar

# Removing unnecessary demo data folder from workspace.
!rm -r sample_data

# Renaming raw data folder to remove space. Trust me, it makes life a lot easier :D 
%mv 'Final Train Dataset' train_data_raw

In [None]:
corrupt_files = ['231.jpg', '231.xml', 'Pias (359).PNG','Pias (359).xml', 'Pias (360).PNG', 'Pias (360).xml']

%cd /content/train_data_raw/

for file in corrupt_files:
    file_path = os.path.join('/content/train_data_raw/', file)  
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f'{file} is removed successfully')
    else:
        print(f'{file} is already deleted')

%cd /content/

In [None]:
"""
Thanks to @bjornstenger for his excellent code for converting the code from XML format to .txt format
here is the original link of this cell of code.
Original Link: https://github.com/bjornstenger/xml2yolo/blob/master/convert.py 
"""

from xml.dom import minidom

# Remember these number assigned. These are the label indexes which will be used in the training process.
# Feel free to unfold to see what's inside 
lut={"ambulance": 0,
     "auto rickshaw": 1,
     "bicycle": 2,
     "bus": 3,
     "car": 4,
     "garbagevan": 5,
     "human hauler": 6,
     "minibus": 7,
     "minivan": 8,
     "motorbike": 9,
     "pickup": 10,
     "army vehicle": 11,
     "policecar": 12,
     "rickshaw": 13,
     "scooter": 14,
     "suv": 15,
     "taxi": 16,
     "three wheelers (CNG)": 17,
     "truck": 18,
     "van": 19,
     "wheelbarrow": 20
     }

label_count ={}

print(f'Object Names: {list(lut.keys())}' )

def convert_coordinates(size, box):
    """
    This function converts the coordinates. 
    box: (xmin, ymin, xmax, ymax)
    size: (width, height)

    returns a touple where (x, y, height, width) of the boundary box
    """
    dw = 1.0/size[0]
    dh = 1.0/size[1]
    x = (box[0]+box[1])/2.0
    y = (box[2]+box[3])/2.0
    w = box[1]-box[0]
    h = box[3]-box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)


def convert_xml2yolo(filelist, lut ):
    """
    filelist: list of .xml file paths to convert to .txt file
    lut: a dictionary containing class_name to class_index mapping
    """
    for fname in filelist:
        xmldoc = minidom.parse(fname)
        fname_out = (fname[:-4]+'.txt')

        with open(fname_out, "w") as f:
            # print(f'processing{fname}')

            itemlist = xmldoc.getElementsByTagName('object')
            size = xmldoc.getElementsByTagName('size')[0]
            width = int((size.getElementsByTagName('width')[0]).firstChild.data)
            height = int((size.getElementsByTagName('height')[0]).firstChild.data)

            for item in itemlist:
                # get class label
                classid =  (item.getElementsByTagName('name')[0]).firstChild.data
                if classid in lut:
                    label_str = str(lut[classid])
                else:
                    label_str = "-1"
                    print ("warning: label '%s' not in look-up table for file '%s'" % classid, fname )
                # get bbox coordinates
                xmin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmin')[0]).firstChild.data
                ymin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymin')[0]).firstChild.data
                xmax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmax')[0]).firstChild.data
                ymax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymax')[0]).firstChild.data
                b = (float(xmin), float(xmax), float(ymin), float(ymax))
                bb = convert_coordinates((width,height), b)
                #print(bb)

                label_count[classid] = label_count.get(classid, 0) + 1

                f.write(label_str + " " + " ".join([("%.6f" % a) for a in bb]) + '\n')
        # print ("wrote %s" % fname_out)


In [None]:
# Reading Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/train_data_raw/*.{format}'))

# Reading XML label file paths
label_file_list_xml = glob.glob('/content/train_data_raw/*.xml')

print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

In [None]:
# Converting .xml file to .txt file
convert_xml2yolo(label_file_list_xml, lut)
label_file_list_txt = glob.glob('/content/train_data_raw/*.txt')
print(f'XML --> TXT files: {len(label_file_list_txt)}')

In [None]:
label_count

In [None]:
from PIL import Image
img_sizes = {}

for fname in image_file_list:
    img = Image.open(fname)
    img_sizes[img.size] = img_sizes.get(img.size, 0) +1 
img_sizes

In [None]:
def resize_images(file_list, width = 1024, height = 1024, overwrite = True, save_dir = ''):
    total_files = len(file_list)
    idx = 1
    for path in file_list:
        img = Image.open(path)
        img_resized = img.resize((width, height))
        if overwrite:
            img_resized.save(path)
            filename = path.split('/')[-1] 
            print(f"{idx}/{total_files}: {filename} {img.size}--> ({width}x{height})")
        else:
            filename = path.split('/')[-1]
            img_resized.save(save_dir + filename)
            print(f'{filename} saved to {save_dir}')
        idx +=1
    clear_output()

In [None]:
import random
random.seed(1455)

#randomply selecting the index of the files
valid_set_index = random.sample(range(len(image_file_list)), 600)
len(set(image_file_list)), len(set(label_file_list_txt)), len(valid_set_index)

image_file_list = sorted(image_file_list)
label_file_list_txt = sorted(label_file_list_txt)

# sanity check of the image files and labels being in the same order
print('Checking files concurrency')
print(image_file_list[:5])
print(label_file_list_txt[:5])

# code to separate train and validation set
valid_selected_images = []
valid_selected_labels = []

for index in valid_set_index: 
    valid_selected_images.append(image_file_list[index])
    valid_selected_labels.append(label_file_list_txt[index])


print('\n\nChecking files concurrency in validation set')
print(valid_selected_images[:5])
print(valid_selected_labels[:5])

In [None]:
!mv test final

In [None]:
import shutil

# Creating validation directory
# valid_dir = '/content/valid/'
valid_dir = '/content/test/'

if os.path.exists(valid_dir):
    print(f'Directory {valid_dir} already exists !')
else: 
    os.makedirs(valid_dir)
    print(f"Directory {valid_dir} is created successfully!") 


for idx in range(len(valid_selected_images)):
    # moving image files
    mypath = valid_selected_images[idx]
    if os.path.exists(mypath):
        filename = mypath.split('/')[-1]
        shutil.move(mypath , valid_dir + filename)
    else:
        print(f'{mypath} not found')
        
    # moving label files
    mypath = valid_selected_labels[idx]
    if os.path.exists(mypath):
        filename = mypath.split('/')[-1]
        shutil.move(mypath , valid_dir + filename)
    else:
        print(f'{mypath} not found')

In [None]:
# !mv train_data_raw train
!mv train_data_raw obj 

In [None]:
def lookup_image_file_paths(formats, dir):
    """
    This function takes a specified set of formats and directory address to list all the filepaths
    of the desired format in that directory
    """
    filepaths = []
    for format in formats:
        filepaths.extend(glob.glob(f'{dir}*.{format}'))
    return filepaths

def make_txt_file(formats, dir):
    """
    Formats the file names to write in the desired txt file
    """
    filepaths = lookup_image_file_paths(formats, dir)
    
    filenames = [x.split('/')[-1] for x in filepaths]
    txt_file_name = dir.split('/')[-2]

    print(f'{txt_file_name} : {len(filepaths)} images')
    with open(f'/content/metadata/{txt_file_name}.txt', 'w') as outfile:
        for filename in filenames:
            outfile.write(f'/mydrive/final/'+filename+'\n')
        outfile.close()

In [None]:
# train_dir = '/content/train/'
# valid_dir = '/content/valid/'
# test_dir =  '/content/test/'
train_dir = '/content/obj/'
valid_dir = '/content/test/'
test_dir =  '/content/final/'

!mkdir metadata

# Making the .txt file containing list of images. 
# make_txt_file(formats, dir = train_dir)
make_txt_file(formats, dir = test_dir)
# make_txt_file(formats, dir = valid_dir)

# Writing the file traffic.names
object_labels = list(lut.keys())
with open('/content/metadata/traffic.names', 'w') as outfile:
    for label in object_labels:
        outfile.write(label + '\n')
    outfile.close()

# Writing the file traffic.data
data_config = f'classes=21\ntrain=train.txt\nvalid=valid.txt\nnames=traffic.names'
with open('/content/metadata/traffic.data', 'w') as outfile:
    outfile.write(data_config + '\n')
    outfile.close()

In [None]:
assert False

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!ln -s /content/gdrive/My\ Drive/ /mydrive
# !ls /mydrive

In [None]:
!cp -r final '/mydrive'

In [None]:
!cp '/content/metadata/final.txt' '/mydrive'

In [None]:
assert False

In [None]:
!ls /mydrive/yolov4

In [None]:
!zip -r obj.zip obj
!zip -r test.zip test
!cp obj.zip '/mydrive/yolov4'
!cp test.zip '/mydrive/yolov4'