### CSV DATA PREPARATION

In [1]:
import pandas as pd
import xml.etree.ElementTree as xet
from glob import glob

import numpy as np
import os

In [2]:
path = glob('./images/*.xml')
path

['./images\\11.xml',
 './images\\12.xml',
 './images\\13.xml',
 './images\\14.xml',
 './images\\15.xml',
 './images\\16.xml',
 './images\\17.xml',
 './images\\19.xml',
 './images\\2.xml',
 './images\\20.xml',
 './images\\21.xml',
 './images\\22.xml',
 './images\\23.xml',
 './images\\24.xml',
 './images\\25.xml',
 './images\\26.xml',
 './images\\28.xml',
 './images\\29.xml',
 './images\\30.xml',
 './images\\31.xml',
 './images\\32.xml',
 './images\\34.xml',
 './images\\35.xml',
 './images\\36.xml',
 './images\\37.xml',
 './images\\38.xml',
 './images\\39.xml',
 './images\\40.xml',
 './images\\41.xml',
 './images\\5.xml',
 './images\\6.xml',
 './images\\7.xml',
 './images\\9.xml',
 './images\\D1529ES.xml',
 './images\\D1553ST.xml',
 './images\\D1569AKI.xml',
 './images\\D1594AJW.xml',
 './images\\D1630ADB.xml',
 './images\\D1688ERM.xml',
 './images\\D1705AGC.xml',
 './images\\D1728UI.xml',
 './images\\D1750AHW.xml',
 './images\\D1770AER-1.xml',
 './images\\D1770AER.xml',
 './images\\D177

In [3]:
labels_dict = dict(filepath=[],xmin=[],xmax=[],ymin=[],ymax=[])
for filename in path:

    #filename = path[0]
    info = xet.parse(filename)
    root = info.getroot()
    member_object = root.find('object')
    labels_info = member_object.find('bndbox')
    xmin = int(labels_info.find('xmin').text)
    xmax = int(labels_info.find('xmax').text)
    ymin = int(labels_info.find('ymin').text)
    ymax = int(labels_info.find('ymax').text)
    #print(xmin,xmax,ymin,ymax)
    labels_dict['filepath'].append(filename)
    labels_dict['xmin'].append(xmin)
    labels_dict['xmax'].append(xmax)
    labels_dict['ymin'].append(ymin)
    labels_dict['ymax'].append(ymax)

In [4]:
df = pd.DataFrame(labels_dict)
df

Unnamed: 0,filepath,xmin,xmax,ymin,ymax
0,./images\11.xml,622,1294,688,919
1,./images\12.xml,891,1694,241,544
2,./images\13.xml,259,791,692,870
3,./images\14.xml,466,906,543,695
4,./images\15.xml,147,1028,201,649
...,...,...,...,...,...
208,./images\plat-nomor-putih3405.xml,441,584,280,340
209,./images\Shutterstock_2207597621-1-1.xml,168,817,156,376
210,./images\Z1401WZ.xml,1022,1231,503,575
211,./images\Z1820KB.xml,644,2022,685,1082


In [5]:
df.to_csv('labels.csv',index=False)

### YOLO DATA PREPARATION

In [6]:
df = pd.read_csv('labels.csv')
df.head()

Unnamed: 0,filepath,xmin,xmax,ymin,ymax
0,./images\11.xml,622,1294,688,919
1,./images\12.xml,891,1694,241,544
2,./images\13.xml,259,791,692,870
3,./images\14.xml,466,906,543,695
4,./images\15.xml,147,1028,201,649


### Labelling YOLO Format
    center_x, center_y, width , height

In [7]:
# parsing
def parsing(path):
    parser = xet.parse(path).getroot()
    name = parser.find('filename').text
    filename = f'./images/{name}'

    # width and height
    parser_size = parser.find('size')
    width = int(parser_size.find('width').text)
    height = int(parser_size.find('height').text)
    
    return filename, width, height
df[['filename','width','height']] = df['filepath'].apply(parsing).apply(pd.Series)

In [8]:
df.head()

Unnamed: 0,filepath,xmin,xmax,ymin,ymax,filename,width,height
0,./images\11.xml,622,1294,688,919,./images/11.jpg,2048,1536
1,./images\12.xml,891,1694,241,544,./images/12.jpg,2048,1536
2,./images\13.xml,259,791,692,870,./images/13.jpg,960,1280
3,./images\14.xml,466,906,543,695,./images/14.jpg,1280,968
4,./images\15.xml,147,1028,201,649,./images/15.jpg,1280,674


In [9]:
# center_x, center_y, width , height
df['center_x'] = (df['xmax'] + df['xmin'])/(2*df['width'])
df['center_y'] = (df['ymax'] + df['ymin'])/(2*df['height'])

df['bb_width'] = (df['xmax'] - df['xmin'])/df['width']
df['bb_height'] = (df['ymax'] - df['ymin'])/df['height']

In [10]:
df.head()

Unnamed: 0,filepath,xmin,xmax,ymin,ymax,filename,width,height,center_x,center_y,bb_width,bb_height
0,./images\11.xml,622,1294,688,919,./images/11.jpg,2048,1536,0.467773,0.523112,0.328125,0.150391
1,./images\12.xml,891,1694,241,544,./images/12.jpg,2048,1536,0.631104,0.255534,0.39209,0.197266
2,./images\13.xml,259,791,692,870,./images/13.jpg,960,1280,0.546875,0.610156,0.554167,0.139063
3,./images\14.xml,466,906,543,695,./images/14.jpg,1280,968,0.535937,0.639463,0.34375,0.157025
4,./images\15.xml,147,1028,201,649,./images/15.jpg,1280,674,0.458984,0.630564,0.688281,0.664688


In [11]:
### split the data into train and test
df_train = df.iloc[:200]
df_test = df.iloc[200:]

In [12]:
from shutil import copy

In [13]:
train_folder = './data_images/train'

values = df_train[['filename','center_x','center_y','bb_width','bb_height']].values
for fname, x, y, w, h in values:
    image_name = os.path.split(fname)[-1]
    txt_name = os.path.splitext(image_name)[0]
    
    dst_image_path = os.path.join(train_folder, image_name)
    dst_label_file = os.path.join(train_folder, txt_name + '.txt')
    
    # copy each image into the folder
    copy(fname, dst_image_path)

    # generate .txt which has label info
    label_txt = f'0 {x} {y} {w} {h}'
    with open(dst_label_file,mode = 'w') as f:
        f.write(label_txt)
        
        f.close()

In [14]:
test_folder = './data_images/test'

values = df_test[['filename','center_x','center_y','bb_width','bb_height']].values
for fname, x, y, w, h in values:
    image_name = os.path.split(fname)[-1]
    txt_name = os.path.splitext(image_name)[0]
    
    dst_image_path = os.path.join(test_folder, image_name)
    dst_label_file = os.path.join(test_folder, txt_name + '.txt')
    
    # copy each image into the folder
    copy(fname, dst_image_path)

    # generate .txt which has label info
    label_txt = f'0 {x} {y} {w} {h}'
    with open(dst_label_file,mode = 'w') as f:
        f.write(label_txt)
        
        f.close()