In [2]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [38]:
# we are loading all xml files and storing them in a list
xml_list = glob('./1_datapreparation/data_images/*.xml')
# data cleaning as in removing \\ and replace with /
xml_list = list(map(lambda x: x.replace('\\', '/'), xml_list))
xml_list


['./1_datapreparation/data_images/01.xml.xml',
 './1_datapreparation/data_images/02.xml.xml',
 './1_datapreparation/data_images/03.xml.xml',
 './1_datapreparation/data_images/04.xml.xml',
 './1_datapreparation/data_images/05.xml.xml',
 './1_datapreparation/data_images/06.xml.xml',
 './1_datapreparation/data_images/07.xml.xml',
 './1_datapreparation/data_images/08.xml.xml',
 './1_datapreparation/data_images/09.xml.xml',
 './1_datapreparation/data_images/10.xml.xml',
 './1_datapreparation/data_images/11.xml.xml',
 './1_datapreparation/data_images/12.xml.xml']

In [39]:
# read XML file and from each extract 
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)

def extract_text(filename):    
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract file name
    image_name = root.find('filename').text
    
    # extract width and height
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    
    # extracting objects/bounding boxes of objects
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        # bounding box dimensions
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return parser


In [42]:
parser_all = list(map(extract_text, xml_list))
data = reduce(lambda x, y: x+y, parser_all)
print(data)

[['01.jpg', '2047', '1365', 'car', '202', '1021', '741', '1164'], ['01.jpg', '2047', '1365', 'car', '1118', '1858', '718', '1095'], ['02.jpg', '2048', '1365', 'car', '497', '2035', '466', '1249'], ['03.jpg', '1600', '1200', 'car', '37', '1547', '523', '1035'], ['03.jpg', '1600', '1200', 'car', '29', '918', '509', '852'], ['03.jpg', '1600', '1200', 'car', '1292', '1600', '503', '917'], ['04.jpg', '2048', '1536', 'dog', '531', '1564', '353', '1439'], ['05.jpg', '2047', '1382', 'bicycle', '677', '2000', '339', '1183'], ['06.jpg', '950', '618', 'boat', '1', '141', '178', '253'], ['06.jpg', '950', '618', 'boat', '64', '405', '208', '369'], ['06.jpg', '950', '618', 'boat', '65', '236', '247', '309'], ['06.jpg', '950', '618', 'boat', '177', '638', '225', '461'], ['07.jpg', '2000', '1500', 'cow', '143', '358', '262', '389'], ['07.jpg', '2000', '1500', 'cow', '977', '1141', '235', '354'], ['07.jpg', '2000', '1500', 'cow', '1155', '1464', '544', '888'], ['07.jpg', '2000', '1500', 'cow', '1270', 

In [46]:
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,01.jpg,2047,1365,car,202,1021,741,1164
1,01.jpg,2047,1365,car,1118,1858,718,1095
2,02.jpg,2048,1365,car,497,2035,466,1249
3,03.jpg,1600,1200,car,37,1547,523,1035
4,03.jpg,1600,1200,car,29,918,509,852


In [50]:
df.shape

(28, 8)

In [48]:
#Counts how much of each object has a bounding box
df['name'].value_counts()

name
person     11
car         6
cow         5
boat        4
dog         1
bicycle     1
Name: count, dtype: int64

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  28 non-null     object
 1   width     28 non-null     object
 2   height    28 non-null     object
 3   name      28 non-null     object
 4   xmin      28 non-null     object
 5   xmax      28 non-null     object
 6   ymin      28 non-null     object
 7   ymax      28 non-null     object
dtypes: object(8)
memory usage: 1.9+ KB


In [51]:
# type conversion values to ints
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  28 non-null     object
 1   width     28 non-null     int32 
 2   height    28 non-null     int32 
 3   name      28 non-null     object
 4   xmin      28 non-null     int32 
 5   xmax      28 non-null     int32 
 6   ymin      28 non-null     int32 
 7   ymax      28 non-null     int32 
dtypes: int32(6), object(2)
memory usage: 1.2+ KB


In [52]:
# center x, center y
df['center_x'] =((df['xmax'] + df['xmin'])/2) / df['width']
df['center_y'] =((df['ymax'] + df['ymin'])/2) / df['height']

#width
df['w'] = (df['xmax'] - df['xmin']) / df['width']
#height
df['h'] = (df['ymax'] - df['ymin']) / df['height']

In [53]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,2047,1365,car,202,1021,741,1164,0.29873,0.697802,0.400098,0.30989
1,01.jpg,2047,1365,car,1118,1858,718,1095,0.726917,0.664103,0.361505,0.27619
2,02.jpg,2048,1365,car,497,2035,466,1249,0.618164,0.628205,0.750977,0.573626
3,03.jpg,1600,1200,car,37,1547,523,1035,0.495,0.649167,0.94375,0.426667
4,03.jpg,1600,1200,car,29,918,509,852,0.295938,0.567083,0.555625,0.285833


In [56]:
images = df['filename'].unique()
len(images)

12

In [70]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_df.head()
img_train = tuple(img_df.sample(frac=0.8)['filename']) #shuffle then randomly picks 80% of images

In [71]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [72]:
img_test

('02.jpg', '11.jpeg')

In [73]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [74]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,01.jpg,2047,1365,car,202,1021,741,1164,0.29873,0.697802,0.400098,0.30989
1,01.jpg,2047,1365,car,1118,1858,718,1095,0.726917,0.664103,0.361505,0.27619
3,03.jpg,1600,1200,car,37,1547,523,1035,0.495,0.649167,0.94375,0.426667
4,03.jpg,1600,1200,car,29,918,509,852,0.295938,0.567083,0.555625,0.285833
5,03.jpg,1600,1200,car,1292,1600,503,917,0.90375,0.591667,0.1925,0.345


In [75]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
2,02.jpg,2048,1365,car,497,2035,466,1249,0.618164,0.628205,0.750977,0.573626
25,11.jpeg,1200,800,person,124,569,53,715,0.28875,0.48,0.370833,0.8275


In [76]:
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6, 'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 
              'cat':12, 'tvmonitor':13, 'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]
              

In [79]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)
train_df.head(7)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,01.jpg,2047,1365,car,202,1021,741,1164,0.29873,0.697802,0.400098,0.30989,1
1,01.jpg,2047,1365,car,1118,1858,718,1095,0.726917,0.664103,0.361505,0.27619,1
3,03.jpg,1600,1200,car,37,1547,523,1035,0.495,0.649167,0.94375,0.426667,1
4,03.jpg,1600,1200,car,29,918,509,852,0.295938,0.567083,0.555625,0.285833,1
5,03.jpg,1600,1200,car,1292,1600,503,917,0.90375,0.591667,0.1925,0.345,1
6,04.jpg,2048,1536,dog,531,1564,353,1439,0.511475,0.583333,0.504395,0.707031,6
7,05.jpg,2047,1382,bicycle,677,2000,339,1183,0.653884,0.550651,0.646312,0.610709,8


In [80]:
import os
from shutil import move

In [82]:
train_folder = './1_datapreparation/data_images/train'
test_folder = './1_datapreparation/data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [123]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')


In [124]:
# our job is to use the data and put it in text files to be read
#groupby_obj_train.get_group('01.jpg').set_index('filename').to_csv('sample.txt', index=False, header=False) 
# save each image in train/test and its respective labels in .txt

def save_data(filename, folder_path, group_obj):
    #move image
    src = os.path.join('./1_datapreparation/data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) #moving image to destination folder

    #save labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [125]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [126]:
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object

In [127]:
filename_series = pd.Series(groupby_obj_test.groups.keys())
filename_series.apply(save_data, args=(test_folder, groupby_obj_test))

0    None
1    None
dtype: object