Dataset: https://www.kaggle.com/datasets/perfect9015/pillsdetectiondataset

In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [17]:
# Load all XML files and store in a list
xml_list = glob("./pills image/*.xml")
print(xml_list[0])

# Convert the "\" in the directory to "/"
xml_list = list(map(lambda x: x.replace('\\','/'),xml_list))
print(xml_list[0])

./pills image\360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU829o.xml
./pills image/360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU829o.xml


In [19]:
# From each XML file, extract filename, size(width,height), and object(name, xmin, xmax, ymin, ymax)

def extract_text(filename):
    
    # Parse the XML file
    tree = et.parse(filename)

    # Enter the root element to access the other attributes such as filename, size, etc.
    root = tree.getroot()

    # Extract the filename
    image_name = root.find('filename').text

    # Extract the width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text

    # getting the object(name, xmin, xmax, ymin, ymax)
    # some XML files have multiple objects, aka multiple pills. Therefore, require for loop to retrieve all object locations.
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
    
    return parser

In [20]:
parser_all = list(map(extract_text, xml_list))

In [21]:
data = reduce(lambda x,y: x+y, parser_all)

In [27]:
df = pd.DataFrame(data, columns=['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])
print(df.shape)
df.head()

(686, 8)


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,233,273,218,255
1,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,282,299,223,240
2,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,287,302,240,253
3,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,315,341,224,250
4,5b9bd9351f00002c002100a3.jpeg,720,480,tablets,347,390,267,312


In [33]:
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 686 entries, 0 to 685
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  686 non-null    object
 1   width     686 non-null    int32 
 2   height    686 non-null    int32 
 3   name      686 non-null    object
 4   xmin      686 non-null    int32 
 5   xmax      686 non-null    int32 
 6   ymin      686 non-null    int32 
 7   ymax      686 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 26.9+ KB


In [34]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']

# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,233,273,218,255,0.468519,0.656944,0.074074,0.102778
1,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,282,299,223,240,0.537963,0.643056,0.031481,0.047222
2,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,287,302,240,253,0.54537,0.684722,0.027778,0.036111
3,360_F_408781728_pROdGsAa6GSKe7iw1ah0CkgeZ1HU82...,540,360,tablets,315,341,224,250,0.607407,0.658333,0.048148,0.072222
4,5b9bd9351f00002c002100a3.jpeg,720,480,tablets,347,390,267,312,0.511806,0.603125,0.059722,0.09375


# Split data into train and test set

In [35]:
images = df['filename'].unique()

In [40]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [42]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [45]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

# Assign ID Number to Names (Encoding)

In [50]:
train_df['id'] = 0
test_df['id'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = 0


In [51]:
import os
from shutil import move

In [57]:
train_folder = 'pills image/train'
test_folder = 'pills image/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [54]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [58]:
# groupby_obj_train.get_group('5b9bd9351f00002c002100a3.jpeg').set_index('filename').to_csv('sample.txt', index=False, header=False)
# save each image in train/test folder and respective labels in .txt

def save_data(filename, folder_path, group_obj):
    src = os.path.join('pills image', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) # move images to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=" ", index=False, header=False)

In [66]:
# move the train to train folder
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
117    None
118    None
119    None
120    None
121    None
Length: 122, dtype: object

In [67]:
# move the train to train folder
filename_series = pd.Series(groupby_obj_test.groups.keys())
filename_series.apply(save_data, args=(test_folder, groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
dtype: object