In [1]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et
from shutil import move

In [2]:
# Load all XML annotation files from the specified directory
# This directory contains images and corresponding annotation files (.xml)
xml_files = glob('./datasets/images/*.xml')

In [3]:
# Data cleaning: replace backslashes (\\) with forward slashes (/)
# Useful when working with file paths on different operating systems
replace_text = lambda x: x.replace('\\', '/')
xml_files = list(map(replace_text, xml_files))

In [4]:
# Print the list of XML files loaded (for debugging)
xml_files

['./datasets/images/thyrocare_0_122.xml',
 './datasets/images/thyrocare_0_1374.xml',
 './datasets/images/thyrocare_0_1433.xml',
 './datasets/images/thyrocare_0_1861.xml',
 './datasets/images/thyrocare_0_2177.xml',
 './datasets/images/thyrocare_0_2300.xml',
 './datasets/images/thyrocare_0_2505.xml',
 './datasets/images/thyrocare_0_2649.xml',
 './datasets/images/thyrocare_0_2737.xml',
 './datasets/images/thyrocare_0_2841.xml',
 './datasets/images/thyrocare_0_2866.xml',
 './datasets/images/thyrocare_0_2885.xml',
 './datasets/images/thyrocare_0_3439.xml',
 './datasets/images/thyrocare_0_3558.xml',
 './datasets/images/thyrocare_0_3804.xml',
 './datasets/images/thyrocare_0_3813.xml',
 './datasets/images/thyrocare_0_421.xml',
 './datasets/images/thyrocare_0_4468.xml',
 './datasets/images/thyrocare_0_447.xml',
 './datasets/images/thyrocare_0_4618.xml',
 './datasets/images/thyrocare_0_4670.xml',
 './datasets/images/thyrocare_0_4683.xml',
 './datasets/images/thyrocare_0_4790.xml',
 './datasets/i

In [5]:
# Function to extract relevant information from each XML annotation file
def extract_text(filename):
    tree = et.parse(filename)  # Parse the XML file
    root = tree.getroot()       # Get the root of the XML tree

    # Extract the filename of the image
    image_name = root.find('filename').text

    # Extract the size (width and height) of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text

    # Extract all object annotations (bounding boxes)
    objects = root.findall('object')
    parser = []

    # Iterate through each object (annotation) and extract relevant details
    for object in objects:
        name = object.find('name').text                # Object class (e.g., Test Name, Value)
        bndbox = object.find('bndbox')                 # Bounding box coordinates
        xmin = bndbox.find('xmin').text                # Left boundary of the object
        xmax = bndbox.find('xmax').text                # Right boundary of the object
        ymin = bndbox.find('ymin').text                # Top boundary of the object
        ymax = bndbox.find('ymax').text                # Bottom boundary of the object
        
        # Append extracted data as a list for this object
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])

    return parser  # Return the list of extracted objects from this file

In [6]:
# Apply extract_text to all XML files and store results in parser_all
parser_all = list(map(extract_text, xml_files))

In [7]:
parser_all

[[['thyrocare_0_122.jpg', '939', '631', 'Test Name', '1', '329', '399', '463'],
  ['thyrocare_0_122.jpg', '939', '631', 'Value', '503', '554', '394', '453'],
  ['thyrocare_0_122.jpg', '939', '631', 'Units', '607', '671', '392', '450'],
  ['thyrocare_0_122.jpg',
   '939',
   '631',
   'Reference Range',
   '672',
   '736',
   '392',
   '450']],
 [['thyrocare_0_1374.jpg',
   '1068',
   '635',
   'Test Name',
   '51',
   '420',
   '346',
   '595'],
  ['thyrocare_0_1374.jpg', '1068', '635', 'Value', '682', '744', '347', '590'],
  ['thyrocare_0_1374.jpg', '1068', '635', 'Units', '805', '873', '348', '589'],
  ['thyrocare_0_1374.jpg',
   '1068',
   '635',
   'Reference Range',
   '908',
   '991',
   '348',
   '588']],
 [['thyrocare_0_1433.jpg',
   '939',
   '631',
   'Test Name',
   '82',
   '355',
   '407',
   '480'],
  ['thyrocare_0_1433.jpg', '939', '631', 'Value', '495', '542', '400', '469'],
  ['thyrocare_0_1433.jpg', '939', '631', 'Units', '583', '634', '398', '466'],
  ['thyrocare_0_1

In [8]:
# Flatten the list of lists into a single list
data = reduce(lambda x, y : x + y, parser_all)

In [9]:
data

[['thyrocare_0_122.jpg', '939', '631', 'Test Name', '1', '329', '399', '463'],
 ['thyrocare_0_122.jpg', '939', '631', 'Value', '503', '554', '394', '453'],
 ['thyrocare_0_122.jpg', '939', '631', 'Units', '607', '671', '392', '450'],
 ['thyrocare_0_122.jpg',
  '939',
  '631',
  'Reference Range',
  '672',
  '736',
  '392',
  '450'],
 ['thyrocare_0_1374.jpg',
  '1068',
  '635',
  'Test Name',
  '51',
  '420',
  '346',
  '595'],
 ['thyrocare_0_1374.jpg', '1068', '635', 'Value', '682', '744', '347', '590'],
 ['thyrocare_0_1374.jpg', '1068', '635', 'Units', '805', '873', '348', '589'],
 ['thyrocare_0_1374.jpg',
  '1068',
  '635',
  'Reference Range',
  '908',
  '991',
  '348',
  '588'],
 ['thyrocare_0_1433.jpg',
  '939',
  '631',
  'Test Name',
  '82',
  '355',
  '407',
  '480'],
 ['thyrocare_0_1433.jpg', '939', '631', 'Value', '495', '542', '400', '469'],
 ['thyrocare_0_1433.jpg', '939', '631', 'Units', '583', '634', '398', '466'],
 ['thyrocare_0_1433.jpg',
  '939',
  '631',
  'Reference R

In [10]:
# Convert the list to a DataFrame for easier manipulation and analysis
df = pd.DataFrame(data, columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [11]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,thyrocare_0_122.jpg,939,631,Test Name,1,329,399,463
1,thyrocare_0_122.jpg,939,631,Value,503,554,394,453
2,thyrocare_0_122.jpg,939,631,Units,607,671,392,450
3,thyrocare_0_122.jpg,939,631,Reference Range,672,736,392,450
4,thyrocare_0_1374.jpg,1068,635,Test Name,51,420,346,595


In [12]:
# Print the shape of the DataFrame (number of rows and columns)
df.shape

(236, 8)

In [13]:
# Count the occurrences of each object class (name) in the dataset
df['name'].value_counts()

name
Test Name          59
Value              59
Units              59
Reference Range    59
Name: count, dtype: int64

In [14]:
# Check the structure of the DataFrame and data types of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  236 non-null    object
 1   width     236 non-null    object
 2   height    236 non-null    object
 3   name      236 non-null    object
 4   xmin      236 non-null    object
 5   xmax      236 non-null    object
 6   ymin      236 non-null    object
 7   ymax      236 non-null    object
dtypes: object(8)
memory usage: 14.9+ KB


In [15]:
# Convert specific columns to integers for easier computation later
columns = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[columns] = df[columns].astype(int)

In [16]:
# Re-check the DataFrame structure after type conversion
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  236 non-null    object
 1   width     236 non-null    int64 
 2   height    236 non-null    int64 
 3   name      236 non-null    object
 4   xmin      236 non-null    int64 
 5   xmax      236 non-null    int64 
 6   ymin      236 non-null    int64 
 7   ymax      236 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 14.9+ KB


In [17]:
# Calculate the center x and center y coordinates of each object (YOLO format)
df['center_x'] = ((df['xmax'] + df['xmin']) / 2) / df['width']
df['center_y'] = ((df['ymax'] + df['ymin']) / 2) / df['height']

# Calculate the normalized width and height of the bounding box (YOLO format)
df['w'] = (df['xmax'] - df['xmin']) / df['width']
df['h'] = (df['ymax'] - df['ymin']) / df['height']

In [18]:
# Display the first few rows of the updated DataFrame with new columns
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,thyrocare_0_122.jpg,939,631,Test Name,1,329,399,463,0.175719,0.683043,0.349308,0.101426
1,thyrocare_0_122.jpg,939,631,Value,503,554,394,453,0.562833,0.671157,0.054313,0.093502
2,thyrocare_0_122.jpg,939,631,Units,607,671,392,450,0.680511,0.667195,0.068158,0.091918
3,thyrocare_0_122.jpg,939,631,Reference Range,672,736,392,450,0.749734,0.667195,0.068158,0.091918
4,thyrocare_0_1374.jpg,1068,635,Test Name,51,420,346,595,0.220506,0.740945,0.345506,0.392126


### Split data into train and test

In [19]:
# Get the list of unique image filenames in the dataset
images = df['filename'].unique()

In [20]:
# Check the number of unique images
len(images)

59

In [21]:
# Split the images into 80% for training and 20% for testing
images_df = pd.DataFrame(images, columns=['filename'])

# Randomly sample 80% of images for training
images_train = tuple(images_df.sample(frac=0.8)['filename'])

In [22]:
# Use the remaining 20% for testing
images_test = tuple(images_df.query(f'filename not in {images_train}')['filename'])

In [23]:
# Check the number of images in the training and testing sets
len(images_train), len(images_test)

(47, 12)

In [24]:
# Create separate DataFrames for training and testing data
train_df = df.query(f'filename in {images_train}')
test_df = df.query(f'filename in {images_test}')

In [25]:
# Display the first few rows of the training and testing DataFrames
train_df.head()
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
20,thyrocare_0_2300.jpg,1068,635,Test Name,93,431,348,606,0.245318,0.751181,0.316479,0.406299
21,thyrocare_0_2300.jpg,1068,635,Value,672,727,347,599,0.654963,0.744882,0.051498,0.39685
22,thyrocare_0_2300.jpg,1068,635,Units,779,850,348,598,0.76264,0.744882,0.066479,0.393701
23,thyrocare_0_2300.jpg,1068,635,Reference Range,877,955,349,596,0.857678,0.744094,0.073034,0.388976
48,thyrocare_0_3439.jpg,939,631,Test Name,68,351,399,464,0.22311,0.683835,0.301384,0.103011


### Assign ID number to object names

In [26]:
# Function to encode object class names into numeric IDs (required for training)
def label_encoding(x):
    labels = {'Test Name': 0, 'Value': 1, 'Units': 2, 'Reference Range': 3}
    return labels[x]

In [27]:
# Apply label encoding to the training and testing sets
train_df['ID'] = train_df['name'].apply(label_encoding)
test_df['ID'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['ID'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['ID'] = test_df['name'].apply(label_encoding)


In [28]:
# Check the updated training DataFrame with label IDs
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,ID
0,thyrocare_0_122.jpg,939,631,Test Name,1,329,399,463,0.175719,0.683043,0.349308,0.101426,0
1,thyrocare_0_122.jpg,939,631,Value,503,554,394,453,0.562833,0.671157,0.054313,0.093502,1
2,thyrocare_0_122.jpg,939,631,Units,607,671,392,450,0.680511,0.667195,0.068158,0.091918,2
3,thyrocare_0_122.jpg,939,631,Reference Range,672,736,392,450,0.749734,0.667195,0.068158,0.091918,3
4,thyrocare_0_1374.jpg,1068,635,Test Name,51,420,346,595,0.220506,0.740945,0.345506,0.392126,0


### Save images and label in text

In [29]:
# Create directories for storing training and testing images and labels
train_folder = 'datasets/images/train'
test_folder = 'datasets/images/test'

os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

In [30]:
# Specify columns to group by for creating individual label files for each image
columns = ['filename', 'ID', 'center_x', 'center_y', 'w', 'h']

# Group data by filename (for both training and testing datasets)
groupby_object_train = train_df[columns].groupby('filename')
groupby_object_test = test_df[columns].groupby('filename')

In [31]:
# Function to move images and save their corresponding labels in YOLO format
def save_data(filename, folder_path, group_object):
    # Move the image file to the destination folder
    source = os.path.join('datasets/images', filename)
    destination = os.path.join(folder_path, filename)
    move(source, destination)  # Move image to the train/test folder

    # Save the label data in a text file with the same name as the image
    text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
    
    # Save the grouped data in YOLO format (no header, space-separated)
    group_object.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [32]:
# Apply the save_data function to each training image
filename_series = pd.Series(groupby_object_train.groups.keys())
filename_series.apply(save_data, args=(train_folder, groupby_object_train))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
dtype: object

In [33]:
# Apply the save_data function to each testing image
filename_series = pd.Series(groupby_object_test.groups.keys())
filename_series.apply(save_data, args=(test_folder, groupby_object_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
dtype: object