Copy images from parent folder containing all images to another folder keeping the same forder name

In [1]:
import os
import shutil
import pandas as pd

# use rl_env environment to run this script

# The following code copies the images from the original_folder folder to a new folder under the ID of the image
# This code helps only take the files that are needed for the defect dataset

# ============================================================================================================
# IMPORTANT: Do not run this code more than once as it will overwrite the images in the new folder
# ============================================================================================================

src_path = r"folder_containing_images" # this is the path to the folder containing the images
dst_path = r"destination_folder" # this is the path to the folder where the images will be copied

# list the subfolders of src_path
subfolders = os.listdir(src_path)

# count the number of subfolders
subfolders_count = len(subfolders)

# create an empty list to store the subfolders without the original_folder folder
subfolders_without_original_folder = []

# iterate through the subfolders under "Production blades cropped 220314"
for i in range(subfolders_count):
    # check if there is a folder called original_folder inside the subfolder
    if os.path.exists(src_path + '/' + subfolders[i] + '/original_folder'):
        # if the folder exists, list the files inside it
        files = os.listdir(src_path + '/' + subfolders[i] + '/original_folder')
        # iterate through the files
        for file in files:
            # if the file ends with Normal X.png, Normal Y.png, Normal Z.png, Curvature.png or .json copy it to the new folder
            if file.endswith('Normals X.png') or file.endswith('Normals Y.png') or file.endswith('Normals Z.png') or file.endswith('Curvature.png') or file.endswith('.json'):
                # if the files has been found, create a subfolder under the new folder
                if not os.path.exists(dst_path + '/'+ subfolders[i]):
                    # create a subfolder under the new folder                   
                    os.makedirs(dst_path + '/' + subfolders[i])
                # copy the image to the new folder under a subfolder called the second last part of the folder path
                shutil.copy(src_path + '/' + subfolders[i] + '/original_folder/' + file, dst_path + '/' + subfolders[i] + '/')
                 
    else:
        # create a list of subfolders without the original_folder folder in an accumulator      
        subfolders_without_original_folder.append(subfolders[i])
# save the list of subfolders without the original_folder folder in a text file
with open('subfolders_missing_original_folder.txt', 'w') as f:
    for item in subfolders_without_original_folder:
        f.write("%s\n" % item)

# display completed message
print('Completed')

Completed


Find the match file between the original image and the cropped images and store the file paths for each matched file in a csv file

In [14]:
# write a code to count matching file names in two directories

import os
import shutil
import glob
import pandas as pd

# get directory path 1 that contains the class folders with images - comment out the path that is not needed
path1 = r"holdout_test"

# get directory path 2 that contains the blade folders with the defect images
path2 = "dataset"

# count the number of matching files names in two directories, the files are under subdirectories

# get the list of subdirectories in path1
subdir1 = os.listdir(path1)
# get the list of subdirectories in path2
subdir2 = os.listdir(path2)

# create an empty dataframe
df = pd.DataFrame(columns=['path1', 'path2','class'])

# loop through each subdirectory in path1
for i in range(len(subdir1)):
    # create a list to store the number of matching files in each subdirectory
    match = []
    # get the list of files in each subdirectory in path1
    files1 = os.listdir(path1 + "\\" + subdir1[i])
    # iterate through each subdirectory in path2
    for j in range(len(files1)):
        
        # iterate through each subdirectory in path2
        for m in range(len(subdir2)):


            # get the list of files in each subdirectory in path2
            files2 = os.listdir(path2 + "\\" + subdir2[m])
            
            # iterate through each file in path1
            for k in range(len(files2)):
                # if the file name in path1 matches the file name in path2

                if files1[j] == files2[k]:
                    # record the file path of both files in a dataframe
                    df2 = pd.DataFrame({'path1': [path1 + "\\" + subdir1[i] + "\\" + files1[j]], 'path2': [path2 + "\\" + subdir2[m] + "\\" + files2[k]], 'class': [subdir1[i]]})
                    # append the dataframe to the empty dataframe
                    df = pd.concat([df, df2], ignore_index=True)
                else:
                    # continue to the next file
                    continue

    # save the dataframe as a csv file
    df.to_csv("csv_data/testMatchedFiles.csv", index=False)


Get the number of matched files between two folders

In [5]:

# load the csv file
import pandas as pd
import cv2
import matplotlib.pyplot as plt

df = pd.read_csv('csv_data/trainMatchedFiles.csv')
# df = pd.read_csv('matched_filepaths.csv')
# print the length of the dataframe
print('Number of files: ',len(df))

df2 = pd.read_csv('csv_data/matched_filepaths.csv')

# print the length of the dataframe
print('Number of files: ',len(df2))


Number of files:  2063
Number of files:  2063


Display a sample of the matched files

In [None]:

index = 500
# Load the two images
path_1 = df['path2'][index]
path_2 = df['path1'][index]
img1 = cv2.imread(path_2)
img2 = cv2.imread(path_1)

# resise the images to half the original size
img1 = cv2.resize(img1, (0, 0), None, .3, .3)
img2 = cv2.resize(img2, (0, 0), None, .3, .3)
# display the images side by side as subplots
fig = plt.figure(figsize=(10, 5))
ax1 = fig.add_subplot(121)
ax1.imshow(img1)
ax2 = fig.add_subplot(122)
ax2.imshow(img2)
plt.show()


Get path of image files in a csv file

In [42]:
import os
import pandas as pd

def extract_paths(parent_folder_path):
    # create an empty dataframe
    df = pd.DataFrame(columns=['Subfolder', 'filename', 'filepath'])
    # Loop through all subfolders in parent folder
    for subfolder in os.listdir(parent_folder_path):
        subfolder_path = os.path.join(parent_folder_path, subfolder)
        
        # Check if subfolder contains DefectsData folder
        if "DefectsData" in os.listdir(subfolder_path):
            defect_folder_path = os.path.join(subfolder_path, "DefectsData")
            # Loop through all files in DefectsData folder
            for file in os.listdir(defect_folder_path):
                file_path = os.path.join(defect_folder_path, file)
                
                # Check if file ends with specified extensions
                if file.endswith(("A.png", "B.png", "C.png", "D.png", ".json")):
                    # append the file path including the subfolder name to the dataframe
                    df2 = pd.DataFrame([[subfolder, file, file_path]], columns=['Subfolder', 'filename', 'filepath'])
                    df = pd.concat([df, df2], ignore_index=True)
                    
    # if the filepaths.csv file already exists, delete it
    if os.path.exists('filepaths.csv'):
        os.remove('original_filepaths.csv')

     # save the dataframe as a csv file
    df.to_csv('csv_data/original_filepaths.csv', index=False)              

In [43]:
parent_folder_path = r"original_folder" # this is the path to the folder containing the images

# use the function extract_paths to get the paths to all the images in a csv file
# the csv will be save in the current working directory
extract_paths(parent_folder_path)


[]

In [2]:
folder_path = r"training"
# create a function to get the file names
# the file names are in a subfolder of the folder_path
# concatenate the in a data frame the subfolder name and the file name
import os
import pandas as pd

def get_filenames(folder_path):
    # create an empty dataframe
    df = pd.DataFrame(columns=['subfolder', 'filename', 'filepath'])
    # loop through the subfolders
    for subfolder in os.listdir(folder_path):
        # loop through the files in the subfolder
        for file in os.listdir(os.path.join(folder_path, subfolder)):
            if file == 'Thumbs.db':
                continue
            else:
                # concatenate the subfolder name, the file path and the file name to the dataframe
                df2 = pd.DataFrame({'label': subfolder, 'filename': file, 'filepath': os.path.join(folder_path, subfolder, file)}, index=[0])
                df = pd.concat([df, df2], ignore_index=True)     

    # if the file name already exists,rename the file name before saving it to the csv file
    if os.path.exists('label_filenames.csv'):
        os.remove('label_filenames.csv')

    # save the dataframe to a csv file
    df.to_csv('csv_data/label_filepaths.csv', index=False)


# call the function
get_filenames(folder_path)

Look if file names match between the original files and the labeled files that are in two different csv files

In [4]:
# load csv file

import pandas as pd

# load csv file
df_original = pd.read_csv('csv_data/original_filepaths.csv')
df_labeled = pd.read_csv('csv_data/label_filepaths.csv')

# if column filename matches between two dataframes, get the filepath from df_original and the label from df_labeled includign the filename and label

# create a new dataframe with the filepath and label
matched_df = pd.DataFrame(columns=['original_filepath', 'label_filepath','label'])

# iterate through df_labeled
for index, row in df_labeled.iterrows():
    # get filename
    filename = row['filename']
    # get label
    label = row['label']
    # file path
    filepath = row['filepath']
    # find the row in df_original that has the same filename
    original_row = df_original.loc[df_original['filename'] == filename]
    # if there is no match, continue
    if original_row.empty:
        continue
    else:
        # get the filepath from df_original
        original_filepath = original_row['filepath'].values[0]
        # create a new row with the original_filepath, label_filepath, and label
        df = pd.DataFrame([[original_filepath, filepath, label]], columns=['original_filepath', 'label_filepath','label'])
        # concat the new row to matched_df
        matched_df = pd.concat([matched_df, df], ignore_index=True)

# save the new dataframe to a csv file
matched_df.to_csv('csv_data/matched_filepaths.csv', index=False)


Display matched original file with its cropped image

In [None]:
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import os
# load the images using the mached_df dataframe

# load the matched dataframe
matched_df = pd.read_csv('csv_data/matched_filepaths.csv')

# Check if the file paths are valid
orig_file_path = matched_df['original_filepath'][150]
label_file_path = matched_df['label_filepath'][150]

if not os.path.isfile(orig_file_path):
    print(f"Original file path does not exist: {orig_file_path}")
if not os.path.isfile(label_file_path):
    print(f"Label file path does not exist: {label_file_path}")

# Load the images
img_orig = cv2.imread(orig_file_path)
img_crop = cv2.imread(label_file_path)

# Check if the images are loaded correctly
if img_orig is None:
    print(f"Failed to load original image from: {orig_file_path}")
if img_crop is None:
    print(f"Failed to load cropped image from: {label_file_path}")

# If both images are loaded correctly, display them
if img_orig is not None and img_crop is not None:
    # show the images
    plt.figure(figsize=(10,10))
    plt.subplot(1,2,1)
    plt.imshow(img_orig)
    plt.subplot(1,2,2)
    plt.imshow(img_crop)
else:
    print("Cannot display images due to loading errors.")

# save the images

Get length of data and unique values

In [None]:
import pandas as pd
# load csv file called matched_filepaths.csv into dataframe
df = pd.read_csv('csv_data/matched_filepaths.csv')

# group the dataframe by the label column
grouped = df.groupby('label')


# get the name of the groups
labels = grouped.groups.keys()

# print he key name of the dictionary labels for the second key
label = list(labels)[2]

# get the group for the label
group = grouped.get_group(label)

# get length of group
length = len(df)

print(length)

# count the number rows in the dataframe
count = df.count()

# count the unique values in the path1 column
unique = df['original_filepath'].unique()
print(unique)


In [17]:
# command to clear jupiter workspace
%reset -f

Get the x,y, width and height of the bounding box of the object as int values and save them as csv file

In [6]:
# display image with bounding box using PIL
import os
import random
import csv
import pandas as pd

# load csv file called labeldata.csv
df = pd.read_csv('TemplateMatchingResultsTable.csv')

# split the matchLoc and matchSize columns into two columns each
df[['x', 'y']] = df['MatchLoc'].str.split(',', expand=True)
df[['w', 'h']] = df['MatchSize'].str.split(',', expand=True)

# drop the matchLoc and matchSize columns
df = df.drop(columns=['MatchLoc', 'MatchSize'])

# reorder the columns so that the original_filepath is first
df = df[['original_filepath', 'label', 'x', 'y', 'w', 'h']]

# find the center point of the bbox
df['x'] = df['x'].astype(int)
df['y'] = df['y'].astype(int)
df['w'] = df['w'].astype(int)
df['h'] = df['h'].astype(int)

# save the new csv file
df.to_csv('bboxvaluesInt.csv', index=False)

Display sample of images of the original image and its cropped image with the bounding box

In [5]:
# display image with bounding box using PIL
import os
from PIL import Image, ImageDraw
import numpy as np
import random
import csv
import pandas as pd

# load csv file called labeldata.csv
df = pd.read_csv('csv_data/bboxvaluesInt.csv')

index = 100
# path to the images
path = df['original_filepath'][index]
label = df['label'][index]
file_name = os.path.basename(path)

# read the image with PIL
image_org = Image.open(path)

# get the image size
width_org, height_org = image_org.size

# calculate a ratio when the image is resized to 640x640
w_ratio = 640 / width_org
h_ratio = 640 / height_org

# resize the image keeping the aspect ratio
image = image_org.resize((int(width_org * w_ratio), int(height_org * h_ratio)))

# convert the image to RGB
image = image.convert('RGB')

draw = ImageDraw.Draw(image)

x = int(df['x'][index])
y = int(df['y'][index])
w = int(df['w'][index])
h = int(df['h'][index])

# multiply the bounding box coordinates by the ratio
x = int(x * w_ratio)
y = int(y * h_ratio)
w = int(w * w_ratio)
h = int(h * h_ratio)

# draw the box
draw.rectangle([x, y, x+w, y+h], outline=(0,256,126), width=2)

# show the image
image.show()

Resize the image and its bbox to 640x640 and select one class for saving data for obj detection using YOLO v8

In [3]:
#  resize the images
import os
from PIL import Image, ImageDraw
import numpy as np
import random
import csv
import pandas as pd

save_path = 'data/train/'

# load csv file 
df_org = pd.read_csv('csv_data/bboxvaluesInt.csv')

# categorize df by labels
df_org = df_org.groupby('label')

# get the group names
group_names = df_org.groups.keys()

# create a new DataFrame for the first group for the label

# create a new data frame containing the A,B,C labels - only apply this for HAB, MAB and LAB - comment otherwise
df = pd.concat([df_org.get_group('A'), df_org.get_group('B'), df_org.get_group('C')], ignore_index=True)

# iterate through the rows of the DataFrame
for index, row in df.iterrows():
    # path to the images
    path = df['original_filepath'][index]
    label = df['label'][index]
    file_name = os.path.basename(path)

    # read the image with PIL
    image_org = Image.open(path)

    # get the image size
    width_org, height_org = image_org.size

    # resize the image keeping the aspect ratio

    # calculate a ratio when the image is resized to 640x640
    w_ratio = 640 / width_org
    h_ratio = 640 / height_org

    # resize the image
    image = image_org.resize((int(width_org * w_ratio), int(height_org * h_ratio)))

    label = 'SecondaryGrain' # Only apply this for MAB, HAB and LAB - comment otherwise

    # check if a folder name 'label' exists under  the data folder and if dont create one
    if not os.path.exists(save_path+label):
        os.mkdir(save_path+label)
    
    # save the resized image to the data folder under the label folder
    # image.save(save_path+label+'/'+file_name)
    image.save(save_path+label+'/'+file_name)

The code below is the same as the previous with the difference that it also saves the images and the csv file with the bbox values

In [None]:
# resize the images and their bbox coordinates
import os
from PIL import Image, ImageDraw
import numpy as np
import random
import csv

# load csv file 
df_org = pd.read_csv('bboxvaluesInt.csv')

# categorize df by labels
df_org = df_org.groupby('label')

# get the group names
group_names = df_org.groups.keys()

# create a new DataFrame for the first group for the Pinsite label
df = df_org.get_group('Scale')


# iterate through the rows of the DataFrame
for index, row in df.iterrows():
    # path to the images
    path = df['original_filepath'][index]
    label = df['label'][index]
    file_name = os.path.basename(path)

    # read the image with PIL
    image_org = Image.open(path)

    # get the image size
    width_org, height_org = image_org.size

    # resize the image keeping the aspect ratio

    # calculate a ratio when the image is resized to 640x640
    w_ratio = 640 / width_org
    h_ratio = 640 / height_org

    # resize the image
    image = image_org.resize((int(width_org * w_ratio), int(height_org * h_ratio)))

    # check if a folder name 'label' exists under  the data folder and if dont create one
    if not os.path.exists('data/'+label):
        os.mkdir('data/'+label)

    # save the resized image to the data folder under the label folder
    image.save('data/'+label+'/'+file_name)

    # convert the image to RGB
    image = image.convert('RGB')

    draw = ImageDraw.Draw(image)

    x = int(df['x'][index])
    y = int(df['y'][index])
    w = int(df['w'][index])
    h = int(df['h'][index])

    # multiply the bounding box coordinates by the ratio
    x = int(x * w_ratio)
    y = int(y * h_ratio)
    w = int(w * w_ratio)
    h = int(h * h_ratio)

    # draw the box
    draw.rectangle([x, y, x+w, y+h], outline=(0,256,126), width=2)

    # update the DataFrame with the new image path and the new coordinates
    df.loc[index,'original_filepath'] = file_name
    df.loc[index,'x'] = x
    df.loc[index,'y'] = y
    df.loc[index,'w'] = w
    df.loc[index,'h'] = h

    # if the csv file already exists, delete it
    if os.path.exists('resizedData.csv'):
        os.remove('resizedData.csv')
    # save the updated DataFrame as a CSV file
    df.to_csv('resizedData.csv', index=False)


normalize the x, y and w, h values to be between 0 and 1 - to prepare data for Yolo v8

In [21]:
import pandas as pd
import numpy as np
import json
import PIL
from PIL import Image

df = pd.read_csv('resizedData.csv')

df['x_center'] = df['x'] + (df['w'] / 2)
df['y_center'] = df['y'] + (df['h'] / 2)

# add two columns for the normalized width and height
df['norm_w'] = df['w']
df['norm_h'] = df['h']

for i in range(len(df)):
    #read the image size by reading the original_filepath with PIL
    img = PIL.Image.open('data/A/'+df['original_filepath'][i])
    # get the image size
    
    width, height = img.size
    # normalize the center and width and height
    df.loc[i,'x_center'] = df.loc[i,'x_center'] / width
    df.loc[i,'y_center'] = df.loc[i,'y_center'] / height
    df.loc[i,'norm_w'] = df.loc[i,'norm_w'] / width
    df.loc[i,'norm_h'] = df.loc[i,'norm_h'] / height

# save the new csv file
df.to_csv('anotationdata.csv', index=False)

In [None]:
# command to clear jupiter workspace
%reset -f

Save annotation data to txt files of each image - to train with Yolo

In [30]:
# load a csv file

import pandas as pd

# read the csv file
df = pd.read_csv('anotationdata.csv')

# iterate through the rows of the csv file
for index, row in df.iterrows():
    
    # get the x_center, y_center, norm_width, norm_height 
    file_name = row['original_filepath']
    x_center = row['x_center']
    y_center = row['y_center']
    norm_width = row['norm_w']
    norm_height = row['norm_h']   

    # the txt file should be saved inside a subfolder called data/anotations
    with open('data/anotations/' + file_name[:-4] + '.txt', 'w') as f:
        f.write(str(0) + ' ' + str(x_center) + ' ' + str(y_center) + ' ' + str(norm_width) + ' ' + str(norm_height))


Scale the bounding box values to the new image size - 640x640

In [51]:
import pandas as pd
import os

# read the CSV file
df = pd.read_csv(r'C:\Users\me1elar\Documents\GitHub\AI-23-19-AVI-ImageProcessingAndPrototypes\csv_data\TemplateMatchingResultsTable.csv')

# scale the bounding box coordinates that are found in the MatchLoc and MatchSize columns
df[['x', 'y']] = df['MatchLoc'].str.split(',', expand=True)
df[['w', 'h']] = df['MatchSize'].str.split(',', expand=True)

# scale the bounding box coordinates in relation to the image resize from 2752x2200 to 640x640
df['x'] = (df['x'].astype(int) * (640 / 2752)).astype(int)
df['y'] = (df['y'].astype(int) * (640 / 2200)).astype(int)

df['w'] = (df['w'].astype(int) * (640 / 2752)).astype(int)
df['h'] = (df['h'].astype(int) * (640 / 2200)).astype(int)

# drop the following columns: label_filepath, MatchScore, MatchLoc, MatchSize
df = df.drop(columns=['label_filepath', 'MatchScore', 'MatchLoc', 'MatchSize'])

# get the last part of the path for each value in the original_filepath column
df['original_filepath'] = df['original_filepath'].apply(lambda x: os.path.basename(x))

In [None]:
df.head()

In [None]:
import cv2
import pandas as pd
import os
import matplotlib.pyplot as plt

folder_path = 'data/A_equ'
file_list = os.listdir(folder_path)

# the label name is the second part of the folder path
label = 'A'

# Get the index of the image you want to display
index = 9 # index 50 is wrong

file_name = file_list[index]

# Get the file name and label from the data frame
file_name = df.loc[df['original_filepath'].str.contains(file_name) & (df['label'] == label), 'original_filepath']
print(file_name)
file_name = file_name.values[0]
# Get the file path and bounding box coordinates from the data frame
file_path = os.path.join(folder_path, file_name)
x = df.loc[df['original_filepath'] == file_name, 'x'].values[0]
y = df.loc[df['original_filepath'] == file_name, 'y'].values[0]
w = df.loc[df['original_filepath'] == file_name, 'w'].values[0]
h = df.loc[df['original_filepath'] == file_name, 'h'].values[0]

# Load the image
image = cv2.imread(file_path)

# Draw the bounding box on the image
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

# Convert the image from BGR to RGB
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Display the image with the bounding box
plt.imshow(image_rgb)
plt.axis('off')
plt.show()


1 Convert the x & y values of the bbox to center points and normalize them including the width and height of the bbox to be between 0 and 1 
2 Save the data to a txt file for each bbox of each image

In [42]:
import os
import pandas as pd

# read the CSV file
df = pd.read_csv(r'csv_data\TemplateMatchingResultsTable.csv')

# scale the bounding box coordinates that are found in the MatchLoc and MatchSize columns
df[['x', 'y']] = df['MatchLoc'].str.split(',', expand=True)
df[['w', 'h']] = df['MatchSize'].str.split(',', expand=True)

# scale the bounding box coordinates in relation to the image resize from 2752x2200 to 640x640
df['x'] = (df['x'].astype(int) * (640 / 2752)).astype(int)
df['y'] = (df['y'].astype(int) * (640 / 2200)).astype(int)

df['w'] = (df['w'].astype(int) * (640 / 2752)).astype(int)
df['h'] = (df['h'].astype(int) * (640 / 2200)).astype(int)

# drop the following columns: label_filepath, MatchScore, MatchLoc, MatchSize
df = df.drop(columns=['label_filepath', 'MatchScore', 'MatchLoc', 'MatchSize'])

# keep only the last part of the path for each value in the original_filepath column
df['original_filepath'] = df['original_filepath'].apply(lambda x: os.path.basename(x))

# add new columns for the center points of the bounding box
df['x_center'] = df['x'] + (df['w'] / 2)
df['y_center'] = df['y'] + (df['h'] / 2)

# add two columns for the normalized width and height of the bounding box
df['norm_w'] = df['w']
df['norm_h'] = df['h']


width = 640
height = 640
# normalize the center and width and height of the bounding box
df['x_center_norm'] = df['x_center'] / width
df['y_center_norm'] = df['y_center'] / height
df['norm_w'] = df['norm_w'] / width
df['norm_h'] = df['norm_h'] / height

# move norm_w and norm_h to the last two columns of the dataframe
cols = list(df.columns.values)
cols.pop(cols.index('norm_w'))
cols.pop(cols.index('norm_h'))
df = df[cols+['norm_w', 'norm_h']]

# drop the x_center and y_center columns
df = df.drop(columns=['x_center', 'y_center'])

# rename the column original_filepath to file_id
df = df.rename(columns={'original_filepath': 'file_id'})

# add a column to represent the labels as int values starting from zero
df['label'] = df['label'].astype('category')
df['label_id'] = df['label'].cat.codes

# save the label_id, x_center_norm, y_center_norm, norm_w, norm_h columns to a txt file with the file_id as the file name. If the file already exists, append the new values to the file
for index, row in df.iterrows():
    file_id = row['file_id']
    label_id = row['label_id']
    x_center_norm = row['x_center_norm']
    y_center_norm = row['y_center_norm']
    norm_w = row['norm_w']
    norm_h = row['norm_h']
    with open('csv_data/all_data_anotations/' + file_id[:-4] + '.txt', 'a') as f:
        f.write(str(label_id) + ' ' + str(x_center_norm) + ' ' + str(y_center_norm) + ' ' + str(norm_w) + ' ' + str(norm_h) + '\n')

     

Check for txt files with more than one bbox information

In [43]:
# search for a txt file in csv_data/all_data_anotations folder and open it
import os

# get the path to the txt file
path = "test.png"
path = path[:-4] + '.txt'
path = os.path.join('csv_data/all_data_anotations', path)

# open the txt file and read the lines
with open(path) as f:
    lines = f.readlines()

print(lines)

['0 0.46796875 0.4703125 0.1390625 0.61875\n', '8 0.546875 0.3859375 0.034375 0.103125\n']


Script to annotate A, B, C and D with their corresponding bbox values and save them to a txt file

In [2]:
import os
import pandas as pd

# read the CSV file
df = pd.read_csv(r'\csv_data\TemplateMatchingResultsTable.csv')

# remove the rows that have a label D, E, A, B, C
df = df[df.label != 'D']
df = df[df.label != 'E']
df = df[df.label != 'A']
df = df[df.label != 'B']
df = df[df.label != 'C']


# scale the bounding box coordinates that are found in the MatchLoc and MatchSize columns
df[['x', 'y']] = df['MatchLoc'].str.split(',', expand=True)
df[['w', 'h']] = df['MatchSize'].str.split(',', expand=True)

# scale the bounding box coordinates in relation to the image resize from 2752x2200 to 640x640
df['x'] = (df['x'].astype(int) * (640 / 2752)).astype(int)
df['y'] = (df['y'].astype(int) * (640 / 2200)).astype(int)

df['w'] = (df['w'].astype(int) * (640 / 2752)).astype(int)
df['h'] = (df['h'].astype(int) * (640 / 2200)).astype(int)

# drop the following columns: label_filepath, MatchScore, MatchLoc, MatchSize
df = df.drop(columns=['label_filepath', 'MatchScore', 'MatchLoc', 'MatchSize'])

# keep only the last part of the path for each value in the original_filepath column
df['original_filepath'] = df['original_filepath'].apply(lambda x: os.path.basename(x))

# add new columns for the center points of the bounding box
df['x_center'] = df['x'] + (df['w'] / 2)
df['y_center'] = df['y'] + (df['h'] / 2)

# add two columns for the normalized width and height of the bounding box
df['norm_w'] = df['w']
df['norm_h'] = df['h']


width = 640
height = 640
# normalize the center and width and height of the bounding box
df['x_center_norm'] = df['x_center'] / width
df['y_center_norm'] = df['y_center'] / height
df['norm_w'] = df['norm_w'] / width
df['norm_h'] = df['norm_h'] / height

# move norm_w and norm_h to the last two columns of the dataframe
cols = list(df.columns.values)
cols.pop(cols.index('norm_w'))
cols.pop(cols.index('norm_h'))
df = df[cols+['norm_w', 'norm_h']]

# drop the x_center and y_center columns
df = df.drop(columns=['x_center', 'y_center'])

# rename the column original_filepath to file_id
df = df.rename(columns={'original_filepath': 'file_id'})

# add a column to represent the labels as int values starting from zero
df['label'] = df['label'].astype('category')
df['label_id'] = df['label'].cat.codes

# save the label_id, x_center_norm, y_center_norm, norm_w, norm_h columns to a txt file with the file_id as the file name. If the file already exists, append the new values to the file
for index, row in df.iterrows():
    file_id = row['file_id']
    label_id = row['label_id']
    x_center_norm = row['x_center_norm']
    y_center_norm = row['y_center_norm']
    norm_w = row['norm_w']
    norm_h = row['norm_h']
    with open('csv_data/FourLabelsAnotations/' + file_id[:-4] + '.txt', 'a') as f:
        f.write(str(label_id) + ' ' + str(x_center_norm) + ' ' + str(y_center_norm) + ' ' + str(norm_w) + ' ' + str(norm_h) + '\n')

Get the original test dataset as full images

In [37]:
#  resize the images
import os
from PIL import Image, ImageDraw
import numpy as np
import random
import csv
import pandas as pd

# load csv file 
df_org = pd.read_csv('csv_data/testMatchedFiles.csv')

# rename the column path1 to original_filepath
df_org = df_org.rename(columns={'path1': 'defect_filepath'})
# rename the column path2 to defect_filepath
df_org = df_org.rename(columns={'path2': 'original_filepath'})
# rename the column class to label
df_org = df_org.rename(columns={'class': 'label'})

# categorize df by labels
df_org = df_org.groupby('label')

# get the group names
group_names = df_org.groups.keys()

# create a new DataFrame for the first group for the HAB, MAB and LAB labels
df = df_org.get_group('A')



# iterate through the rows of the DataFrame
for index, row in df.iterrows():
    # path to the images
    path = df['original_filepath'][index]
    label = df['label'][index]
    file_name = os.path.basename(path)

    # read the image with PIL
    image_org = Image.open(path)

    # get the image size
    width_org, height_org = image_org.size

    # resize the image keeping the aspect ratio

    # calculate a ratio when the image is resized to 640x640
    w_ratio = 640 / width_org
    h_ratio = 640 / height_org

    # resize the image
    image = image_org.resize((int(width_org * w_ratio), int(height_org * h_ratio)))

    label = 'SecondaryGrain' # Only apply this for MAB, HAB and LAB - comment otherwise
    
    # check if a folder name 'label' exists under  the data folder and if dont create one
    if not os.path.exists('data/test/'+label):
        os.mkdir('data/test/'+label)

    # save the resized image to the data folder under the label folder
    image.save('data/test/'+label+'/'+file_name)


In [14]:
#  resize the images
import os
from PIL import Image, ImageDraw
import numpy as np
import random
import csv
import pandas as pd

# load csv file 
df = pd.read_csv('csv_data/testMatchedFiles.csv')

# change the Machine_scar class to MachineScar
df = df.replace('Machine_scar', 'MachineScar')

# save as a csv file called testMatchedFiles.csv
df.to_csv('csv_data/testMatchedFiles.csv', index=False)



Move txt label files from the training dataset to the val dataset according to the val image files

In [None]:
import os
import pandas as pd

# val folder path
img_val_folder = r"dataset\images\val"

# train folder path
img_train_folder = r"dataset\images\train"

# get the list of files in the val folder
img_val_files = os.listdir(img_val_folder)

# get the list of files in the train folder
img_train_files = os.listdir(img_train_folder)

# check that the files in the val folder are not in the train folder
counter = 0
for file in img_val_files:
    if file in img_train_files:
        counter += 1
        # remove the file from the img_train_folder
        # os.remove(os.path.join(img_train_folder, file))

        print(f"File {file} is in both folders")
print(f"Number of files in both folders: {counter}")

# remove the files that are in both folders from the img_train_folder
for file in img_val_files:
    if file in img_train_files:
        # remove the file from the img_train_folder
        os.remove(os.path.join(img_train_folder, file))


# train label folder path
label_train_folder = r"dataset\labels\train"

# val label folder path
label_val_folder = r"dataset\labels\val"

# move the txt files that are in te label_train_folder to the label_val_folder if these files names match the files names in the img_val_folder
for file in img_val_files:
    file_path = os.path.join(label_train_folder, file[:-4] + '.txt')

    if os.path.exists(file_path):
        # move the txt file from label_train_folder to label_val_folder
        source_path = os.path.join(label_train_folder, file[:-4] + '.txt')
        destination_path = os.path.join(label_val_folder, file[:-4] + '.txt')
        # replace the file in the destination folder if it already exists
        if os.path.exists(destination_path):
            os.remove(destination_path)
        os.rename(source_path, destination_path)
    else:
        print(f"File {file} does not exist in the label_train_folder")
        

        


Apply image equalization to the images

In [59]:
import matplotlib.pyplot as plt
from skimage import data, exposure, img_as_float, io, color, img_as_ubyte
import os

# get the path of files inside the folder data/Pinsite
path = 'data/test/ABC'

files = os.listdir(path)

# iterate through the files and apply the histogram equalization
for file in files:
    img = io.imread(os.path.join(path, file))
    # print(img.shape)
    img_eq = exposure.equalize_adapthist(img_as_float(img))
    # Convert the image to RGB mode
    img_rgb = color.gray2rgb(img_eq)
    # convert to uint8
    img_rgb = img_as_ubyte(img_rgb)
    # save the image to the same folder
    io.imsave(os.path.join(path, file), img_rgb)
    

In [50]:
# clear the jupiter workspace
%reset -f
