In [1]:
import os
import shutil
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
image_folder = 'images'

data = []

for filename in os.listdir(image_folder):
    if filename.endswith('.png'):  
        label = filename.split('-')[0]
        data.append([filename, label])

dataset = pd.DataFrame(data, columns=['filename', 'label'])
dataset

Unnamed: 0,filename,label
0,smp-SM58.png,smp
1,smp-SM239.png,smp
2,dv1-14.png,dv1
3,base6-107.png,base6
4,ex11-46.png,ex11
...,...,...
17184,sm3-123.png,sm3
17185,sv6-78.png,sv6
17186,dp6-96.png,dp6
17187,smp-SM220.png,smp


In [3]:
label_counts = dataset['label'].value_counts()
labels_to_keep = label_counts[label_counts >= 150].index
dataset_tiny = dataset[dataset['label'].isin(labels_to_keep)]
dataset_tiny['label'].nunique()

36

In [5]:
dataset_tiny['label'].value_counts()

label
swshp        290
swsh8        284
sv2          279
sm12         272
sv4          266
sm11         261
sv1          258
smp          251
sv4pt5       245
sm8          240
sm10         238
swsh10       216
swsh1        216
xyp          213
swsh2        209
swsh4        203
swsh3        201
sm9          198
sm7          187
swsh9        186
ecard2       182
ecard3       182
sm2          180
sm5          178
sm3          177
sv7          175
sm1          173
swsh5        166
xy8          165
ecard1       165
xy5          164
swsh12pt5    160
sv6          155
pl3          153
bw7          153
sm6          150
Name: count, dtype: int64

In [14]:
source_folder = 'images'
destination_folder = 'images_dataset'

In [15]:
os.makedirs(destination_folder, exist_ok=True)
for filename in os.listdir(source_folder):
    if filename.endswith('.png'):  # Only process .png files
        class_name = filename.split('-')[0]
        class_folder = os.path.join(destination_folder, class_name)
        os.makedirs(class_folder, exist_ok=True)
        
        source_path = os.path.join(source_folder, filename)
        destination_path = os.path.join(class_folder, filename)
        shutil.move(source_path, destination_path)

print("Files have been organized by class.")

Files have been organized by class.


In [4]:
source_folder = 'images_dataset'
train_folder = 'images_dataset_train'
test_folder = 'images_dataset_test'

os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

test_size = 0.2  # 20% of the images will go to the test set

# Loop through each class folder
for class_name in os.listdir(source_folder):
    class_folder = os.path.join(source_folder, class_name)
    
    if os.path.isdir(class_folder):  # Only process directories (classes)
        # List all files in the class folder
        images = [f for f in os.listdir(class_folder) if f.endswith('.png')]
        
        # Split images into train and test sets
        train_images, test_images = train_test_split(images, test_size=test_size, random_state=42)
        
        # Create class subfolders in train and test directories
        train_class_folder = os.path.join(train_folder, class_name)
        test_class_folder = os.path.join(test_folder, class_name)
        os.makedirs(train_class_folder, exist_ok=True)
        os.makedirs(test_class_folder, exist_ok=True)
        
        # Move images to the respective train/test class folders
        for image in train_images:
            shutil.copy(os.path.join(class_folder, image), os.path.join(train_class_folder, image))
        
        for image in test_images:
            shutil.copy(os.path.join(class_folder, image), os.path.join(test_class_folder, image))

print("Train and test sets have been created.")


Train and test sets have been created.


In [6]:
labels_to_keep = [
    "swshp", "swsh8", "sv2", "sm12", "sv4", "sm11", "sv1", "smp", "sv4pt5", 
    "sm8", "sm10", "swsh10", "swsh1", "xyp", "swsh2", "swsh4", "swsh3", 
    "sm9", "sm7", "swsh9", "ecard2", "ecard3", "sm2", "sm5", "sm3", "sv7", 
    "sm1", "swsh5", "xy8", "ecard1", "xy5", "swsh12pt5", "sv6", "pl3", "bw7", "sm6"
]

In [7]:
def filter_folders(root_folder, labels_to_keep):
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        
        if os.path.isdir(folder_path) and folder_name not in labels_to_keep:
            shutil.rmtree(folder_path)
            print(f"Removed folder: {folder_path}")

train_folder = 'images_dataset_train'
test_folder = 'images_dataset_test'

# Filter folders in train and test directories
filter_folders(train_folder, labels_to_keep)
filter_folders(test_folder, labels_to_keep)

print("Filtered train and test folders to keep only specified labels.")

Removed folder: images_dataset_train/bwp
Removed folder: images_dataset_train/mcd11
Removed folder: images_dataset_train/ex12
Removed folder: images_dataset_train/ex15
Removed folder: images_dataset_train/mcd16
Removed folder: images_dataset_train/sv6pt5
Removed folder: images_dataset_train/bw2
Removed folder: images_dataset_train/bw5
Removed folder: images_dataset_train/swsh35
Removed folder: images_dataset_train/pl2
Removed folder: images_dataset_train/col1
Removed folder: images_dataset_train/bw4
Removed folder: images_dataset_train/mcd19
Removed folder: images_dataset_train/bw3
Removed folder: images_dataset_train/mcd21
Removed folder: images_dataset_train/ex14
Removed folder: images_dataset_train/ex13
Removed folder: images_dataset_train/pl4
Removed folder: images_dataset_train/neo2
Removed folder: images_dataset_train/sv3pt5
Removed folder: images_dataset_train/pop9
Removed folder: images_dataset_train/bp
Removed folder: images_dataset_train/pop7
Removed folder: images_dataset_tr