# Import all relevant libraries


In [None]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
import shutil

from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#Create the folder layout for labelled and unlabelled data

In [None]:
#folder = '/content/drive/MyDrive/Sodankyla_database-20211027T162911Z-002/Sodankyla_database/DP_good_days'
folder = '/content/drive/MyDrive/Sodankyla_database-20211027T162911Z-002/Sodankyla_database/Testing_Auto-Cropping_and_Cutouts/Auto-cropped_Data/optimal_crops'
labelled_images   = os.path.join(folder, "labelled") 
unlabelled_images = os.path.join(folder, "unlabelled") 

if "labelled"   not in os.listdir(folder): os.mkdir(labelled_images)
if "unlabelled" not in os.listdir(folder): os.mkdir(unlabelled_images)

#Go through the data and put them into labelled and unlabelled data. Labelled data also goes into the right class folder


In [None]:
#################################################
#  Python script that goes through year folders and copies labelled images
#  to their corresponding directories bearing the same name 
#################################################


log = False
nlabelled_imgs   = 0
nunlabelled_imgs = 0

classes = ['Arcs',
           'Breakup',
           'Colored',
           'Discrete',
           'Edge',
           'Faint',
           'Patchy']


if "classes" not in os.listdir(labelled_images): os.mkdir(os.path.join(labelled_images,"classes"))
class_dir = os.path.join(labelled_images, 'classes')

for spec_class in classes:
  if spec_class not in os.listdir(class_dir):
    os.mkdir(os.path.join(class_dir,spec_class))
imgs = os.listdir(folder)

# iterating over each image and copying them into their class folders
current_image_no = 0
total_image_no   = len(imgs) 
for img in imgs:
    current_image_no += 1
    img_dir_comps = img.split('.')[len(img.split('.'))-2]
    if len(img_dir_comps.split('_')) > 1:
      if log:print(img)
      if log:print(img_dir_comps.split('_')[1])

      spec_class = img_dir_comps.split('_')[1]

      source = os.path.join(folder, img)
      destination = os.path.join(class_dir, spec_class, img)
      if log:print(destination + "\n")

      # copying the labelled file to the 'labelled_images' folder
      try:
        if img not in os.listdir(labelled_images): 
          shutil.copy(source, destination)
          nlabelled_imgs += 1
        else: 
          if log:print("Labelled image already in folder")
          nlabelled_imgs += 1
      except:
        if log:print("Something went wrong with that L image:      " + source)

    # copying the unlabelled file to the 'unlabelled_images' folder
    else:
      source = os.path.join(folder, img)
      destination = os.path.join(unlabelled_images, img)
      try:
        if img not in os.listdir(unlabelled_images): 
          shutil.copy(source, destination)
          nunlabelled_imgs += 1
        else: 
          if log:print("Unlabelled image already in folder")
          nunlabelled_imgs += 1
      except:
        if log:print("Something went wrong with that U image:      " + source)
    
    print(str(round(100*(current_image_no / total_image_no),3)) + "%")
    print(str(current_image_no) + "/" + str(total_image_no) + "\n")

print("Total labelled images:    " + str( nlabelled_imgs ) )
print("Total unlabelled images:  " + str( nunlabelled_imgs ) )
print("Total images:             " + str( nlabelled_imgs + nunlabelled_imgs ) )

1.351%
1/74

2.703%
2/74

4.054%
3/74

5.405%
4/74

6.757%
5/74

8.108%
6/74

9.459%
7/74

10.811%
8/74

12.162%
9/74

13.514%
10/74

14.865%
11/74

16.216%
12/74

17.568%
13/74

18.919%
14/74

20.27%
15/74

21.622%
16/74

22.973%
17/74

24.324%
18/74

25.676%
19/74

27.027%
20/74

28.378%
21/74

29.73%
22/74

31.081%
23/74

32.432%
24/74

33.784%
25/74

35.135%
26/74

36.486%
27/74

37.838%
28/74

39.189%
29/74

40.541%
30/74

41.892%
31/74

43.243%
32/74

44.595%
33/74

45.946%
34/74

47.297%
35/74

48.649%
36/74

50.0%
37/74

51.351%
38/74

52.703%
39/74

54.054%
40/74

55.405%
41/74

56.757%
42/74

58.108%
43/74

59.459%
44/74

60.811%
45/74

62.162%
46/74

63.514%
47/74

64.865%
48/74

66.216%
49/74

67.568%
50/74

68.919%
51/74

70.27%
52/74

71.622%
53/74

72.973%
54/74

74.324%
55/74

75.676%
56/74

77.027%
57/74

78.378%
58/74

79.73%
59/74

81.081%
60/74

82.432%
61/74

83.784%
62/74

85.135%
63/74

86.486%
64/74

87.838%
65/74

89.189%
66/74

90.541%
67/74

91.892%
68/74

93

#Create new variable for the labelled folder and go through and create training and testing sets

In [None]:
folder_labelled = folder+"/labelled"

In [None]:
#############################################
# Iterates over each class folder, splits the images in the folder into train a 
# test set 80:20 and copies them into train and test set folders
#############################################

if "train_set" not in os.listdir(folder_labelled):
  os.mkdir(folder_labelled+"/train_set")
if "test_set" not in os.listdir(folder_labelled):
  os.mkdir(folder_labelled+"/test_set")

# iterating over each class folder
for clas in classes:
    clas_dir = os.path.join(folder_labelled, 'classes', clas)
    img_files = os.listdir(clas_dir)

    # removing the desktop.ini file if it exists 
    if os.path.exists(os.path.join(clas_dir, 'desktop.ini')):
        img_files.remove('desktop.ini')

    train, test =  train_test_split(img_files, test_size = 0.2, random_state = 42) # splits the set

    # coping training images
    for img_file in train:
        source = os.path.join(clas_dir, img_file)
        destination = os.path.join(folder_labelled, 'train_set', img_file)

        # copying image
        shutil.copy(source, destination)

    # copying test images
    for img_file in test:
        source = os.path.join(clas_dir, img_file)
        destination = os.path.join(folder_labelled, 'test_set', img_file)

        # copying image
        shutil.copy(source,destination)

ValueError: ignored