# IMAGE ANALYSIS
### In this script, after having divided the images in folders, we inspected the data to get some useful informations.


In [None]:
import os

# In this list i'll save all image sizes
all_sizes = []

# In this list i'll save all *different* image sizes
different_sizes = []

# In this list i'll save all cardinality of every type of size
cardinality = []

# Dataset folder
dataset_dir = os.path.join(os.getcwd(), "MaskDatasetSizes")

# Here i have the list of all subdirectories
dirs = os.listdir(dataset_dir)

# File where we'll save the vertical images
f = open("vertical.txt","a")

### Here we cycled over the images to get their size, find the vertical ones, and create a second dataset on which rotate them without cropping

In [None]:
import imagesize
from os.path import isfile, join
from PIL import Image

for dir in dirs:
    sub = join(dataset_dir, dir)
    subdirs = listdir(sub)

    for subdir in subdirs:
        complete_subdir = join(join(dataset_dir,dir),subdir)
        
        # List of every file in dataset
        onlyfiles = [f for f in listdir(complete_subdir) if isfile(join(complete_subdir, f))]
       
        # Decomment to compute the number of file in each subdirectory
        #print(join(dir,subdir) + "  : " + str(len(onlyfiles)) + " images")

        for file in onlyfiles:
            image_complete_path = join(complete_subdir, file)
            
            # Get image size
            width, height = imagesize.get(image_complete_path)
            
            # Understand which are the vertical images
            if width < height:
                
                # Treat it like an horizontal one
                width, height = height, width
                
                # ROTATION
                img = Image.open(image_complete_path)
                img = img.rotate(90 ,expand=True)
                img.save(image_complete_path)
               
                # print their path on file, to get which are the vertical ones
                f.write(join(dir,join(subdir, file)))
                f.write("\n")

            cur_size = str(width) + "_" + str(height)
            all_sizes.append(cur_size)
            
            if cur_size not in different_sizes:
                different_sizes.append(cur_size)

### Here we understood how many different types of image sizes were present, and created a dictionaire tuple for everyone of them

In [None]:
# Get the cardinality of each size, to understand which type of shape is in major number
for different_size in different_sizes:
    count = 0
    for size in all_sizes:
        if different_size == size:
            count += 1
    cardinality.append(count)

# Build a dictionnaire to visualize gained data
i = 0
Total = dict()
for i in range(len(different_sizes)):
    width, height = different_sizes[i].split("_")
    Total[i] = {'Size': different_sizes[i] ,'Width': int(width), 'Height': int(height), 'Cardinality' : int(cardinality[i]) }

# Sort dict by descending cardinality of size
outputlist = sorted(Total.values(), key=lambda x:x['Cardinality'], reverse=True)

### Here we first computed the sums to get the average, and then we printed out our dictionary in order to see the most relevant tuples (of which we reported the most relevent below)

In [None]:
sum_width, sum_height, card = 0,0,0
for el in outputlist:
    sum_width += el['Width'] * el['Cardinality']
    sum_height += el['Height'] * el['Cardinality']
    card += el['Cardinality']
    print(el)

##### {'Size': '612_408', 'Width': 612, 'Height': 408, 'Cardinality': 4325} 
##### {'Size': '612_407', 'Width': 612, 'Height': 407, 'Cardinality': 149}
##### {'Size': '612_459', 'Width': 612, 'Height': 459, 'Cardinality': 133}
##### {'Size': '612_409', 'Width':e 612, 'Height': 409, 'Cardinality': 105}
##### {'Size': '612_612', 'Width': 612, 'Height': 612, 'Cardinality': 39}
##### {'Size': '612_344', 'Width': 612, 'Height': 344, 'Cardinality': 31}

### So we discovered that the dominant format is  [Width': 612, 'Height': 408], and there are 4325 images like that.


### Here we computed the weighted average of sizes

In [None]:
med_w = sum_width / card
med_h = sum_height / card
print("\nMedium weighted Width: " + str(med_w))
print("Medium weighted Height: " + str(med_h) + "\n")

f.close()

##### Medium weighted size of images (weight=612 fixed, height=413.2 variable in range [256,612])

### By running this script for the first time and printing every type of image size, we noticed that some images where vertical. So when we added the parameter target_size (X,Y), some images would have gone through a bad resize,  so we decided to create another dataset with only horizontal images, by rotating with the parameter expand=True, in order to prevent cropping. As we can see from this example:
* this first image is the original vertical one;
* the second one is what the "flow_from_directory" parameter target_size wuold create on out image;
* the third one is our rotation  

<img src="https://i.ibb.co/RCB00L4/example.png"  align="center" alt="example" border="0">

#### Then finally, we uploaded our second dataset with only horizontal images, over which we will try to see if any changes will occur on classifying