# Detection and Classification of People With and Without Masks in Images
## Exploratory Data Analysis

In [None]:
# Import Packages
import os
import re
import tensorflow as tf
import pandas as pd

from bs4 import BeautifulSoup as soup
from PIL import Image

import matplotlib.pyplot as plot
import matplotlib.patches as patches



In [None]:
# Sample Image
with Image.open("./Data/images/maksssksksss0.png") as rawImage:
    display(rawImage)


In [None]:
# Sample Annotation
with open("./Data/annotations/maksssksksss0.xml") as annotation:
    rawXML = annotation.read()
    print(rawXML)


In [None]:
# Data Extraction Function
def Extract(path):
    # Safely open annotation
    with open(path) as annotation:
        # Read annotation with Beautiful Soup
        rawXML = annotation.read()
        bsXML = soup(rawXML,"xml")

        # Stores Metadata in a Dictionary
        meta = {}
        size = bsXML.find('size')
        meta['index']=int(re.search(r'\d+',path).group())
        meta['width'] = int(size.find('width').get_text())
        meta['height'] = int(size.find('height').get_text())
        meta['depth'] = int(size.find('depth').get_text())

        # Stores Labels in a List of Dictionaries
        faces = []
        objects = bsXML.find_all('object')
        for face in objects:
            labels={}
            labels['index']=meta['index']
            labels['label'] = face.find('name').get_text()
            labels['pose'] = face.find('pose').get_text()
            labels['truncated'] = int(face.find('truncated').get_text())
            labels['occluded'] = int(face.find('occluded').get_text())
            labels['difficult'] = int(face.find('difficult').get_text())
            labels['xmin'] = int(face.find('xmin').get_text())
            labels['xmax'] = int(face.find('xmax').get_text())
            labels['ymin'] = int(face.find('ymin').get_text())
            labels['ymax'] = int(face.find('ymax').get_text())
            faces.append(labels)
        return(meta,faces)

In [None]:
# Extract data from sample
meta,faces = Extract("./Data/annotations/maksssksksss0.xml")
print("Metadata: ")
print(meta)
print("Face Objects: ")
print(faces)

In [None]:
# Show labels on Image
with Image.open("./Data/images/maksssksksss0.png") as rawImage:
    figure, axis = plot.subplots()
    axis.imshow(rawImage)
    meta,objects = Extract("./Data/annotations/maksssksksss0.xml")
    for face in objects:
        color = 'green' if face['label']=='with_mask' else 'red'
        corner = (face['xmin'],face['ymin'])
        width = face['xmax']-face['xmin']
        height = face['ymax']-face['ymin']
        rect = patches.Rectangle(corner, width, height, linewidth=2, edgecolor=color, facecolor='none')
        axis.add_patch(rect)
    plot.show()

In [None]:
# Collect Samples
imagePath = "./Data/images/"
imagesList = [[int(re.search(r'\d+',os.path.join(imagePath,f)).group()),os.path.join(imagePath,f)] for f in os.listdir(imagePath) if os.path.isfile(os.path.join(imagePath,f))]
images = pd.DataFrame(imagesList,columns=['index','path']).set_index('index')

# Collect Labels
annotationpath = "./Data/annotations/"
metaList=[]
facesList=[]
for f in os.listdir(annotationpath):
    path = os.path.join(annotationpath,f)
    meta, faces = Extract(path)
    metaList.append(meta)
    facesList=facesList+faces
meta = pd.DataFrame(metaList).set_index('index')
labels = pd.DataFrame(facesList).set_index('index')

In [None]:
# Metadata Summary Statistics
meta.describe()

In [None]:
# Explore Image Sizes
groupedMeta = meta.groupby(['height', 'width']).size().reset_index().rename(columns={0:'count'})
print("Most images of the same size:")
print(groupedMeta[groupedMeta['count']==groupedMeta['count'].max()])
plot.scatter(groupedMeta['width'],groupedMeta['height'],c='darkblue',s=3*groupedMeta['count'])
plot.xlabel('width (pixels)')
plot.ylabel('height (pixels)')
plot.title('Sizes of Images in Dataset')
plot.show()

In [None]:
# Labels Summary Statistics
nlabels = len(labels)
nWithMask = len(labels[labels['label']=='with_mask'])
nWithoutMask = len(labels[labels['label']=='without_mask'])
nIncorrectMask = len(labels[labels['label']=='mask_weared_incorrect'])
nOther = nlabels - nWithMask - nWithoutMask - nIncorrectMask
labelTable = pd.DataFrame([
    ["With","{:.1f}".format(100*nWithMask/nlabels)+'%'],
    ["Without","{:.1f}".format(100*nWithoutMask/nlabels)+"%"],
    ["Incorrect","{:.1f}".format(100*nIncorrectMask/nlabels)+"%"]],columns=["Mask Label","Label Classifications ("+str(nlabels)+" Total Labels)"]).set_index("Mask Label")
labelTable

In [None]:
# Explore Number of Labels per Image
groupedLabels = labels.groupby(by='index').size()
plot.hist(groupedLabels,bins=25,color='teal')
plot.xlabel('Labels in Image')
plot.ylabel('Number of Images')
plot.title('Distribution of Labels in Images')
plot.show()

In [None]:
# Look into other label variables
print(labels.groupby(by='pose').size())
labels.describe()