# The objective of this notebook is to:
* Perform preprocessing operations on the dataset
* Perform tasks that are only needed once

In [1]:
import codecs
import os
import pandas as pd
import xml.etree.ElementTree as ET #https://docs.python.org/3/library/xml.etree.elementtree.html

In [2]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/AI_Projects/Segmentor/datasets/pascal')

Mounted at /content/drive


In [3]:
os.getcwd()

'/content/drive/MyDrive/AI_Projects/Segmentor/datasets/pascal'

In [4]:
# Here we create the dictionary to map from a class to an index
dict_class_index = {}
dict_class_index["aeroplane"] =     0
dict_class_index["bicycle"] =       1
dict_class_index["bird"] =          2
dict_class_index["boat"] =          3
dict_class_index["bottle"] =        4
dict_class_index["bus"] =           5
dict_class_index["car"] =           6
dict_class_index["cat"] =           7
dict_class_index["chair"] =         8
dict_class_index["cow"] =           9
dict_class_index["diningtable"] =   10
dict_class_index["dog"] =           11
dict_class_index["horse"] =         12
dict_class_index["motorbike"] =     13
dict_class_index["person"] =        14
dict_class_index["pottedplant"] =   15
dict_class_index["sheep"] =         16
dict_class_index["sofa"] =          17
dict_class_index["train"] =         18
dict_class_index["tvmonitor"] =     19

In [6]:
column_names = ["image_filename","0","1","2","3","4","5","6","7","8","9","10",
                "11","12","13","14","15","16","17","18","19"]
df = pd.DataFrame(columns = column_names)

files = os.listdir("Annotations/")
print(f"There are {len(files)} files")

for i, xml_file_name in enumerate(files): 
    tree = ET.parse("Annotations/" + xml_file_name)
    root = tree.getroot()
    
    if i % 100 == 0:
        print(f"Analyzing file {i+1} / {len(files)}")

    # Check if the image is segmented
    is_segmented = int(root.find("segmented").text)
    if is_segmented == 1:
        
        # Iniatilize the row to be appended
        row = [0]*(1+ len(dict_class_index))

        # Retrieve the information of the file
        image_file = root.find("filename").text
        row[0] = image_file

        # Retrieve the information of the objects in the image
        objects = root.iter("object")
        for o in objects:
            class_name = o.find("name").text.strip().lower()
            class_index = dict_class_index[class_name]
            row[1+class_index] = 1
        
        # Append the row to the df
        row_series = pd.Series(row, index = df.columns)
        df = df.append(row_series, ignore_index = True)
    
# Now save the df
df.to_csv("pascal_segmented_classes_per_image.csv")
print("Saved the file pascal_segmented_classes_per_image.csv")

There are 17125 files
Analyzing file 1 / 17125
Analyzing file 101 / 17125
Analyzing file 201 / 17125
Analyzing file 301 / 17125
Analyzing file 401 / 17125
Analyzing file 501 / 17125
Analyzing file 601 / 17125
Analyzing file 701 / 17125
Analyzing file 801 / 17125
Analyzing file 901 / 17125
Analyzing file 1001 / 17125
Analyzing file 1101 / 17125
Analyzing file 1201 / 17125
Analyzing file 1301 / 17125
Analyzing file 1401 / 17125
Analyzing file 1501 / 17125
Analyzing file 1601 / 17125
Analyzing file 1701 / 17125
Analyzing file 1801 / 17125
Analyzing file 1901 / 17125
Analyzing file 2001 / 17125
Analyzing file 2101 / 17125
Analyzing file 2201 / 17125
Analyzing file 2301 / 17125
Analyzing file 2401 / 17125
Analyzing file 2501 / 17125
Analyzing file 2601 / 17125
Analyzing file 2701 / 17125
Analyzing file 2801 / 17125
Analyzing file 2901 / 17125
Analyzing file 3001 / 17125
Analyzing file 3101 / 17125
Analyzing file 3201 / 17125
Analyzing file 3301 / 17125
Analyzing file 3401 / 17125
Analyzing 