# Pre-processing Tool

The pre-processign tool can be used to pre-process an image set which is not yet organized as corpus.

An image corpus itself can consist of several folders. Each folder represents a class the respective classifier will be able to recognize. Each class folder contains all images that will be used to train and test the classifier on this class. If only one class per classifier is given, a folder called negative_examples is needed as well.

To get a better understanding of the layout, take a look at this sample folder hierarchy (also contained in this project):

```
 ./corpus
     /audi
         /athree
             3_1.jpg
             ...
         /afour
             a4_1.jpg
             ...
     /mercedes
         /sclass
             sclass_1.jpg
             ...
         /negative_examples
             negative_sclass_1.jpg
             ...
```

# Initialization

In [None]:
# import basic libraries
import os
import sys
import shutil

import numpy as np
import pandas as pd

import vrtool

# Config

In [None]:
# Excel mapping config
MAPPING_EXCEL = './config/sample-image-mapping.xlsx' # The path to the Excel mapping file
IMG_COL_NAME = 'IMG' # The default Excel column namecontaining the image paths
IMG_CLASS_COL_NAME = 'IMG_CLASS' # The default Excel column name containing the image classes per image

# Corpus creation settings
IS_MULTI_CLASS = True # If set to False every image class will be used as a separate classifer, currently limited to True
CLASSIFIER_NAME = 's_class' # Only needs to be provided if multiclass is set to True

# Default folder structures
CORPUS_ROOT = './corpus'
IMG_DUMP_ROOT = './image_dump'

# Corpus Creation from Image Dump and Mapping File 

In [None]:
# Load Excel which contains image to class mappings
df = pd.read_excel(MAPPING_EXCEL)
df.head(5)

In [None]:
# Create corpus folder and add class subfolders
class_names = df[IMG_CLASS_COL_NAME].unique()
if IS_MULTI_CLASS:
    for class_name in class_names:
        try:
            folder = os.path.join(CORPUS_ROOT, CLASSIFIER_NAME, class_name)
            os.makedirs(folder)
        except FileExistsError:
            print("ERROR: Folder {} exist, please delete them before recreating.".format(folder))

In [None]:
KEEP_OLD_IMAGES = True

def move_img(src_image_name, class_name, keep_old_img=True):
    src_img_path = os.path.join(IMG_DUMP_ROOT, src_image_name)
    target_img_path = os.path.join(CORPUS_ROOT, CLASSIFIER_NAME, class_name, src_image_name)
    
    if keep_old_img:
        shutil.copyfile(src_img_path, target_img_path)
    else:
        shutil.move(src_img_path, target_img_path)
    

if IS_MULTI_CLASS:
    for __, row in df.iterrows():
        img_name = row[IMG_COL_NAME]
        img_class = row[IMG_CLASS_COL_NAME]
        move_img(img_name, img_class, KEEP_OLD_IMAGES)

In [None]:
def remove_empty_folders(corpus_path):
    for dirpath, dirnames, files in os.walk(corpus_path):
        if not files:
            os.rmdir(dirpath)
            
target_corpus = os.path.join(CORPUS_ROOT, CLASSIFIER_NAME)
remove_empty_folders(target_corpus)

# Corpora Overview

In [None]:
# The name of the folder that contains the corpora, currently relative to notebook location
corpora_folder_name = '../corpus'
config_name = 'config.ini'

runner = vrtool.Runner(config_name, corpora_folder_name)
corpora = runner.get_available_corpora()

# Print a summary of the available corpora in the corpora directory
print('\nAvailable image corpora:')
print('\n'.join('{}: {}'.format(*el) for el in enumerate(corpora)))
print()

for corpus_name, corpus_dir in corpora:
    if(not "negative" in corpus_name):
        imgs = corpus_dir.get_all_class_images()
        image_info = pd.DataFrame(imgs)
        try:
            print('---------------------------------')
            print("Corpus:",corpus_name)
            print(image_info.class_name.value_counts())
        except:
            pass