# Introduction

This file preprocesses the dataset below for further use in DeepLearning algorithms.
It rescales the images to the same size, ensures RGB mode with 3 channels and saves the processed images as jpg still into (translated) category folders.
It also generates a dataframe with metadata of the cleaned dataset and saves it as csv to allow simple loading for later use in preparing the dataset for deep learning.  

Dataset: <https://www.kaggle.com/datasets/alessiocorrado99/animals10/data>



# Preamble

## Downloading the data

In [2]:
# uncomment only if redownload is desired

# ! curl -L -o data/data.zip https://www.kaggle.com/api/v1/datasets/download/alessiocorrado99/animals10
# ! unzip data/data.zip -d data/

## Libraries

In [None]:
import os
from PIL import Image
import pandas as pd


from sklearn.preprocessing import LabelEncoder

## Variables

In [4]:
# Variables

# image processing
RAW_PATH = './data/raw-img'
CLEAN_PATH='./data/processed'

IMG_SIZE = 224       # src imgs are not square, should I just squash them into squares?

# translation of category names (there's a missing value in the included translation file, so simply supplying my own version here)
translate = {"cane": "dog", 
             "cavallo": "horse", 
             "elefante": "elephant", 
             "farfalla": "butterfly", 
             "gallina": "chicken", 
             "gatto": "cat", 
             "mucca": "cow", 
             "pecora": "sheep", 
             "ragno": "spider",
             "scoiattolo": "squirrel", 
             "dog": "cane", 
             "cavallo": "horse", 
             "elephant" : "elefante", 
             "butterfly": "farfalla", 
             "chicken": "gallina", 
             "cat": "gatto", 
             "cow": "mucca", 
             "spider": "ragno", 
             "squirrel": "scoiattolo"}

## Auxiliary functions

In [None]:
# adapted from readTrafficSigns.py by Institut für Neuroinformatik    
def mk_img_csv(rootpath,csvfile='data/overview.csv',raw=False):

    lst = []

    
    #  Hinweis: unter MacOS wird der versteckte Ordner
    # ".DS_STORE" mitgezählt
    cat_folders = [cat for cat in os.listdir(rootpath) if not cat.startswith(".")]

    print('Recognised following categories:', cat_folders)
    
    for c in cat_folders:
        fullfolder = os.path.join(rootpath, c) + '/'
        print('Reading filenames in folder', fullfolder)

        for pic in os.listdir(fullfolder):
            if pic.startswith("."):  # Skip hidden files
                continue

            filepath = os.path.join(fullfolder, pic)
            fileformat = pic.split('.')[-1]

            try:
                with Image.open(filepath) as img:
                    width, height = img.size
                    mode = img.mode  # 'RGB', 'RGBA', 'L', etc.
                    channels = len(img.getbands())  # e.g., 3 for RGB, 4 for RGBA
            except Exception as e:
                print(f"Error reading {filepath}: {e}")
                width, height, mode, channels = None, None, None, None

            if raw:
                lst.append({
                    'relpath': filepath,
                    'name': pic,
                    'format': fileformat,
                    'category': translate[c],  # translate category name from Italian
                    'width': width,
                    'height': height,
                    'mode': mode,
                    'channels': channels
                })
            else:
                     lst.append({
                    'relpath': filepath,
                    'name': pic,
                    'format': fileformat,
                    'category': c,
                    'width': width,
                    'height': height,
                    'mode': mode,
                    'channels': channels
                })

            
    
    print('Writing csv file')
    overview=pd.DataFrame(lst)
    
    if not raw:
        encoder = LabelEncoder()
        overview['label'] = encoder.fit_transform(overview['category']) 
   
    overview.to_csv(csvfile,header=True)
    
    return overview

# function to resize images and ensure RGB
def preprocess_img(df, size=IMG_SIZE,target_root = "data/preprocessed"):
    """ Transform all images listed in df to RGB and squares of appropriate size (IMG_SIZE)
    
    Saved in folder supplied to `target_root`
    """
    
    os.makedirs(target_root, exist_ok=True)

    for index, row in df.iterrows():
        src_path = row['relpath']
        category = row['category']
        target_folder = os.path.join(target_root, category)
        os.makedirs(target_folder, exist_ok=True)
        filename = os.path.splitext(os.path.basename(src_path))[0] + '.jpg'
        target_path = os.path.join(target_folder, filename)
        
        try:
            with Image.open(src_path) as img:
                img = img.convert('RGB')
                img = img.resize((size, size), Image.BILINEAR)
                img.save(target_path, format='JPEG', quality=90)  # quality can be adjusted
        except Exception as e:
            print(f"Error processing {src_path}: {e}")


# Checking the raw data

In [6]:
raw_df = mk_img_csv(RAW_PATH,raw=True)

Recognised following categories: ['cane', 'cavallo', 'elefante', 'farfalla', 'gallina', 'gatto', 'mucca', 'pecora', 'ragno', 'scoiattolo']
Reading filenames in folder ./data/raw-img/cane/
Reading filenames in folder ./data/raw-img/cavallo/
Reading filenames in folder ./data/raw-img/elefante/
Reading filenames in folder ./data/raw-img/farfalla/
Reading filenames in folder ./data/raw-img/gallina/
Reading filenames in folder ./data/raw-img/gatto/
Reading filenames in folder ./data/raw-img/mucca/
Reading filenames in folder ./data/raw-img/pecora/
Reading filenames in folder ./data/raw-img/ragno/
Reading filenames in folder ./data/raw-img/scoiattolo/
Writing csv file


In [7]:
raw_df.groupby(['format','channels'])['name'].count()

format  channels
jpeg    3           24209
jpg     1               1
        3            1917
        4               1
png     3               2
        4              49
Name: name, dtype: int64

All detected files are indeed picture formats, good. Imbalance in filetypes should not matter. (However, note the tip about possible pngs with transparency layer.)

In [8]:
raw_df.groupby(['category'])['name'].count()

category
butterfly    2112
cat          1668
chicken      3098
cow          1866
dog          4863
elephant     1446
horse        2623
sheep        1820
spider       4821
squirrel     1862
Name: name, dtype: int64

In [14]:
raw_df.groupby(['width'])['name'].count().reset_index().sort_values(by='name',ascending=False)

Unnamed: 0,width,name
167,300,19942
253,640,1765
92,225,496
67,200,383
68,201,133
...,...,...
254,2482,1
255,2848,1
170,369,1
171,382,1


In [16]:
raw_df.groupby(['height'])['name'].count().reset_index().sort_values(by='name',ascending=False)

Unnamed: 0,height,name
209,300,4870
134,225,4401
109,200,3295
78,169,1049
303,426,694
...,...,...
448,4272,1
449,4480,1
450,4608,1
451,4884,1


We can see that the dataset is unbalanced. We will need to up- or downsample to create equally sized categories. If possible, upsampling would be nicer to avoid wasting data. For now, we will just preprocess the images to ensure they are the same size and have 3 channels each.

# Preprocess images (normalise size, ensure RGB)

In [None]:
# scaling and normalising pictures
# uncomment if data preprocessing should be repeated
# preprocess_img(raw_df,target_root=CLEAN_PATH)

In [14]:
# create dataframe for cleaned data
# save csv for further use
clean_df = mk_img_csv(CLEAN_PATH,csvfile='data/processed-overview.csv')

Recognised following categories: ['dog', 'horse', 'elephant', 'butterfly', 'chicken', 'cat', 'cow', 'sheep', 'spider', 'squirrel']
Reading filenames in folder ./data/processed/dog/
Reading filenames in folder ./data/processed/horse/
Reading filenames in folder ./data/processed/elephant/
Reading filenames in folder ./data/processed/butterfly/
Reading filenames in folder ./data/processed/chicken/
Reading filenames in folder ./data/processed/cat/
Reading filenames in folder ./data/processed/cow/
Reading filenames in folder ./data/processed/sheep/
Reading filenames in folder ./data/processed/spider/
Reading filenames in folder ./data/processed/squirrel/
Writing csv file


In [15]:
# data overview
clean_df.head()

Unnamed: 0,relpath,name,format,category,width,height,mode,channels,label
0,./data/processed/dog/OIF-e2bexWrojgtQnAPPcUfOW...,OIF-e2bexWrojgtQnAPPcUfOWQ.jpg,jpg,dog,224,224,RGB,3,4
1,./data/processed/dog/OIP---A27bIBcUgX1qkbpZOPs...,OIP---A27bIBcUgX1qkbpZOPswHaFS.jpg,jpg,dog,224,224,RGB,3,4
2,./data/processed/dog/OIP---ZIdwfUcJeVxnh47zppc...,OIP---ZIdwfUcJeVxnh47zppcQHaFj.jpg,jpg,dog,224,224,RGB,3,4
3,./data/processed/dog/OIP---ZRsOF7zsMqhW30WeF8-...,OIP---ZRsOF7zsMqhW30WeF8-AHaFj.jpg,jpg,dog,224,224,RGB,3,4
4,./data/processed/dog/OIP---_cJbI6Ei26w5bW1urHe...,OIP---_cJbI6Ei26w5bW1urHewHaCf.jpg,jpg,dog,224,224,RGB,3,4


In [16]:
# all images are jpgs with 3 channels and a format of 200x200 now
clean_df.groupby(['format','channels','width','height'])['name'].count()

format  channels  width  height
jpg     3         224    224       26179
Name: name, dtype: int64

The dataset can now be used.