# Clasificación de imágenes con modelos de *ensemble*
Este notebook muestra el proceso de análisis, preprocesamiento y entrenamiento de modelos para predecir la clase de imágenes de distintos escenarios.

## 1. Setup

Librerías importadas en el proyecto.

In [3]:
import numpy as np
import pandas as pd

import os

from IPython.display import IFrame

%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

Variables globales declaradas en el proyecto.

In [9]:
RANDOM_SEED = 1337

DATASETS = ["train", "test"]

DATA_PATH = os.path.join(os.pardir, "data")  # Path to csv data
INFO_PATH = os.path.join(os.pardir, "info")  # Path to problem info
RESULTS_PATH = os.path.join(os.pardir, "results")  # Path to results files

Enunciado del problema.

In [7]:
IFrame(os.path.join(INFO_PATH, "lab2.pdf"), width=1080, height=920)

## 2. Análisis exploratorio de los datos

En primer lugar, vamos a cargar los datos en memoria.

In [13]:
dataframes = {}

for dataset in DATASETS:
    dataset_path = os.path.join(DATA_PATH, dataset)
    data = []
    for category in os.listdir(dataset_path):
        category_path = os.path.join(dataset_path, category)
        size = 0
        for file in filter(lambda x: x.endswith(".jpg"), os.listdir(category_path)):
            data.append({
                "img": plt.imread(os.path.join(category_path, file)),
                "label": category
            })
            size += 1
        print(f"{size} images read from {category} in {dataset} set")
    dataframes[dataset] = pd.DataFrame(data)

2191 images read from edificios in train set
2404 images read from glaciares in train set
2512 images read from montanas in train set
2274 images read from mares in train set
2271 images read from bosques in train set
2382 images read from calles in train set
437 images read from edificios in test set
553 images read from glaciares in test set
525 images read from montanas in test set
510 images read from mares in test set
474 images read from bosques in test set
501 images read from calles in test set


In [None]:
def plot_img_from_df(dict_df, df, images):
    
    random.seed(RANDOM_SEED)

    fig = plt.figure(figsize=(22,15))
    fig.suptitle(f"{len(images)} images from {df} df\n", fontsize=20)

    size_df = len(dict_df[df])

    for i in range(len(images)):
        
        plt.subplot(round(np.sqrt(len(images))) , round(np.sqrt(len(images))), i+1)
        plt.xticks([])
        plt.yticks([])
        plt.imshow(dict_df[df]["img"].iloc[images[i]])
        plt.xlabel(list(classes.keys())[dict_df[df]["classification"].iloc[images[i]]])

    fig.tight_layout()    
    plt.show()

In [None]:
images_train = []
images_test = []

random.seed(RANDOM_SEED)

for i in range(36):
    images_train.append(random.randint(0, len(dataframes["train"])))
    images_test.append(random.randint(0, len(dataframes["test"])))
    
plot_img_from_df(dataframes, "train", images_train)
print()
plot_img_from_df(dataframes, "test", images_test)