# Construyendo un Model de Deep Learning que diferencie Perros de Gatos desde 0

Pasos:
1. Descargamos nuestro conjunto de datos de: https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data y lo descomprimimos dentro del directorio `notebook/data/`
2. Exploración de datos
3. Preparación de datos
4. Entrenando modelo inicial usando una CNN + Datos Aumentados
5. Verificación manual de mi modelo

## 1. Descargamos y disponemos el conjunto de datos
Fuente: https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data

#### 1.1 Descargamos y descomprimimos

Al descargar y descomprimir debe verse así:
```
notebook/data/
│
├── dogs-vs-cats-redux-kernels-edition.zip
└── dogs-vs-cats-redux-kernels-edition/
    ├── sample_submission.csv
    ├── test.zip
    └── train.zip
```

#### 1.2 Descomprimimos test.zip y train.zip en `../data/`

Al realizar esto quedamos con - nota como eliminamos el directorio y zip `dogs-vs-cats-redux-kernels-edition`:
```
notebook/data/
│
├── test/
└── train/
```

Renombramos el folder `test/` a `unlabeled_test_data/`:
```
notebook/data/
│
├── unlabeled_test_data/
└── train/
```

## 2. Exploración de datos

In [None]:
import os
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt

In [None]:
DATA_PATH = 'data/'

Separamos imagenes de gatos y perros en diferentes directorios

In [None]:
import os
from tqdm import tqdm

if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH)

if not os.path.exists(os.path.join(DATA_PATH, 'dog')):
    os.makedirs(os.path.join(DATA_PATH, 'dog'))

if not os.path.exists(os.path.join(DATA_PATH, 'cat')):
    os.makedirs(os.path.join(DATA_PATH, 'cat'))

TRAIN_DATA_PATH = 'data/train/'

for file in tqdm(os.listdir(TRAIN_DATA_PATH)):
    new_file = file.replace('.jpg', '')
    new_file = new_file.replace('.', '/')
    new_file = new_file + '.jpg'
    os.rename(os.path.join(TRAIN_DATA_PATH, file), os.path.join(DATA_PATH, new_file))

os.rmdir(TRAIN_DATA_PATH)

Entendiendo la distribución de mis datos

In [None]:
os.listdir(DATA_PATH)

In [None]:
total_cats = len(os.listdir(os.path.join(DATA_PATH, 'cat')))
total_dogs = len(os.listdir(os.path.join(DATA_PATH, 'dog')))
print("Numero de imagenes de gatos: ", total_cats)
print("Numero de imagenes de perros: ", total_dogs)

In [None]:
objects = ('Dog', 'Cat')
y_pos = np.arange(len(objects))
performance = [total_dogs, total_cats]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Number')
plt.title('Number of pets. Bar chart')

plt.show()

## 3. Preparación de datos

Creamos los directorios `train`, `test` y `validation` para entrenamiento

In [None]:
# Train

if not os.path.exists(os.path.join(DATA_PATH, 'train')):
    os.makedirs(os.path.join(DATA_PATH, 'train'))

if not os.path.exists(os.path.join(DATA_PATH, 'train/dog')):
    os.makedirs(os.path.join(DATA_PATH, 'train/dog'))

if not os.path.exists(os.path.join(DATA_PATH, 'train/cat')):
    os.makedirs(os.path.join(DATA_PATH, 'train/cat'))

# Test

if not os.path.exists(os.path.join(DATA_PATH, 'test')):
    os.makedirs(os.path.join(DATA_PATH, 'test'))

if not os.path.exists(os.path.join(DATA_PATH, 'test/dog')):
    os.makedirs(os.path.join(DATA_PATH, 'test/dog'))

if not os.path.exists(os.path.join(DATA_PATH, 'test/cat')):
    os.makedirs(os.path.join(DATA_PATH, 'test/cat'))

# Validation

if not os.path.exists(os.path.join(DATA_PATH, 'validation')):
    os.makedirs(os.path.join(DATA_PATH, 'validation'))

if not os.path.exists(os.path.join(DATA_PATH, 'validation/dog')):
    os.makedirs(os.path.join(DATA_PATH, 'validation/dog'))

if not os.path.exists(os.path.join(DATA_PATH, 'validation/cat')):
    os.makedirs(os.path.join(DATA_PATH, 'validation/cat'))

print('Folders created...')

Dividimos los datos de entrenamiento de la siguiente forma:
- Train - 80%
- Test - 10%
- Validation - 10%

In [None]:
list_of_dogs = os.listdir(os.path.join(DATA_PATH, 'dog'))
list_of_cats = os.listdir(os.path.join(DATA_PATH, 'cat'))

In [None]:
import random

# Train

dog_train_size = int(len(list_of_dogs) * 0.8)
train_dog = random.sample(list_of_dogs, k=dog_train_size)

for dog in tqdm(train_dog):
    os.rename(os.path.join(DATA_PATH, 'dog', dog), os.path.join(DATA_PATH, 'train/dog', dog))

cat_train_size = int(len(list_of_cats) * 0.8)
train_cat = random.sample(list_of_cats, k=cat_train_size)

for cat in tqdm(train_cat):
    os.rename(os.path.join(DATA_PATH, 'cat', cat), os.path.join(DATA_PATH, 'train/cat', cat))

print('Train data created...')

# Test

list_of_dogs = os.listdir(os.path.join(DATA_PATH, 'dog'))
list_of_cats = os.listdir(os.path.join(DATA_PATH, 'cat'))

dog_test_size = int(len(list_of_dogs) * 0.5)
test_dog = random.sample(list_of_dogs, k=dog_test_size)

for dog in tqdm(test_dog):
    os.rename(os.path.join(DATA_PATH, 'dog', dog), os.path.join(DATA_PATH, 'test/dog', dog))

cat_test_size = int(len(list_of_cats) * 0.5)
test_cat = random.sample(list_of_cats, k=cat_test_size)

for cat in tqdm(test_cat):
    os.rename(os.path.join(DATA_PATH, 'cat', cat), os.path.join(DATA_PATH, 'test/cat', cat))

# Validation

list_of_dogs = os.listdir(os.path.join(DATA_PATH, 'dog'))
list_of_cats = os.listdir(os.path.join(DATA_PATH, 'cat'))

for dog in tqdm(list_of_dogs):
    os.rename(os.path.join(DATA_PATH, 'dog', dog), os.path.join(DATA_PATH, 'validation/dog', dog))

for cat in tqdm(list_of_cats):
    os.rename(os.path.join(DATA_PATH, 'cat', cat), os.path.join(DATA_PATH, 'validation/cat', cat))

os.rmdir(os.path.join(DATA_PATH, 'cat'))
os.rmdir(os.path.join(DATA_PATH, 'dog'))

## 4. Entrenando modelo inicial usando una CNN + Datos Aumentados

Ver notebook `notebook/model-training.ipynb`

## 5. Verificación manual del modelo

In [None]:
def display_image(image_path):
    dog_img = plt.imread(image_path)
    plt.imshow(dog_img)
    plt.show()

def translate_pred(prediction: np.array) -> str:
    if prediction[0][0] > 0.5:
        return "Dog", prediction[0][0] * 100
    else:
        return "Cat", (1 - prediction[0][0]) * 100

In [None]:
from tensorflow import keras
cnn_model = keras.models.load_model(os.path.join(DATA_PATH, 'model', 'cnn_model'))
cnn_model.metrics_names

In [None]:
def model_predict(image_uri: str):
    import numpy as np
    from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator

    display_image(image_uri)
    
    #load the image
    img_width, img_height = 150, 150
    my_image = load_img(image_uri, target_size=(img_width, img_height))

    #preprocess the image
    test_datagen = ImageDataGenerator(rescale=1. / 255)
    # img_arr = img_to_array(my_image)
    img_arr = np.expand_dims(img_to_array(my_image), axis=0)
    preprocessed_img = next(test_datagen.flow(img_arr, batch_size=1))

    prediction = cnn_model.predict(preprocessed_img)
    animal_kind, confidence = translate_pred(prediction)
    print(f"It's a {animal_kind} ({confidence:.2f}%)")

In [None]:
model_predict('data/unlabeled_test_data/1.jpg')