### Librerías y Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import io

from google.colab.patches import cv2_imshow
import urllib.request
import time

from google.colab import files
from scipy import stats
import tensorflow as tf

Importación de datos, extraído de EBird: https://ebird.org/explore

In [None]:
uploaded = files.upload()

In [None]:
df2 = pd.read_csv(io.BytesIO(uploaded['raw_data.csv']))

### Análisis de la Data

In [None]:
#Se observan las columnas con información general como el tipo de dato
df2.info()

In [None]:
#Se realiza un conteo de cada celda
df2.count()

In [None]:
#Número total de especies en la data
len(df2['Scientific Name'].unique())

In [None]:
#Número de fotografías por especie
df2.groupby(['Scientific Name']).count()

Conclusión del análisis:
 * Existen 10000 registros
 * Existen 1483 especies registradas

###Distribución de Registro de Especies

In [None]:
#número mínimo de registros a estudiar
min_registros = 500

In [None]:
#Se agrupa y ordenan las especies de forma descendente observando la cantidad de existencias por especie
species_sorted_desc = df2.groupby(['Scientific Name']).count()['ML Catalog Number'].sort_values(ascending=False)
species_sorted_desc

In [None]:
species_sorted_desc.value_counts()

In [None]:
#Gráfico de registros por especie
x=species_sorted_desc.value_counts().index.values
y=species_sorted_desc.value_counts().values
plt.xlabel('Cantidad de registros')
plt.ylabel('Cantidad de especies')
plt.bar(x,y,color='maroon',width=1)

In [None]:
# Se eliminan las especies con pocos registros hasta que 
# el número total de registros sea aproximadamente 5000
def get_index_drop():
  sum = 0
  for index, value in species_sorted_desc.iteritems():
    sum += value
    if (sum >= min_registros):
      return (index, value)

get_index_drop()
#Se muestra el último espécimen eliminado y su número de registros.

In [None]:
species_to_drop = species_sorted_desc[species_sorted_desc.iloc[:] < get_index_drop()[1]].index

In [None]:
# Eliminando en el Dataframe
df_final = df2[~df2['Scientific Name'].isin(species_to_drop.to_list())]
df_final

In [None]:
num_clases = len(df_final['Scientific Name'].unique())
num_clases
# Número de clases 

### Carga de Imágenes

In [None]:
# Funciones de utilidad
def get_url_image(id):
  return f'https://cdn.download.ams.birds.cornell.edu/api/v1/asset/{id}/1200'

def get_list_of_url_by_id(df,title):
  list_url=[]
  for id in df[title]:
    list_url.append(get_url_image(id))
  return list_url

def save_image(link, new_name):
  urllib.request.urlretrieve(link, new_name)

In [None]:
# Añadimos una columna que hace referencia al link de cada imagen
url = get_list_of_url_by_id(df_final, 'ML Catalog Number')
df_final = df_final.assign(ImageURL=url)

In [None]:
#Se muestra la información junto a la columna agregada
df_final.head()

In [None]:
#Se crea un dataFrame con las imagenes correspondientes y ordenamos
df_images = {'Image': [], 'Scientific Name': []}
for i in range(len(df_final)):
  print(i)
  image_name_aux = f'{df_final.iloc[i]["ML Catalog Number"]}.jpg'
  save_image(df_final.iloc[i]['ImageURL'], image_name_aux)
  df_images['Image'].append(image_name_aux)
  df_images['Scientific Name'].append(df_final.iloc[i]['Scientific Name'])
df_images = pd.DataFrame(data=df_images)

In [None]:
df_images.head()


### Ordenamiento de carpetas
Orden de carpetas para la mejor distribución entre entrenamiento, validación y test.

In [None]:
import os
import shutil

In [None]:
#Identifica las clases a evaluar
clases = df_images['Scientific Name'].unique()
clases

In [None]:
#Creando las carpetas train test y val con subcarpetas para almacenar cada clase de ave
for clase in clases:
  if not (os.path.exists("./train/"+clase)):
    os.makedirs("./train/"+clase)

  if not (os.path.exists("./test/"+clase)):
    os.makedirs("./test/"+clase)

  if not (os.path.exists("./val/"+clase)):
    os.makedirs("./val/"+clase)
    


In [None]:
#define los valores n_train n_test y n_val que determinaran el tamaño de entrenamiento prueba y validación
for clase in clases:
  df_clase = df_images[df_images['Scientific Name'] == clase]
  n = len(df_clase)
  n_train = int((n * 0.6) + 0.5)
  n_val = int((n * 0.25) + 0.5)
  n_test = n - n_train - n_val
  #print(n, n_train, n_test, n_val)

  for (i, im) in enumerate(df_clase['Image']):
    if i < n_train:
      if not os.path.exists("./train/"+clase):
        shutil.move(im,"./train/"+clase)
    elif i < n_train +n_val:
      if not os.path.exists("./val/"+clase):
        shutil.move(im,"./val/"+clase)
    else:
      if not os.path.exists("./test/"+clase):
        shutil.move(im,"./test/"+clase)

In [None]:
#convertir de tipo objeto a tipo string
clase2 = []
for clase in clases:
  clase2.append(str(clase))

clases = clase2

In [None]:
def show(batch, pred_labels=None):
    plt.figure(figsize=(10,10))
    for i in range(4):
        plt.subplot(2,2,i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(batch[0][i], cmap=plt.cm.binary)
        # The CIFAR labels happen to be arrays, 
        # which is why you need the extra index
        lbl = clases[int(batch[1][i])]
        if pred_labels is not None:
            lbl += "/ Pred:" + clases[int(pred_labels[i])]
        plt.xlabel(lbl)
    plt.show()



In [None]:
#Importando libreria keras
from tensorflow.keras import layers
import tensorflow.keras as keras

### Realizando Red convolucional

In [None]:
model = keras.models.Sequential()
model.add(layers.Conv2D(
    32, 
    (3,3), 
    strides=(1,1), 
    padding="valid",
    activation='relu',
    input_shape = (256,256,3)
    ))
model.add(layers.MaxPool2D(2,2)),
model.add(layers.Conv2D(
    64, 
    (3,3), 
    activation='relu',
    )),
model.add(layers.MaxPool2D(2,2)),
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(num_clases))
print(model.summary())

In [None]:
#Creando los batches
from tensorflow import keras

train_gen = keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    rotation_range=10,
    horizontal_flip=True,
    zoom_range=0.2)
val_gen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_gen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

train_batches = train_gen.flow_from_directory(
    './train',
    target_size=(256,256),
    class_mode='sparse',
    batch_size=32,
    color_mode='rgb',
    classes=clases
)

val_batches = val_gen.flow_from_directory(
    './val',
    target_size=(256,256),
    class_mode='sparse',
    batch_size=32,
    color_mode='rgb',
    classes=clases
)

test_batches = test_gen.flow_from_directory(
    './test',
    target_size=(256,256),
    class_mode='sparse',
    batch_size=32,
    color_mode='rgb',
    classes=clases
)

In [None]:
show(train_batches[0])

In [None]:
#genera indice de categoría mas probable
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ["sparse_categorical_accuracy"]
model.compile(optimizer=optim, loss=loss, metrics=metrics)

In [None]:

epochs = 10
#monitor de perdida
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    verbose=2
)


history = model.fit(train_batches,
                    validation_data=val_batches,
                    epochs=epochs,
                    callbacks=[early_stopping])

In [None]:
# Genera grafico de perdida y precisión categorica

plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='valid loss')
plt.grid()
plt.legend(fontsize=15)

plt.subplot(1, 2, 2)
plt.plot(history.history['sparse_categorical_accuracy'], label='train acc')
plt.plot(history.history['val_sparse_categorical_accuracy'], label='valid acc')
plt.grid()
plt.legend(fontsize=15);

In [None]:
# se evalúa la data de prueba
model.evaluate(test_batches, verbose=2)

In [None]:
#Realizando predicciones (opcional)
predictions = model.predict(test_batches)
predictions = tf.nn.softmax(predictions)
labels = np.argmax(predictions, axis=1)

print(test_batches[0][1])
print(labels[0:19])

# TRANSFER LEARNING

#### VGG16

In [None]:
#Extrañendo el model vgg16 de keras
vgg_model = tf.keras.applications.vgg16.VGG16()
print(type(vgg_model))
vgg_model.summary()

In [None]:
# Crea un modelo "Secuencial" y agregua una primera capa.
model = keras.models.Sequential()
for layer in vgg_model.layers[0:-1]:
    model.add(layer)

for layer in model.layers:
    layer.trainable = False
    
#Información general del modelo
model.summary()

In [None]:
#Agrega una capa Densa al modelo.
model.add(layers.Dense(num_clases))

In [None]:
#Se configura el modelo
model.compile(optimizer=optim, loss=loss, metrics=metrics)

In [None]:
# Se utiliza el preprocesamiento de la red ya entrenada al igual que su target_size

preprocess_input = tf.keras.applications.vgg16.preprocess_input

train_gen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)
val_gen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)
test_gen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)

train_batches = train_gen.flow_from_directory(
    './train',
    target_size=(224,224),
    class_mode='sparse',
    batch_size=32,
    color_mode='rgb',
    classes=clases
)

val_batches = val_gen.flow_from_directory(
    './val',
    target_size=(224,224),
    class_mode='sparse',
    batch_size=32,
    color_mode='rgb',
    classes=clases
)

test_batches = test_gen.flow_from_directory(
    './test',
    target_size=(224,224),
    class_mode='sparse',
    batch_size=32,
    color_mode='rgb',
    classes=clases
)

In [None]:

epochs = 10

early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    verbose=2
)


history = model.fit(train_batches,
                    validation_data=val_batches,
                    epochs=epochs,
                    callbacks=[early_stopping])

In [None]:
#Genera grafico de perdida y precisión categorica

plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='valid loss')
plt.grid()
plt.legend(fontsize=15)

plt.subplot(1, 2, 2)
plt.plot(history.history['sparse_categorical_accuracy'], label='train acc')
plt.plot(history.history['val_sparse_categorical_accuracy'], label='valid acc')
plt.grid()
plt.legend(fontsize=15);

In [None]:
# evaluate on test data (Opcional)
model.evaluate(test_batches, verbose=2)