In [29]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import requests
import os
from sklearn.model_selection import train_test_split
from io import BytesIO
from PIL import Image
from imblearn.over_sampling import SMOTE
import math

IMG_SIZE = (224, 224)
BATCH_SIZE = 16
DATASET_PATH = "../DataSets/cats_vs_dogs/"

def cargar_imagen(row):
    """Carga una imagen desde un archivo local o una URL."""
    print(row['filename'], row['urlAbsoluta'], row['urlAbsoluta'].astype(str).strip()) 
    if len(row['urlAbsoluta'].astype(str).strip()) > 0 and not math.isnan(row['urlAbsoluta']):
        try:
            response = requests.get(row['urlAbsoluta'], stream=True, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert('RGB')
        except requests.exceptions.RequestException as e:
            print(f"Error al descargar la imagen {row['urlAbsoluta']}: {e}")
            return None
        except Exception as e:
            print(f"Error al abrir la imagen descargada {row['urlAbsoluta']}: {e}")
            return None
    elif row['filename']:
        prefix = ''
        if "nodogs" in row['filename']:
            prefix = 'no_dogs'
        else:
            prefix = 'dogs'

        directory = os.path.join(DATASET_PATH, prefix)
        print(directory, '-->', row['filename']) 
        filepath = os.path.join(directory, row['filename'])
        try:
            img = Image.open(filepath).convert('RGB')
        except FileNotFoundError:
            print(f"Archivo no encontrado: {filepath}")
            return None
        except Exception as e:
            print(f"Error al abrir la imagen {filepath}: {e}")
            return None
    else:
        print("Ni urlAbsoluta ni filename proporcionados.")
        return None
    return img.resize(IMG_SIZE)

def preparar_datos(csv_path):
    """Prepara los datos para el entrenamiento."""
    chunksize = 1000  # Leer 1000 filas a la vez
    chunks = []
    #df = pd.read_csv(csv_path)
    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        chunks.append(chunk)
    df = pd.concat(chunks)
    print(f"Longitud del DataFrame: {len(df)}")
    
    # Usar las columnas 'perro' y 'gato' como etiquetas
    etiquetas_unicas = ['perro', 'gato']  # Definir las etiquetas directamente
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    print('train_df=',len(train_df), ' val_df=',len(val_df))

    datagen = ImageDataGenerator(
        preprocessing_function=tf.keras.applications.mobilenet_v3.preprocess_input,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True
    )

    def balanced_batch_generator_smote(df, datagen, batch_size, etiquetas_unicas):
        """Genera batches balanceados con SMOTE para clasificación multi-label."""
        smote = SMOTE(random_state=42)
        while True:
            image_batch = []
            label_batch = []
            indices = np.random.choice(df.index, size=batch_size, replace=False)
            print('len(indices)', len(indices))
            for index in indices:
                if index >= len(df):  # Verificar si el índice está fuera de los límites
                    print(f"Error: Índice {index} fuera de los límites. Longitud del DataFrame: {len(df)}")
                    continue  # Saltar a la siguiente iteración
                row = df.iloc[index]
                img = cargar_imagen(row)
                if img is not None:
                    img_array = tf.keras.preprocessing.image.img_to_array(img)
                    image_batch.append(img_array)
                    labels = row[etiquetas_unicas].values.astype(np.float32)
                    label_batch.append(labels)
            image_batch = np.array(image_batch)
            label_batch = np.array(label_batch)

            # Aplicar SMOTE a cada etiqueta por separado
            original_shape = image_batch.shape
            if image_batch.shape[0] > 0: #Verificar que el batch tenga datos antes de aplicar smote
                for i in range(label_batch.shape[1]):
                    X_resampled, y_resampled = smote.fit_resample(image_batch.reshape(image_batch.shape[0], -1), label_batch[:, i])
                    if i == 0:
                        X_resampled_all = X_resampled
                        y_resampled_all = y_resampled.reshape(-1,1)
                    else:
                        y_resampled_all = np.concatenate((y_resampled_all, y_resampled.reshape(-1,1)), axis = 1)
                
                image_batch_resampled = X_resampled_all.reshape(-1, IMG_SIZE[0], IMG_SIZE[1], 3)
                label_batch_resampled = y_resampled_all
                image_batch = np.concatenate((image_batch, image_batch_resampled))
                label_batch = np.concatenate((label_batch, label_batch_resampled))
            
            image_batch = datagen.flow(image_batch, batch_size=batch_size*2 if original_shape[0] > 0 else batch_size, shuffle=True).next()
            yield image_batch, label_batch

    train_generator = balanced_batch_generator_smote(train_df, datagen, BATCH_SIZE, etiquetas_unicas)
    validation_generator = balanced_batch_generator_smote(val_df, datagen, BATCH_SIZE, etiquetas_unicas) #No aplicar smote en validacion

    return train_generator, validation_generator, etiquetas_unicas, len(train_df), len(val_df)

# Ejemplo de uso:
csv_file = './dogs-no-dogs.csv'
train_generator, validation_generator, etiquetas_unicas, train_len, val_len = preparar_datos(csv_file)

from tensorflow.keras.applications import MobileNetV3Large
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

base_model = MobileNetV3Large(weights='imagenet', include_top=False, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
base_model.trainable = False

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(128, activation='relu')(x)
predictions = Dense(len(etiquetas_unicas), activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['binary_accuracy'])

model.fit(
    train_generator,
    steps_per_epoch=train_len*2 // BATCH_SIZE, #Se multiplica por dos el total de datos porque se duplican en cada batch
    epochs=10,
    validation_data=validation_generator,
    validation_steps=val_len // BATCH_SIZE
)

Longitud del DataFrame: 25000
train_df= 20000  val_df= 5000
len(indices) 16
10773.jpg nan nan
../DataSets/cats_vs_dogs/dogs --> 10773.jpg
Error: Índice 20417 fuera de los límites. Longitud del DataFrame: 20000
4246.jpg nan nan
../DataSets/cats_vs_dogs/dogs --> 4246.jpg
661_nodogs.jpg nan nan
../DataSets/cats_vs_dogs/no_dogs --> 661_nodogs.jpg
10144.jpg nan nan
../DataSets/cats_vs_dogs/dogs --> 10144.jpg
1080.jpg nan nan
../DataSets/cats_vs_dogs/dogs --> 1080.jpg
Error: Índice 23770 fuera de los límites. Longitud del DataFrame: 20000
Error: Índice 20334 fuera de los límites. Longitud del DataFrame: 20000
4739_nodogs.jpg nan nan
../DataSets/cats_vs_dogs/no_dogs --> 4739_nodogs.jpg
11558.jpg nan nan
../DataSets/cats_vs_dogs/dogs --> 11558.jpg
6135_nodogs.jpg nan nan
../DataSets/cats_vs_dogs/no_dogs --> 6135_nodogs.jpg
6140_nodogs.jpg nan nan
../DataSets/cats_vs_dogs/no_dogs --> 6140_nodogs.jpg
6549_nodogs.jpg nan nan
../DataSets/cats_vs_dogs/no_dogs --> 6549_nodogs.jpg
6083.jpg nan nan
..

AttributeError: 'NumpyArrayIterator' object has no attribute 'next'