## Importamos as bibliotecas

In [1]:
import numpy as np
import cv2
import os
from PIL import Image
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.applications import VGG16
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint

## Baixamos o dataset

In [2]:

# Configuração para evitar logs desnecessários
tf.get_logger().setLevel('ERROR')

# Defina o caminho da pasta principal onde estão os dados
pasta = "/kaggle/input/the-algae-cell-images/algae_data_11"

# Verificar se a pasta foi extraída corretamente
if not os.path.exists(pasta):
    print("Erro: O dataset não foi extraído corretamente.")
else:
    print("Dataset extraído com sucesso!")

# Inicialize os arrays para armazenar os dados e os rótulos
dataset = []
label = []
SIZE = 224

# Obtenha a lista de categorias (subpastas)
categorias = [d for d in os.listdir(pasta) if os.path.isdir(os.path.join(pasta, d))]

# Iterar sobre cada categoria
for label_idx, categoria in enumerate(categorias):
    subpasta = os.path.join(pasta, categoria)
    imagens = [img for img in os.listdir(subpasta) if img.endswith('.jpg')]

    for image_name in imagens:
        img_path = os.path.join(subpasta, image_name)
        img = cv2.imread(img_path)

        if img is not None:
            img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
            img = img.resize((SIZE, SIZE))
            dataset.append(np.array(img))
            label.append(label_idx)

print(f"Dataset criado com {len(dataset)} imagens e {len(label)} rótulos.")


Dataset extraído com sucesso!
Dataset criado com 6300 imagens e 6300 rótulos.


## Processamos os dados

In [3]:


# Pré-processamento dos dados
def preprocess_data(X, Y):
    X = X.astype('float32')
    X_p = preprocess_input(X)
    Y_p = to_categorical(Y, len(categorias))
    return X_p, Y_p

# Conversão para numpy arrays
dataset = np.array(dataset)
label = np.array(label)

# Divisão dos dados
X_train, X_temp, Y_train, Y_temp = train_test_split(dataset, label, 
                                                    test_size=0.4, 
                                                    random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, 
                                                Y_temp, 
                                                test_size=0.5, 
                                                random_state=42)

X_train_p, Y_train_p = preprocess_data(X_train, Y_train)
X_val_p, Y_val_p = preprocess_data(X_val, Y_val)
X_test_p, Y_test_p = preprocess_data(X_test, Y_test)


## Definimos o modelo

In [4]:

# Construção do modelo
base_model = VGG16(include_top=False, 
                   weights='imagenet', 
                   pooling='avg', 
                   input_shape=(224, 224, 3))

# Dicionário para armazenar os históricos de treinamento
history_dict = {}

model = Sequential([
    base_model,
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.2),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(len(categorias), activation='softmax')
])

# Callbacks
def decay(epoch):
    """Função para ajustar a taxa de aprendizado."""
    return 0.001 / (1 + 1 * 20)

# Lista de callbacks
callbacks = [
    LearningRateScheduler(decay, verbose=1),
    ModelCheckpoint('/kaggle/working/phytoplankton.keras', 
                    save_best_only=True, 
                    monitor='val_loss', 
                    mode='min')
]

# Compilação do modelo
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Treinamos o modelo

In [5]:

# Treinamento
for batch_size in [16, 32, 64]:
    history = model.fit(X_train_p, 
                        Y_train_p, 
                        batch_size=batch_size, 
                        validation_data=(X_val_p, Y_val_p), 
                        epochs=100, 
                        shuffle=True, 
                        callbacks=callbacks,
                        verbose=1)

# Armazena o histórico de treinamento no dicionário
history_dict[batch_size] = history


Epoch 1: LearningRateScheduler setting learning rate to 4.761904761904762e-05.
Epoch 1/100
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 402ms/step - accuracy: 0.6195 - loss: 1.6039 - val_accuracy: 0.7437 - val_loss: 0.8118 - learning_rate: 4.7619e-05

Epoch 2: LearningRateScheduler setting learning rate to 4.761904761904762e-05.
Epoch 2/100
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 302ms/step - accuracy: 0.7771 - loss: 0.7161 - val_accuracy: 0.7905 - val_loss: 0.7353 - learning_rate: 4.7619e-05

Epoch 3: LearningRateScheduler setting learning rate to 4.761904761904762e-05.
Epoch 3/100
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 310ms/step - accuracy: 0.8437 - loss: 0.4830 - val_accuracy: 0.8325 - val_loss: 0.6747 - learning_rate: 4.7619e-05

Epoch 4: LearningRateScheduler setting learning rate to 4.761904761904762e-05.
Epoch 4/100
[1m237/237[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 324ms/step - accur

## Visualizamos o comportamento do modelo

In [1]:
import matplotlib.pyplot as plt

# Encontra o batch_size com a melhor acurácia de validação
best_batch_size = max(history_dict, key=lambda k: max(history_dict[k].history['val_accuracy']))

# Plota os gráficos apenas para o melhor batch_size
best_history = history_dict[best_batch_size]

plt.figure(figsize=(14, 5))

# Gráfico de acurácia
plt.subplot(1, 2, 1)
plt.plot(best_history.history['accuracy'], label='Train Accuracy')
plt.plot(best_history.history['val_accuracy'], label='Validation Accuracy')
plt.title(f'Acurácia (batch_size={best_batch_size})')
plt.xlabel('Época')
plt.ylabel('Acurácia')
plt.legend()

# Gráfico de perda
plt.subplot(1, 2, 2)
plt.plot(best_history.history['loss'], label='Train Loss')
plt.plot(best_history.history['val_loss'], label='Validation Loss')
plt.title(f'Perda (batch_size={best_batch_size})')
plt.xlabel('Época')
plt.ylabel('Perda')
plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'history_dict' is not defined

In [10]:
# Salva o modelo manualmente (opcional)
model.save('/kaggle/working/phytoplankton_bkp.keras')

## Avaliamos o melhor modelo com os dados de teste

In [6]:
from tensorflow.keras.models import load_model

# Carregue o modelo salvo
# model = load_model('phytoplankton.keras')

# Avalia o modelo no conjunto de teste
score = model.evaluate(X_test_p, Y_test_p)
print(f"Resultados para batch_size = {batch_size}:")
print(f"Test loss: {score[0]}")
print(f"Test accuracy: {score[1]}")



[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 148ms/step - accuracy: 0.9331 - loss: 1.1825
Resultados para batch_size = 64:
Test loss: 1.1463239192962646
Test accuracy: 0.9309523701667786


In [8]:
model.summary()

In [9]:
# Gere as previsões do modelo
y_pred = model.predict(X_test_p)

# Converta as previsões de probabilidades para classes
y_pred_classes = np.argmax(y_pred, axis=1)

# Converta os rótulos verdadeiros de one-hot encoding para classes
y_true = np.argmax(Y_test_p, axis=1)

# Gere o relatório de classificação
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred_classes, target_names=categorias))

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 132ms/step
               precision    recall  f1-score   support

     nontoxic       0.95      0.99      0.97       932
    Noctiluca       0.94      1.00      0.97        34
     Anabaena       0.93      0.68      0.78        40
 Prorocentrum       0.90      0.76      0.83        25
      Karenia       0.88      0.96      0.92        24
       Nostoc       0.75      0.62      0.68        29
    Nodularia       0.76      0.70      0.73        27
 Oscillatoria       0.85      0.71      0.77        24
Aphanizomenon       0.77      0.79      0.78        29
  Microcystis       0.94      0.83      0.88        35
  Gymnodinium       0.90      0.53      0.67        34
  Skeletonema       0.92      0.85      0.88        27

     accuracy                           0.93      1260
    macro avg       0.87      0.78      0.82      1260
 weighted avg       0.93      0.93      0.93      1260

