In [1]:
# 필요한 모듈 임포트
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator  # 수정 부분
from tensorflow.keras import layers, models
from tensorflow.keras.applications import EfficientNetV2L
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6),
    ModelCheckpoint("best_model.keras", save_best_only=True)  # 수정: .keras 확장자로 변경
]


import os

# GPU 설정
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# 데이터 경로
train_dir = "/home/ec2-user/SageMaker/Original_data_for_model/train"
val_dir = "/home/ec2-user/SageMaker/Original_data_for_model/val"

# 데이터 증강 및 전처리 설정
train_datagen = ImageDataGenerator(
    rescale=1.0/255,  # 모든 픽셀 값을 0~1로 스케일링
    rotation_range=20,  # 회전 증강
    width_shift_range=0.2,  # 가로 이동 증강
    height_shift_range=0.2,  # 세로 이동 증강
    shear_range=0.2,  # 전단 변환
    zoom_range=0.2,  # 확대/축소
    horizontal_flip=True  # 수평 뒤집기
)

val_datagen = ImageDataGenerator(
    rescale=1.0/255  # 검증 데이터는 스케일링만 적용
)

# 데이터 로드
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(224, 224),  # EfficientNetV2의 입력 크기
    batch_size=32,
    class_mode='categorical'  # 다중 클래스 분류
)

val_generator = val_datagen.flow_from_directory(
    val_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)


2024-11-28 10:23:56.023392: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-28 10:23:56.047172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732789436.081894   28412 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732789436.092725   28412 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 10:23:56.129388: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Found 612 images belonging to 2 classes.
Found 165 images belonging to 2 classes.


In [2]:
# EfficientNetV2 모델 구성
base_model = EfficientNetV2L(
    weights='imagenet',  # ImageNet 사전 학습 가중치 사용
    include_top=False,  # 분류 층 제외
    input_shape=(224, 224, 3)  # 입력 크기
)
base_model.trainable = False  # 사전 학습된 가중치를 고정

model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(train_generator.class_indices), activation='softmax')  # 클래스 개수
])

# 모델 컴파일
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


I0000 00:00:1732789443.025379   28412 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8608 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:1e.0, compute capability: 7.5


In [3]:
# 모델 훈련
history = model.fit(
    train_generator,
    epochs=20,  # 원하는 epoch 수로 변경 가능
    validation_data=val_generator,
    callbacks=callbacks
)

# 훈련 후 성능 평가
val_loss, val_acc = model.evaluate(val_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_acc}")


  self._warn_if_super_not_called()


Epoch 1/20


I0000 00:00:1732789545.212220   28832 service.cc:148] XLA service 0x7f5844014c10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732789545.212457   28832 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-11-28 10:25:48.664516: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
E0000 00:00:1732789560.837252   28832 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
E0000 00:00:1732789575.185986   28832 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_28412/2987484970.py", line 2, in <module>

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 368, in fit

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 216, in function

  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 129, in multi_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_70183]