## 하이퍼파라미터 튜닝

라이브러리

In [None]:
# 데이터 처리 및 분석
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', None)

# 머신러닝
from sklearn.model_selection import train_test_split
from tensorflow.keras.datasets import cifar10
import numpy as np
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report

# AWS 관련
import sagemaker
from sagemaker.utils import name_from_base
from sagemaker.tensorflow import TensorFlow
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter, CategoricalParameter
import boto3
import awswrangler as wr

# 기타 유틸리티
import os
from dotenv import load_dotenv
load_dotenv()

True

SageMaker 세션 및 역할 설정

In [4]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

S3 데이터 저장 위치 설정

In [5]:
bucket_name = 'dante-sagemaker'
project_name = 'cifar10'

s3_training_file_location = r's3://{0}/{1}/input/training/'.format(bucket_name, project_name)
s3_validation_file_location =r's3://{0}/{1}/input/validation/'.format(bucket_name, project_name)

s3_output_location = r's3://{0}/{1}/output/'.format(bucket_name, project_name)
s3_checkpoint_location = r's3://{0}/{1}/checkpoint/'.format(bucket_name, project_name)

print('s3_training_file_location : ', s3_training_file_location)
print('s3_validation_file_location : ', s3_validation_file_location)
print('s3_output_location : ', s3_output_location)
print('s3_checkpoint_location : ', s3_checkpoint_location)

s3_training_file_location :  s3://dante-sagemaker/cifar10/input/training/
s3_validation_file_location :  s3://dante-sagemaker/cifar10/input/validation/
s3_output_location :  s3://dante-sagemaker/cifar10/output/
s3_checkpoint_location :  s3://dante-sagemaker/cifar10/checkpoint/


데이터 다운로드 및 s3 업로드

In [9]:
# 데이터 로드
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# 데이터 전처리
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

y_train = tf.keras.utils.to_categorical(y_train, 10)
y_test = tf.keras.utils.to_categorical(y_test, 10)

# 데이터 저장
os.makedirs(f'dataset/{project_name}', exist_ok=True)
np.save(f'dataset/{project_name}/x_train.npy', x_train)
np.save(f'dataset/{project_name}/y_train.npy', y_train)
np.save(f'dataset/{project_name}/x_test.npy', x_test)
np.save(f'dataset/{project_name}/y_test.npy', y_test)

s3_train_features = os.path.join(s3_training_file_location, 'x_train.npy')
s3_train_labels = os.path.join(s3_training_file_location, 'y_train.npy')
s3_validation_features = os.path.join(s3_validation_file_location, 'x_test.npy')
s3_validation_labels = os.path.join(s3_validation_file_location, 'y_test.npy')

wr.s3.upload(f'dataset/{project_name}/x_train.npy', s3_train_features, boto3_session=boto3_session)
wr.s3.upload(f'dataset/{project_name}/y_train.npy', s3_train_labels, boto3_session=boto3_session)
wr.s3.upload(f'dataset/{project_name}/x_test.npy', s3_validation_features, boto3_session=boto3_session)
wr.s3.upload(f'dataset/{project_name}/y_test.npy', s3_validation_labels, boto3_session=boto3_session)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 0us/step


훈련 스크립트 작성

In [11]:
os.makedirs(f'scripts/{project_name}', exist_ok=True)

In [34]:
%%writefile scripts/cifar10/train.py

import tensorflow as tf
import argparse
import os
import numpy as np

def load_data():
    x_train = np.load(os.path.join('/opt/ml/input/data/train', 'x_train.npy'))
    y_train = np.load(os.path.join('/opt/ml/input/data/train', 'y_train.npy'))
    x_test = np.load(os.path.join('/opt/ml/input/data/test', 'x_test.npy'))
    y_test = np.load(os.path.join('/opt/ml/input/data/test', 'y_test.npy'))
    return (x_train, y_train), (x_test, y_test)

def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    
    return model

def model(x_train, y_train, x_test, y_test, batch_size, epochs, learning_rate, model_dir):
    model = create_model()
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))
    
    # 모델 저장
    model.save(os.path.join(model_dir, 'model'))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--batch-size', type=int, default=64)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--learning-rate', type=float, default=0.001)
    parser.add_argument('--model_dir', type=str, default='/opt/ml/model')
    args = parser.parse_args()
    
    (x_train, y_train), (x_test, y_test) = load_data()
    
    model(x_train, y_train, x_test, y_test, args.batch_size, args.epochs, args.learning_rate, args.model_dir)

Overwriting scripts/cifar10/train.py


Tensorflow Estimator 생성

In [28]:
tf_estimator = TensorFlow(entry_point=f'scripts/{project_name}/train.py',
                          role=role,
                          sagemaker_session=sagemaker_session,
                          instance_count=1,
                          instance_type='ml.m5.xlarge',
                          framework_version='2.4.1',
                          py_version='py37',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 20
                          })

하이퍼파라미터 튜닝 설정

In [29]:
# 하이퍼파라미터 범위 설정
hyperparameter_ranges = {
    'batch-size': IntegerParameter(32, 128),  # 배치 크기: 32에서 128 사이의 정수
    'learning-rate': ContinuousParameter(0.0001, 0.01)  # 학습률: 0.0001에서 0.01 사이의 연속값
}

# 목표 지표 설정
objective_metric_name = 'val_accuracy'  # 검증 정확도를 목표 지표로 사용
objective_type = 'Maximize'  # 목표 지표를 최대화하려고 함

# 지표 정의 설정
metric_definitions = [{'Name': 'val_accuracy',
                       'Regex': 'val_accuracy: ([0-9\\.]+)'}]  # 로그에서 검증 정확도를 추출하기 위한 정규 표현식

# 하이퍼파라미터 튜너 설정
tuner = HyperparameterTuner(
    estimator=tf_estimator,  # 사용할 estimator
    base_tuning_job_name=f'{project_name}-tuner',  # 튜닝 작업의 기본 이름
    strategy='Bayesian',  # 베이지안 최적화 전략 사용 (다른 옵션: Random)
    objective_metric_name=objective_metric_name,  # 최적화할 목표 지표
    hyperparameter_ranges=hyperparameter_ranges,  # 탐색할 하이퍼파라미터 범위
    metric_definitions=metric_definitions,  # 지표 정의
    max_jobs=20,  # 최대 튜닝 작업 수
    max_parallel_jobs=3,  # 동시에 실행할 최대 작업 수
    objective_type=objective_type  # 목표 지표의 최적화 방향 (최대화)
)

In [30]:
tuner.fit({'train': s3_training_file_location, 'test': s3_validation_file_location})
tuner.wait()

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

최적 하이퍼파라미터로 모델 엔드포인트 생성

In [None]:
best_estimator = tuner.best_estimator()
best_estimator.fit({'train': s3_training_file_location, 'test': s3_validation_file_location})

# 최종 모델 배포 및 평가
predictor = best_estimator.deploy(initial_instance_count=1, instance_type='ml.m5.large')

모델 평가

In [None]:
# 모델 평가
y_pred = predictor.predict(np.expand_dims(x_test, axis=-1))
y_pred = np.argmax(y_pred, axis=1)

# 정확도 계산
accuracy = np.mean(y_pred == y_test)
print(f'테스트 세트 정확도: {accuracy:.4f}')

# 혼동 행렬 생성
confusion_mtx = confusion_matrix(y_test, y_pred)
print('혼동 행렬:')
print(confusion_mtx)

# 분류 보고서 출력
print('\n분류 보고서:')
print(classification_report(y_test, y_pred))

엔드포인트 삭제

In [None]:
predictor.delete_endpoint()