## 데이터 전처리 및 저장

In [2]:
!pip3 install tensorflow



In [3]:
import tensorflow as tf
import numpy as np
import pathlib

2024-04-06 09:50:51.116217: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### 데이터 다운로드

In [4]:
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url, untar=True)
data_dir = pathlib.Path(data_dir)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz
[1m228813984/228813984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


### 데이터 전처리

In [5]:
batch_size = 32
img_height = 180
img_width = 180

# 훈련 데이터
train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

# 검증 데이터
val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)


Found 3670 files belonging to 5 classes.
Using 2936 files for training.


2024-04-06 09:51:25.774535: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Found 3670 files belonging to 5 classes.
Using 734 files for validation.


In [6]:
class_names = train_ds.class_names
print(class_names)

['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']


In [7]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

### 전처리한 데이터 저장 및 S3에 업로드

In [8]:
!pip3 install -U sagemaker



In [9]:
import os
import boto3
import sagemaker

sagemaker_session = sagemaker.Session()
# bucket_name = '[이전에 생성한 버킷 이름]'
bucket_name = 'kmu-sagemaker-m2023807'
print(bucket_name)  

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
kmu-sagemaker-m2023807


In [10]:
# save data locally first
dest = 'flower_model_train_data'
train_path = os.path.join(dest, 'train')
test_path = os.path.join(dest, 'val')

print(train_path)

!mkdir {dest}

train_ds.save(train_path)
val_ds.save(test_path)

flower_model_train_data/train


2024-04-06 09:53:32.060473: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 12441600 exceeds 10% of free system memory.
2024-04-06 09:53:32.133228: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 12441600 exceeds 10% of free system memory.
2024-04-06 09:53:32.194217: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 12441600 exceeds 10% of free system memory.
2024-04-06 09:53:32.250631: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 12441600 exceeds 10% of free system memory.
2024-04-06 09:53:32.306757: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 12441600 exceeds 10% of free system memory.


In [11]:
# upload to S3 bucket (see the bucket name above)
sagemaker_session.upload_data(train_path, bucket=bucket_name, key_prefix='data/train')
sagemaker_session.upload_data(test_path, bucket=bucket_name, key_prefix='data/test')

's3://kmu-sagemaker-m2023807/data/test'

In [12]:
!ls {dest}

train  val


### 모델학습

In [13]:
from sagemaker import get_execution_role
from sagemaker.tensorflow import TensorFlow

In [14]:
role = get_execution_role()
print("execution role ARN:", role)

training_data_uri = os.path.join(f's3://{bucket_name}', 'data')
print(training_data_uri)

execution role ARN: arn:aws:iam::381492271890:role/LabRole
s3://kmu-sagemaker-m2023807/data


In [15]:
num_of_instance = 1                 # 학습에 사용할 인스턴스 개수
instance_type = 'ml.m5.xlarge'     # 학습에 사용할 인스턴스 타입, xlarge 사용

training_script = 'flower_model_train.py'

training_parameters = {
    'epochs': 15, 
    'img_height': img_height, 
    'img_width': img_width
}

# training framework specs
tensorflow_version = '2.14'
python_version = 'py310'
distributed_training_spec = {'parameter_server': {'enabled': True}}

In [16]:
# initiate the training job using Tensorflow estimator
flower_estimator = TensorFlow(
    entry_point=training_script,
    role=role,
    instance_count=num_of_instance,
    instance_type=instance_type,
    framework_version=tensorflow_version,
    py_version=python_version,
    distribution=distributed_training_spec,
    hyperparameters=training_parameters
)

In [17]:
# 15분소요 
flower_estimator.fit(training_data_uri)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: tensorflow-training-2024-04-06-10-08-52-552


2024-04-06 10:08:54 Starting - Starting the training job...
2024-04-06 10:09:08 Starting - Preparing the instances for training...
2024-04-06 10:09:41 Downloading - Downloading input data......
2024-04-06 10:10:46 Training - Training image download completed. Training in progress..[34m2024-04-06 10:10:54.990562: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.[0m
[34mTo enable the following instructions: AVX512F, in other operations, rebuild TensorFlow with the appropriate compiler flags.[0m
[34m2024-04-06 10:10:57,242 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2024-04-06 10:10:57,243 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-04-06 10:10:57,244 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-04-06 10:10:5

### 모델 배포

In [18]:
endpoint_name = 'ep-flower-classification'
model_name = 'flower-classification-model'

predictor = flower_estimator.deploy(
    initial_instance_count=1, 
    instance_type="ml.m5.large", # 배포할 인스턴스 타입은 large
    endpoint_name=endpoint_name,
    model_name=model_name,
)

INFO:sagemaker.tensorflow.model:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating model with name: flower-classification-model
INFO:sagemaker:Creating endpoint-config with name ep-flower-classification
INFO:sagemaker:Creating endpoint with name ep-flower-classification


----!

In [19]:
print(f'endpoint_name= "{endpoint_name}"')
print(f'bucket_name = "{bucket_name}"')

endpoint_name= "ep-flower-classification"
bucket_name = "kmu-sagemaker-m2023807"


### 추론 테스트

In [20]:
from sagemaker.tensorflow import TensorFlowPredictor

In [21]:
flower_model = TensorFlowPredictor(endpoint_name)

In [22]:
!wget https://storage.googleapis.com/download.tensorflow.org/example_images/592px-Red_sunflower.jpg
!ls

--2024-04-06 10:24:59--  https://storage.googleapis.com/download.tensorflow.org/example_images/592px-Red_sunflower.jpg
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.69.219, 142.251.33.123, 142.251.211.251, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.69.219|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 117948 (115K) [image/jpeg]
Saving to: ‘592px-Red_sunflower.jpg’


2024-04-06 10:24:59 (4.32 MB/s) - ‘592px-Red_sunflower.jpg’ saved [117948/117948]

592px-Red_sunflower.jpg  flower_model_train_deploy.ipynb  lost+found
flower_model_train_data  flower_model_train.py


In [23]:
from tensorflow.keras.preprocessing.image import img_to_array

sunflower_path = "592px-Red_sunflower.jpg"

img = tf.keras.utils.load_img(
    sunflower_path, target_size=(img_height, img_width)
)

img_array = img_to_array(img)
img_array = tf.expand_dims(img_array, 0) 

img_array_numpy = img_array.numpy().tolist() 

predictions = flower_model.predict(img_array_numpy)
deserialized_predictions = predictions['predictions']
score = tf.nn.softmax(deserialized_predictions[0])

print("추론 결과: {}".format(score, end='\n\n'))

print("-----------------", end='\n\n')

print("이 꽃의 이름은 {:.2f}% 확률로 {}일 것입니다.".format(100 * np.max(score), class_names[np.argmax(score)]))

추론 결과: [7.8482651e-07 1.5321527e-04 7.0023246e-04 9.9541950e-01 3.7262430e-03]
-----------------

이 꽃의 이름은 99.54% 확률로 sunflowers일 것입니다.


### 배포한 엔드포인트 삭제

In [24]:
# delete endpoint at the end of the demo
predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: ep-flower-classification
INFO:sagemaker:Deleting endpoint with name: ep-flower-classification
