## 배치 트랜스포머 - 분류 모델 훈련

라이브러리 준비

In [1]:
!pip install ucimlrepo



In [121]:
import warnings
warnings.filterwarnings('ignore')

In [122]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

In [123]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import awswrangler as wr

In [124]:
import os
import sagemaker
import boto3
from dotenv import load_dotenv
load_dotenv()

True

SageMaker 세션 및 역할 설정

In [125]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


데이터셋 다운로드

In [126]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

데이터 확인

In [127]:
mushroom.metadata

{'uci_id': 73,
 'name': 'Mushroom',
 'repository_url': 'https://archive.ics.uci.edu/dataset/73/mushroom',
 'data_url': 'https://archive.ics.uci.edu/static/public/73/data.csv',
 'abstract': 'From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible',
 'area': 'Biology',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 8124,
 'num_features': 22,
 'feature_types': ['Categorical'],
 'demographics': [],
 'target_col': ['poisonous'],
 'index_col': None,
 'has_missing_values': 'yes',
 'missing_values_symbol': 'NaN',
 'year_of_dataset_creation': 1981,
 'last_updated': 'Thu Aug 10 2023',
 'dataset_doi': '10.24432/C5959T',
 'creators': [],
 'intro_paper': None,
 'additional_info': {'summary': "This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525).  Each species is identified as definitely 

In [128]:
mushroom.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,poisonous,Target,Categorical,,,,no
1,cap-shape,Feature,Categorical,,"bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s",,no
2,cap-surface,Feature,Categorical,,"fibrous=f,grooves=g,scaly=y,smooth=s",,no
3,cap-color,Feature,Binary,,"brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y",,no
4,bruises,Feature,Categorical,,"bruises=t,no=f",,no
5,odor,Feature,Categorical,,"almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s",,no
6,gill-attachment,Feature,Categorical,,"attached=a,descending=d,free=f,notched=n",,no
7,gill-spacing,Feature,Categorical,,"close=c,crowded=w,distant=d",,no
8,gill-size,Feature,Categorical,,"broad=b,narrow=n",,no
9,gill-color,Feature,Categorical,,"black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y",,no


In [129]:
y.describe()

Unnamed: 0,poisonous
count,8124
unique,2
top,e
freq,4208


In [130]:
X.describe()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,5644,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,6,4,10,2,9,2,2,2,12,2,4,4,4,9,9,1,4,3,5,9,6,7
top,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


데이터 전처리

In [131]:
X.isna().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  2480
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
dtype: int64

In [132]:
X['stalk-root'] = X['stalk-root'].fillna('Unknown')

In [133]:
df = pd.concat([y, X], axis=1)

In [134]:
# 데이터를 train, validation, test 세트로 분리합니다
from sklearn.model_selection import train_test_split

# 먼저 train+validation 세트와 test 세트로 분리
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=2024, stratify=df['poisonous'])

# 그 다음 train_val 세트를 train과 validation 세트로 분리
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=2024, stratify=train_val_df['poisonous'])

print(f"훈련 데이터 크기: {len(train_df)}")
print(f"검증 데이터 크기: {len(val_df)}")
print(f"테스트 데이터 크기: {len(test_df)}")

훈련 데이터 크기: 4874
검증 데이터 크기: 1625
테스트 데이터 크기: 1625


In [135]:
# LabelEncoder를 사용하여 범주형 데이터를 인코더 생성
from sklearn.preprocessing import LabelEncoder
features = train_df.columns.tolist()
features.remove('poisonous')

feature_encoders = {
    name: encoder for name, encoder in zip(features, [LabelEncoder().fit(train_df[col]) for col in features])
}
# 레이블 컬럼도 인코더 생성
label_encoder = LabelEncoder().fit(train_df['poisonous'])

In [136]:
# 특성 인코딩
train_df[features] = train_df[features].apply(lambda x: feature_encoders[x.name].transform(x))
val_df[features] = val_df[features].apply(lambda x: feature_encoders[x.name].transform(x))
# 실제 테스트 데이터는 배치 트랜스포머 예측 단계에서 인코딩 해야 합니다.
# test_df[features] = test_df[features].apply(lambda x: feature_encoders[x.name].transform(x))
# 레이블 인코딩
train_df['poisonous'] = label_encoder.transform(train_df['poisonous'])
val_df['poisonous'] = label_encoder.transform(val_df['poisonous'])
# test_df['poisonous'] = label_encoder.transform(test_df['poisonous'])

In [137]:
for name, encoder in feature_encoders.items():
    print(name, encoder.classes_)
print("Label Encoder classes:", label_encoder.classes_)

cap-shape ['b' 'c' 'f' 'k' 's' 'x']
cap-surface ['f' 'g' 's' 'y']
cap-color ['b' 'c' 'e' 'g' 'n' 'p' 'r' 'u' 'w' 'y']
bruises ['f' 't']
odor ['a' 'c' 'f' 'l' 'm' 'n' 'p' 's' 'y']
gill-attachment ['a' 'f']
gill-spacing ['c' 'w']
gill-size ['b' 'n']
gill-color ['b' 'e' 'g' 'h' 'k' 'n' 'o' 'p' 'r' 'u' 'w' 'y']
stalk-shape ['e' 't']
stalk-root ['Unknown' 'b' 'c' 'e' 'r']
stalk-surface-above-ring ['f' 'k' 's' 'y']
stalk-surface-below-ring ['f' 'k' 's' 'y']
stalk-color-above-ring ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
stalk-color-below-ring ['b' 'c' 'e' 'g' 'n' 'o' 'p' 'w' 'y']
veil-type ['p']
veil-color ['n' 'o' 'w' 'y']
ring-number ['n' 'o' 't']
ring-type ['e' 'f' 'l' 'n' 'p']
spore-print-color ['b' 'h' 'k' 'n' 'o' 'r' 'u' 'w' 'y']
population ['a' 'c' 'n' 's' 'v' 'y']
habitat ['d' 'g' 'l' 'm' 'p' 'u' 'w']
Label Encoder classes: ['e' 'p']


S3에 데이터 업로드

In [138]:
bucket_name = 'dante-sagemaker'
project_name = 'mushroom-classification'

In [139]:
input_path = f's3://{bucket_name}/{project_name}/input'
output_path = f's3://{bucket_name}/{project_name}/output'
model_path = f's3://{bucket_name}/{project_name}/model'
asset_path = f's3://{bucket_name}/{project_name}/asset'
checkpoint_path = f's3://{bucket_name}/{project_name}/checkpoints'
train_path = f'{input_path}/train/train.recordio'
val_path = f'{input_path}/val/val.recordio'
test_path = f'{input_path}/test/test.csv'

print('train_path:', train_path)
print('val_path:', val_path)
print('test_path:', test_path)
print('model_path:', model_path)
print('asset_path:', asset_path)
print('checkpoint_path:', checkpoint_path)

train_path: s3://dante-sagemaker/mushroom-classification/input/train/train.recordio
val_path: s3://dante-sagemaker/mushroom-classification/input/val/val.recordio
test_path: s3://dante-sagemaker/mushroom-classification/input/test/test.csv
model_path: s3://dante-sagemaker/mushroom-classification/model
asset_path: s3://dante-sagemaker/mushroom-classification/asset
checkpoint_path: s3://dante-sagemaker/mushroom-classification/checkpoints


In [140]:
import io
from sagemaker.amazon.common import write_numpy_to_dense_tensor

# RecordIO 형식으로 데이터를 변환하고 S3에 저장하는 헬퍼 함수
def convert_to_recordio_protobuf(df, label_column, s3_path):
    # 레이블과 특성 분리
    labels = df[label_column].values
    features = df.drop(columns=[label_column]).values
    
    # RecordIO-Protobuf 형식으로 변환
    buf = io.BytesIO()
    write_numpy_to_dense_tensor(buf, features, labels)
    buf.seek(0)
    
    # S3에 업로드
    wr.s3.upload(local_file=buf, path=s3_path, boto3_session=boto3_session)
    print(f"데이터가 {s3_path}에 RecordIO-Protobuf 형식으로 저장되었습니다.")

In [141]:
# 예측 단계에서 사용할수 있도록 인코더를 저장해두어야 합니다.
import pickle
os.makedirs('assets', exist_ok=True)
feature_encoders_filepath = 'assets/feature_encoders.pkl'
label_encoder_filepath = 'assets/label_encoder.pkl'

with open(feature_encoders_filepath, 'wb') as f:
    pickle.dump(feature_encoders, f)
with open(label_encoder_filepath, 'wb') as f:
    pickle.dump(label_encoder, f)
    
# 예측 단계와 함께 인코딩하기 위해 features 리스트를 함께 저장해두어야 합니다.
features_df = pd.DataFrame(features, columns=['features'])

In [142]:
# train_df를 RecordIO 형식으로 변환하고 S3에 저장
convert_to_recordio_protobuf(train_df, 'poisonous', train_path)
# test_df를 RecordIO 형식으로 변환하고 S3에 저장
convert_to_recordio_protobuf(val_df, 'poisonous', val_path)
# test 데이터는 변환하지 않고 저장
wr.s3.to_csv(df=test_df, path=test_path, index=False, header=False, boto3_session=boto3_session)
# 특성 컬럼 저장
wr.s3.to_csv(df=features_df, path=os.path.join(asset_path, 'features.csv'), index=False, header=False, boto3_session=boto3_session)
# 특성 인코더 저장
wr.s3.upload(feature_encoders_filepath, os.path.join(asset_path, 'feature_encoders.pkl'), boto3_session=boto3_session)
# 레이블 인코더 저장
wr.s3.upload(label_encoder_filepath, os.path.join(asset_path, 'label_encoder.pkl'), boto3_session=boto3_session)

데이터가 s3://dante-sagemaker/mushroom-classification/input/train/train.recordio에 RecordIO-Protobuf 형식으로 저장되었습니다.
데이터가 s3://dante-sagemaker/mushroom-classification/input/val/val.recordio에 RecordIO-Protobuf 형식으로 저장되었습니다.


빌트인 모델 및 하이퍼파라미터 설정

In [24]:
container = sagemaker.image_uris.retrieve("xgboost",sagemaker_session.boto_region_name,version="1.7-1")
use_spot_instances = True
max_run = 60 * 60 # 1시간
max_wait = 60 * 60 * 2 # 2시간
instance_type = 'ml.m5.xlarge'

In [25]:
estimator = sagemaker.estimator.Estimator(
    image_uri=container,  # 사용할 Docker 이미지의 URI
    role=role,  # IAM 역할 ARN
    sagemaker_session=sagemaker_session,  # SageMaker 세션 객체
    instance_count=1,  # 사용할 인스턴스 수
    instance_type=instance_type,  # 사용할 인스턴스 유형
    max_run=max_run,  # 최대 실행 시간 (초)
    use_spot_instances=use_spot_instances,  # 스팟 인스턴스 사용 여부
    max_wait=max_wait,  # 스팟 인스턴스 대기 최대 시간 (초)
    output_path=output_path,  # 모델 아티팩트 저장 경로
    base_job_name=project_name,  # 훈련 작업 이름의 기본 접두사
    checkpoint_path=checkpoint_path,  # 체크포인트 저장 경로
)

In [26]:
# XGBoost 모델의 하이퍼파라미터 설정
estimator.set_hyperparameters(
    max_depth=5,          # 트리의 최대 깊이
    eta=0.1,              # 학습률
    gamma=4,              # 트리의 리프 노드를 추가적으로 나누는데 필요한 최소 손실 감소값
    min_child_weight=6,   # 자식 노드를 만드는데 필요한 최소 가중치 합
    subsample=0.8,        # 각 트리마다 사용할 훈련 데이터의 샘플링 비율
    objective='binary:logistic',  # 이진 분류를 위한 로지스틱 목적 함수
    num_round=200,        # 부스팅 라운드 수
    eval_metric='logloss' # 평가 지표로 로그 손실 사용
)

데이터 경로 지정 및 모델 훈련

In [27]:
training_input_config = sagemaker.session.TrainingInput(
    s3_data=train_path,
    content_type='application/x-recordio-protobuf',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=val_path,
    content_type='application/x-recordio-protobuf',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}
estimator.fit(data_channels)

INFO:sagemaker:Creating training-job with name: mushroom-classification-2024-08-01-09-13-11-160


2024-08-01 09:13:11 Starting - Starting the training job......
2024-08-01 09:14:00 Starting - Preparing the instances for training...
2024-08-01 09:14:42 Downloading - Downloading the training image.........
2024-08-01 09:16:03 Training - Training image download completed. Training in progress..[2024-08-01 09:16:10.824 ip-10-0-185-76.ap-northeast-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-08-01 09:16:10.846 ip-10-0-185-76.ap-northeast-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2024-08-01:09:16:11:INFO] Imported framework sagemaker_xgboost_container.training
[2024-08-01:09:16:11:INFO] Failed to parse hyperparameter eval_metric value logloss to Json.
Returning the value itself
[2024-08-01:09:16:11:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.
Returning the value itself
[2024-08-01:09:16:11:INFO] No GPUs detected (normal if no gpus installed)
[2024-08-01:09:16:11:INFO] Running 

손실 함수 시각화

In [28]:
training_job_name = estimator.latest_training_job.job_name
training_job_name

'mushroom-classification-2024-08-01-09-13-11-160'

In [None]:
log_client = boto3_session.client('logs')
log_streams = log_client.describe_log_streams(logGroupName="/aws/sagemaker/TrainingJobs")

In [49]:
log_streams = []
next_token = None

while True:
    if next_token:
        response = log_client.describe_log_streams(logGroupName="/aws/sagemaker/TrainingJobs", nextToken=next_token)
    else:
        response = log_client.describe_log_streams(logGroupName="/aws/sagemaker/TrainingJobs")
    
    log_streams.extend([stream for stream in response['logStreams'] if training_job_name in stream['logStreamName']])
    
    if 'nextToken' in response:
        next_token = response['nextToken']
    else:
        break

In [52]:
log_events = log_client.get_log_events(logGroupName="/aws/sagemaker/TrainingJobs", 
                                   logStreamName=log_streams[0]['logStreamName'])
log_events['events'][-5:]

[{'timestamp': 1722503775329,
  'message': '[195]#011train-logloss:0.01168#011validation-logloss:0.01484',
  'ingestionTime': 1722503781483},
 {'timestamp': 1722503775329,
  'message': '[196]#011train-logloss:0.01168#011validation-logloss:0.01484',
  'ingestionTime': 1722503781483},
 {'timestamp': 1722503775329,
  'message': '[197]#011train-logloss:0.01168#011validation-logloss:0.01484',
  'ingestionTime': 1722503781483},
 {'timestamp': 1722503775329,
  'message': '[198]#011train-logloss:0.01168#011validation-logloss:0.01484',
  'ingestionTime': 1722503781483},
 {'timestamp': 1722503775329,
  'message': '[199]#011train-logloss:0.01168#011validation-logloss:0.01484',
  'ingestionTime': 1722503781483}]

In [53]:
from collections import defaultdict
import re
metrics = defaultdict(list)

for event in log_events['events']:
    # 정규표현식은 SageMkaer 훈련작업 콘솔에서 확인가능
    train_pattern = r'.*\[[0-9]+\].*#011train-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'
    val_pattern = r'.*\[[0-9]+\].*#011validation-logloss:([-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'
    train_match = re.search(train_pattern, event['message'])
    val_match = re.search(val_pattern, event['message'])
    if train_match :
        metrics['train:logloss'].append(float(train_match.group(1)))
    if val_match :
        metrics['validation:logloss'].append(float(val_match.group(1)))
metrics = dict(metrics)

In [54]:
from plotly import graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(metrics['train:logloss']))), y=metrics['train:logloss'], name='train-logloss'))
fig.add_trace(go.Scatter(x=list(range(len(metrics['validation:logloss']))), y=metrics['validation:logloss'], name='validation-logloss'))
fig.show()

---

In [55]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge', serializer=CSVSerializer(), deserializer=CSVDeserializer())

INFO:sagemaker:Creating model with name: mushroom-classification-2024-08-01-09-41-17-296
INFO:sagemaker:Creating endpoint-config with name mushroom-classification-2024-08-01-09-41-17-296
INFO:sagemaker:Creating endpoint with name mushroom-classification-2024-08-01-09-41-17-296


-------!