### 라이브러리 준비

In [1]:
# 경고 무시
import warnings
warnings.filterwarnings('ignore')
# 데이터 처리 및 분석
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', None)
import numpy as np

# 머신러닝
import tensorflow as tf

# AWS 관련
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.utils import name_from_base
import boto3
import awswrangler as wr

# 시각화
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 기타 유틸리티
import os
import io
import json
import kaggle
from dotenv import load_dotenv
load_dotenv("../.env")

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/dante/Library/Application Support/sagemaker/config.yaml


True

### SageMaker 세션 및 역할 설정

In [2]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
pipeline_session = PipelineSession(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

In [4]:
# SageMaker IDE 인스턴스에서 진행하실때는 아래와 같이 입력하시면 됩니다.
# boto3_session = boto3.Session()
# sagemaker_session = sagemaker.Session(boto_session=boto3_session)
# pipeline_session = PipelineSession(boto_session=boto3_session)
# role = sagemaker.get_execution_role()

### 데이터 로드 및 전처리

S3 데이터 입력 경로 설정

In [3]:
bucket_name = 'dante-sagemaker'
project_name = 'mushroom-classification-api-integration'

s3_original_folder_path = r's3://{0}/{1}/input/{2}'.format(bucket_name,project_name, 'original')
s3_training_file_folder_path = r's3://{0}/{1}/input/{2}'.format(bucket_name,project_name, 'training')
s3_validation_file_folder_path = r's3://{0}/{1}/input/{2}'.format(bucket_name,project_name, 'validation')
s3_test_file_folder_path = r's3://{0}/{1}/input/{2}'.format(bucket_name,project_name, 'test')
s3_output_folder_path = r's3://{0}/{1}/output/{2}'.format(bucket_name,project_name, 'model')
s3_asset_folder_path = r's3://{0}/{1}/output/{2}'.format(bucket_name,project_name, 'asset')

In [28]:
s3_training_file_folder_path

's3://dante-sagemaker/mushroom-classification-api-integration/input/training'

데이터 다운로드

In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X = mushroom.data.features 
y = mushroom.data.targets 

데이터 확인

In [5]:
mushroom.metadata

{'uci_id': 73,
 'name': 'Mushroom',
 'repository_url': 'https://archive.ics.uci.edu/dataset/73/mushroom',
 'data_url': 'https://archive.ics.uci.edu/static/public/73/data.csv',
 'abstract': 'From Audobon Society Field Guide; mushrooms described in terms of physical characteristics; classification: poisonous or edible',
 'area': 'Biology',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 8124,
 'num_features': 22,
 'feature_types': ['Categorical'],
 'demographics': [],
 'target_col': ['poisonous'],
 'index_col': None,
 'has_missing_values': 'yes',
 'missing_values_symbol': 'NaN',
 'year_of_dataset_creation': 1981,
 'last_updated': 'Thu Aug 10 2023',
 'dataset_doi': '10.24432/C5959T',
 'creators': [],
 'intro_paper': None,
 'additional_info': {'summary': "This data set includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family (pp. 500-525).  Each species is identified as definitely 

In [6]:
mushroom.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,poisonous,Target,Categorical,,,,no
1,cap-shape,Feature,Categorical,,"bell=b,conical=c,convex=x,flat=f, knobbed=k,su...",,no
2,cap-surface,Feature,Categorical,,"fibrous=f,grooves=g,scaly=y,smooth=s",,no
3,cap-color,Feature,Binary,,"brown=n,buff=b,cinnamon=c,gray=g,green=r, pink...",,no
4,bruises,Feature,Categorical,,"bruises=t,no=f",,no
5,odor,Feature,Categorical,,"almond=a,anise=l,creosote=c,fishy=y,foul=f, mu...",,no
6,gill-attachment,Feature,Categorical,,"attached=a,descending=d,free=f,notched=n",,no
7,gill-spacing,Feature,Categorical,,"close=c,crowded=w,distant=d",,no
8,gill-size,Feature,Categorical,,"broad=b,narrow=n",,no
9,gill-color,Feature,Categorical,,"black=k,brown=n,buff=b,chocolate=h,gray=g, gre...",,no


> ucimlrepo 패키지에서 오류가 나는 분들은 수업자료 데이터셋에서 직접 로드하세요

In [7]:
df = pd.read_csv('dataset/mushrooms.csv')
X = df.drop(columns=['class'])
y = df['class']

In [8]:
y.describe()

count     8124
unique       2
top          e
freq      4208
Name: class, dtype: object

In [9]:
X.describe()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


| 변수명                      | 역할    | 타입         | 설명                                                                        | 단위 | 결측치 |
|----------------------------|---------|--------------|----------------------------------------------------------------------------|-------|--------|
| poisonous                  | 목표    | 범주형       |                                                                            |       | 없음   |
| cap-shape                  | 특성    | 범주형       | bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s                    |       | 없음   |
| cap-surface                | 특성    | 범주형       | fibrous=f, grooves=g, scaly=y, smooth=s                                    |       | 없음   |
| cap-color                  | 특성    | 이진형       | brown=n, buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y |       | 없음   |
| bruises                    | 특성    | 범주형       | bruises=t, no=f                                                            |       | 없음   |
| odor                       | 특성    | 범주형       | almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s |       | 없음   |
| gill-attachment            | 특성    | 범주형       | attached=a, descending=d, free=f, notched=n                                 |       | 없음   |
| gill-spacing               | 특성    | 범주형       | close=c, crowded=w, distant=d                                              |       | 없음   |
| gill-size                  | 특성    | 범주형       | broad=b, narrow=n                                                          |       | 없음   |
| gill-color                 | 특성    | 범주형       | black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y |       | 없음   |
| stalk-shape                | 특성    | 범주형       | enlarging=e, tapering=t                                                    |       | 없음   |
| stalk-root                 | 특성    | 범주형       | bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=?       |       | 있음   |
| stalk-surface-above-ring   | 특성    | 범주형       | fibrous=f, scaly=y, silky=k, smooth=s                                      |       | 없음   |
| stalk-surface-below-ring   | 특성    | 범주형       | fibrous=f, scaly=y, silky=k, smooth=s                                      |       | 없음   |
| stalk-color-above-ring     | 특성    | 범주형       | brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y |       | 없음   |
| stalk-color-below-ring     | 특성    | 범주형       | brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y |       | 없음   |
| veil-type                  | 특성    | 이진형       | partial=p, universal=u                                                     |       | 없음   |
| veil-color                 | 특성    | 범주형       | brown=n, orange=o, white=w, yellow=y                                       |       | 없음   |
| ring-number                | 특성    | 범주형       | none=n, one=o, two=t                                                       |       | 없음   |
| ring-type                  | 특성    | 범주형       | cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z |       | 없음   |
| spore-print-color          | 특성    | 범주형       | black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y |       | 없음   |
| population                 | 특성    | 범주형       | abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y    |       | 없음   |
| habitat                    | 특성    | 범주형       | grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d         |       | 없음   |


* 이 데이터 세트는 Agaricus와 Lepiota 가족의 23종의 아가리가 있는 버섯에 해당하는 가상의 샘플 설명을 포함합니다 (pp. 500-525).
* 각 종은 확실히 식용 가능, 확실히 독성, 또는 식용 가능성이 불확실하고 권장되지 않음으로 식별됩니다. 
* 가이드에서는 버섯의 식용 가능성을 결정하는 간단한 규칙이 없다고 명확히 명시하고 있습니다.

로컬 머신 데이터 전처리 테스트

In [10]:
X.isnull().sum()

cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [11]:
# 범주형 데이터의 경우 nan 값을 문자열로 변환
X = X.fillna('nan')

In [12]:
y.value_counts()

class
e    4208
p    3916
Name: count, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

# train, val, test 셋으로 나누기
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=2024)

print("Training set size:", X_train.shape, y_train.shape)
print("Validation set size:", X_val.shape, y_val.shape)
print("Test set size:", X_test.shape, y_test.shape)

Training set size: (4549, 22) (4549,)
Validation set size: (1950, 22) (1950,)
Test set size: (1625, 22) (1625,)


In [14]:
from sklearn.preprocessing import LabelEncoder

In [15]:
feature_encoders = {
    name: encoder for name, encoder in zip(X.columns, [LabelEncoder().fit(X[col]) for col in X.columns])
}

X_train_encoded = X_train.copy()
X_val_encoded = X_val.copy()

for name, encoder in feature_encoders.items():
    X_train_encoded[name] = encoder.transform(X_train[name])
    X_val_encoded[name] = encoder.transform(X_val[name])
    
# y 값들을 인코딩
y_train = (y_train != 'e').astype(int)
y_val = (y_val != 'e').astype(int)

In [16]:
feature_encoders_dict = {
    col: {
        orig: encoded 
        for orig, encoded in zip(feature_encoders[col].classes_, feature_encoders[col].transform(feature_encoders[col].classes_))
    }
    for col in feature_encoders.keys()
}

In [17]:
feature_encoders_dict

{'cap-shape': {'b': 0, 'c': 1, 'f': 2, 'k': 3, 's': 4, 'x': 5},
 'cap-surface': {'f': 0, 'g': 1, 's': 2, 'y': 3},
 'cap-color': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
  'n': 4,
  'p': 5,
  'r': 6,
  'u': 7,
  'w': 8,
  'y': 9},
 'bruises': {'f': 0, 't': 1},
 'odor': {'a': 0,
  'c': 1,
  'f': 2,
  'l': 3,
  'm': 4,
  'n': 5,
  'p': 6,
  's': 7,
  'y': 8},
 'gill-attachment': {'a': 0, 'f': 1},
 'gill-spacing': {'c': 0, 'w': 1},
 'gill-size': {'b': 0, 'n': 1},
 'gill-color': {'b': 0,
  'e': 1,
  'g': 2,
  'h': 3,
  'k': 4,
  'n': 5,
  'o': 6,
  'p': 7,
  'r': 8,
  'u': 9,
  'w': 10,
  'y': 11},
 'stalk-shape': {'e': 0, 't': 1},
 'stalk-root': {'?': 0, 'b': 1, 'c': 2, 'e': 3, 'r': 4},
 'stalk-surface-above-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3},
 'stalk-surface-below-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3},
 'stalk-color-above-ring': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
  'n': 4,
  'o': 5,
  'p': 6,
  'w': 7,
  'y': 8},
 'stalk-color-below-ring': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
 

In [18]:
# 타겟 레이블을 첫번째 열로 붙임
train_data = pd.concat([y_train, X_train_encoded], axis=1)
val_data = pd.concat([y_val, X_val_encoded], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

In [19]:
whole_data = pd.concat([y, X], axis=1)
wr.s3.to_csv(whole_data, os.path.join(s3_original_folder_path, 'original_data.csv'), boto3_session=boto3_session)

{'paths': ['s3://dante-sagemaker/mushroom-classification-api-integration/input/original/original_data.csv'],
 'partitions_values': {}}

In [20]:
os.makedirs('scripts', exist_ok=True)

전처리 스크립트 작성

In [21]:
%%writefile scripts/preprocess.py
import argparse
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def preprocess_data(input_data_path, output_train_path, output_val_path, output_test_path, asset_path, test_size):
    # 데이터 읽기
    original_data = pd.read_csv(input_data_path)

    # 특성과 타겟 분리
    X = original_data.iloc[:, 1:]
    y = original_data.iloc[:, 0]
    
    # 전처리
    X = X.fillna('nan')
    
    # 모든 열을 문자열로 변환
    X = X.astype(str)
    
    # train, val, test 셋으로 나누기
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=test_size, random_state=2024)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=2024)
    
    # 범주형 데이터 인코딩
    feature_encoders = {
        name: LabelEncoder().fit(X[name]) for name in X.columns
    }
    
    feature_encoders_dict = {
        col: {
            orig: encoded 
            for orig, encoded in zip(feature_encoders[col].classes_, feature_encoders[col].transform(feature_encoders[col].classes_))
        }
        for col in feature_encoders.keys()
    }

    X_train_encoded = X_train.copy()
    X_val_encoded = X_val.copy()

    for name, encoder in feature_encoders.items():
        X_train_encoded[name] = encoder.transform(X_train[name])
        X_val_encoded[name] = encoder.transform(X_val[name])
        
    # y 값들을 인코딩
    y_train = (y_train != 'e').astype(int)
    y_val = (y_val != 'e').astype(int)

    # 타겟 레이블을 첫번째 열로 붙임
    train_data = pd.concat([y_train, X_train_encoded], axis=1)
    val_data = pd.concat([y_val, X_val_encoded], axis=1)
    test_data = pd.concat([y_test, X_test], axis=1)

    # 전처리된 데이터 저장
    train_data.to_csv(output_train_path, index=False, header=None)
    val_data.to_csv(output_val_path, index=False, header=None)
    test_data.to_csv(output_test_path, index=False, header=None)
    
    # 인코더와 스케일러 저장
    with open(os.path.join(asset_path, 'feature_encoders_dict.pkl'), 'wb') as f:
        pickle.dump(feature_encoders_dict, f)
    
    print("전처리 완료 및 데이터 저장 완료")

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-size', type=float, default=0.2)
    
    args, _ = parser.parse_known_args()
    
    input_data_path = '/opt/ml/processing/input/original_data.csv'
    output_train_path = '/opt/ml/processing/output/train/train_data.csv'
    output_val_path = '/opt/ml/processing/output/validation/val_data.csv'
    output_test_path = '/opt/ml/processing/output/test/test_data.csv'
    asset_path = '/opt/ml/processing/output/asset'
    
    os.makedirs(os.path.dirname(output_train_path), exist_ok=True)
    os.makedirs(os.path.dirname(output_val_path), exist_ok=True)
    os.makedirs(os.path.dirname(output_test_path), exist_ok=True)
    os.makedirs(asset_path, exist_ok=True)
    
    preprocess_data(input_data_path, output_train_path, output_val_path, output_test_path, asset_path, args.test_size)

Overwriting scripts/preprocess.py


파이프라이닝

In [29]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

# 전처리 단계 정의
job_name = name_from_base(project_name + '-preprocess')
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    sagemaker_session=pipeline_session,
    base_job_name=job_name,
    instance_type='ml.m5.xlarge',
    instance_count=1,
)

In [30]:
# 전처리 단계 실행을 위한 인자 설정
preprocess_args = sklearn_processor.run(
    # 전처리 스크립트 경로
    code="scripts/preprocess.py",
    # 입력 데이터 설정
    inputs=[ProcessingInput(input_name="origin", source=s3_original_folder_path, destination="/opt/ml/processing/input")],
    # 출력 데이터 설정
    outputs=[
        # 훈련 데이터 출력
        ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train/", destination=s3_training_file_folder_path),
        # ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train/", destination=os.path.join(s3_training_file_folder_path, 'train_data.csv')),
        # 검증 데이터 출력
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation/", destination=s3_validation_file_folder_path),
        # ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation/", destination=os.path.join(s3_validation_file_folder_path, 'val_data.csv')),
        # 테스트 데이터 출력
        ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test/", destination=s3_test_file_folder_path),
        # ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test/", destination=os.path.join(s3_test_file_folder_path, 'test_data.csv')),
        # 기타 자산 출력
        ProcessingOutput(output_name="asset", source="/opt/ml/processing/output/asset/", destination=s3_asset_folder_path),
    ],
    # 전처리 스크립트에 전달할 인자
    arguments=[
        "--test-size", "0.2",  # 테스트 데이터 비율 설정
    ],
)
# 훈련 단계 정의
preprocess_step = ProcessingStep(name="PreprocessingStep", step_args=preprocess_args)

In [24]:
from sagemaker.workflow.steps import TrainingStep

# XGBoost 모델 컨테이너 URI 가져오기
model_container = sagemaker.image_uris.retrieve("xgboost",sagemaker_session.boto_region_name,version="1.7-1")

# 훈련 작업 이름 생성
job_name = name_from_base(project_name + '-train')

# SageMaker Estimator 객체 생성
estimator = sagemaker.estimator.Estimator(
    model_container,  # 모델 컨테이너 URI
    role,  # IAM 역할
    input_mode='File',  # 입력 모드
    instance_count=1,  # 인스턴스 수
    instance_type='ml.m5.xlarge',  # 인스턴스 유형
    output_path=s3_output_folder_path,  # 출력 경로
    sagemaker_session=pipeline_session,  # SageMaker 세션
    max_run=60*60,  # 최대 실행 시간 (1시간)
    max_wait=60*60,  # 최대 대기 시간 (1시간)
    use_spot_instances=True,  # 스팟 인스턴스 사용
    base_job_name=job_name,  # 기본 작업 이름
)

# XGBoost 하이퍼파라미터 설정
estimator.set_hyperparameters(
    max_depth=5,  # 트리의 최대 깊이
    eta=0.1,  # 학습률
    gamma=4,  # 분할을 위한 최소 손실 감소
    min_child_weight=6,  # 자식 노드에 필요한 최소 가중치 합
    subsample=0.8,  # 훈련 인스턴스의 서브샘플링 비율
    objective='binary:logistic',  # 목적 함수 (이진 분류)
    num_round=200,  # 부스팅 라운드 수
    early_stopping_rounds=10,  # 조기 종료를 위한 라운드 수
    eval_metric='logloss'  # 평가 지표
)

In [None]:
  train_channel = sagemaker.inputs.TrainingInput(
      s3_data=preprocess_step.properties.ProcessingOutputConfig
                              .Outputs["train"].S3Output.S3Uri,
      content_type="text/csv"
  )
  val_channel = sagemaker.inputs.TrainingInput(
      s3_data=preprocess_step.properties.ProcessingOutputConfig
                              .Outputs["validation"].S3Output.S3Uri,
      content_type="text/csv"
  )
  estimator.fit({"train": train_channel, "validation": val_channel})

In [31]:
# 훈련 데이터 입력 구성
training_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_training_file_folder_path,  # S3에 저장된 훈련 데이터 경로
    content_type='text/csv',  # 데이터 형식 (CSV)
    s3_data_type='S3Prefix'  # S3 데이터 유형
)

# 검증 데이터 입력 구성
validation_input_config = sagemaker.session.TrainingInput(
    s3_data=s3_validation_file_folder_path,  # S3에 저장된 검증 데이터 경로
    content_type='text/csv',  # 데이터 형식 (CSV)
    s3_data_type='S3Prefix'  # S3 데이터 유형
)

# 데이터 채널 설정
data_channels = {
    'train': training_input_config,  # 훈련 데이터 채널
    'validation': validation_input_config  # 검증 데이터 채널
}

# 모델 훈련 실행
train_args = estimator.fit(data_channels)  # 설정된 데이터 채널로 모델 훈련 시작

# 훈련 단계 정의
train_step = TrainingStep(
    name='TrainingStep',  # 훈련 단계의 이름
    step_args=train_args,  # 훈련 인자
    cache_config=False  # 캐시 설정 비활성화
)

In [32]:
# SageMaker 워크플로우 파이프라인을 위한 라이브러리 임포트
from sagemaker.workflow.pipeline import Pipeline

# 파이프라인 이름 설정
pipeline_name = "Mushroom-Classification-Pipeline"

# 처리 및 훈련에 사용할 인스턴스 유형 및 개수 설정
processing_instance_type = 'ml.m5.xlarge'
processing_instance_count = 1
training_instance_type = 'ml.m5.xlarge'

# 모델 승인 상태 설정
model_approval_status = 'PendingManualApproval'

# 입력 데이터 및 배치 데이터 경로 설정
input_data = s3_original_folder_path

# 파이프라인 객체 생성
pipeline = Pipeline(
    name=pipeline_name,  # 파이프라인 이름
    parameters=[  # 파이프라인 파라미터 설정
        processing_instance_count,
        training_instance_type,
        model_approval_status,
        input_data,
    ],
    sagemaker_session=pipeline_session,  # 파이프라인 세션
    steps=[preprocess_step, train_step],  # 파이프라인 단계 설정
)

파이프라인 실행

In [33]:
# 파이프라인 업데이트
pipeline.upsert(role_arn=role)
# 파이프라인 실행 시작
execution = pipeline.start()
execution.wait()

엔드포인트 생성

In [33]:
model_artifacts = wr.s3.list_objects(s3_output_folder_path, 'model.tar.gz', boto3_session=boto3_session)
model_artifacts

['s3://dante-sagemaker/mushroom-classification-api-integration/output/model/pipelines-7x22cw8dhhwo-TrainingStep-ZXmKCidshK/output/model.tar.gz',
 's3://dante-sagemaker/mushroom-classification-api-integration/output/model/pipelines-cshf63get7ue-TrainingStep-Ohcr0TNMLY/output/model.tar.gz',
 's3://dante-sagemaker/mushroom-classification-api-integration/output/model/pipelines-fnqnznrkl02x-TrainingStep-pIqCYn0aHQ/output/model.tar.gz',
 's3://dante-sagemaker/mushroom-classification-api-integration/output/model/pipelines-mlnsl4pf1plu-TrainingStep-Ons02dTzZZ/output/model.tar.gz',
 's3://dante-sagemaker/mushroom-classification-api-integration/output/model/pipelines-vay2jgpkz82x-TrainingStep-DOMfJl1oMi/output/model.tar.gz']

In [34]:
model_artifact = model_artifacts[-1]

In [35]:
# 엔드포인트 생성
from sagemaker.model import Model
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

image_uri = sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, version="1.7-1")

# 모델 객체 생성
model = Model(
    image_uri=image_uri,
    model_data=model_artifact,
    role=role,
    sagemaker_session=sagemaker_session
)

# 엔드포인트 생성
endpoint_name = name_from_base(project_name + '-endpoint')
predictor = model.deploy(
    endpoint_name=endpoint_name,
    instance_type='ml.m5.xlarge',  # 인스턴스 유형 지정
    initial_instance_count=1,      # 초기 인스턴스 수 지정
    serializer=CSVSerializer(),
    deserializer=JSONDeserializer()
)

print(f"엔드포인트가 생성되었습니다: {endpoint_name}")


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-11-08-06-30-20-196
INFO:sagemaker:Creating endpoint-config with name mushroom-classification-api-integration-2024-11-08-06-30-20-195
INFO:sagemaker:Creating endpoint with name mushroom-classification-api-integration-2024-11-08-06-30-20-195


-----!엔드포인트가 생성되었습니다: mushroom-classification-api-integration-2024-11-08-06-30-20-195


실시간 엔드포인트 추론 테스트

In [36]:
def find_endpoint(project_name):
    # 로컬 모드 실행시
    sagemaker_client = boto3_session.client('sagemaker')
    # 람다 함수에서 실행시
    # sagemaker_client = boto3.client()
    # 엔드포인트 목록 가져오기
    endpoints = sagemaker_client.list_endpoints()
    
    # 프로젝트 이름을 포함하는 엔드포인트 찾기
    matching_endpoints = [
        endpoint for endpoint in endpoints['Endpoints'] 
        if project_name in endpoint['EndpointName']
    ]
    
    if matching_endpoints:
        # 가장 최근에 생성된 엔드포인트 반환
        return sorted(matching_endpoints, key=lambda x: x['CreationTime'], reverse=True)[0]['EndpointName']
    else:
        return None

# 사용 예시
project_name = 'mushroom-classification-api-integration'
endpoint_name = find_endpoint(project_name)
print(f"찾은 엔드포인트 이름: {endpoint_name}")


찾은 엔드포인트 이름: mushroom-classification-api-integration-2024-11-08-06-30-20-195


In [37]:
bucket_name = 'dante-sagemaker'
project_name = 'mushroom-classification-api-integration'
s3_test_file_folder_path = f's3://{bucket_name}/{project_name}/input/test'

In [38]:
test_data = wr.s3.read_csv(os.path.join(s3_test_file_folder_path, 'test_data.csv'), index_col=0, header=None, boto3_session=boto3_session)

In [39]:
test_data.head(5)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2402,e,x,f,g,t,n,f,c,b,p,t,b,s,s,g,p,p,w,o,p,k,v,d
4042,p,f,y,g,f,f,f,c,b,h,e,b,k,k,b,p,p,w,o,l,h,v,d
1761,e,x,f,w,f,n,f,w,b,k,t,e,f,f,w,w,p,w,o,e,k,s,g
1730,e,f,s,w,f,n,f,w,b,k,t,e,f,s,w,w,p,w,o,e,n,a,g
7924,p,k,s,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l


In [40]:
y_test = test_data.iloc[:, 0]
X_test = test_data.iloc[:, 1:]

In [41]:
feature_columns = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [42]:
X_test.columns = feature_columns

In [43]:
X_test = X_test.fillna('nan')

In [44]:
os.makedirs('assets', exist_ok=True)

In [45]:
wr.s3.download('s3://dante-sagemaker/mushroom-classification-api-integration/output/asset/feature_encoders_dict.pkl', 'assets/feature_encoders_dict.pkl', boto3_session=boto3_session)

In [46]:
import pickle
with open('assets/feature_encoders_dict.pkl', 'rb') as f:
    feature_encoders = pickle.load(f)

In [47]:
feature_encoders

{'class': {'e': 0, 'p': 1},
 'cap-shape': {'b': 0, 'c': 1, 'f': 2, 'k': 3, 's': 4, 'x': 5},
 'cap-surface': {'f': 0, 'g': 1, 's': 2, 'y': 3},
 'cap-color': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
  'n': 4,
  'p': 5,
  'r': 6,
  'u': 7,
  'w': 8,
  'y': 9},
 'bruises': {'f': 0, 't': 1},
 'odor': {'a': 0,
  'c': 1,
  'f': 2,
  'l': 3,
  'm': 4,
  'n': 5,
  'p': 6,
  's': 7,
  'y': 8},
 'gill-attachment': {'a': 0, 'f': 1},
 'gill-spacing': {'c': 0, 'w': 1},
 'gill-size': {'b': 0, 'n': 1},
 'gill-color': {'b': 0,
  'e': 1,
  'g': 2,
  'h': 3,
  'k': 4,
  'n': 5,
  'o': 6,
  'p': 7,
  'r': 8,
  'u': 9,
  'w': 10,
  'y': 11},
 'stalk-shape': {'e': 0, 't': 1},
 'stalk-root': {'?': 0, 'b': 1, 'c': 2, 'e': 3, 'r': 4},
 'stalk-surface-above-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3},
 'stalk-surface-below-ring': {'f': 0, 'k': 1, 's': 2, 'y': 3},
 'stalk-color-above-ring': {'b': 0,
  'c': 1,
  'e': 2,
  'g': 3,
  'n': 4,
  'o': 5,
  'p': 6,
  'w': 7,
  'y': 8},
 'stalk-color-below-ring': {'b': 0,
  '

In [48]:
X_test_encoded = X_test.copy()
for col in feature_columns:
    X_test_encoded[col] = X_test[col].map(feature_encoders_dict[col])
X_test_encoded.head(5)

Unnamed: 0_level_0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2402,5,0,3,1,5,1,0,0,7,1,1,2,2,3,6,0,2,1,4,2,4,0
4042,2,3,3,0,2,1,0,0,3,0,1,1,1,0,6,0,2,1,2,1,4,0
1761,5,0,8,0,5,1,1,0,4,1,3,0,0,7,7,0,2,1,0,2,3,1
1730,2,2,8,0,5,1,1,0,4,1,3,0,2,7,7,0,2,1,0,3,0,1
7924,3,2,4,0,8,1,0,1,0,1,0,2,1,7,7,0,2,1,0,7,4,2


In [49]:
from sagemaker.serializers import CSVSerializer
serializer = CSVSerializer()
body = serializer.serialize(X_test_encoded.iloc[:10, :])
sm_runtime = boto3_session.client('sagemaker-runtime')
response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType='text/csv', Body=body)
predictions = response['Body'].read().decode()
predictions = list(map(lambda x: int(float(x) > 0.5), predictions.strip().split('\n')))
predictions

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

람다 함수 로직

In [50]:
import json
import boto3
import pickle
import pandas as pd
import numpy as np

def lambda_handler(event, context):
    
    # S3와 SageMaker 런타임 클라이언트 생성
    # 로컬에서 실행할때
    boto3_session = boto3.Session(profile_name='awstutor')
    s3 = boto3_session.client('s3')
    sagemaker_runtime = boto3_session.client('sagemaker-runtime')
    
    # SageMaker IDE 인스턴스에서 실행할때
    # s3 = boto3.client('s3')
    # sagemaker_runtime = boto3.client('sagemaker-runtime')
    
    # 버킷 이름과 프로젝트 이름 설정
    bucket_name = 'dante-sagemaker'
    project_name = 'mushroom-classification-api-integration'
    
    # 엔드포인트 이름 찾기
    endpoint_name = find_endpoint(project_name)

    # 인코더 로드
    encoder_key = f'{project_name}/output/asset/feature_encoders_dict.pkl'
    encoder_obj = s3.get_object(Bucket=bucket_name, Key=encoder_key)
    feature_encoders_dict = pickle.loads(encoder_obj['Body'].read())
    
    # 입력 데이터 가져오기
    input_data = event['data']
    
    # 특성 컬럼 정의
    feature_columns = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']
    
    # 데이터프레임 생성 및 전처리
    X_test = pd.DataFrame(input_data, columns=feature_columns)
    X_test = X_test.fillna('nan')
    X_test_encoded = X_test.copy()
    
    # 특성 인코딩
    for col in feature_columns:
        X_test_encoded[col] = X_test[col].map(feature_encoders_dict[col])
        
    # 페이로드 생성
    payload =  "\n".join([",".join([str(x) for x in row]) for row in X_test_encoded.values])
    
    # SageMaker 엔드포인트 호출
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='text/csv',
        Body=payload
    )
    
    # 예측 결과 처리
    predictions = response['Body'].read().decode()
    predictions = list(map(lambda x: int(float(x) > 0.5), predictions.strip().split('\n')))
    
    # 결과 반환
    return {
        'statusCode': 200,
        'body': json.dumps(predictions)
    }

In [51]:
lambda_handler(event={"data" : test_data.iloc[:5, 1:].values.tolist()}, context=None)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


{'statusCode': 200, 'body': '[1, 1, 1, 1, 1]'}

AWS 콘솔에서 람다함수 생성
- 이 단계는 AWS 콘솔에서 진행합니다.

람다함수 호출 테스트

In [52]:
lambda_function_name = 'mushroom-classification-function'

In [54]:
lamb = boto3_session.client('lambda')

# NaN 값을 문자열 'nan'으로 변환
payload_data = test_data.iloc[:5, 1:].fillna('nan').values.tolist()

response = lamb.invoke(
    FunctionName=lambda_function_name, 
    Payload=json.dumps({
        "data": payload_data
    })
)

# 응답 처리
if response['StatusCode'] == 200:
    result = json.loads(response['Payload'].read().decode())
    print("Lambda 함수 실행 결과:", result)
else:
    print("Lambda 함수 호출 실패:", response)

Lambda 함수 실행 결과: {'statusCode': 200, 'body': '[1, 1, 1, 1, 1]'}


In [55]:
preds = result['body']
preds

'[1, 1, 1, 1, 1]'

API Gateway 생성

- 이 단계는 AWS 콘솔에서 진행합니다.

API Gateway 호출 테스트

In [56]:
import requests
import pickle
import json

    
# json payload로 사용할 데이터셋을 준비 (pandas의 nan을 None으로 치환)
request_dataset = [[val if val != 'nan' else None for val in row] for row in np.array(X_test.values[:10].tolist())]
api_url = 'https://6mzmwxvnx0.execute-api.ap-northeast-2.amazonaws.com/kmu-cloud-ml-2024-api'

resp = requests.post(api_url, json={'data' : request_dataset}).json()

preds = resp['body']
preds

'[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]'

리소스 삭제

In [57]:
# Lambda 함수 삭제
lambda_client = boto3_session.client('lambda')
lambda_client.delete_function(FunctionName='mushroom-classification-function')
print("Lambda 함수가 삭제되었습니다.")

Lambda 함수가 삭제되었습니다.


In [308]:
# API Gateway 리소스 삭제
# 콘솔에서 진행하겠습니다.

In [58]:
# SageMaker 엔드포인트 삭제
sm = boto3_session.client('sagemaker')
sm.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': 'db989545-6ff7-4e1d-a9f8-a25ed3ef1e22',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'db989545-6ff7-4e1d-a9f8-a25ed3ef1e22',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Fri, 08 Nov 2024 07:10:16 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}