## 스크립트 모드

### 로컬모드 수행

라이브러리

In [2]:
# 시각화 라이브러리
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

In [3]:
# 전처리 및 기타 라이브러리
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import awswrangler as wr
import os
import boto3
from dotenv import load_dotenv
load_dotenv('../.env')

True

In [4]:
# 머신러닝 라이브러리
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

In [5]:
# SageMaker 라이브러리
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.inputs import TrainingInput
from sagemaker.utils import name_from_base
from sagemaker.xgboost import XGBoost
from sagemaker.xgboost.model import XGBoostModel
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/dante/Library/Application Support/sagemaker/config.yaml


SageMaker 세션 및 역할 설정

In [6]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

데이터셋 다운로드

> ucimlrepo 라이브러리가 더이상 서비스하지 않는 문제가 있으니, 아래 주석 내용을 제외하고, sklearn.datasets 라이브러리를 사용하여 데이터셋을 로드합니다.
> 
> 라이브러리가 변경되어, 타겟변수가 정수형으로 변경되어 일부 스크립트가 변경되었으니 확인바랍니다.

In [7]:
from sklearn.datasets import fetch_openml
import pandas as pd

# Adult 데이터셋 가져오기
adult = fetch_openml(name='adult', version=1, as_frame=True)

# 특성과 타겟 분리
X = adult.data
y = adult.target
y.name = 'income'

# 타겟 변수를 이진 형태로 변환
y = y.map({'<=50K': 0, '>50K': 1})

print("데이터셋 로드 완료")
print("특성 데이터 형태:", X.shape)
print("타겟 데이터 형태:", y.shape)

데이터셋 로드 완료
특성 데이터 형태: (48842, 14)
타겟 데이터 형태: (48842,)


로컬 모드 테스트

In [8]:
# 테스트 데이터 분리
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 결측치 처리
X_tmp = X_tmp.replace('?', np.nan)

# 범주형 변수와 수치형 변수 구분
categorical_features = X_tmp.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_tmp.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 범주형 변수에 'Unknown' 카테고리 추가 및 결측치 처리
for feature in categorical_features:
    X_tmp[feature] = X_tmp[feature].astype('category')
    X_tmp[feature] = X_tmp[feature].cat.add_categories('Unknown')
    X_tmp[feature] = X_tmp[feature].fillna('Unknown')

# 수치형 특성의 결측치는 중앙값으로 대체
for feature in numeric_features:
    X_tmp[feature] = X_tmp[feature].fillna(X_tmp[feature].median())

# 훈련 / 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=2024)

# 범주형 컬럼 레이블 인코딩
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_val[feature] = le.transform(X_val[feature])
    label_encoders[feature] = le

# 표준화
print("특성 표준화 중")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# PCA 적용
pca = PCA(n_components=0.9)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# PCA 결과를 DataFrame으로 변환
train_data = pd.concat([pd.Series(y_train, name='income'), pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(X_train_pca.shape[1])], index=X_train.index)], axis=1)
val_data = pd.concat([pd.Series(y_val, name='income'), pd.DataFrame(X_val_pca, columns=[f'PC{i+1}' for i in range(X_val_pca.shape[1])], index=X_val.index)], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

print("PCA 전처리 완료")

특성 표준화 중
PCA 전처리 완료


In [9]:
train_data.head()

Unnamed: 0,income,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12
19357,0,0.847677,0.740701,0.314253,-0.963488,-0.074096,-0.341373,0.935778,0.184683,0.1082,0.357192,-0.809655,1.010002
23471,1,-0.537448,0.173578,0.179786,-1.993763,-0.092885,-0.43351,0.578782,0.163811,0.354912,0.830392,-0.461574,0.941602
20840,0,-0.543073,-0.461562,-0.637295,-0.217027,0.751043,-0.308462,0.247126,-0.514735,-0.101227,-1.44315,0.159678,-0.567373
22208,1,2.419338,0.470762,-0.8268,-0.074531,-1.565616,0.097889,2.119097,-0.697977,1.627574,0.389173,1.86468,-0.27761
9849,0,1.556354,0.864994,0.135481,-1.021607,-0.53051,0.281886,-1.656057,-0.083072,1.124826,-0.772528,0.169173,-1.080851


In [10]:
# XGBoost 모델 데이터 입력형식으로 변환
d_train =xgb.DMatrix(train_data.iloc[:, 1:], label=train_data.iloc[:, 0])
d_val = xgb.DMatrix(val_data.iloc[:, 1:], label=val_data.iloc[:, 0])

> ##### 주요 하이퍼파라미터

| 분류 | 파라미터 | 설명 |
|------|----------|------|
| 학습 관련 | max_depth | 트리의 최대 깊이 |
| | learning_rate | 학습률 (eta) |
| | n_estimators | 생성할 트리의 개수 |
| 정규화 관련 | reg_alpha | L1 정규화 파라미터 |
| | reg_lambda | L2 정규화 파라미터 |
| 샘플링 관련 | subsample | 각 트리마다 사용할 샘플의 비율 |
| | colsample_bytree | 각 트리마다 사용할 특성의 비율 |
| 조기 종료 관련 | early_stopping_rounds | 성능 개선이 없을 때 조기 종료할 라운드 수 |
| 기타 | objective | 목적 함수 ('binary:logistic' 등) |
| | eval_metric | 평가 지표 ('auc', 'error' 등) |
| | random_state | 랜덤 시드 |


In [11]:
# XGBoost 모델 파라미터 설정
params = {
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 200,
    'reg_alpha': 1,
    'reg_lambda': 1,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}

# 모델 훈련
num_round = 200
watchlist = [(d_train, '훈련'), (d_val, '검증')]
xgb_model = xgb.train(params, d_train, num_round, watchlist, early_stopping_rounds=20, verbose_eval=10)

# 모델 저장
xgb_model.save_model('xgboost_model.json')

print("모델 훈련이 완료되었습니다.")


[0]	훈련-auc:0.67304	검증-auc:0.67756


[10]	훈련-auc:0.85496	검증-auc:0.85127
[20]	훈련-auc:0.87169	검증-auc:0.86833
[30]	훈련-auc:0.87973	검증-auc:0.87590
[40]	훈련-auc:0.88553	검증-auc:0.88120
[50]	훈련-auc:0.88867	검증-auc:0.88363
[60]	훈련-auc:0.89151	검증-auc:0.88629
[70]	훈련-auc:0.89381	검증-auc:0.88821
[80]	훈련-auc:0.89527	검증-auc:0.88931
[90]	훈련-auc:0.89650	검증-auc:0.88986
[100]	훈련-auc:0.89780	검증-auc:0.89034
[110]	훈련-auc:0.89890	검증-auc:0.89117
[120]	훈련-auc:0.90008	검증-auc:0.89174
[130]	훈련-auc:0.90099	검증-auc:0.89203
[140]	훈련-auc:0.90214	검증-auc:0.89274
[150]	훈련-auc:0.90315	검증-auc:0.89317
[160]	훈련-auc:0.90420	검증-auc:0.89317
[170]	훈련-auc:0.90535	검증-auc:0.89363
[180]	훈련-auc:0.90659	검증-auc:0.89439
[190]	훈련-auc:0.90746	검증-auc:0.89454
[199]	훈련-auc:0.90806	검증-auc:0.89486
모델 훈련이 완료되었습니다.


In [12]:
# 테스트 데이터 분리
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 결측치 처리
X_test = X_test.replace('?', np.nan)

# 범주형 변수와 수치형 변수 구분
categorical_features = X_test.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_test.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 범주형 변수에 'Unknown' 카테고리 추가 및 결측치 처리
for feature in categorical_features:
    X_test[feature] = X_test[feature].astype('category')
    X_test[feature] = X_test[feature].cat.add_categories('Unknown')
    X_test[feature] = X_test[feature].fillna('Unknown')

# 수치형 특성의 결측치는 중앙값으로 대체
for feature in numeric_features:
    X_test[feature] = X_test[feature].fillna(X_test[feature].median())

# 범주형 컬럼 레이블 인코딩
for feature in categorical_features:
    le = label_encoders[feature]
    # 새로운 카테고리 처리
    new_categories = set(X_test[feature]) - set(le.classes_)
    if new_categories:
        le.classes_ = np.append(le.classes_, list(new_categories))
    X_test[feature] = le.transform(X_test[feature])

# 표준화
X_test_scaled = scaler.transform(X_test)

# PCA 적용
X_test_pca = pca.transform(X_test_scaled)

# PCA 결과를 DataFrame으로 변환
test_data = pd.concat([pd.Series(y_test, name='income'), pd.DataFrame(X_test_pca, columns=[f'PC{i+1}' for i in range(X_test_pca.shape[1])], index=X_test.index)], axis=1)

# XGBoost 입력 형식으로 변환
d_test = xgb.DMatrix(test_data.iloc[:, 1:], label=test_data.iloc[:, 0])

# 예측
y_pred = xgb_model.predict(d_test)

print("테스트 데이터 전처리 완료")


테스트 데이터 전처리 완료


In [13]:
# 이진 분류를 위한 임계값 설정 (예: 0.5)
y_pred_binary = (y_pred > 0.5).astype(int)

# 실제 레이블 가져오기 (d_test에서 레이블 추출)
y_test = d_test.get_label()

# 성능 평가
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

print("테스트 셋 성능:")
print(f"정확도: {accuracy:.4f}")
print(f"정밀도: {precision:.4f}")
print(f"재현율: {recall:.4f}")
print(f"F1 점수: {f1:.4f}")

테스트 셋 성능:
정확도: 0.8447
정밀도: 0.7450
재현율: 0.5410
F1 점수: 0.6268


### 스크립트 작성

전처리 프로세싱 스크립트 (pca_preprocessing.py)

In [14]:
os.makedirs('script', exist_ok=True)

In [15]:
%%writefile script/pca_preprocessing.py
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
from glob import glob

# 입력 인수 파싱
parser = argparse.ArgumentParser()
parser.add_argument('--n_components', type=int, default=4)
parser.add_argument('--train-test-split-ratio', type=float, default=0.3)
args, _ = parser.parse_known_args()

# SageMaker 데이터 경로
input_data_path = '/opt/ml/processing/input'
train_data_path = '/opt/ml/processing/train'
validation_data_path = '/opt/ml/processing/validation'
test_data_path = '/opt/ml/processing/test'
asset_path = '/opt/ml/processing/asset'

# 데이터 로드
print("데이터 로드 중")

input_files = glob(os.path.join(input_data_path, '*.csv'))
df = pd.concat([pd.read_csv(file, low_memory=False) for file in input_files])

# 특성과 타겟 분리
print("특성과 타겟 준비 중")
X = df.drop('income', axis=1)
y = df['income']

# 테스트 데이터 분리
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 타겟 변수가 이미 숫자형인지 확인
if y.dtype == 'object':
    # 문자열인 경우에만 매핑 적용
    y = y.map({
        '<=50K': 0,
        '<=50K.': 0,
        '>50K': 1,
        '>50K.': 1
    })
else:
    # 이미 숫자형인 경우 그대로 사용
    print("타겟 변수가 이미 숫자형입니다.")

# 결측치 처리
X_tmp = X_tmp.replace('?', np.nan)

# 범주형 변수와 수치형 변수 구분
categorical_features = X_tmp.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_tmp.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 범주형 변수에 'Unknown' 카테고리 추가 및 결측치 처리
for feature in categorical_features:
    X_tmp[feature] = X_tmp[feature].astype('category')
    X_tmp[feature] = X_tmp[feature].cat.add_categories('Unknown')
    X_tmp[feature] = X_tmp[feature].fillna('Unknown')

# 수치형 특성의 결측치는 중앙값으로 대체
for feature in numeric_features:
    X_tmp[feature] = X_tmp[feature].fillna(X_tmp[feature].median())

# 훈련 / 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=2024)

# 범주형 컬럼 레이블 인코딩
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_val[feature] = le.transform(X_val[feature])
    label_encoders[feature] = le

# 표준화
print("특성 표준화 중")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# PCA 적용
pca = PCA(n_components=args.n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

# PCA 결과를 DataFrame으로 변환
train_data = pd.concat([pd.Series(y_train, name='income'), pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(X_train_pca.shape[1])], index=X_train.index)], axis=1)
val_data = pd.concat([pd.Series(y_val, name='income'), pd.DataFrame(X_val_pca, columns=[f'PC{i+1}' for i in range(X_val_pca.shape[1])], index=X_val.index)], axis=1)
test_data = pd.concat([y_test, X_test], axis=1)

# 전처리 데이터 저장
print("결과 저장 중")
train_file_path = os.path.join(train_data_path, "train.csv")
train_data.to_csv(train_file_path, index=False)
val_file_path = os.path.join(validation_data_path, "validation.csv")
val_data.to_csv(val_file_path, index=False)
test_file_path = os.path.join(test_data_path, "test.csv")
test_data.to_csv(test_file_path, index=False)

# 에셋 저장
with open(f'{asset_path}/adult_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
with open(f'{asset_path}/adult_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open(f'{asset_path}/adult_pca.pkl', 'wb') as f:
    pickle.dump(pca, f)
    
print("PCA 전처리 완료")

Overwriting script/pca_preprocessing.py


모델 훈련 스크립트 (train.py)

In [16]:
%%writefile script/train.py
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import pickle as pkl
from glob import glob

def main(args):  
    # 훈련 / 검증 데이터 로드
    train_files = glob(os.path.join(args.train, "*.csv"))
    print(f"훈련 데이터 파일 목록: {train_files}")
    train_data = pd.concat([pd.read_csv(file) for file in train_files], ignore_index=True)
    val_files = glob(os.path.join(args.validation, "*.csv"))
    print(f"검증 데이터 파일 목록: {val_files}")
    val_data = pd.concat([pd.read_csv(file) for file in val_files], ignore_index=True)

    # XGBoost 입력 데이터 형식 변환
    X_train = train_data.iloc[:, 1:]
    y_train = train_data.iloc[:, 0].astype(int)  # 타겟 변수를 정수형으로 변환
    X_val = val_data.iloc[:, 1:]
    y_val = val_data.iloc[:, 0].astype(int)  # 타겟 변수를 정수형으로 변환
    
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_val = xgb.DMatrix(X_val, label=y_val)

    # XGBoost 모델 생성 및 훈련
    watchlist = [(d_train, '훈련'), (d_val, '검증')]
    params = {
        'max_depth': args.max_depth,
        'learning_rate': args.learning_rate,
        'reg_alpha': args.reg_alpha,
        'reg_lambda': args.reg_lambda,
        'subsample': args.subsample,
        'colsample_bytree': args.colsample_bytree,
        'objective': args.objective,
        'eval_metric': args.eval_metric,
    }
    xgb_model = xgb.train(params, d_train, args.num_round, watchlist, early_stopping_rounds=args.early_stopping_rounds, verbose_eval=10)
       
    # 검증 데이터로 성능 평가
    y_pred = xgb_model.predict(d_val)
    y_pred_binary = (y_pred > 0.5).astype(int)

    accuracy = accuracy_score(y_val, y_pred_binary)
    precision = precision_score(y_val, y_pred_binary)
    recall = recall_score(y_val, y_pred_binary)
    f1 = f1_score(y_val, y_pred_binary)

    print(f'검증 정확도: {accuracy:.4f}')
    print(f'검증 정밀도: {precision:.4f}')
    print(f'검증 재현율: {recall:.4f}')
    print(f'검증 F1 점수: {f1:.4f}')

    # 모델 저장
    model_path = os.path.join(args.model_dir, 'xgboost-model')
    pkl.dump(xgb_model, open(model_path, 'wb'))
    print(f'모델이 {model_path}에 저장되었습니다.')

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()
    # SageMaker 특정 인자 설정 (기본값은 환경 변수에서 가져옴)
    parser.add_argument('--output_data_dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--validation', type=str, default=os.environ.get('SM_CHANNEL_VALIDATION'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    # 하이퍼파라미터 설정
    parser.add_argument('--max-depth', type=int, default=3)
    parser.add_argument('--learning-rate', type=float, default=0.1)
    parser.add_argument('--reg-alpha', type=float, default=0)
    parser.add_argument('--reg-lambda', type=float, default=1)
    parser.add_argument('--subsample', type=float, default=1)
    parser.add_argument('--colsample-bytree', type=float, default=1)
    parser.add_argument('--num-round', type=int, default=200)
    parser.add_argument('--early-stopping-rounds', type=int, default=10)
    parser.add_argument('--objective', type=str, default='binary:logistic')
    parser.add_argument('--eval-metric', type=str, default='auc')
    args, _ = parser.parse_known_args()
    
    main(args)

Overwriting script/train.py


모델 추론 스크립트 (inference.py)

In [17]:
%%writefile script/inference.py
import os
import json
import pickle as pkl
import numpy as np
import xgboost as xgb
import pandas as pd
import io
import boto3

def model_fn(model_dir):
    """XGBoost 모델과 필요한 자산을 `model_dir`에서 로드합니다."""
    # 모델 객체 로드
    model_file = 'xgboost-model'
    xgb_model = pkl.load(open(os.path.join(model_dir, model_file), 'rb'))
    
    # S3에서 asset 파일을 로컬로 복사
    s3 = boto3.client('s3')
    bucket_name = 'dante-sagemaker' # 본인의 버킷명으로 반드시 수정하세요!
    project_name = 'adult-income-classification-v2'
    
    # 자산 파일 로드
    scaler_key = f'{project_name}/asset/adult_scaler.pkl'
    encoder_key = f'{project_name}/asset/adult_encoders.pkl'
    pca_key = f'{project_name}/asset/adult_pca.pkl'
    
    local_scaler_path = os.path.join(model_dir, 'adult_scaler.pkl')
    local_encoder_path = os.path.join(model_dir, 'adult_encoders.pkl')
    local_pca_path = os.path.join(model_dir, 'adult_pca.pkl')
    
    scaler_obj = s3.get_object(Bucket=bucket_name, Key=scaler_key)
    encoder_obj = s3.get_object(Bucket=bucket_name, Key=encoder_key)
    pca_obj = s3.get_object(Bucket=bucket_name, Key=pca_key)
    
    scaler = pkl.loads(scaler_obj['Body'].read())
    encoders = pkl.loads(encoder_obj['Body'].read())
    pca = pkl.loads(pca_obj['Body'].read())
    
    return xgb_model, (scaler, encoders, pca)

def input_fn(request_body, request_content_type):
    """입력 데이터 페이로드를 파싱합니다."""
    if request_content_type != "text/csv":
        raise ValueError(f"지원되지 않는 컨텐츠 타입입니다: {request_content_type}")
    df = pd.read_csv(io.StringIO(request_body), header=None)
    return df.values

def output_fn(prediction, accept):
    """예측 출력을 포맷팅합니다."""
    if accept != "text/csv":
        raise ValueError(f"지원되지 않는 accept 타입입니다: {accept}")
    return ','.join(map(str, prediction))

def predict_fn(input_data, model):
    """로드된 모델로 예측을 수행합니다."""
    xgb_model, (scaler, encoders, pca) = model
    prep_input_data = preprocess_input_data(input_data, (scaler, encoders, pca))
    dmatrix = xgb.DMatrix(prep_input_data)
    return xgb_model.predict(dmatrix)


def preprocess_input_data(input_data, assets):
    """입력 데이터를 전처리합니다."""
    scaler, encoders, pca = assets
    
    total_cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
    numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week']

    
    X = pd.DataFrame(input_data, columns=total_cols)

    # 전처리
    X[X == '?'] = np.nan
    X['workclass'].fillna(X['workclass'].mode()[0], inplace=True)
    X['occupation'].fillna(X['occupation'].mode()[0], inplace=True)
    X['native-country'].fillna(X['native-country'].mode()[0], inplace=True)
    X[numeric_cols] = X[numeric_cols].astype('float64')
    
    # 범주형 컬럼 레이블 인코딩
    for feature in encoders.keys() :
        le = encoders[feature]
        X[feature] = X[feature].astype(str)
        # 인코더 업데이트
        unique_values = np.unique(X[feature])
        le.classes_ = np.unique(np.concatenate([le.classes_, unique_values]))
        # 변환 처리
        X[feature] = le.transform(X[feature])

    # 스케일링
    X_scaled = scaler.transform(X)

    # PCA 차원축소
    X_pca = pca.transform(X_scaled)
    
    return pd.DataFrame(X_pca, columns=[f'PC{i}' for i in range(1, pca.n_components_ + 1)])
   

Overwriting script/inference.py


### SageMaker 컨테이너 실행 코드

S3 경로 설정

In [18]:
bucket_name = 'dante-sagemaker' # 본인의 버킷명으로 반드시 수정하세요!
project_name = 'adult-income-classification-v2'


origin_file_path = f's3://{bucket_name}/{project_name}/origin'
input_path = f's3://{bucket_name}/{project_name}/input'
output_path = f's3://{bucket_name}/{project_name}/output'
model_path = f's3://{bucket_name}/{project_name}/model'
asset_path = f's3://{bucket_name}/{project_name}/asset'
checkpoint_path = f's3://{bucket_name}/{project_name}/checkpoints'
pca_path = f's3://{bucket_name}/{project_name}/pca'

train_path = f'{input_path}/train/'
val_path = f'{input_path}/val/'
test_path = f'{input_path}/test/'

print('train_path:', train_path)
print('val_path:', val_path)
print('test_path:', test_path)
print('model_path:', model_path)
print('asset_path:', asset_path)
print('checkpoint_path:', checkpoint_path)
print('pca_path:', pca_path)

train_path: s3://dante-sagemaker/adult-income-classification-v2/input/train/
val_path: s3://dante-sagemaker/adult-income-classification-v2/input/val/
test_path: s3://dante-sagemaker/adult-income-classification-v2/input/test/
model_path: s3://dante-sagemaker/adult-income-classification-v2/model
asset_path: s3://dante-sagemaker/adult-income-classification-v2/asset
checkpoint_path: s3://dante-sagemaker/adult-income-classification-v2/checkpoints
pca_path: s3://dante-sagemaker/adult-income-classification-v2/pca


In [19]:
wr.s3.delete_objects(input_path, boto3_session=boto3_session)
wr.s3.delete_objects(output_path, boto3_session=boto3_session)
wr.s3.delete_objects(model_path, boto3_session=boto3_session)
wr.s3.delete_objects(asset_path, boto3_session=boto3_session)
wr.s3.delete_objects(checkpoint_path, boto3_session=boto3_session)
wr.s3.delete_objects(pca_path, boto3_session=boto3_session)

원본 데이터 S3 업로드

In [20]:
wr.s3.to_csv(pd.concat([y, X], axis=1), f'{origin_file_path}/adult.csv', index=False, boto3_session=boto3_session)

{'paths': ['s3://dante-sagemaker/adult-income-classification-v2/origin/adult.csv'],
 'partitions_values': {}}

전처리 프로세싱

In [21]:
# PCA 처리 단계 정의
sklearn_processor = SKLearnProcessor(
    framework_version='0.23-1',
    role=role,
    sagemaker_session=sagemaker_session,
    instance_type='ml.m5.xlarge',
    instance_count=1,
)

In [22]:
sklearn_processor.run(
    code="script/pca_preprocessing.py",
    inputs=[ProcessingInput(input_name="origin", source=origin_file_path, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train/", destination=train_path),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation/", destination=val_path),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test/", destination=test_path),
        ProcessingOutput(output_name="asset", source="/opt/ml/processing/asset/", destination=asset_path),
    ],
    arguments=[
        "--n_components", "12", 
        "--train-test-split-ratio", "0.3",
    ],
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-10-21-11-08-42-813


..........데이터 로드 중
특성과 타겟 준비 중
타겟 변수가 이미 숫자형입니다.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[feature] = le.fit_transform(X_train[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[feature] = le.transform(X_val[feature])
특성 표준화 중
결과 저장 중
PCA 전처리 완료



모델 훈련

In [23]:
use_spot_instances=True
max_run=60*60 # 스팟 인스턴스 최대 실행 시간
max_wait=60*60 # 스팟 인스턴스 최대 대기 시간

In [24]:
# 하이퍼 파라미터 설정
hyperparams = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "binary:logistic",
    "num_round": "200",
    "early_stopping_rounds": "10",
    "eval_metric": "logloss",
}


In [25]:
# 컨테이너 설정
xgb_estimator = XGBoost(
    role=role,
    entry_point="script/train.py",
    framework_version="1.7-1",
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    base_job_name=project_name + "-train",
    max_run=max_run,
    max_wait=max_wait,
    use_spot_instances=use_spot_instances,
    hyperparameters=hyperparams,
)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.xlarge.


In [26]:
# XGBoost 모델 훈련 실행
xgb_estimator.fit(
    inputs={
        "train": TrainingInput(s3_data=train_path, content_type='text/csv'),
        "validation": TrainingInput(s3_data=val_path, content_type='text/csv'),
        "asset": TrainingInput(s3_data=asset_path, content_type='text/csv'),
    },
    wait=True,
    logs=True,
    job_name=name_from_base(project_name + "-train"),
)

print("XGBoost 모델 훈련이 완료되었습니다.")

INFO:sagemaker:Creating training-job with name: adult-income-classification-v2-train-2024-10-21-11-15-28-047


2024-10-21 11:15:29 Starting - Starting the training job...
2024-10-21 11:15:44 Starting - Preparing the instances for training...
2024-10-21 11:16:26 Downloading - Downloading the training image......
2024-10-21 11:17:32 Training - Training image download completed. Training in progress.
2024-10-21 11:17:32 Uploading - Uploading generated training model[2024-10-21 11:17:22.534 ip-10-0-112-90.ap-northeast-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-10-21 11:17:22.556 ip-10-0-112-90.ap-northeast-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2024-10-21:11:17:22:INFO] Imported framework sagemaker_xgboost_container.training
[2024-10-21:11:17:22:INFO] No GPUs detected (normal if no gpus installed)
[2024-10-21:11:17:22:INFO] Invoking user training script.
[2024-10-21:11:17:23:INFO] Module train does not provide a setup.py. 
Generating setup.py
[2024-10-21:11:17:23:INFO] Generating setup.cfg
[2024-10-21:11:17:23:INFO

In [27]:
# 훈련된 모델 정보 출력
print(f"훈련된 모델 이름: {xgb_estimator.model_data}")
print(f"훈련 작업 이름: {xgb_estimator.latest_training_job.job_name}")

훈련된 모델 이름: s3://dante-sagemaker/adult-income-classification-v2/output/adult-income-classification-v2-train-2024-10-21-11-15-28-047/output/model.tar.gz
훈련 작업 이름: adult-income-classification-v2-train-2024-10-21-11-15-28-047


모델 추론

In [28]:
# XGBoost 모델 객체 생성
xgb_inf_estimator = XGBoostModel(
    model_data=xgb_estimator.model_data,
    role=role,
    entry_point="script/inference.py",
    framework_version="1.7-1",
    sagemaker_session=sagemaker_session,
)

In [29]:
# 역직렬화 클래스 정의
class CustomDeserializer(CSVDeserializer):
    def deserialize(self, stream, content_type):
        result = super().deserialize(stream, content_type)
        if isinstance(result, list) and len(result) == 1:
            result = result[0]
        result = [float(x) for x in result]
        result = [int(x > 0.5) for x in result]
        return result

In [30]:
# 모델 배포
predictor = xgb_inf_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    serializer=CSVSerializer(),
    deserializer=CustomDeserializer(),
)
# 추론 모델 컨텐츠 타입 설정
predictor.content_type = 'text/csv'
predictor.accept = 'text/csv'

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.xlarge.


INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-10-21-11-20-05-040
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-10-21-11-20-05-606
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-10-21-11-20-05-606


-----!

In [31]:
test_data = wr.s3.read_csv(os.path.join(test_path, 'test.csv'), boto3_session=boto3_session)
X_test, y_test = test_data.iloc[:, 1:], test_data.iloc[:, 0]

In [32]:
X_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,0,Private,423024,HS-grad,9,Never-married,Other-service,Not-in-family,White,Male,0,0,0,United-States
1,0,Private,178953,12th,8,Never-married,Sales,Own-child,White,Female,0,0,0,United-States
2,0,Local-gov,348986,HS-grad,9,Never-married,Handlers-cleaners,Other-relative,Black,Male,0,0,2,United-States
3,0,Private,218215,Some-college,10,Never-married,Sales,Own-child,White,Female,0,0,1,United-States
4,3,Private,244025,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,Amer-Indian-Eskimo,Male,0,0,3,Puerto-Rico


In [33]:
y_test

0       0
1       0
2       0
3       0
4       0
       ..
9764    0
9765    0
9766    0
9767    0
9768    1
Name: income, Length: 9769, dtype: int64

In [34]:
y_preds = predictor.predict(X_test)

예측 평가

In [35]:
# 실제 테스트 데이터의 레이블 (income 열)
y_true = y_test.replace({
    '<=50K': 0,
    '<=50K.': 0,
    '>50K': 1,
    '>50K.': 1
})

# 정확도 계산
accuracy = accuracy_score(y_true, y_preds)
print(f'정확도: {accuracy:.4f}')

# 정밀도 계산
precision = precision_score(y_true, y_preds)
print(f'정밀도: {precision:.4f}')

# 재현율 계산
recall = recall_score(y_true, y_preds)
print(f'재현율: {recall:.4f}')

# F1 점수 계산
f1 = f1_score(y_true, y_preds)
print(f'F1 점수: {f1:.4f}')

# 혼동 행렬 출력
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go

cm = confusion_matrix(y_true, y_preds)

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['예측 0', '예측 1'],
    y=['실제 0', '실제 1'],
    hoverongaps = False,
    text=cm,
    texttemplate="%{text}",
    colorscale='Blues'
))

fig.update_layout(
    title='혼동 행렬',
    xaxis_title='예측',
    yaxis_title='실제'
)

fig.show()


정확도: 0.8434
정밀도: 0.7361
재현율: 0.5461
F1 점수: 0.6270


In [36]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2024-10-21-11-20-05-606
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-10-21-11-20-05-606
