## PCA 차원축소

라이브러리 준비

In [100]:
import warnings
warnings.filterwarnings('ignore')

In [101]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

In [102]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import awswrangler as wr

In [103]:
import os
import pickle
import sagemaker
import boto3
from dotenv import load_dotenv
load_dotenv();

In [104]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

SageMaker 세션 및 역할 설정

In [105]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

데이터셋 다운로드

In [106]:
from sklearn.datasets import fetch_openml
# Adult 데이터셋 가져오기
adult = fetch_openml(name='adult', version=1, as_frame=True)

# 특성과 타겟 분리
X = adult.data
y = adult.target
y.name = 'income'

# 타겟 변수를 이진 형태로 변환
y = y.map({'<=50K': 0, '>50K': 1})

데이터 확인

In [107]:
y.value_counts()

income
0    37155
1    11687
Name: count, dtype: int64

In [108]:
X.describe()

Unnamed: 0,fnlwgt,education-num
count,48842.0,48842.0
mean,189664.1,10.078089
std,105604.0,2.570973
min,12285.0,1.0
25%,117550.5,9.0
50%,178144.5,10.0
75%,237642.0,12.0
max,1490400.0,16.0


In [109]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  category
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capitalgain     48842 non-null  category
 11  capitalloss     48842 non-null  category
 12  hoursperweek    48842 non-null  category
 13  native-country  47985 non-null  category
dtypes: category(12), int64(2)
memory usage: 1.3 MB


In [110]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


S3 경로 설정

In [111]:
bucket_name = 'dante-sagemaker' # 본인의 버킷명으로 반드시 수정하세요!
project_name = 'adult-income-classification'

In [112]:
input_path = f's3://{bucket_name}/{project_name}/input'
output_path = f's3://{bucket_name}/{project_name}/output'
model_path = f's3://{bucket_name}/{project_name}/model'
asset_path = f's3://{bucket_name}/{project_name}/asset'
script_path = f's3://{bucket_name}/{project_name}/script'
checkpoint_path = f's3://{bucket_name}/{project_name}/checkpoints'
pca_path = f's3://{bucket_name}/{project_name}/pca'

train_path = f'{input_path}/train/train.recordio'
val_path = f'{input_path}/val/val.recordio'
test_path = f'{input_path}/test/test.csv'

print('train_path:', train_path)
print('val_path:', val_path)
print('test_path:', test_path)
print('model_path:', model_path)
print('asset_path:', asset_path)
print('checkpoint_path:', checkpoint_path)
print('script_path:', script_path)
print('pca_path:', pca_path)

train_path: s3://dante-sagemaker/adult-income-classification/input/train/train.recordio
val_path: s3://dante-sagemaker/adult-income-classification/input/val/val.recordio
test_path: s3://dante-sagemaker/adult-income-classification/input/test/test.csv
model_path: s3://dante-sagemaker/adult-income-classification/model
asset_path: s3://dante-sagemaker/adult-income-classification/asset
checkpoint_path: s3://dante-sagemaker/adult-income-classification/checkpoints
script_path: s3://dante-sagemaker/adult-income-classification/script
pca_path: s3://dante-sagemaker/adult-income-classification/pca


테스트 데이터 분리 및 S3 업로드

In [113]:
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size = 0.1, random_state = 2024)

In [114]:
test_df = pd.concat([y_test, X_test], axis=1)
wr.s3.to_csv(test_df, test_path, index=False, boto3_session=boto3_session)

{'paths': ['s3://dante-sagemaker/adult-income-classification/input/test/test.csv'],
 'partitions_values': {}}

데이터 전처리

In [115]:
# '?' 값을 NaN으로 대체
X_tmp[X_tmp == '?'] = np.nan

In [144]:
# 범주형 데이터 열 이름 확인
X_tmp.select_dtypes(include=['object', 'category']).columns

Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capitalgain', 'capitalloss',
       'hoursperweek', 'native-country'],
      dtype='object')

In [117]:
# workclass, occupation, native-country 열은 결측치가 존재하기 때문에 이를 최빈값을 보간합니다.
X_tmp['workclass'].fillna(X_tmp['workclass'].mode()[0], inplace=True)
X_tmp['occupation'].fillna(X_tmp['occupation'].mode()[0], inplace=True)
X_tmp['native-country'].fillna(X_tmp['native-country'].mode()[0], inplace=True)
X_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43957 entries, 25825 to 7816
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             43957 non-null  category
 1   workclass       43957 non-null  category
 2   fnlwgt          43957 non-null  int64   
 3   education       43957 non-null  category
 4   education-num   43957 non-null  int64   
 5   marital-status  43957 non-null  category
 6   occupation      43957 non-null  category
 7   relationship    43957 non-null  category
 8   race            43957 non-null  category
 9   sex             43957 non-null  category
 10  capitalgain     43957 non-null  category
 11  capitalloss     43957 non-null  category
 12  hoursperweek    43957 non-null  category
 13  native-country  43957 non-null  category
dtypes: category(12), int64(2)
memory usage: 1.5 MB


In [118]:
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size = 0.3, random_state = 2024)

In [119]:
# 범주형 변수를 인코딩하기 위한 딕셔너리 생성
feature_encoders = {}

# 범주형(object, category) 데이터 열 선택
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

# 각 범주형 열에 대해 레이블 인코딩 수행
for col in categorical_cols:
    # LabelEncoder 객체 생성
    encoder = LabelEncoder()
    
    # 훈련 데이터에 대해 인코더를 학습하고 변환
    X_train[col] = encoder.fit_transform(X_train[col])
    
    # 인코더를 딕셔너리에 저장
    feature_encoders[col] = encoder
    
    # 검증 데이터와 테스트 데이터에 대해 학습된 인코더로 변환
    X_val[col] = encoder.transform(X_val[col])

In [120]:
# StandardScaler를 사용하여 numeric 특성 스케일링 수행
scaler = StandardScaler()

# numeric 컬럼 선택
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

# 훈련 데이터의 numeric 컬럼에 대해 스케일러를 학습하고 변환
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# 검증 데이터의 numeric 컬럼에 대해 학습된 스케일러로 변환
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])

In [121]:
# 숫자형 데이터로 모두 변환되었는지 확인
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30769 entries, 28878 to 30753
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             30769 non-null  float64
 1   workclass       30769 non-null  float64
 2   fnlwgt          30769 non-null  float64
 3   education       30769 non-null  float64
 4   education-num   30769 non-null  float64
 5   marital-status  30769 non-null  float64
 6   occupation      30769 non-null  float64
 7   relationship    30769 non-null  float64
 8   race            30769 non-null  float64
 9   sex             30769 non-null  float64
 10  capitalgain     30769 non-null  float64
 11  capitalloss     30769 non-null  float64
 12  hoursperweek    30769 non-null  float64
 13  native-country  30769 non-null  float64
dtypes: float64(14)
memory usage: 3.5 MB


In [122]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
28878,-1.36751,-0.093884,-0.150088,1.214871,-0.027163,0.912707,-0.290514,0.971772,0.391761,0.704144,-0.269024,-0.204754,-2.17658,0.260168
4832,0.947673,-0.093884,-1.514669,0.184108,-0.415719,-0.411967,-1.046313,-0.901651,0.391761,0.704144,-0.269024,-0.204754,0.054492,0.260168
31657,1.719401,2.60082,0.678506,-1.362036,-2.358501,-0.411967,1.72495,-0.901651,0.391761,0.704144,-0.269024,-0.204754,-1.061044,0.260168
27870,-1.36751,-0.093884,0.288244,1.214871,-0.027163,0.912707,-0.542447,0.971772,0.391761,0.704144,-0.269024,-0.204754,1.170027,0.260168
26820,-1.36751,-0.093884,-0.676139,1.214871,-0.027163,0.912707,1.221084,0.971772,-1.981795,-1.420164,-0.269024,-0.204754,-2.17658,0.260168


In [123]:
# 추후 테스트를 위해 인코더, 스케일러를 파일로 저장
os.makedirs('dataset', exist_ok=True)
with open('dataset/adult_encoders.pkl', 'wb') as f:
    pickle.dump(feature_encoders, f)
with open('dataset/adult_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [124]:
# 인코더 업로드
encoders_s3_path = os.path.join(asset_path, 'adult_encoders.pkl')
wr.s3.upload('dataset/adult_encoders.pkl', encoders_s3_path, boto3_session=boto3_session)
# 스케일러 업로드
scaler_s3_path = os.path.join(asset_path, 'adult_scaler.pkl')
wr.s3.upload('dataset/adult_scaler.pkl', scaler_s3_path, boto3_session=boto3_session)

### 로컬 모드
---

차원 축소

In [125]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
28878,-1.367510,-0.093884,-0.150088,1.214871,-0.027163,0.912707,-0.290514,0.971772,0.391761,0.704144,-0.269024,-0.204754,-2.176580,0.260168
4832,0.947673,-0.093884,-1.514669,0.184108,-0.415719,-0.411967,-1.046313,-0.901651,0.391761,0.704144,-0.269024,-0.204754,0.054492,0.260168
31657,1.719401,2.600820,0.678506,-1.362036,-2.358501,-0.411967,1.724950,-0.901651,0.391761,0.704144,-0.269024,-0.204754,-1.061044,0.260168
27870,-1.367510,-0.093884,0.288244,1.214871,-0.027163,0.912707,-0.542447,0.971772,0.391761,0.704144,-0.269024,-0.204754,1.170027,0.260168
26820,-1.367510,-0.093884,-0.676139,1.214871,-0.027163,0.912707,1.221084,0.971772,-1.981795,-1.420164,-0.269024,-0.204754,-2.176580,0.260168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19310,0.175946,2.600820,-0.276585,0.441799,1.527062,-0.411967,0.717218,-0.901651,0.391761,0.704144,-0.269024,-0.204754,1.170027,0.260168
215,0.175946,-0.093884,4.227727,-0.331274,1.138506,0.912707,-0.794380,-0.277177,-1.981795,0.704144,-0.269024,-0.204754,1.170027,0.260168
8798,-0.595782,-0.093884,0.793315,1.214871,-0.027163,0.912707,0.213352,0.971772,0.391761,0.704144,-0.269024,3.405613,-2.176580,0.260168
35323,1.719401,-0.093884,-0.017680,-0.588964,0.361393,2.237381,0.717218,1.596247,0.391761,-1.420164,5.090387,-0.204754,0.054492,0.260168


In [126]:
pca = PCA(n_components=0.9)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

In [127]:
pca.explained_variance_ratio_

array([0.14994353, 0.10247411, 0.08202965, 0.07952118, 0.07482676,
       0.07341251, 0.07094928, 0.06749229, 0.06311276, 0.06087873,
       0.05825584, 0.04782387])

In [128]:
print(f"차원축소 : {X_train.shape} -> {X_train_pca.shape}")
print(f"차원축소 : {X_val.shape} -> {X_val_pca.shape}")

차원축소 : (30769, 14) -> (30769, 12)
차원축소 : (13188, 14) -> (13188, 12)


모델 훈련

In [129]:
# XGBoost 모델 생성 및 학습
xgb_model = XGBClassifier(random_state=2024)
xgb_model.fit(X_train_pca, y_train)

# 검증 세트에 대한 예측
y_val_pred = xgb_model.predict(X_val_pca)

# 모델 성능 평가
accuracy = accuracy_score(y_val, y_val_pred)
print(f"XGBoost 모델의 정확도: {accuracy:.4f}")

XGBoost 모델의 정확도: 0.8396


시각화

In [130]:
# plotly를 사용하여 결과 시각화
import plotly.graph_objects as go
import numpy as np

# 실제 값과 예측 값 비교
y_val_flat = y_val.cat.codes  # 범주형 데이터를 정수로 변환
y_val_pred_flat = y_val_pred  # 이미 1차원 배열이므로 변환 불필요
confusion_matrix = pd.crosstab(y_val_flat, y_val_pred_flat, rownames=['실제'], colnames=['예측'])

# 히트맵 생성
fig = go.Figure(data=go.Heatmap(
                z=confusion_matrix.values,
                x=confusion_matrix.columns,
                y=confusion_matrix.index,
                colorscale='Viridis'))

# 레이아웃 설정
fig.update_layout(
    title='혼동 행렬',
    xaxis_title='예측 값',
    yaxis_title='실제 값'
)

# 각 셀에 텍스트 추가
annotations = []
for i, row in enumerate(confusion_matrix.values):
    for j, value in enumerate(row):
        annotations.append(
            dict(
                x=confusion_matrix.columns[j],
                y=confusion_matrix.index[i],
                text=str(value),
                showarrow=False,
                font=dict(color='white' if value < confusion_matrix.values.max() / 2 else 'black')
            )
        )
fig.update_layout(annotations=annotations)

# 그래프 표시
fig.show()

In [131]:
# 특성 중요도 시각화
feature_importance = xgb_model.feature_importances_
feature_names = [f'PC{i+1}' for i in range(len(feature_importance))]

# 특성 중요도를 내림차순으로 정렬
sorted_idx = np.argsort(feature_importance)
sorted_feature_names = [feature_names[i] for i in sorted_idx]
sorted_feature_importance = feature_importance[sorted_idx]

# 막대 그래프 생성
fig = go.Figure(go.Bar(
    x=sorted_feature_importance,
    y=sorted_feature_names,
    orientation='h'
))

# 레이아웃 설정
fig.update_layout(
    title='XGBoost 모델의 특성 중요도',
    xaxis_title='중요도',
    yaxis_title='주성분',
    height=600,
    width=800
)

# 그래프 표시
fig.show()


### SageMaker 클라우드 모드
---

데이터 업로드

In [132]:
# import io
# import numpy as np
# import sagemaker.amazon.common as smac
# import boto3

# def numpy_to_recordio_protobuf(array):
#     buffer = io.BytesIO()
#     smac.write_numpy_to_dense_tensor(buffer, array)
#     return buffer.getvalue()

# # S3 클라이언트 생성
# s3_client = boto3.client('s3')

# # 버킷 이름과 파일 키(경로) 설정
# bucket_name = 'your-bucket-name'
# train_key = 'path/to/train_data.recordio'
# val_key = 'path/to/val_data.recordio'

# # 데이터를 RecordIO-protobuf 형식으로 변환
# train_data_recordio = numpy_to_recordio_protobuf(X_train.values)
# val_data_recordio = numpy_to_recordio_protobuf(X_val.values)

# # S3에 업로드
# s3_client.put_object(Body=train_data_recordio, Bucket=bucket_name, Key=train_key)
# s3_client.put_object(Body=val_data_recordio, Bucket=bucket_name, Key=val_key)

In [133]:
from sagemaker.amazon.common import RecordSerializer
serializer = RecordSerializer()
wr.s3.upload(serializer.serialize(X_train.values), train_path, boto3_session=boto3_session)
wr.s3.upload(serializer.serialize(X_val.values), val_path, boto3_session=boto3_session)

PCA 빌트인 모델 및 하이퍼파라미터 설정

In [134]:
container = sagemaker.image_uris.retrieve('pca', sagemaker_session.boto_region_name)
use_spot_instances = True
max_run = 60 * 60 # 1시간
max_wait = 60 * 60 # 1시간
training_instance_type = "ml.m5.xlarge"
content_type = 'text/csv'

In [135]:
from sagemaker.utils import name_from_base

pca_estimator =  sagemaker.estimator.Estimator(
    container,
    role, 
    sagemaker_session=sagemaker_session,
    instance_count=1, 
    instance_type=training_instance_type,
    output_path=output_path,
    base_job_name = name_from_base(project_name),
    use_spot_instances=use_spot_instances,
    max_run=max_run,
    max_wait=max_wait,
)

In [136]:
# https://docs.aws.amazon.com/ko_kr/sagemaker/latest/dg/PCA-reference.html
pca_estimator.set_hyperparameters(
    feature_dim=14,
    num_components=12,
    subtract_mean=False,
    algorithm_mode='regular',
    mini_batch_size=200
)

In [137]:
pca_estimator.fit({'train': train_path})

INFO:sagemaker:Creating training-job with name: adult-income-classification-2024-10-21--2024-10-21-10-51-03-354


2024-10-21 10:51:04 Starting - Starting the training job...
2024-10-21 10:51:19 Starting - Preparing the instances for training...
2024-10-21 10:51:55 Downloading - Downloading the training image......
2024-10-21 10:53:01 Training - Training image download completed. Training in progress..Docker entrypoint called with argument(s): train
Running default environment configuration script
  if num_device is 1 and 'dist' not in kvstore:
[10/21/2024 10:53:11 INFO 139668033648448] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}
[10/21/2024 10:53:11 INFO 139668033648448] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'algorithm_mode': 'regular', 'feature_dim': '14', 'mini_batch_size'

차원축소 엔드포인트 생성

In [138]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
import pandas as pd

class CustomJSONDeserializer(JSONDeserializer):
    def deserialize(self, data, content_type):
        result = super().deserialize(data, content_type)
        projections = [proj['projection'] for proj in result['projections']]
        return pd.DataFrame(projections, columns=[f'PC{i+1}' for i in range(len(projections[0]))])


pca_predictor = pca_estimator.deploy(
    initial_instance_count=1, 
    instance_type=training_instance_type,
    endpoint_name=name_from_base(project_name + '-pca'),
    serializer=CSVSerializer(),
    deserializer=CustomJSONDeserializer()
)

INFO:sagemaker:Creating model with name: adult-income-classification-2024-10-21--2024-10-21-10-53-51-445


INFO:sagemaker:Creating endpoint-config with name adult-income-classification-pca-2024-10-21-10-53-51-445
INFO:sagemaker:Creating endpoint with name adult-income-classification-pca-2024-10-21-10-53-51-445


------!

차원축소 실행

* 훈련, 검증 데이터셋

In [139]:
# 데이터를 배치로 나누어 처리
batch_size = 10000  # 적절한 배치 크기 설정

X_train_pca = []
for i in range(0, len(X_train), batch_size):
    batch = X_train.iloc[i:i+batch_size].values
    batch_pca = pca_predictor.predict(batch)
    X_train_pca.append(batch_pca)
X_train_pca = pd.concat(X_train_pca, ignore_index=True)

X_val_pca = []
for i in range(0, len(X_val), batch_size):
    batch = X_val.iloc[i:i+batch_size].values
    batch_pca = pca_predictor.predict(batch)
    X_val_pca.append(batch_pca)
X_val_pca = pd.concat(X_val_pca, ignore_index=True)

* 테스트 데이터셋

In [140]:
# 데이터 다운로드
test_df = wr.s3.read_csv(test_path, boto3_session=boto3_session)
X_test = test_df.iloc[:, 1:]
y_test = test_df.iloc[:, 0]

In [141]:
wr.s3.download(os.path.join(asset_path, 'adult_encoders.pkl'), 'dataset/adult_encoders.pkl', boto3_session=boto3_session)
wr.s3.download(os.path.join(asset_path, 'adult_scaler.pkl'), 'dataset/adult_scaler.pkl', boto3_session=boto3_session)
with open('dataset/adult_encoders.pkl', 'rb') as f:
    feature_encoders = pickle.load(f)
with open('dataset/adult_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [142]:
# 전처리
# '?' 값을 NaN으로 대체
X_test[X_test == '?'] = np.nan
# workclass, occupation, native-country 열은 결측치가 존재하기 때문에 이를 최빈값을 보간합니다.
X_test['workclass'].fillna(X_test['workclass'].mode()[0], inplace=True)
X_test['occupation'].fillna(X_test['occupation'].mode()[0], inplace=True)
X_test['native-country'].fillna(X_test['native-country'].mode()[0], inplace=True)

In [146]:
# 인코딩 / 스케일링
X_test_encoded = X_test.copy()
# 로드한 레이블 인코더를 사용하여 범주형 변수 인코딩
for col in feature_encoders.keys():
    # 새로운 범주가 있는지 확인
    unseen_categories = set(X_test_encoded[col]) - set(feature_encoders[col].classes_)
    if unseen_categories:
        # 새로운 범주가 있으면 인코더 업데이트
        feature_encoders[col].classes_ = np.append(feature_encoders[col].classes_, list(unseen_categories))
    # 인코딩 진행
    X_test_encoded[col] = feature_encoders[col].transform(X_test_encoded[col])

# 수치형 변수 정규화
X_test_scaled = scaler.transform(X_test_encoded)

# 전처리된 데이터를 DataFrame으로 변환
X_test_processed = pd.DataFrame(X_test_scaled, columns=X_test_encoded.columns)

print("X_test shape:", X_test_processed.shape)
print("y_test shape:", y_test.shape)


X_test shape: (4885, 14)
y_test shape: (4885,)


In [147]:
X_test_pca = pca_predictor.predict(X_test_processed.values)
X_test_pca.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12
0,-1.854052,-1.611059,1.518132,8.121953,4.125352,-1.661083,-3.113461,-2.954618,0.848511,3.388551,1.463064,7.132072
1,-1.360219,-4.307003,0.849553,7.699219,4.182195,1.559687,-3.648826,-3.052536,1.038673,4.662076,-0.256844,5.532094
2,-1.163707,-2.752522,0.145521,8.630344,2.714113,-0.148636,-2.237233,-3.518494,1.175824,2.531605,3.149301,3.417424
3,-0.646073,-1.727187,1.890225,8.26057,3.107603,-0.159877,-2.474569,-3.881232,2.901913,2.375993,2.844999,2.544449
4,-0.777944,-2.774834,0.531698,8.632375,3.062,0.328825,-2.474372,-3.198385,1.341331,1.405606,1.589867,4.180598


In [136]:
pca_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: adult-income-classification-pca-2024-08-01-03-11-46-170
INFO:sagemaker:Deleting endpoint with name: adult-income-classification-pca-2024-08-01-03-11-46-170


차원축소 데이터 S3 업로드

In [137]:
import io
import pickle
import numpy as np
from sagemaker.amazon.common import write_numpy_to_dense_tensor

# RecordIO 형식으로 데이터를 변환하고 S3에 저장하는 헬퍼 함수
def convert_to_recordio_protobuf(feature_values : np.ndarray, target_values : np.ndarray, s3_path : str):

    # RecordIO-Protobuf 형식으로 변환
    buf = io.BytesIO()
    write_numpy_to_dense_tensor(buf, feature_values, target_values)
    buf.seek(0)
    
    # S3에 업로드
    wr.s3.upload(local_file=buf, path=s3_path, boto3_session=boto3_session)
    print(f"데이터가 {s3_path}에 RecordIO-Protobuf 형식으로 저장되었습니다.")

In [149]:
convert_to_recordio_protobuf(X_train_pca.values, y_train.values.flatten(), os.path.join(pca_path, 'train/train-pca.recordio'))
convert_to_recordio_protobuf(X_val_pca.values, y_val.values.flatten(), os.path.join(pca_path, 'val/val-pca.recordio'))
convert_to_recordio_protobuf(X_test_pca.values, y_test.values.flatten(), os.path.join(pca_path, 'test/test-pca.recordio'))

데이터가 s3://dante-sagemaker/adult-income-classification/pca/train/train-pca.recordio에 RecordIO-Protobuf 형식으로 저장되었습니다.
데이터가 s3://dante-sagemaker/adult-income-classification/pca/val/val-pca.recordio에 RecordIO-Protobuf 형식으로 저장되었습니다.
데이터가 s3://dante-sagemaker/adult-income-classification/pca/test/test-pca.recordio에 RecordIO-Protobuf 형식으로 저장되었습니다.


XGBoost 빌트인 모델 및 하이퍼파라미터 설정

In [150]:
container = sagemaker.image_uris.retrieve("xgboost",sagemaker_session.boto_region_name,version="1.7-1")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [151]:
xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=container,  # 사용할 Docker 이미지의 URI
    role=role,  # IAM 역할 ARN
    sagemaker_session=sagemaker_session,  # SageMaker 세션 객체
    instance_count=1,  # 사용할 인스턴스 수
    instance_type=training_instance_type,  # 사용할 인스턴스 유형
    max_run=max_run,  # 최대 실행 시간 (초)
    use_spot_instances=use_spot_instances,  # 스팟 인스턴스 사용 여부
    max_wait=max_wait,  # 스팟 인스턴스 대기 최대 시간 (초)
    output_path=output_path,  # 모델 아티팩트 저장 경로
    base_job_name=project_name,  # 훈련 작업 이름의 기본 접두사
)

In [152]:
xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.1,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',
    num_round=200,
    early_stopping_rounds=10,
    eval_metric='logloss'
)

In [153]:
training_input_config = sagemaker.session.TrainingInput(
    s3_data=os.path.join(pca_path, 'train'),
    content_type='application/x-recordio-protobuf',
)

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=os.path.join(pca_path, 'val'),
    content_type='application/x-recordio-protobuf',
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

xgb_estimator.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: adult-income-classification-2024-08-01-03-32-59-239


2024-08-01 03:32:59 Starting - Starting the training job...
2024-08-01 03:33:13 Starting - Preparing the instances for training...
2024-08-01 03:33:55 Downloading - Downloading the training image......
2024-08-01 03:34:56 Training - Training image download completed. Training in progress...[2024-08-01 03:35:02.923 ip-10-0-243-133.ap-northeast-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-08-01 03:35:02.948 ip-10-0-243-133.ap-northeast-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2024-08-01:03:35:03:INFO] Imported framework sagemaker_xgboost_container.training
[2024-08-01:03:35:03:INFO] Failed to parse hyperparameter eval_metric value logloss to Json.
Returning the value itself
[2024-08-01:03:35:03:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.
Returning the value itself
[2024-08-01:03:35:03:INFO] No GPUs detected (normal if no gpus installed)
[2024-08-01:03:35:03:INFO] Running XGB

In [154]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
import numpy as np

class CustomCSVDeserializer(CSVDeserializer):
    def deserialize(self, stream, content_type):
        result = super().deserialize(stream, content_type)
        return np.array([float(item[0]) for item in result])
    
    
xgb_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name = name_from_base(project_name + "-xgb"),
    serialize = CSVSerializer(),
    deserializer = CustomCSVDeserializer()
)

INFO:sagemaker:Creating model with name: adult-income-classification-2024-08-01-05-04-42-748
INFO:sagemaker:Creating endpoint-config with name adult-income-classification-xgb-2024-08-01-05-04-42-748
INFO:sagemaker:Creating endpoint with name adult-income-classification-xgb-2024-08-01-05-04-42-748


-------!

In [160]:
y_preds = xgb_predictor.predict(X_test_pca)
y_preds = [1 if pred > 0.5 else 0 for pred in y_preds] # 확률값 0.5기준으로 바이너리화

In [165]:
from sklearn.metrics import classification_report

# 분류 보고서 생성
report = classification_report(y_test, y_preds)

print("분류 보고서:")
print(report)


분류 보고서:
              precision    recall  f1-score   support

           0       0.86      0.93      0.90      3699
           1       0.73      0.54      0.62      1186

    accuracy                           0.84      4885
   macro avg       0.80      0.74      0.76      4885
weighted avg       0.83      0.84      0.83      4885



In [166]:
xgb_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: adult-income-classification-xgb-2024-08-01-05-04-42-748
INFO:sagemaker:Deleting endpoint with name: adult-income-classification-xgb-2024-08-01-05-04-42-748
