## Auto Pilot

라이브러리

In [5]:
# 데이터 처리 및 분석
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', None)

# 머신러닝
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# AWS 관련
import sagemaker
from sagemaker.automl.automl import AutoML
import boto3
import awswrangler as wr

# 기타 유틸리티
import os
from dotenv import load_dotenv
load_dotenv()

True

SageMaker 세션 및 역할 설정

In [3]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

S3 데이터 저장 위치 설정

In [4]:
bucket_name = 'dante-sagemaker'
project_name = 'california-housing'

s3_training_file_location = r's3://{0}/{1}/input/training/'.format(bucket_name, project_name)
s3_validation_file_location =r's3://{0}/{1}/input/validation/'.format(bucket_name, project_name)

s3_output_location = r's3://{0}/{1}/output/'.format(bucket_name, project_name)
s3_checkpoint_location = r's3://{0}/{1}/checkpoint/'.format(bucket_name, project_name)

print('s3_training_file_location : ', s3_training_file_location)
print('s3_validation_file_location : ', s3_validation_file_location)
print('s3_output_location : ', s3_output_location)
print('s3_checkpoint_location : ', s3_checkpoint_location)

s3_training_file_location :  s3://dante-sagemaker/california-housing/input/training/
s3_validation_file_location :  s3://dante-sagemaker/california-housing/input/validation/
s3_output_location :  s3://dante-sagemaker/california-housing/output/
s3_checkpoint_location :  s3://dante-sagemaker/california-housing/checkpoint/


데이터 다운로드 및 s3 업로드

In [8]:
# 데이터 로드
housing = fetch_california_housing(as_frame=True)
df = housing.frame
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [23]:
# 데이터 로드
X, y = df.drop('MedHouseVal', axis=1), df['MedHouseVal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# 데이터 저장
wr.s3.to_csv(pd.concat([y_train, X_train], axis=1), os.path.join(s3_training_file_location, 'train.csv'), index=False, boto3_session=boto3_session)
wr.s3.to_csv(pd.concat([y_test, X_test], axis=1), os.path.join(s3_validation_file_location, 'test.csv'), index=False, boto3_session=boto3_session)

{'paths': ['s3://dante-sagemaker/california-housing/input/validation/test.csv'],
 'partitions_values': {}}

SageMaker Autopilot Job 설정

In [26]:
# Autopilot Job 설정
auto_ml = AutoML(
    role=role,
    sagemaker_session=sagemaker_session,
    base_job_name=project_name + '-auto-pilot',
    target_attribute_name='MedHouseVal',  # 목표 변수 (주택 가격)
    output_path=s3_output_location,
    max_candidates=10,  # 생성할 모델 수 제한
    job_objective={'MetricName': 'MSE'},  # 회귀 모델이므로 MSE(평균 제곱 오차)를 최소화
    problem_type='Regression',
)

# Autopilot Job 시작
auto_ml.fit(
    inputs=os.path.join(s3_training_file_location, 'train.csv'),
    wait=True,  # 완료될 때까지 대기
    logs=True,  # 로그 출력
)

.........................................................................................................................................................................................................................
..

최적 모델 엔드포인트 생성 및 추론

In [29]:
# 최적 모델 가져오기
best_candidate = auto_ml.describe_auto_ml_job()['BestCandidate']
best_candidate_name = best_candidate['CandidateName']

# 추론을 위한 모델 배포
predictor = auto_ml.deploy(candidate_name=best_candidate_name, initial_instance_count=1, instance_type='ml.m5.large')

# 예측 수행
sample_data = df.drop(columns=['MedHouseVal']).head(10)  # 샘플 데이터
predictions = predictor.predict(sample_data.values)

# 결과 출력
print("Predictions: ", predictions)

모델 평가

In [None]:
# 모델 평가
y_pred = predictor.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

# 정확도 계산
accuracy = np.mean(y_pred == y_test)
print(f'테스트 세트 정확도: {accuracy:.4f}')

# 혼동 행렬 생성
confusion_mtx = confusion_matrix(y_test, y_pred)
print('혼동 행렬:')
print(confusion_mtx)

# 분류 보고서 출력
print('\n분류 보고서:')
print(classification_report(y_test, y_pred))

엔드포인트 삭제

In [None]:
predictor.delete_endpoint()