라이브러리 준비

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline(connected=True)

In [62]:
from kaggle.api.kaggle_api_extended import KaggleApi
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import awswrangler as wr

In [63]:
import os
import sagemaker
import boto3
from dotenv import load_dotenv
load_dotenv()

True

SageMaker 세션 및 역할 설정

In [64]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


데이터셋 다운로드

In [65]:
# Kaggle API 인증
api = KaggleApi()
api.authenticate()

# 데이터셋 다운로드 경로 설정
download_path = 'dataset/bike-sharing'
os.makedirs(download_path, exist_ok=True)

# 데이터셋 다운로드
api.dataset_download_files('archit9406/bike-sharing', path=download_path, unzip=True)

print("'archit9406/bike-sharing' 데이터셋이 'dataset/bike-sharing' 폴더에 다운로드되었습니다.")

Dataset URL: https://www.kaggle.com/datasets/archit9406/bike-sharing
'archit9406/bike-sharing' 데이터셋이 'dataset/bike-sharing' 폴더에 다운로드되었습니다.


데이터 로드 및 확인

In [66]:
df = pd.read_csv('dataset/bike-sharing/hour.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


시계열 인덱스 설정 및 컬럼 정리

In [68]:
# dteday와 hr를 결합하여 datetime 열 생성
df['datetime'] = pd.to_datetime(df['dteday']) + pd.to_timedelta(df['hr'], unit='h')

# datetime을 인덱스로 설정
df.set_index('datetime', inplace=True)
df.yr = df.index.year

# instant와 dteday 컬럼 삭제
df = df.drop(['instant', 'dteday'], axis=1)

df.head()

Unnamed: 0_level_0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-01 00:00:00,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,1,2011,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,1,2011,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,1,2011,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,1,2011,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


레이블 컬럼 첫열로 이동

In [11]:
# cnt 컬럼을 맨 처음으로 이동
columns = ['cnt'] + [col for col in df.columns if col != 'cnt']
df = df[columns]

df.head()

Unnamed: 0_level_0,cnt,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-01-01 00:00:00,16,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13
2011-01-01 01:00:00,40,1,2011,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32
2011-01-01 02:00:00,32,1,2011,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27
2011-01-01 03:00:00,13,1,2011,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10
2011-01-01 04:00:00,1,1,2011,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1


데이터 분리 (훈련 / 검증 / 테스트)

In [12]:
# 데이터를 train, validation, test로 분리
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.15)

train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]

print(f"훈련 데이터 크기: {train_df.shape}")
print(f"검증 데이터 크기: {val_df.shape}")
print(f"테스트 데이터 크기: {test_df.shape}")

훈련 데이터 크기: (12165, 15)
검증 데이터 크기: (2606, 15)
테스트 데이터 크기: (2608, 15)


S3에 데이터 업로드

In [13]:
bucket_name = 'dante-sagemaker'
project_name = 'bike-sharing'

In [14]:
input_path = f's3://{bucket_name}/{project_name}/input'
output_path = f's3://{bucket_name}/{project_name}/output'
model_path = f's3://{bucket_name}/{project_name}/model'
script_path = f's3://{bucket_name}/{project_name}/script'
train_path = f'{input_path}/train/train.csv'
val_path = f'{input_path}/val/val.csv'
test_path = f'{input_path}/test/test.csv'

In [17]:
wr.s3.to_csv(train_df, train_path, index=False, header=False, boto3_session=boto3_session)
wr.s3.to_csv(val_df, val_path, index=False, header=False, boto3_session=boto3_session)
wr.s3.to_csv(test_df, test_path, index=False, header=False, boto3_session=boto3_session)

{'paths': ['s3://dante-sagemaker/bike-sharing/input/test.csv'],
 'partitions_values': {}}

빌트인 모델 및 하이퍼파라미터 설정

In [None]:
container = sagemaker.image_uris.retrieve("xgboost",sagemaker_session.boto_region_name,version="1.7-1")
use_spot_instances = True
max_wait = 3600
max_run = 3600
instance_type = 'ml.m5.xlarge'

In [71]:
container

'366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-xgboost:1.7-1'

In [23]:
estimator = sagemaker.estimator.Estimator(
    image_uri=container,  # 사용할 Docker 이미지의 URI
    role=role,  # IAM 역할 ARN
    sagemaker_session=sagemaker_session,  # SageMaker 세션 객체
    instance_count=1,  # 사용할 인스턴스 수
    instance_type=instance_type,  # 사용할 인스턴스 유형
    max_run=max_run,  # 최대 실행 시간 (초)
    use_spot_instances=use_spot_instances,  # 스팟 인스턴스 사용 여부
    max_wait=max_wait,  # 스팟 인스턴스 대기 최대 시간 (초)
    output_path=output_path,  # 모델 아티팩트 저장 경로
    base_job_name=project_name,  # 훈련 작업 이름의 기본 접두사
)

In [24]:
estimator.set_hyperparameters(max_depth=5,
                              objective="reg:squarederror",
                              eta=0.1,
                              num_round=150)

데이터 경로 지정 및 모델 훈련

In [25]:
training_input_config = sagemaker.session.TrainingInput(
    s3_data=train_path,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.TrainingInput(
    s3_data=val_path,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}
estimator.fit(data_channels)

INFO:sagemaker:Creating training-job with name: bike-sharing-2024-07-30-02-29-36-160


2024-07-30 02:29:36 Starting - Starting the training job...
2024-07-30 02:29:52 Starting - Preparing the instances for training...
2024-07-30 02:30:28 Downloading - Downloading the training image.........
2024-07-30 02:32:00 Training - Training image download completed. Training in progress..[2024-07-30 02:32:07.500 ip-10-0-145-135.ap-northeast-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-07-30:02:32:07:INFO] Imported framework sagemaker_xgboost_container.training
[2024-07-30:02:32:07:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.
Returning the value itself
[2024-07-30:02:32:07:INFO] No GPUs detected (normal if no gpus installed)
[2024-07-30:02:32:07:INFO] Running XGBoost Sagemaker in algorithm mode
[2024-07-30:02:32:07:INFO] Determined delimiter of CSV input is ','
[2024-07-30:02:32:07:INFO] Determined delimiter of CSV input is ','
[2024-07-30:02:32:07:INFO] Determined delimiter of CSV input is ','
[2024-07-30:02:32:0

추론 엔드포인트 생성

In [46]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
from sagemaker.utils import name_from_base

In [27]:
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = name_from_base(project_name),
                             serializer=CSVSerializer(),
                             deserializer=CSVDeserializer(),
                             )

INFO:sagemaker:Creating model with name: bike-sharing-2024-07-30-02-33-14-941
INFO:sagemaker:Creating endpoint-config with name bike-sharing-2024-07-30-02-33-14-941
INFO:sagemaker:Creating endpoint with name bike-sharing-2024-07-30-02-33-14-941


-------!

테스트셋에 대한 예측 요청

In [56]:
y_test = test_df['cnt'].values
y_pred = predictor.predict(CSVSerializer().serialize(test_df.iloc[:, 1:].values))
y_pred = np.array(y_pred).flatten()
y_pred = list(map(float, y_pred))

예측 성능 평가

In [57]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# MSE 계산
mse = mean_squared_error(y_test, y_pred)
print(f'평균 제곱 오차 (MSE): {mse:.4f}')

# RMSE 계산
rmse = np.sqrt(mse)
print(f'평균 제곱근 오차 (RMSE): {rmse:.4f}')

# MAE 계산
mae = mean_absolute_error(y_test, y_pred)
print(f'평균 절대 오차 (MAE): {mae:.4f}')

# R-squared 계산
r2 = r2_score(y_test, y_pred)
print(f'결정 계수 (R-squared): {r2:.4f}')

평균 제곱 오차 (MSE): 103.5098
평균 제곱근 오차 (RMSE): 10.1740
평균 절대 오차 (MAE): 3.2897
결정 계수 (R-squared): 0.9978


예측 성능 시각화

In [59]:
# 산점도 생성
scatter = go.Scatter(x=y_test, y=y_pred, mode='markers', marker=dict(opacity=0.5), name='데이터')

# 예측 일치선 생성
diagonal = go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], 
                      mode='lines', line=dict(color='red', dash='dash'), name='예측 일치선')

# 레이아웃 설정
layout = go.Layout(
    title='실제값 vs 예측값',
    xaxis=dict(title='실제값'),
    yaxis=dict(title='예측값'),
    width=800,
    height=600
)

# 그래프 생성 및 표시
fig = go.Figure(data=[scatter, diagonal], layout=layout)
fig.show()


In [60]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: bike-sharing-2024-07-30-02-33-14-941
INFO:sagemaker:Deleting endpoint with name: bike-sharing-2024-07-30-02-33-14-941
