## 마켓플레이스 알고리즘 구독

라이브러리

In [25]:
# 데이터 처리 및 분석
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', None)

# AWS 관련
import sagemaker
from sagemaker.utils import name_from_base
import boto3
import awswrangler as wr

# 시각화
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 기타 유틸리티
import os
from dotenv import load_dotenv
load_dotenv()

True

SageMaker 세션 및 역할 설정

In [2]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

S3 데이터 저장 위치 설정

In [1]:
bucket_name = 'dante-sagemaker'
project_name = 'marketplace-lstm-ae'

s3_training_file_location = r's3://{0}/{1}/input/training/'.format(bucket_name, project_name)
s3_validation_file_location =r's3://{0}/{1}/input/validation/'.format(bucket_name, project_name)
s3_inference_file_location = r's3://{0}/{1}/input/inference/'.format(bucket_name, project_name)

s3_output_location = r's3://{0}/{1}/output/'.format(bucket_name, project_name)

print('s3_training_file_location : ', s3_training_file_location)
print('s3_validation_file_location : ', s3_validation_file_location)
print('s3_inference_file_location : ', s3_inference_file_location)
print('s3_output_location : ', s3_output_location)

s3_training_file_location :  s3://dante-sagemaker/marketplace-lstm-ae/input/training/
s3_validation_file_location :  s3://dante-sagemaker/marketplace-lstm-ae/input/validation/
s3_inference_file_location :  s3://dante-sagemaker/marketplace-lstm-ae/input/inference/
s3_output_location :  s3://dante-sagemaker/marketplace-lstm-ae/output/


데이터 다운로드 및 s3 업로드

* 훈련 데이터셋 로드

In [4]:
training_dataset = pd.read_csv(
    "https://raw.githubusercontent.com/fg-research/lstm-ae-sagemaker/master/data/training/train.csv",
    header=None,
    index_col=None
)

In [6]:
training_dataset.head()

Unnamed: 0,0,1
0,-10.302054,-9.879548
1,-9.761644,-11.455176
2,-10.185059,-12.195165
3,-12.834404,-10.253211
4,-13.341993,-13.015417


In [7]:
training_dataset.shape

(15000, 2)

In [10]:
fig = make_subplots(rows=training_dataset.shape[1], cols=1, shared_xaxes=True, vertical_spacing=0.05)

for i in range(training_dataset.shape[1]):
    fig.add_trace(
        go.Scatter(x=training_dataset.index, y=training_dataset.iloc[:, i], line=dict(color="#AFB8C1", width=1)),
        row=i+1, col=1
    )
    fig.update_yaxes(title_text="값", row=i+1, col=1)
    fig.update_xaxes(title_text="시간", row=i+1, col=1)

fig.update_layout(
    height=300 * training_dataset.shape[1],
    title_text="훈련 데이터셋",
    showlegend=False
)

for i in range(training_dataset.shape[1]):
    fig.update_annotations({"text": f"시계열 {i + 1}"}, row=i+1, col=1)

fig.show()

* 검증 데이터셋 로드

In [11]:
validation_dataset = pd.read_csv(
    "https://raw.githubusercontent.com/fg-research/lstm-ae-sagemaker/master/data/training/valid.csv",
    header=None,
    index_col=None
)

In [12]:
validation_dataset.head()

Unnamed: 0,0,1
0,-9.728953,-11.809606
1,-11.821012,-12.637485
2,-11.422567,-11.136945
3,-12.757596,-14.022805
4,-13.126208,-12.55809


In [13]:
validation_dataset.shape

(5000, 2)

In [15]:
fig = make_subplots(rows=validation_dataset.shape[1], cols=1, shared_xaxes=True, vertical_spacing=0.05)

for i in range(validation_dataset.shape[1]):
    fig.add_trace(
        go.Scatter(x=validation_dataset.index, y=validation_dataset.iloc[:, i], line=dict(color="#AFB8C1", width=1)),
        row=i+1, col=1)

fig.update_layout(
    height=300 * validation_dataset.shape[1],
    title_text="검증 데이터셋",
    showlegend=False
)

for i in range(validation_dataset.shape[1]):
    fig.update_annotations({"text": f"시계열 {i + 1}"}, row=i+1, col=1)

fig.show()

In [17]:
# 훈련 데이터셋을 S3에 업로드
wr.s3.to_csv(
    df=training_dataset,
    path=os.path.join(s3_training_file_location, 'train.csv'),
    index=False,
    header=False,
    boto3_session=boto3_session
)
# 검증 데이터셋을 S3에 업로드
wr.s3.to_csv(
    df=validation_dataset,
    path=os.path.join(s3_validation_file_location, 'valid.csv'),
    index=False,
    header=False,
    boto3_session=boto3_session
)

### 싱글 모델 훈련

하이퍼파라미터 설정

In [20]:
hyperparameters = {
    "sequence-length": 100,
    "sequence-stride": 50,
    "hidden-size": 64,
    "lr": 0.001,
    "batch-size": 32,
    "epochs": 100,
}

Estimator 생성

In [23]:
from sagemaker.inputs import TrainingInput

In [24]:
estimator = sagemaker.algorithm.AlgorithmEstimator(
    algorithm_arn='arn:aws:sagemaker:ap-northeast-2:745090734665:algorithm/lstm-ae-v1-4-ae1d05dbe655323caa35f1136622995b', # 마켓플레이스에서 구독한 알고리즘 구성 화면에서 알고리즘 ARN을 복사합니다.
    base_job_name="lstm-ae-training",
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    input_mode="File",
    sagemaker_session=sagemaker_session,
    hyperparameters=hyperparameters,
    # use_spot_instances=True,
    # max_run=3600,
    # max_wait=3600,
)

estimator.fit({"training": TrainingInput(s3_training_file_location, content_type='text/csv'), "validation": TrainingInput(s3_validation_file_location, content_type='text/csv')})

INFO:sagemaker:Creating training-job with name: lstm-ae-training-2024-09-06-18-36-31-276


2024-09-06 18:36:31 Starting - Starting the training job......
2024-09-06 18:37:31 Downloading - Downloading input data...
2024-09-06 18:37:46 Downloading - Downloading the training image.................................
2024-09-06 18:43:10 Training - Training image download completed. Training in progress.bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2024-09-06 18:43:23,896 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2024-09-06 18:43:23,897 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-06 18:43:23,898 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2024-09-06 18:43:23,909 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2024-09-06 18:43:23,910 sagemaker_pytorch_container.training INFO     Invoking user training script.
2024-09-06 18:43:25,272 sagemak

모델 배포 및 엔드포인트 생성

In [26]:
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    serializer=sagemaker.serializers.CSVSerializer(content_type="text/csv"),
    deserializer=sagemaker.deserializers.CSVDeserializer(accept="text/csv"),
    model_name=name_from_base("lstm-ae-model"),
    endpoint_name=name_from_base("lstm-ae-endpoint"),
)

INFO:sagemaker:Creating model package with name: lstm-ae-model-2024-09-06-18-48-00-944


.........

INFO:sagemaker:Creating model with name: lstm-ae-model-2024-09-06-18-48-00-944





INFO:sagemaker:Creating endpoint-config with name lstm-ae-endpoint-2024-09-06-18-48-00-944
INFO:sagemaker:Creating endpoint with name lstm-ae-endpoint-2024-09-06-18-48-00-944


-----------------!

테스트 데이터 페이로드 생성
- 추론 알고리즘은 시계열이 포함된 CSV 파일을 입력으로 받습니다. 
- CSV 파일의 각 열은 시계열을 나타내고 각 행은 시간 단계를 나타냅니다. 
- CSV 파일에는 인덱스 열이나 열 헤더가 포함되어서는 안 됩니다. 
- 모든 시계열의 길이는 같아야 하며 누락된 값을 포함하지 않아야 합니다.
- 이 알고리즘은 가변 길이 시퀀스를 지원하지 않으므로 입력 시계열의 길이는 시퀀스 길이의 배수여야 합니다.

In [28]:
# 테스트 데이터셋 로드
test_dataset = pd.read_csv(
    "https://raw.githubusercontent.com/fg-research/lstm-ae-sagemaker/master/data/inference/input/test.csv",
    header=None,
    index_col=None
)

In [29]:
test_dataset.shape

(5000, 2)

In [30]:
test_dataset.head()

Unnamed: 0,0,1
0,-11.013412,-11.392782
1,-12.690769,-13.362681
2,-12.638245,-10.937028
3,-14.549574,-13.608522
4,-14.831431,-14.712521


In [31]:
# 테스트 데이터셋 시각화
fig = make_subplots(rows=test_dataset.shape[1], cols=1, shared_xaxes=True, vertical_spacing=0.05)

for i in range(test_dataset.shape[1]):
    fig.add_trace(
        go.Scatter(x=test_dataset.index, y=test_dataset.iloc[:, i], 
                   line=dict(color="#AFB8C1", width=1), 
                   name=f"시계열 {i + 1}"),
        row=i+1, col=1
    )
    fig.update_yaxes(title_text="값", row=i+1, col=1)
    fig.update_xaxes(title_text="시간", row=i+1, col=1)

fig.update_layout(
    height=200*test_dataset.shape[1],
    title_text="테스트 데이터셋",
    showlegend=False
)

fig.show()

In [32]:
# 테스트 데이터셋을 S3에 업로드
wr.s3.to_csv(
    df=test_dataset,
    path=os.path.join(s3_inference_file_location, 'test.csv'),
    index=False,
    header=False,
    boto3_session=boto3_session
)

{'paths': ['s3://dante-sagemaker/marketplace-lstm-ae/input/inference/test.csv'],
 'partitions_values': {}}

In [33]:
payload = sagemaker.serializers.CSVSerializer().serialize(test_dataset)

실시간 추론 수행

In [34]:
response = sagemaker_session.sagemaker_runtime_client.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType="text/csv",
    Body=payload
)

real_time_predictions = pd.DataFrame(data=sagemaker.deserializers.CSVDeserializer().deserialize(response["Body"], content_type="text/csv")).replace("", None).astype(float)
real_time_predictions.head()

Unnamed: 0,0,1,2
0,0.213033,-10.082997,-10.008846
1,2.609459,-11.104548,-10.988354
2,0.374179,-12.088655,-11.942144
3,0.674115,-13.018567,-12.862983
4,0.016431,-13.878198,-13.74135


In [37]:
fig = make_subplots(rows=test_dataset.shape[1] + 1, cols=1, shared_xaxes=True, vertical_spacing=0.05)

for i in range(test_dataset.shape[1]):
    fig.add_trace(
        go.Scatter(x=test_dataset.index, y=test_dataset.iloc[:, i], 
                   mode='lines', line=dict(color="#AFB8C1", width=0.5), 
                   name="실제" if i == 0 else None, showlegend=(i == 0)),
        row=i+1, col=1
    )
    fig.add_trace(
        go.Scatter(x=real_time_predictions.index, y=real_time_predictions.iloc[:, i + 1], 
                   mode='lines', line=dict(color="#009ad3", width=1), 
                   name="재구성" if i == 0 else None, showlegend=(i == 0)),
        row=i+1, col=1
    )
    fig.update_yaxes(title_text="값", row=i+1, col=1)
    fig.update_xaxes(title_text="시간", row=i+1, col=1)
    fig.update_annotations(dict(font_size=10), row=i+1, col=1)

fig.add_trace(
    go.Scatter(x=real_time_predictions.index, y=real_time_predictions.iloc[:, 0], 
               mode='lines', line=dict(color="#57606a", width=1), 
               name="이상 점수"),
    row=test_dataset.shape[1]+1, col=1
)
fig.update_yaxes(title_text="값", row=test_dataset.shape[1]+1, col=1)
fig.update_xaxes(title_text="시간", row=test_dataset.shape[1]+1, col=1)

fig.update_layout(
    height=300*test_dataset.shape[1],
    title_text="테스트 데이터셋에 대한 실시간 예측",
    showlegend=True
)

fig.show()

In [35]:
wr.s3.to_csv(
    df=real_time_predictions,
    path=os.path.join(s3_output_location, 'real_time_predictions.csv'),
    index=False,
    header=False,
    boto3_session=boto3_session
)

{'paths': ['s3://dante-sagemaker/marketplace-lstm-ae/output/real_time_predictions.csv'],
 'partitions_values': {}}

모델 평가

```
충분한 레이블된 이상 데이터가 있다면, 실제 이상 레이블과 예측된 이상 레이블 사이의 F-베타 점수를 최대화하는 것으로 이상 점수에 대한 최적의 임계값을 추정할 수 있습니다. 
적절한 임계값이 결정되면, 모델은 이상 점수가 임계값보다 높으면 1(이상), 임계값보다 낮으면 0(정상)을 예측하는 이진 분류기로 사용될 수 있습니다. 
그 후에는 일반적인 방식으로 표준 분류 메트릭(정확도, 정밀도, 재현율 등)을 계산할 수 있습니다.
```

In [39]:
from sklearn.metrics import confusion_matrix, classification_report

# 이상 점수의 임계값 설정
threshold = 0.5  # 예시 임계값, 실제로는 데이터에 따라 조정 필요

# 이진 분류 예측 생성
binary_predictions = (real_time_predictions.iloc[:, 0] > threshold).astype(int)

# 실제 레이블 (예시, 실제 데이터에 맞게 수정 필요)
true_labels = np.random.randint(0, 2, size=len(binary_predictions))

# 혼동 행렬 계산|
conf_matrix = confusion_matrix(true_labels, binary_predictions)
class_report = classification_report(true_labels, binary_predictions)

print("혼동 행렬:")
print(conf_matrix)
print("\n분류 보고서:")
print(class_report)

혼동 행렬:
[[ 466 2057]
 [ 469 2008]]

분류 보고서:
              precision    recall  f1-score   support

           0       0.50      0.18      0.27      2523
           1       0.49      0.81      0.61      2477

    accuracy                           0.49      5000
   macro avg       0.50      0.50      0.44      5000
weighted avg       0.50      0.49      0.44      5000



In [40]:
# ROC 곡선 및 AUC 계산
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(true_labels, real_time_predictions.iloc[:, 0])
roc_auc = auc(fpr, tpr)

fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC 곡선 (AUC = {roc_auc:.2f})',
               line=dict(color='darkorange', width=2))
)

fig.add_trace(
    go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='기준선',
               line=dict(color='navy', width=2, dash='dash'))
)

fig.update_layout(
    title='ROC 곡선',
    xaxis_title='거짓 양성 비율',
    yaxis_title='참 양성 비율',
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1.05]),
    legend=dict(x=1, y=0, xanchor='right', yanchor='bottom')
)

fig.show()

엔드포인트 삭제

In [41]:
predictor.delete_endpoint(delete_endpoint_config=True)

INFO:sagemaker:Deleting endpoint configuration with name: lstm-ae-endpoint-2024-09-06-18-48-00-944
INFO:sagemaker:Deleting endpoint with name: lstm-ae-endpoint-2024-09-06-18-48-00-944


### 하이퍼파라미터 튜닝

| 하이퍼파라미터 | 설명 | 타입 | 튜닝 가능 여부 |
|----------------|------|------|----------------|
| sequence-length | 시퀀스의 길이 | 정수 | 예 |
| sequence-stride | 연속된 시퀀스 간의 간격 | 정수 | 예 |
| hidden-size | LSTM 레이어의 은닉 유닛 수 | 정수 | 예 |
| lr | 학습에 사용되는 학습률 | 연속 | 예 |
| batch-size | 학습에 사용되는 배치 크기 | 정수 | 예 |
| epochs | 학습 에폭 수 | 정수 | 예 |

하이퍼파라미터 설정

In [42]:
# 하이퍼파라미터 범위 설정
hyperparameter_ranges = {
    "hidden-size": sagemaker.parameter.IntegerParameter(32, 128),
    "lr": sagemaker.parameter.ContinuousParameter(0.001, 0.01),
    "batch-size": sagemaker.parameter.CategoricalParameter([32, 64]),
    "epochs": sagemaker.parameter.IntegerParameter(100, 200),
}

In [45]:
# 목표 메트릭 설정
objective_metric_name = "valid_mae"

In [46]:
# 목표 메트릭 유형 설정
objective_type = "Minimize"

하이퍼파라미터 튜닝 수행

In [47]:
tuner = sagemaker.tuner.HyperparameterTuner(
    estimator=estimator,
    base_tuning_job_name="lstm-ae-tuning",
    objective_metric_name=objective_metric_name,
    objective_type=objective_type,
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=4,
    max_parallel_jobs=4,
    random_seed=100,
)
tuner.fit({"training": TrainingInput(s3_training_file_location, content_type='text/csv'), "validation": TrainingInput(s3_validation_file_location, content_type='text/csv')})

INFO:sagemaker:Creating hyperparameter tuning job with name: lstm-ae-tuning-240907-0422


........................................................................................................!


In [48]:
# 하이퍼파라미터 튜닝 결과 확인
tuner.analytics().dataframe().sort_values(by="FinalObjectiveValue", ascending=True, ignore_index=True)

Unnamed: 0,batch-size,epochs,hidden-size,lr,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,32.0,184.0,79.0,0.002332,lstm-ae-tuning-240907-0422-001-a6205cfb,Completed,0.080649,2024-09-07 04:23:28+09:00,2024-09-07 04:31:31+09:00,483.0
1,32.0,189.0,35.0,0.007927,lstm-ae-tuning-240907-0422-002-cc19650e,Completed,0.094749,2024-09-07 04:23:37+09:00,2024-09-07 04:31:14+09:00,457.0
2,64.0,197.0,75.0,0.009906,lstm-ae-tuning-240907-0422-004-aee46ab6,Completed,0.104588,2024-09-07 04:23:42+09:00,2024-09-07 04:30:54+09:00,432.0
3,64.0,121.0,64.0,0.009069,lstm-ae-tuning-240907-0422-003-94bf5534,Completed,0.122665,2024-09-07 04:23:36+09:00,2024-09-07 04:30:18+09:00,402.0


In [49]:
tuning_job_result = sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.name
)
print("Best hyperparameters:")
tuning_job_result["BestTrainingJob"]["TunedHyperParameters"]

Best hyperparameters:


{'batch-size': '32',
 'epochs': '184',
 'hidden-size': '79',
 'lr': '0.002332046610717271'}

In [50]:
print("Best score:")
tuning_job_result["BestTrainingJob"]["FinalHyperParameterTuningJobObjectiveMetric"]["Value"]

Best score:


0.08064883947372437