### 라이브러리 준비

In [25]:
# 경고 무시
import warnings
warnings.filterwarnings('ignore')
# 데이터 처리 및 분석
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', None)
import numpy as np

# 머신러닝
import tensorflow as tf

# AWS 관련
import sagemaker
from sagemaker.utils import name_from_base
import boto3
import awswrangler as wr

# 시각화
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 기타 유틸리티
import os
import io
import json
import kaggle
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
git rm --cached ch07_advanced_course1/dataset/anime/user_anime_test_predictions.csv
git rm --cached ch07_advanced_course1/dataset/anime/user_anime_train.csv
git rm --cached ch07_advanced_course1/dataset/anime/user_anime_train.svm
git rm --cached ch07_advanced_course1/fm-recommender.ipynb
git rm --cached ch07_advanced_course1/dataset/anime/rating.csv
git rm --cached ch07_advanced_course1/dataset/anime/user_anime_test.recordio

### SageMaker 세션 및 역할 설정

In [None]:
boto3_session = boto3.Session(profile_name='awstutor')
sagemaker_session = sagemaker.Session(boto_session=boto3_session)
role = os.environ.get('SAGEMAKER_EXECUTION_ROLE_ARN')

### 데이터 로드 및 전처리

데이터 다운로드

In [4]:
kaggle.api.authenticate()
dataset_name = 'dmi3kno/newcarsalesnorway'
kaggle.api.dataset_download_files(dataset_name, path='dataset/newcarsalesnorway', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/dmi3kno/newcarsalesnorway


데이터 전처리

In [5]:
df = pd.read_csv('dataset/newcarsalesnorway/norway_new_car_sales_by_make.csv')
df.head()

Unnamed: 0,Year,Month,Make,Quantity,Pct
0,2007,1,Toyota,2884,22.7
1,2007,1,Volkswagen,2521,19.9
2,2007,1,Peugeot,1029,8.1
3,2007,1,Ford,870,6.9
4,2007,1,Volvo,693,5.5


In [6]:
# 'Year'와 'Month' 열을 결합하여 'YM' 열 생성 (날짜 형식)
df['YM'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Month'].astype(str).str.zfill(2) + '-01')

# 불필요한 열 제거 ('Year', 'Month', 'Pct')
df.drop(['Year','Month', 'Pct'],axis=1,inplace=True)
df.head()

Unnamed: 0,Make,Quantity,YM
0,Toyota,2884,2007-01-01
1,Volkswagen,2521,2007-01-01
2,Peugeot,1029,2007-01-01
3,Ford,870,2007-01-01
4,Volvo,693,2007-01-01


In [7]:
# defaultdict를 사용하여 날짜별 자동차 브랜드의 판매량을 저장할 중첩 딕셔너리 생성
from collections import defaultdict
dates = defaultdict(lambda: defaultdict(int))

# DataFrame을 순회하며 각 날짜에 대한 자동차 브랜드별 판매량 저장
for idx, row in df.iterrows():
    dates[row['YM']][row['Make']] = row['Quantity']

In [9]:
# 데이터 프레임으로 변환하고 행과 열을 전치
reshaped_df = pd.DataFrame(dates).T
reshaped_df.head()

Unnamed: 0,Toyota,Volkswagen,Peugeot,Ford,Volvo,Skoda,Opel,Audi,Honda,Mercedes-Benz,BMW,Citroen,Suzuki,Mazda,Saab,Renault,Mitsubishi,Nissan,Hyundai,Kia,Dodge,MINI,Subaru,Lexus,Chevrolet,Alfa Romeo,Daihatsu,Land Rover,Fiat,Chrysler,Iveco,Jaguar,Jeep,Nilsson,Think,Smart,Ssangyong,NaN,Chevrolet US,Porsche,Cadillac,Aston Martin,Morgan,Bentley,Lancia,Maserati,Isuzu,Ferrari,Koenigsegg,Lamborghini,Seat,Tesla,Tata,Dacia,Infiniti,Lotus,Secma,Martin Motors,Fisker,Westfield,Mia,McLaren,Binz,Tazzari,Polaris,DS
2007-01-01,2884.0,2521.0,1029.0,870.0,693.0,665.0,622.0,599.0,423.0,362.0,352.0,263.0,258.0,191.0,169.0,168.0,136.0,127.0,97.0,55.0,33.0,26.0,26.0,22.0,20.0,16.0,15.0,14.0,9.0,4.0,4.0,3.0,2.0,2.0,2.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2007-02-01,1885.0,1517.0,621.0,686.0,570.0,463.0,551.0,498.0,356.0,410.0,335.0,247.0,264.0,239.0,175.0,84.0,262.0,134.0,149.0,119.0,24.0,32.0,18.0,17.0,35.0,9.0,22.0,30.0,6.0,5.0,,5.0,6.0,6.0,,5.0,2.0,,3.0,2.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,
2007-03-01,1833.0,1428.0,867.0,766.0,656.0,491.0,578.0,682.0,399.0,387.0,365.0,239.0,333.0,319.0,243.0,131.0,512.0,327.0,228.0,171.0,25.0,28.0,7.0,18.0,52.0,21.0,27.0,40.0,20.0,3.0,5.0,9.0,33.0,1.0,,,,1.0,,14.0,4.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,
2007-04-01,1300.0,1257.0,547.0,451.0,587.0,438.0,534.0,556.0,351.0,387.0,360.0,179.0,347.0,249.0,191.0,92.0,104.0,328.0,142.0,92.0,25.0,31.0,77.0,18.0,79.0,20.0,17.0,42.0,7.0,11.0,5.0,5.0,10.0,,1.0,1.0,,,,7.0,2.0,,4.0,,,,,,,,,,,,,,,,,,,,,,,
2007-05-01,1866.0,1934.0,743.0,648.0,805.0,517.0,771.0,630.0,520.0,422.0,431.0,223.0,420.0,325.0,241.0,109.0,222.0,382.0,155.0,169.0,29.0,30.0,78.0,10.0,65.0,17.0,36.0,91.0,18.0,8.0,,5.0,72.0,,,2.0,3.0,,,4.0,,4.0,2.0,,,,,,,,,,,,,,,,,,,,,,,


In [10]:
sub_df = reshaped_df[['Toyota','Volkswagen','Peugeot','Ford','Volvo', 'Skoda']].astype(int)
sub_df.index.freq = 'MS'
sub_df.head()

Unnamed: 0,Toyota,Volkswagen,Peugeot,Ford,Volvo,Skoda
2007-01-01,2884,2521,1029,870,693,665
2007-02-01,1885,1517,621,686,570,463
2007-03-01,1833,1428,867,766,656,491
2007-04-01,1300,1257,547,451,587,438
2007-05-01,1866,1934,743,648,805,517


데이터 시각적 확인

In [12]:
# 3x2 서브플롯 생성
fig = make_subplots(rows=3, cols=2, subplot_titles=sub_df.columns)

# 각 열에 대한 라인 그래프 추가
for i, column in enumerate(sub_df.columns):
    row = i // 2 + 1
    col = i % 2 + 1
    fig.add_trace(
        go.Scatter(x=sub_df.index, y=sub_df[column], name=column), row=row, col=col
    )

# 레이아웃 설정
fig.update_layout(height=900, width=1000, title_text="자동차 브랜드별 판매량 추이")
fig.update_xaxes(title_text="날짜")
fig.update_yaxes(title_text="판매량")

# 그래프 표시
fig.show()


DeepAR 모델 입력 데이터 생성

In [13]:
# DeepAR에 날짜가 필요하지 않으므로 인덱스를 재설정하고 데이터를 전치합니다
df_input=sub_df.reset_index(drop=True).T.reset_index()
df_input.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120
0,Toyota,2884,1885,1833,1300,1866,1620,1901,1783,1303,1648,1579,1081,1758,1494,1338,1528,1259,1141,1315,1002,1250,1247,760,876,946,893,1021,993,1056,888,1353,1482,1697,1709,1450,1117,1894,1287,1474,1280,1213,1488,1694,1529,1461,1379,1511,668,1557,1270,1604,1253,1306,876,1325,1284,1313,1218,1298,932,1388,1465,1834,1364,1508,1264,1342,1547,1387,1640,1380,784,1287,1500,1301,1645,1635,1450,1627,1666,1548,1890,1454,906,1501,1352,1441,1229,1458,1306,1270,1233,1463,1451,1241,1055,1372,1436,1509,1274,1159,1409,1458,1461,1527,1443,1102,866,1106,1374,1537,1432,1687,1603,1127,1824,1823,1327,1375,1238,1526
1,Volkswagen,2521,1517,1428,1257,1934,1531,1777,1665,1373,1941,1473,1272,1224,1767,1503,1795,1402,1182,1203,1432,1229,1442,1306,1256,725,921,909,1207,1094,965,1356,1174,1358,1458,1529,1571,1524,1411,1585,1906,1591,1425,1916,1458,1575,1659,1824,1298,1665,1501,1908,1845,1822,1222,1590,1827,1674,1854,1960,2030,1528,1340,1701,2107,1712,1480,2031,1873,1445,1848,2064,1378,1680,1592,1440,2200,1351,1482,1556,1551,1512,1550,1735,1481,1360,1379,1722,1893,1716,1620,2197,2065,1707,2061,1920,2019,2057,1895,2274,2667,2076,2501,2415,2254,1768,2210,2346,1881,1743,2044,2236,3017,2222,2287,2076,2359,2084,2161,2106,2239,1688
2,Peugeot,1029,621,867,547,743,569,749,531,426,707,479,225,592,467,502,662,590,427,581,457,383,449,333,173,320,291,361,479,339,308,408,359,370,590,403,173,842,500,704,628,506,528,635,597,690,638,496,398,548,463,701,595,1018,571,635,557,729,535,622,453,493,527,484,391,655,516,512,657,490,669,539,276,639,448,390,444,491,371,361,563,402,463,436,251,661,608,490,531,474,515,432,642,568,601,506,527,373,401,525,541,490,472,477,531,439,475,400,418,437,428,405,483,404,400,314,461,354,415,427,270,571
3,Ford,870,686,766,451,648,772,1006,816,679,693,566,821,942,1016,713,854,984,898,1056,658,764,701,639,848,533,527,698,608,661,845,824,552,956,1045,1142,1413,849,602,989,567,844,1241,1013,1102,1190,887,1331,1226,840,997,1105,835,1067,1085,901,1248,1107,1147,1282,1377,828,862,1255,741,1019,1120,841,911,930,1017,741,742,679,800,666,950,1010,1084,825,897,733,864,733,756,601,524,700,803,626,787,645,604,597,691,562,681,593,666,829,634,747,670,642,866,648,706,724,737,752,754,717,715,751,854,510,625,697,534,690,727,617
4,Volvo,693,570,656,587,805,662,1064,498,662,1014,710,641,593,684,557,732,724,672,601,270,457,472,478,714,347,309,332,434,379,590,900,321,767,765,956,1217,652,842,936,868,728,882,1018,530,827,922,939,1314,656,932,1104,878,1084,853,1134,687,1178,1061,1193,1235,911,800,1127,928,1099,890,917,609,945,969,1090,1265,857,826,826,1060,938,692,901,581,1098,1147,1053,1884,654,723,915,816,859,852,930,614,901,1039,950,2072,321,438,875,729,942,1099,826,518,827,954,937,1512,643,732,687,748,619,766,635,463,763,732,754,1235,1158


In [14]:
# 모델에 입력하기 위해 자동차 유형을 범주형 인코딩으로 변환합니다
ts_code=df_input["index"].astype('category').cat.codes.values
ts_code

array([3, 4, 1, 0, 5, 2], dtype=int8)

In [15]:
# 컨텍스트 길이와 예측 길이를 6으로 설정
context_length = prediction_length = 6

# 전체 데이터 길이 계산 (인덱스 열 제외)
total_length = df_input.shape[1] - 1

# 훈련 데이터셋 생성: 마지막 예측 길이만큼 제외
df_train = df_input.iloc[:,1:total_length-prediction_length + 1]

# 테스트 데이터셋 생성: 전체 데이터 사용
df_test = df_input.iloc[:,1:]

# 훈련 및 테스트 데이터셋의 형태 출력
print('df_train shape:', df_train.shape)
print('df_test shape:', df_test.shape)

df_train shape: (6, 115)
df_test shape: (6, 121)


In [16]:
start_train = sub_df.index.min()
start_train.freq = 'MS'

In [17]:
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114
0,2884,1885,1833,1300,1866,1620,1901,1783,1303,1648,1579,1081,1758,1494,1338,1528,1259,1141,1315,1002,1250,1247,760,876,946,893,1021,993,1056,888,1353,1482,1697,1709,1450,1117,1894,1287,1474,1280,1213,1488,1694,1529,1461,1379,1511,668,1557,1270,1604,1253,1306,876,1325,1284,1313,1218,1298,932,1388,1465,1834,1364,1508,1264,1342,1547,1387,1640,1380,784,1287,1500,1301,1645,1635,1450,1627,1666,1548,1890,1454,906,1501,1352,1441,1229,1458,1306,1270,1233,1463,1451,1241,1055,1372,1436,1509,1274,1159,1409,1458,1461,1527,1443,1102,866,1106,1374,1537,1432,1687,1603,1127
1,2521,1517,1428,1257,1934,1531,1777,1665,1373,1941,1473,1272,1224,1767,1503,1795,1402,1182,1203,1432,1229,1442,1306,1256,725,921,909,1207,1094,965,1356,1174,1358,1458,1529,1571,1524,1411,1585,1906,1591,1425,1916,1458,1575,1659,1824,1298,1665,1501,1908,1845,1822,1222,1590,1827,1674,1854,1960,2030,1528,1340,1701,2107,1712,1480,2031,1873,1445,1848,2064,1378,1680,1592,1440,2200,1351,1482,1556,1551,1512,1550,1735,1481,1360,1379,1722,1893,1716,1620,2197,2065,1707,2061,1920,2019,2057,1895,2274,2667,2076,2501,2415,2254,1768,2210,2346,1881,1743,2044,2236,3017,2222,2287,2076
2,1029,621,867,547,743,569,749,531,426,707,479,225,592,467,502,662,590,427,581,457,383,449,333,173,320,291,361,479,339,308,408,359,370,590,403,173,842,500,704,628,506,528,635,597,690,638,496,398,548,463,701,595,1018,571,635,557,729,535,622,453,493,527,484,391,655,516,512,657,490,669,539,276,639,448,390,444,491,371,361,563,402,463,436,251,661,608,490,531,474,515,432,642,568,601,506,527,373,401,525,541,490,472,477,531,439,475,400,418,437,428,405,483,404,400,314
3,870,686,766,451,648,772,1006,816,679,693,566,821,942,1016,713,854,984,898,1056,658,764,701,639,848,533,527,698,608,661,845,824,552,956,1045,1142,1413,849,602,989,567,844,1241,1013,1102,1190,887,1331,1226,840,997,1105,835,1067,1085,901,1248,1107,1147,1282,1377,828,862,1255,741,1019,1120,841,911,930,1017,741,742,679,800,666,950,1010,1084,825,897,733,864,733,756,601,524,700,803,626,787,645,604,597,691,562,681,593,666,829,634,747,670,642,866,648,706,724,737,752,754,717,715,751,854,510
4,693,570,656,587,805,662,1064,498,662,1014,710,641,593,684,557,732,724,672,601,270,457,472,478,714,347,309,332,434,379,590,900,321,767,765,956,1217,652,842,936,868,728,882,1018,530,827,922,939,1314,656,932,1104,878,1084,853,1134,687,1178,1061,1193,1235,911,800,1127,928,1099,890,917,609,945,969,1090,1265,857,826,826,1060,938,692,901,581,1098,1147,1053,1884,654,723,915,816,859,852,930,614,901,1039,950,2072,321,438,875,729,942,1099,826,518,827,954,937,1512,643,732,687,748,619,766,635
5,665,463,491,438,517,512,489,476,354,438,447,578,360,426,455,537,466,370,322,327,306,283,252,317,192,292,337,406,425,280,313,302,342,463,408,446,442,554,566,612,500,534,622,507,559,603,666,590,516,601,546,653,585,623,646,648,691,770,593,748,430,543,531,652,469,573,781,643,463,611,517,580,388,481,484,625,464,434,502,543,679,773,789,786,521,566,543,672,692,577,632,653,554,790,585,666,473,539,679,725,585,645,627,666,548,717,642,644,528,753,662,785,722,721,602


In [20]:
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120
0,2884,1885,1833,1300,1866,1620,1901,1783,1303,1648,1579,1081,1758,1494,1338,1528,1259,1141,1315,1002,1250,1247,760,876,946,893,1021,993,1056,888,1353,1482,1697,1709,1450,1117,1894,1287,1474,1280,1213,1488,1694,1529,1461,1379,1511,668,1557,1270,1604,1253,1306,876,1325,1284,1313,1218,1298,932,1388,1465,1834,1364,1508,1264,1342,1547,1387,1640,1380,784,1287,1500,1301,1645,1635,1450,1627,1666,1548,1890,1454,906,1501,1352,1441,1229,1458,1306,1270,1233,1463,1451,1241,1055,1372,1436,1509,1274,1159,1409,1458,1461,1527,1443,1102,866,1106,1374,1537,1432,1687,1603,1127,1824,1823,1327,1375,1238,1526
1,2521,1517,1428,1257,1934,1531,1777,1665,1373,1941,1473,1272,1224,1767,1503,1795,1402,1182,1203,1432,1229,1442,1306,1256,725,921,909,1207,1094,965,1356,1174,1358,1458,1529,1571,1524,1411,1585,1906,1591,1425,1916,1458,1575,1659,1824,1298,1665,1501,1908,1845,1822,1222,1590,1827,1674,1854,1960,2030,1528,1340,1701,2107,1712,1480,2031,1873,1445,1848,2064,1378,1680,1592,1440,2200,1351,1482,1556,1551,1512,1550,1735,1481,1360,1379,1722,1893,1716,1620,2197,2065,1707,2061,1920,2019,2057,1895,2274,2667,2076,2501,2415,2254,1768,2210,2346,1881,1743,2044,2236,3017,2222,2287,2076,2359,2084,2161,2106,2239,1688
2,1029,621,867,547,743,569,749,531,426,707,479,225,592,467,502,662,590,427,581,457,383,449,333,173,320,291,361,479,339,308,408,359,370,590,403,173,842,500,704,628,506,528,635,597,690,638,496,398,548,463,701,595,1018,571,635,557,729,535,622,453,493,527,484,391,655,516,512,657,490,669,539,276,639,448,390,444,491,371,361,563,402,463,436,251,661,608,490,531,474,515,432,642,568,601,506,527,373,401,525,541,490,472,477,531,439,475,400,418,437,428,405,483,404,400,314,461,354,415,427,270,571
3,870,686,766,451,648,772,1006,816,679,693,566,821,942,1016,713,854,984,898,1056,658,764,701,639,848,533,527,698,608,661,845,824,552,956,1045,1142,1413,849,602,989,567,844,1241,1013,1102,1190,887,1331,1226,840,997,1105,835,1067,1085,901,1248,1107,1147,1282,1377,828,862,1255,741,1019,1120,841,911,930,1017,741,742,679,800,666,950,1010,1084,825,897,733,864,733,756,601,524,700,803,626,787,645,604,597,691,562,681,593,666,829,634,747,670,642,866,648,706,724,737,752,754,717,715,751,854,510,625,697,534,690,727,617
4,693,570,656,587,805,662,1064,498,662,1014,710,641,593,684,557,732,724,672,601,270,457,472,478,714,347,309,332,434,379,590,900,321,767,765,956,1217,652,842,936,868,728,882,1018,530,827,922,939,1314,656,932,1104,878,1084,853,1134,687,1178,1061,1193,1235,911,800,1127,928,1099,890,917,609,945,969,1090,1265,857,826,826,1060,938,692,901,581,1098,1147,1053,1884,654,723,915,816,859,852,930,614,901,1039,950,2072,321,438,875,729,942,1099,826,518,827,954,937,1512,643,732,687,748,619,766,635,463,763,732,754,1235,1158
5,665,463,491,438,517,512,489,476,354,438,447,578,360,426,455,537,466,370,322,327,306,283,252,317,192,292,337,406,425,280,313,302,342,463,408,446,442,554,566,612,500,534,622,507,559,603,666,590,516,601,546,653,585,623,646,648,691,770,593,748,430,543,531,652,469,573,781,643,463,611,517,580,388,481,484,625,464,434,502,543,679,773,789,786,521,566,543,672,692,577,632,653,554,790,585,666,473,539,679,725,585,645,627,666,548,717,642,644,528,753,662,785,722,721,602,529,653,628,665,526,681


In [19]:
# DeepAR 모델의 입력 데이터 형식 생성
training_data = [
    {
        "start": str(start_train),  # 시계열 데이터의 시작 날짜
        "target": df_train.loc[ts, :].tolist(),  # 각 시계열의 목표 값 리스트
        "cat": int(code)  # 범주형 변수 (자동차 브랜드)를 정수로 변환
    }
    for ts, code in zip(df_train.index, ts_code)  # 각 시계열과 해당 범주 코드를 순회
]

# 생성된 훈련 데이터의 개수 출력
print(f"생성된 훈련 데이터 시계열 수: {len(training_data)}")

생성된 훈련 데이터 시계열 수: 6


In [21]:
test_data = [
    {
        "start": str(start_train),
        "target": df_test.loc[ts, :].tolist(),
        "cat": int(code)
    }
    for ts, code in zip(df_test, ts_code)
]
print(len(test_data))

6


S3 데이터 경로 생성 및 데이터 업로드

In [35]:
bucket_name = 'dante-sagemaker'
project_name = 'newcarsalesnorway'
training_data_folder = 'inputs/train'
test_data_folder = 'inputs/test'
train_folder_s3_uri = f"s3://{bucket_name}/{project_name}/{training_data_folder}"
test_folder_s3_uri = f"s3://{bucket_name}/{project_name}/{test_data_folder}"
train_data_s3_uri = f"s3://{bucket_name}/{project_name}/{training_data_folder}/training_data.json"
test_data_s3_uri = f"s3://{bucket_name}/{project_name}/{test_data_folder}/test_data.json"
checkpoint_s3_uri = f"s3://{bucket_name}/{project_name}/checkpoints"
s3_model_output_path = f"s3://{bucket_name}/{project_name}/output"

In [32]:
buf_train, buf_test = io.BytesIO(), io.BytesIO()
for d in training_data :
    buf_train.write(json.dumps(d).encode("utf-8"))
    buf_train.write(b"\n")
for d in test_data :
    buf_test.write(json.dumps(d).encode("utf-8"))
    buf_test.write(b"\n")
buf_train.seek(0)
buf_test.seek(0)
wr.s3.upload(local_file=buf_train, path=train_data_s3_uri, boto3_session=boto3_session)
wr.s3.upload(local_file=buf_test, path=test_data_s3_uri, boto3_session=boto3_session)

### 모델 훈련

모델 컨테이너 준비

In [33]:
container = sagemaker.image_uris.retrieve("forecasting-deepar",sagemaker_session.boto_region_name)
container

'204372634319.dkr.ecr.ap-northeast-2.amazonaws.com/forecasting-deepar:1'

In [36]:
# 훈련 작업 이름 생성
job_name = name_from_base(project_name + '-train')

# SageMaker Estimator 객체 생성
estimator = sagemaker.estimator.Estimator(
    container,                      # 사용할 Docker 컨테이너 이미지
    role,                           # IAM 역할
    instance_count=1,               # 사용할 인스턴스 수
    instance_type='ml.c5.2xlarge',  # 인스턴스 유형
    output_path=s3_model_output_path, # 모델 아티팩트 저장 경로
    sagemaker_session=sagemaker_session, # SageMaker 세션
    base_job_name = job_name,       # 기본 작업 이름
    use_spot_instances=True,        # Spot 인스턴스 사용 여부
    max_run=60 * 60,                # 최대 실행 시간 (초)
    max_wait=60 * 60,               # 최대 대기 시간 (초)
    checkpoint_s3_uri=checkpoint_s3_uri) # 체크포인트 저장 경로

하이퍼파라미터 설정

In [37]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html
# 하이퍼파라미터 설정
hyperparameters = {
    "time_freq": 'M',                           # 시간 빈도 (월별)
    "epochs": "400",                            # 학습 에포크 수
    "early_stopping_patience": "10",            # 조기 종료 인내심
    "mini_batch_size": "64",                    # 미니 배치 크기
    "learning_rate": "5E-4",                    # 학습률
    "context_length": str(context_length),      # 컨텍스트 길이
    "prediction_length": str(prediction_length),# 예측 길이
    "cardinality" : "auto",                     # 카디널리티 (자동)
}

# 하이퍼파라미터를 estimator에 설정
estimator.set_hyperparameters(**hyperparameters)

모델 훈련

In [38]:
%%time
data_channels = {"train": train_folder_s3_uri, "test": test_folder_s3_uri}
estimator.fit(inputs=data_channels, wait=True)

INFO:sagemaker:Creating training-job with name: newcarsalesnorway-train-2024-08-22-07-5-2024-08-22-07-56-07-889


2024-08-22 07:56:08 Starting - Starting the training job...
2024-08-22 07:56:25 Starting - Preparing the instances for training...
2024-08-22 07:57:03 Downloading - Downloading the training image.............................................
2024-08-22 08:04:23 Training - Training image download completed. Training in progress..Docker entrypoint called with argument(s): train
Running default environment configuration script
Running custom environment configuration script
  if num_device is 1 and 'dist' not in kvstore:
[08/22/2024 08:04:40 INFO 140411621779264] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likelihood': 'student-t', 'mini_batch_size': '128', 'num_cells': '40', 'num_dynamic_

### 시계열 모델 예측

예측기 객체 생성

In [39]:
# IdentitySerializer는 SageMaker에서 예측 요청을 직렬화하는 데 사용됩니다.
# 이 클래스는 입력 데이터를 그대로 유지하며, 지정된 content_type으로 전송합니다.
from sagemaker.serializers import IdentitySerializer

In [40]:
# NpEncoder 클래스: NumPy 데이터 타입을 JSON으로 인코딩하기 위한 사용자 정의 인코더
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

# DeepARPredictor 클래스: SageMaker DeepAR 모델을 위한 사용자 정의 예측기
# 참고 : https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/deepar_electricity/DeepAR-Electricity.html
class DeepARPredictor(sagemaker.predictor.Predictor):
    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            serializer=IdentitySerializer(content_type="application/json"),
            **kwargs,
        )

    # 예측 메서드: 주어진 시계열 데이터에 대한 예측 수행
    def predict(
        self,
        ts,
        cat=None,
        dynamic_feat=None,
        num_samples=100,
        return_samples=False,
        quantiles=["0.1", "0.5", "0.9"],
    ):
        """
        ts: 예측할 시계열 데이터 (pandas.Series)
        cat: 시계열 카테고리 (선택적)
        dynamic_feat: 동적 특성 (선택적)
        num_samples: 생성할 샘플 수
        return_samples: 샘플 반환 여부
        quantiles: 계산할 분위수 목록
        """
        prediction_time = ts.index[-1] + ts.index.freq
        quantiles = [str(q) for q in quantiles]
        req = self.__encode_request(ts, cat, dynamic_feat, num_samples, return_samples, quantiles)
        res = super(DeepARPredictor, self).predict(req)
        return self.__decode_response(res, ts.index.freq, prediction_time, return_samples)

    # 요청 인코딩 메서드
    def __encode_request(self, ts, cat, dynamic_feat, num_samples, return_samples, quantiles):
        instance = series_to_dict(
            ts, cat if cat is not None else None, dynamic_feat if dynamic_feat else None
        )

        configuration = {
            "num_samples": num_samples,
            "output_types": ["quantiles", "samples"] if return_samples else ["quantiles"],
            "quantiles": quantiles,
        }

        http_request_data = {"instances": [instance], "configuration": configuration}

        return json.dumps(http_request_data, cls=NpEncoder).encode("utf-8")

    # 응답 디코딩 메서드
    def __decode_response(self, response, freq, prediction_time, return_samples):
        predictions = json.loads(response.decode("utf-8"))["predictions"][0]
        prediction_length = len(next(iter(predictions["quantiles"].values())))
        prediction_index = pd.date_range(
            start=prediction_time, freq=freq, periods=prediction_length
        )
        if return_samples:
            dict_of_samples = {"sample_" + str(i): s for i, s in enumerate(predictions["samples"])}
        else:
            dict_of_samples = {}
        return pd.DataFrame(
            data={**predictions["quantiles"], **dict_of_samples}, index=prediction_index
        )

    # 빈도 설정 메서드
    def set_frequency(self, freq):
        self.freq = freq

# 타겟 인코딩 함수: NaN 값을 문자열로 변환
def encode_target(ts):
    return [x if np.isfinite(x) else "NaN" for x in ts]

# 시계열을 딕셔너리로 변환하는 함수
def series_to_dict(ts, cat=None, dynamic_feat=None):
    """
    ts: 대상 시계열 (pandas.Series)
    cat: 시계열 카테고리 (선택적)
    dynamic_feat: 동적 특성 (선택적)
    """
    obj = {"start": str(ts.index[0]), "target": encode_target(ts)}
    if cat is not None:
        obj["cat"] = cat
    if dynamic_feat is not None:
        obj["dynamic_feat"] = dynamic_feat
    return obj

엔드포인트 생성

In [41]:
predictor = estimator.deploy(
    initial_instance_count=1, instance_type="ml.m5.large", predictor_cls=DeepARPredictor
)

INFO:sagemaker:Creating model with name: newcarsalesnorway-train-2024-08-22-07-5-2024-08-22-08-12-08-760
INFO:sagemaker:Creating endpoint-config with name newcarsalesnorway-train-2024-08-22-07-5-2024-08-22-08-12-08-760
INFO:sagemaker:Creating endpoint with name newcarsalesnorway-train-2024-08-22-07-5-2024-08-22-08-12-08-760


----------------------!

폭스바겐 예측

In [46]:
sub_df.columns[1]

'Volkswagen'

In [47]:
# DeepAR 예측기를 사용하여 예측 수행
# ts: Volkswagen 시계열 데이터 사용
# cat: Volkswagen 카테고리 코드 사용
# quantiles: 10%, 50%, 90% 분위수 예측
# head(): 처음 5개의 예측 결과만 표시
predictor.predict(ts=sub_df.iloc[:,1], cat=ts_code[1], quantiles=[0.10, 0.5, 0.90]).head()

Unnamed: 0,0.1,0.5,0.9
2017-02-01,1549.881348,2037.647461,2438.171387
2017-03-01,1692.017334,2113.808594,2452.54248
2017-04-01,1962.594482,2314.830322,2705.594727
2017-05-01,1653.662231,2070.904053,2588.794922
2017-06-01,1671.287354,2190.785645,2544.798096


푸조 예측

In [48]:
sub_df.columns[2]

'Peugeot'

In [49]:
predictor.predict(ts=sub_df.iloc[:,2], cat=ts_code[2], quantiles=[0.10, 0.5, 0.90]).head()

Unnamed: 0,0.1,0.5,0.9
2017-02-01,321.458191,430.330811,519.674866
2017-03-01,310.174683,409.61261,487.125732
2017-04-01,369.614319,449.518127,543.758789
2017-05-01,326.409454,421.086609,536.773193
2017-06-01,321.169922,428.875122,505.603516


6개 차종 예측

In [52]:
fig = make_subplots(rows=3, cols=2, subplot_titles=sub_df.columns)
for i in range(0, len(ts_code)):
    request = sub_df.iloc[:,i]
    response = predictor.predict(ts=request, cat=ts_code[i], quantiles=[0.10, 0.5, 0.90])
    
    row = i // 2 + 1
    col = i % 2 + 1
    
    showlegend = True if i == 0 else False
    
    fig.add_trace(go.Scatter(x=request.index, y=request.values, name="기록", line=dict(color='blue'), showlegend=showlegend), row=row, col=col)
    fig.add_trace(go.Scatter(x=response.index, y=response['0.5'], name="예측(0.5)", line=dict(color='red'), showlegend=showlegend), row=row, col=col)
    fig.add_trace(go.Scatter(x=response.index, y=response['0.1'], name="예측(0.1)", line=dict(color='rgba(255,0,0,0.2)', dash='dash'), showlegend=showlegend), row=row, col=col)
    fig.add_trace(go.Scatter(x=response.index, y=response['0.9'], name="예측(0.9)", line=dict(color='rgba(255,0,0,0.2)', dash='dash'), fill='tonexty', showlegend=showlegend), row=row, col=col)

    fig.update_xaxes(title_text="시간", row=row, col=col)
    fig.update_yaxes(title_text="값", row=row, col=col)

fig.update_layout(height=900, width=1000, title_text="시계열 예측")
fig.show()

엔드포인트 삭제

In [53]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: newcarsalesnorway-train-2024-08-22-07-5-2024-08-22-08-12-08-760
INFO:sagemaker:Deleting endpoint with name: newcarsalesnorway-train-2024-08-22-07-5-2024-08-22-08-12-08-760
