# Import Target Time Series Data

In [154]:
import boto3
from time import sleep
import subprocess
import pandas as pd
import json
import time
import pprint
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

In [155]:
%store -r

In [156]:
DATASET_FREQUENCY = "W" 
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"

suffix = str(np.random.uniform())[4:9]


# 원하는 프로젝트 이름을 넣으세요
project = 'WalmartKaggleGonsoo'
datasetName= project+'DS' + suffix
datasetGroupName= project +'DSG'+ suffix

In [157]:
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
    data = json.load(notebook_info)
    resource_arn = data['ResourceArn']
    region = resource_arn.split(':')[3]
print(region)

us-east-2


In [158]:
session = boto3.Session(region_name=region)
forecast = session.client(service_name='forecast')
forecast_query = session.client(service_name='forecastquery')

## Create DatasetGroup

In [159]:
# Create the DatasetGroup
create_dataset_group_response = forecast.create_dataset_group(DatasetGroupName=datasetGroupName,
                                                              Domain="CUSTOM",
                                                             )
datasetGroupArn = create_dataset_group_response['DatasetGroupArn']

In [160]:
forecast.describe_dataset_group(DatasetGroupArn=datasetGroupArn)

{'DatasetGroupName': 'WalmartKaggleGonsooDSG89952',
 'DatasetGroupArn': 'arn:aws:forecast:us-east-2:057716757052:dataset-group/WalmartKaggleGonsooDSG89952',
 'DatasetArns': [],
 'Domain': 'CUSTOM',
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 3, 22, 2, 42, 36, 374000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 3, 22, 2, 42, 36, 374000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '7e55c41d-90f0-4ffe-922f-f4daf3ec8298',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 22 Mar 2020 02:42:37 GMT',
   'x-amzn-requestid': '7e55c41d-90f0-4ffe-922f-f4daf3ec8298',
   'content-length': '277',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

## Create schema

In [161]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"target_value",
         "AttributeType":"float"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      }
   ]
}

## Create Target Time Sereis Dataset

In [162]:
response=forecast.create_dataset(
                    Domain="CUSTOM",
                    DatasetType='TARGET_TIME_SERIES',
                    DatasetName=datasetName,
                    DataFrequency=DATASET_FREQUENCY, 
                    Schema = schema
)

In [163]:
target_datasetArn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=target_datasetArn)

{'DatasetArn': 'arn:aws:forecast:us-east-2:057716757052:dataset/WalmartKaggleGonsooDS89952',
 'DatasetName': 'WalmartKaggleGonsooDS89952',
 'Domain': 'CUSTOM',
 'DatasetType': 'TARGET_TIME_SERIES',
 'DataFrequency': 'W',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'target_value', 'AttributeType': 'float'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 3, 22, 2, 42, 41, 112000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 3, 22, 2, 42, 41, 112000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '8e70009c-4fbd-4bca-8fbb-ba26ecf2c208',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 22 Mar 2020 02:42:42 GMT',
   'x-amzn-requestid': '8e70009c-4fbd-4bca-8fbb-ba26ecf2c208',
   'content-length': '513',
   'connection': 'keep-alive'},
  '

## Attach the target time series dataset to the DatasetGroup

In [164]:
# Attach the Dataset to the Dataset Group:
forecast.update_dataset_group(DatasetGroupArn=datasetGroupArn, DatasetArns=[target_datasetArn])

{'ResponseMetadata': {'RequestId': 'c205e880-11b9-4014-9899-81bd745193db',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Sun, 22 Mar 2020 02:42:45 GMT',
   'x-amzn-requestid': 'c205e880-11b9-4014-9899-81bd745193db',
   'content-length': '2',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

## Create role
Pleare uncomment if needed

In [174]:
# Use existing role
role_arn = "arn:aws:iam::057716757052:role/WalmartForecast"

In [165]:
# iam = boto3.client("iam")

# # 원하는 Role Name 을 넣으세요
# # role_name = "WalmartForecast" + suffix
# role_name = "WalmartForecast" 
# assume_role_policy_document = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#           "Effect": "Allow",
#           "Principal": {
#             "Service": "forecast.amazonaws.com"
#           },
#           "Action": "sts:AssumeRole"
#         }
#     ]
# }

# create_role_response = iam.create_role(
#     RoleName = role_name,
#     AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
# )

# # AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# # if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# # that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
# policy_arn = "arn:aws:iam::aws:policy/AmazonForecastFullAccess"
# iam.attach_role_policy(
#     RoleName = role_name,
#     PolicyArn = policy_arn
# )

# # Now add S3 support
# iam.attach_role_policy(
#     PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
#     RoleName=role_name
# )
# time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

# role_arn = create_role_response["Role"]["Arn"]
# print(role_arn)

## Create a bucket
Uncomment the following code if needed

In [175]:
# Use an existing bucket
bucket_name = "walmart-forecast" 

In [167]:
# print(region)
# s3 = boto3.client('s3')
# account_id = boto3.client('sts').get_caller_identity().get('Account')
# # 원하는 버킷 이름을 넗으세요
# # bucket_name = account_id + "forecastpoc-gsmoon"
# # bucket_name = "walmart-forecast" + suffix
# bucket_name = "walmart-forecast" 
# print(bucket_name)
# if region != "us-east-1":
#     s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
# else:
#     s3.create_bucket(Bucket=bucket_name)

## Create dataset_import_job used to download dataset from S3

In [169]:
# Upload Target File
boto3.Session().resource('s3').Bucket(bucket_name).Object(target_time_series_filename).upload_file(target_time_series_path)
target_s3DataPath = "s3://"+bucket_name+"/"+target_time_series_filename

In [170]:
# Finally we can call import the dataset
datasetImportJobName = 'DSIMPORT_JOB_TARGET_WALMART' + suffix
ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=target_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":target_s3DataPath,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          },
                                                          TimestampFormat=TIMESTAMP_FORMAT
                                                         )

In [171]:
ds_import_job_arn=ds_import_job_response['DatasetImportJobArn']
print(ds_import_job_arn)

arn:aws:forecast:us-east-2:057716757052:dataset-import-job/WalmartKaggleGonsooDS89952/DSIMPORT_JOB_TARGET_WALMART89952


In [172]:
%%time

while True:
    dataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_import_job_arn)['Status']
    print(dataImportStatus)
    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'CREATE_FAILED':
        sleep(30)
    else:
        break

CREATE_PENDING
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
ACTIVE
CPU times: user 19.7 ms, sys: 629 µs, total: 20.4 ms
Wall time: 2min


In [173]:
%store project
%store suffix
%store region
%store ds_import_job_arn
%store datasetGroupArn
%store target_datasetArn
%store bucket_name
%store role_arn

Stored 'project' (str)
Stored 'suffix' (str)
Stored 'region' (str)
Stored 'ds_import_job_arn' (str)
Stored 'datasetGroupArn' (str)
Stored 'target_datasetArn' (str)
Stored 'bucket_name' (str)
Stored 'role_arn' (str)
