# Import Target Dataset

In [4]:
import boto3
from time import sleep
import os
import pandas as pd
import json
import time
import pprint
import numpy as np

## Create Target Time Series File and Validation File

In [5]:
data_dir = 'data'
train_file_name = 'train.csv'
features_file_name = 'features.csv'
store_file_name = 'stores.csv'
train_data = pd.read_csv(os.path.join(data_dir,train_file_name))
train_df = train_data.copy()
# Shift four dats in the future because of a basis of Monday in Forecast in uni of Weekly
train_df.Date = pd.to_datetime(train_df['Date']) + pd.DateOffset(days=3)
train_df = train_df.rename(columns={'Date':'date', 'Store':'item_id'})

In [6]:
temp_df = train_df.groupby(['item_id','date'])['Weekly_Sales'].sum()
# df2 = pd.DataFrame(temp_df, columns=['Store','Date','Weekly_Sales'])
stores_sales = pd.DataFrame(temp_df)
stores_sales.reset_index(inplace=True)
# print(stores_sales.groupby('Store').agg({'Date': ['min','max']}))
stores_sales = stores_sales.set_index('date')



In [7]:

stores_sales.item_id = stores_sales.item_id.astype(str)
stores_sales.index = pd.to_datetime(stores_sales.index, format = '%Y-%m-%d' )
stores_sales.index = stores_sales.index.strftime('%Y-%m-%d %H:%M:%S')

#stores_sales.index = pd.to_datetime(stores_sales.index.strftime("yyyy-MM-dd hh:mm:ss"))
cols = ['Weekly_Sales', 'item_id']
stores_sales = stores_sales[cols]

In [8]:
## Split into target file and validation file

In [9]:
# end_train_date = '2011-10-03' # Non-inclusive
# end_val_date = '2012-01-02'

end_train_date = '2012-07-30' # Non-inclusive
end_val_date = '2012-10-29'

In [10]:
target_stores_sales = stores_sales[stores_sales.index < end_train_date]
validation_stores_sales = stores_sales[stores_sales.index >= end_train_date]
validation_stores_sales = stores_sales[stores_sales.index < end_val_date]


In [11]:
# With the data in a great state, save it off as a CSV
target_time_series_filename = "target_time_series.csv"
target_time_series_path = data_dir + "/" + target_time_series_filename
target_stores_sales.to_csv(target_time_series_path, header=False)

In [None]:
# With the data in a great state, save it off as a CSV
validation_time_series_filename = "validation_time_series.csv"
validation_time_series_path = data_dir + "/" + validation_time_series_filename
validation_stores_sales.to_csv(validation_time_series_path, header=False)

## Parmeters

In [12]:
DATASET_FREQUENCY = "W" 
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"

suffix = str(np.random.uniform())[4:9]


# 원하는 프로젝트 이름을 넣으세요
project = 'WalmartKaggle'
target_suffix = '_target'


target_datasetName= project+'DS' + suffix
target_datasetGroupName= project +'DSG'+ suffix

In [13]:
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
     data = json.load(notebook_info)
     resource_arn = data['ResourceArn']
     region = resource_arn.split(':')[3]
print(region)

us-east-2


In [14]:
session = boto3.Session(region_name=region)
forecast = session.client(service_name='forecast')
forecast_query = session.client(service_name='forecastquery')

## Create schema

In [15]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"target_value",
         "AttributeType":"float"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      }
   ]
}

## Create Target Time Sereis Dataset

In [16]:
response=forecast.create_dataset(
                    Domain="CUSTOM",
                    DatasetType='TARGET_TIME_SERIES',
                    DatasetName=target_datasetName,
                    DataFrequency=DATASET_FREQUENCY, 
                    Schema = schema
)

In [17]:
target_datasetArn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=target_datasetArn)

{'DatasetArn': 'arn:aws:forecast:us-east-2:057716757052:dataset/WalmartKaggleDS38922',
 'DatasetName': 'WalmartKaggleDS38922',
 'Domain': 'CUSTOM',
 'DatasetType': 'TARGET_TIME_SERIES',
 'DataFrequency': 'W',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'target_value', 'AttributeType': 'float'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 3, 26, 8, 53, 51, 778000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 3, 26, 8, 53, 51, 778000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'fbf6dfe9-3537-4894-92d3-2a4b7e3df4df',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Thu, 26 Mar 2020 08:53:51 GMT',
   'x-amzn-requestid': 'fbf6dfe9-3537-4894-92d3-2a4b7e3df4df',
   'content-length': '501',
   'connection': 'keep-alive'},
  'RetryAttempt

## Create role
Pleare uncomment if needed

In [18]:
# Use existing role
role_arn = "arn:aws:iam::057716757052:role/WalmartForecast"

In [19]:
# iam = boto3.client("iam")

# # 원하는 Role Name 을 넣으세요
# # role_name = "WalmartForecast" + suffix
# role_name = "WalmartForecast" 
# assume_role_policy_document = {
#     "Version": "2012-10-17",
#     "Statement": [
#         {
#           "Effect": "Allow",
#           "Principal": {
#             "Service": "forecast.amazonaws.com"
#           },
#           "Action": "sts:AssumeRole"
#         }
#     ]
# }

# create_role_response = iam.create_role(
#     RoleName = role_name,
#     AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
# )

# # AmazonPersonalizeFullAccess provides access to any S3 bucket with a name that includes "personalize" or "Personalize" 
# # if you would like to use a bucket with a different name, please consider creating and attaching a new policy
# # that provides read access to your bucket or attaching the AmazonS3ReadOnlyAccess policy to the role
# policy_arn = "arn:aws:iam::aws:policy/AmazonForecastFullAccess"
# iam.attach_role_policy(
#     RoleName = role_name,
#     PolicyArn = policy_arn
# )

# # Now add S3 support
# iam.attach_role_policy(
#     PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
#     RoleName=role_name
# )
# time.sleep(60) # wait for a minute to allow IAM role policy attachment to propagate

# role_arn = create_role_response["Role"]["Arn"]
# print(role_arn)

## Create a bucket
Uncomment the following code if needed

In [20]:
# Use an existing bucket
bucket_name = "walmart-forecast" 

In [21]:
# print(region)
# s3 = boto3.client('s3')
# account_id = boto3.client('sts').get_caller_identity().get('Account')
# # 원하는 버킷 이름을 넗으세요
# # bucket_name = account_id + "forecastpoc-gsmoon"
# # bucket_name = "walmart-forecast" + suffix
# bucket_name = "walmart-forecast" 
# print(bucket_name)
# if region != "us-east-1":
#     s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
# else:
#     s3.create_bucket(Bucket=bucket_name)

## Create dataset_import_job used to download dataset from S3

In [22]:
# Upload Target File
boto3.Session().resource('s3').Bucket(bucket_name).Object(target_time_series_filename).upload_file(target_time_series_path)
target_s3DataPath = "s3://"+bucket_name+"/"+target_time_series_filename

In [23]:
# Finally we can call import the dataset
datasetImportJobName = 'DSIMPORT_JOB_TARGET_WALMART' + suffix
ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=target_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":target_s3DataPath,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          },
                                                          TimestampFormat=TIMESTAMP_FORMAT
                                                         )

In [24]:
ds_target_import_job_arn=ds_import_job_response['DatasetImportJobArn']
print(ds_target_import_job_arn)

arn:aws:forecast:us-east-2:057716757052:dataset-import-job/WalmartKaggleDS38922/DSIMPORT_JOB_TARGET_WALMART38922


In [25]:
%%time

while True:
    dataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_target_import_job_arn)['Status']
    print(dataImportStatus)
    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'CREATE_FAILED':
        sleep(30)
    else:
        break

CREATE_PENDING
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
ACTIVE
CPU times: user 19.5 ms, sys: 778 µs, total: 20.3 ms
Wall time: 2min


In [27]:
%store project
%store suffix
%store target_suffix
%store region
%store ds_target_import_job_arn
%store target_datasetArn
%store bucket_name
%store role_arn

Stored 'project' (str)
Stored 'suffix' (str)
Stored 'target_suffix' (str)
Stored 'region' (str)
Stored 'ds_target_import_job_arn' (str)
Stored 'target_datasetArn' (str)
Stored 'bucket_name' (str)
Stored 'role_arn' (str)
