# Import Related Time Series Data

In [22]:
import boto3
from time import sleep
import os
import pandas as pd
import json
import time
import pprint
import numpy as np

In [23]:
data_dir = 'data'
train_file_name = 'train.csv'
features_file_name = 'features.csv'
store_file_name = 'stores.csv'
feature_data = pd.read_csv(os.path.join(data_dir,features_file_name))
feature_df = feature_data.copy()
# Shift four dats in the future because of a basis of Monday in Forecast in uni of Weekly
feature_df.Date = pd.to_datetime(feature_df['Date']) + pd.DateOffset(days=3)



In [24]:
feature_df = feature_df.drop(['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5'], axis=1)
# Drop CPI, Unemployment because of missing values
# Drop isHoliday because isHoliday exists in the built-in data of AWS Forecast
feature_df = feature_df.drop(['CPI','Unemployment','IsHoliday'], axis=1)



In [25]:
feature_df = feature_df.rename(columns={'Store':'item_id','Date':'date'})
feature_df.reset_index(inplace=True)
feature_df = feature_df.drop('index', axis=1)
feature_df = feature_df.set_index('date')
feature_df.item_id = feature_df.item_id.astype(str)
feature_df.index = pd.to_datetime(feature_df.index, format = '%Y-%m-%d')
feature_df.index = feature_df.index.strftime('%Y-%m-%d %H:%M:%S')
cols = ['Temperature','Fuel_Price','item_id']
feature_df = feature_df[cols]
feature_df.head()

Unnamed: 0,Temperature,Fuel_Price,item_id
2010-02-08 00:00:00,42.31,2.572,1
2010-02-15 00:00:00,38.51,2.548,1
2010-02-22 00:00:00,39.93,2.514,1
2010-03-01 00:00:00,46.63,2.561,1
2010-03-08 00:00:00,46.5,2.625,1


## Split into target file and validation file

In [26]:
end_val_date = '2012-10-29'

In [27]:
related_feature_df = feature_df[feature_df.index < end_val_date]
#validation_stores_sales = stores_sales[stores_sales.index >= '2011-10-01']


In [28]:
# With the data in a great state, save it off as a CSV
related_time_series_filename = "related_time_series.csv"
related_time_series_path = data_dir + "/" + related_time_series_filename
related_feature_df.to_csv(related_time_series_path, header=False)

## Parameters

In [29]:
%store -r

In [30]:
DATASET_FREQUENCY = "W" 
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"

project = 'WalmartKaggle'
related_suffix = '_related'

related_dataset_Name= project+'DS' + related_suffix + suffix

In [31]:
session = boto3.Session(region_name=region)
forecast = session.client(service_name='forecast')
forecast_query = session.client(service_name='forecastquery')

## Create schema

In [32]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"Temperature",
         "AttributeType":"float"
      },
      {
         "AttributeName":"Fuel_Price",
         "AttributeType":"float"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      }       
   ]
}

## Create Related Time Sereis Dataset

In [39]:
response=forecast.create_dataset(
                    Domain="CUSTOM",
                    DatasetType='RELATED_TIME_SERIES',
                    DatasetName=related_dataset_Name,
                    DataFrequency=DATASET_FREQUENCY, 
                    Schema = schema
)

In [40]:
related_datasetArn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=related_datasetArn)

{'DatasetArn': 'arn:aws:forecast:us-east-2:057716757052:dataset/WalmartKaggleDS_related38922',
 'DatasetName': 'WalmartKaggleDS_related38922',
 'Domain': 'CUSTOM',
 'DatasetType': 'RELATED_TIME_SERIES',
 'DataFrequency': 'W',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'Temperature', 'AttributeType': 'float'},
   {'AttributeName': 'Fuel_Price', 'AttributeType': 'float'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 3, 26, 9, 49, 0, 627000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 3, 26, 9, 49, 0, 627000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'a3fce8d5-a713-4a4c-80cc-d39336e5f2a6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Thu, 26 Mar 2020 09:49:02 GMT',
   'x-amzn-requestid': 'a3fce8d5-a713-4a4c-80cc-d39336e5f2a6',

## Create dataset_import_job used to download dataset from S3

In [41]:
# Upload Target File
boto3.Session().resource('s3').Bucket(bucket_name).Object(related_time_series_filename).upload_file(related_time_series_path)
related_s3DataPath = "s3://"+bucket_name+"/"+related_time_series_filename

In [42]:
# Finally we can call import the dataset
datasetImportJobName = 'DSIMPORT_JOB_RELATED_WALMART' + related_suffix + suffix
ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=related_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":related_s3DataPath,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          },
                                                          TimestampFormat=TIMESTAMP_FORMAT
                                                         )

In [43]:
ds_related_import_job_arn=ds_import_job_response['DatasetImportJobArn']
print(ds_related_import_job_arn)

arn:aws:forecast:us-east-2:057716757052:dataset-import-job/WalmartKaggleDS_related38922/DSIMPORT_JOB_RELATED_WALMART_related38922


In [44]:
forecast.describe_dataset_import_job(DatasetImportJobArn=ds_related_import_job_arn)

{'DatasetImportJobName': 'DSIMPORT_JOB_RELATED_WALMART_related38922',
 'DatasetImportJobArn': 'arn:aws:forecast:us-east-2:057716757052:dataset-import-job/WalmartKaggleDS_related38922/DSIMPORT_JOB_RELATED_WALMART_related38922',
 'DatasetArn': 'arn:aws:forecast:us-east-2:057716757052:dataset/WalmartKaggleDS_related38922',
 'TimestampFormat': 'yyyy-MM-dd hh:mm:ss',
 'DataSource': {'S3Config': {'Path': 's3://walmart-forecast/related_time_series.csv',
   'RoleArn': 'arn:aws:iam::057716757052:role/WalmartForecast'}},
 'Status': 'CREATE_PENDING',
 'CreationTime': datetime.datetime(2020, 3, 26, 9, 49, 8, 407000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 3, 26, 9, 49, 8, 407000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'a74e35ef-a840-4924-bf86-2c5bba36f77c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Thu, 26 Mar 2020 09:49:11 GMT',
   'x-amzn-requestid': 'a74e35ef-a840-4924-bf86-2c5bba36f77c',
   'c

In [45]:
%%time

while True:
    dataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_related_import_job_arn)['Status']
    print(dataImportStatus)
    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'CREATE_FAILED':
        sleep(30)
    else:
        break

CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
CREATE_IN_PROGRESS
ACTIVE
CPU times: user 18.9 ms, sys: 1.18 ms, total: 20.1 ms
Wall time: 2min


In [46]:
%store related_suffix
%store ds_related_import_job_arn
%store related_datasetArn

Stored 'related_suffix' (str)
Stored 'ds_related_import_job_arn' (str)
Stored 'related_datasetArn' (str)
