# SEC351 - re:Invent 2019 Timeseries Forecasting - Example
## _You should complete the data exploration notebook first_

Load libraries that we will need for the model training and deployment.

In [None]:
# Import important libraries
import pandas as pd
# import numpy as np
import boto3
# import io
# import re
import sagemaker as sage
# from time import gmtime, strftime
# import itertools
from math import sqrt
from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from io import StringIO
# from io import BytesIO
from datetime import datetime

### Import the datset locally and load into the s3 bucket
_This steps assumes you've completed the accompanying data exploration notebook and have created an appropriate csv data file in the local directory_ In our case, our exploration generated "AwsSecurityAudit_Meta31_DescribeDBInstances.csv"   
We first load the data set into a Pandas DataFrame for feature selection and cleaning. We select those features that are relevant to this forecasting task by manually specifying column names, parse the time, and
- mark all NA values with 0
- drop the first 24 hours

We included optional code to save the data in a CSV file in the local notebook instance. Finally display the first 5 rows to inspect the data frame. 

In [None]:
# We are reading the csv file that we created in the prior data exploration notebook
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')

UserEventFreq_df = pd.read_csv('AwsSecurityAudit_Meta31_DescribeDBInstances.csv'
                   , index_col=0)
UserEventFreq_df.index.name = 'date'
UserEventFreq_df.head()

In [None]:
dateFreq_df = UserEventFreq_df[['count']]
dateFreq_df.head()

In [None]:
dateFreq_df.info()

#### Store data in S3 
A more persistent data store.  Just in case we close the notebook instance or want to use this data later.  
To upload the data to S3, we define the name of the bucket and the prefix that will be used throughout the notebook. We then create an S3 resource client and upload the data to S3 directly from teh data frame object in memory. 

In [None]:
#Define the s3 bucket
region = boto3.Session().region_name
bucket ='reinvent2019-builder-working' # <==  Change the name of this bucket
prefix = 'data'

In [None]:
# Write data to s3 (rather than keeping in the notebook instance)
s3 = boto3.client("s3")
s3_resource = boto3.resource('s3')

csv_buffer = StringIO()
dateFreq_df.to_csv(csv_buffer)
s3_resource.Object(bucket, prefix+'/data.csv').put(Body=csv_buffer.getvalue())

### Visualize the Data
Plot the features in the dataset to observe repeating pattern.

In [None]:
features_of_interest = ['count']
pyplot.figure(figsize=(12,3*len(features_of_interest)))
for i,f in enumerate(features_of_interest):
    if i==0: ax0 = pyplot.subplot(len(features_of_interest), 1, i+1)
    else: pyplot.subplot(len(features_of_interest), 1, i+1, sharex = ax0)
    dateFreq_df[f].plot()
    pyplot.title(f, y=0.85, loc='right')
pyplot.subplots_adjust(hspace=0.05)

### Prepare data for timeseries forecasting
We next define a function that will take timeseries data and create a data structure where input sequence `(t-n, ... t-1)` forecasts an output sequence `(t, t+1, ... t+n)`. This will provide training data for the forecasting algorithm.  

In our simple example, we're just going to use the value at time t to predict the value at time t+1. 

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    if dropnan:
        agg.dropna(inplace=True)  # drop rows with NaN values
    return agg

In [None]:
# transform series into train and test sets for supervised learning
def prepare_data(series, n_test, n_lag, n_seq):
    # extract raw values
    raw_values = series.values
    
    # integer encode direction
    encoder = LabelEncoder()
    raw_values[:,0] = encoder.fit_transform(raw_values[:,0])

    # rescale values to 0, 1
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_values = scaler.fit_transform(raw_values)
    scaled_values = scaled_values.reshape(len(scaled_values), 1)
    
    # transform into supervised learning problem X, y
    supervised = series_to_supervised(scaled_values, n_lag, n_seq)
    supervised = supervised[['var1(t-1)',
                             'var1(t)']]
    supervised_values = supervised.values
    # split into train and test sets
    train, test = supervised_values[0:-n_test], supervised_values[-n_test:]
    return scaler, train, test

In [None]:
timeSeries = dateFreq_df

# configure
n_lag = 1
n_seq = 1
n_test = 30

# prepare data
scaler, train, test = prepare_data(timeSeries, n_test, n_lag, n_seq)

In [None]:
print(train.shape)

### Split data into training and testing sets and save in s3

In [None]:
train_df = pd.DataFrame(train)
train_df.head(10)

In [None]:
train_df.info()

In [None]:
# select the first 70 observations and save training set in s3:
csv_buffer = StringIO()
train_df[:70].to_csv(csv_buffer,index=False)
s3_resource.Object(bucket, prefix+'/train/train.csv').put(Body=csv_buffer.getvalue())

In [None]:
# save validation set in s3
csv_buffer = StringIO()
train_df[70:].to_csv(csv_buffer,index=False)
s3_resource.Object(bucket, prefix+'/validation/val.csv').put(Body=csv_buffer.getvalue())

### Build the docker image and push it to ECS


In [None]:
%%sh
# Set the name of our algorithm
algorithm_name=reinvent2019-user-event-predict

chmod +x train/train
chmod +x train/serve

# Get the account and region defined in the current configuration (default to us-west-2 if none defined)
account=$(aws sts get-caller-identity --query Account --output text)
region=$(aws configure get region)
region=${region:-us-west-2}
fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1
if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)

# Build the docker image locally with the image name and then push it to ECR
# with the full name.
sudo docker build  -t ${algorithm_name} .
docker tag ${algorithm_name} ${fullname}
docker push ${fullname}

In [None]:
traindata_df = pd.read_csv('s3://'+bucket+'/'+prefix+'/train/train.csv')
traindata_df.head()

### Train the model and set up the endpoint for inference.

In [None]:
role = sage.get_execution_role()
sess = sage.Session()
source_bucket_uri = 's3://'+bucket+'/'+prefix+'/train/'

# Get account and region to create the image 
# Make sure we're suing the same algorithm name as above
account = sess.boto_session.client('sts').get_caller_identity()['Account']
region = sess.boto_session.region_name
image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, 'reinvent2019-user-event-predict')

In [None]:
image

### Train Model

In [None]:
# Set up SageMaker Estimator and fit the training job
model = sage.estimator.Estimator(image,
                      role, 1, 'ml.p3.8xlarge',
                      output_path="s3://{}/output".format(sess.default_bucket()),
                      sagemaker_session=sess)
model.fit(source_bucket_uri)

### Create endpoint

In [None]:
# Create the model endpoint 
predictor = model.deploy(1, 'ml.t2.medium', serializer=sage.predictor.csv_serializer)

In [None]:
predictor

### Test that the endpoint is working and make predictions

In [None]:
type(test)

In [None]:
test.shape

In [None]:
testData_df = pd.DataFrame(test)

In [None]:
testData_df.head()

In [None]:
testData_df=testData_df.iloc[:,:1]
testData_df.head()

#### Aside - Testing the endpoint outside the notebook
In the previous (and following) code, we call the model endpoint inference from within the notebook environment.  The more common use-case, however, is calling the endpoint in some account-local application.  For this, we'll use the _invoke__endpoint()_ API call. For example, we might call it from a lambda.  
So, before we proceed, we just want to check that that API call works properly.

In [None]:
import io

# Our simpfile.csv is just a file with two numbers - invoke_endpoing for this model
#   likes file objects to read
payload = pd.read_csv('simpfile.csv')
payload_file = io.StringIO()
payload.to_csv(payload_file, header = None, index = None)

**Need to set the EndpointName**

In [None]:
import boto3
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
    EndpointName = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', 
    ContentType = 'text/csv',
    Body = payload_file.getvalue())

In [None]:
import json
result = json.loads(response['Body'].read().decode())
print(result)

#### End of aside....

#### Testing and scoring unseen data
Let's return to testing our new time series predictor, and see how we could score new points.

In [None]:
sess = sage.Session()

In [None]:
testData_df.values

In [None]:
predictions = predictor.predict(testData_df.values).decode('utf-8').split('\n')

In [None]:
predictions

In [None]:
type(predictions[1])

In [None]:
n_features = 1
actual = [row[n_features:] for row in test]

In [None]:
forecasts = list()
for i in range(len(test)):
    # make forecast
    forecast = [float(s) for s in predictions[i].split(',')]
    # store the forecast
    forecasts.append(forecast)

#### Calculate RMSE in scaled and unscaled features space

In [None]:
# evaluate the RMSE for each forecast time step
from sklearn.metrics import mean_squared_error

def evaluate_forecasts(test, forecasts, n_lag, n_seq):
    for i in range(n_seq):
        actual = [row[i] for row in test]
        predicted = [forecast[i] for forecast in forecasts]
        rmse = sqrt(mean_squared_error(actual, predicted))
        print('t+%d RMSE: %f' % ((i+1), rmse))
        
evaluate_forecasts(actual, forecasts, n_lag, n_seq)

In [None]:
#transform back to the original scale
raw_values = timeSeries.values

# transform into supervised learning problem X, y
supervised = series_to_supervised(raw_values, n_lag, n_seq)
supervised = supervised[['var1(t)']]
supervised_values = supervised.values

# rescale values back to the original values
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_values = scaler.fit_transform(supervised_values)
scaled_values = scaled_values.reshape(len(scaled_values), 1)

inv_forecast = scaler.inverse_transform(forecasts)
inv_actual = scaler.inverse_transform(actual)

evaluate_forecasts(inv_actual, inv_forecast, n_lag, n_seq)

In [None]:
# plot the forecasts in the context of the original dataset
def plot_forecasts(series, forecasts, n_test):
    # plot the entire dataset in blue
    pyplot.figure(figsize=(20,5))
    pyplot.plot(series.values)
    # plot the forecasts in red
    for i in range(len(forecasts)):
        off_s = len(series) - n_test + i -1
        off_e = off_s + len(forecasts[i]) +1
        xaxis = [x for x in range(off_s, off_e)]
        yaxis = [series.values[off_s]] + list(forecasts[i]) 
        pyplot.plot(xaxis, yaxis, color='red')
    # show the plot
    pyplot.show()

In [None]:
polution = dateFreq_df['count'][-32:]
polution

### Graph of original series

In [None]:
features_of_interest = ['count']
pyplot.figure(figsize=(12,3*len(features_of_interest)))
for i,f in enumerate(features_of_interest):
    if i==0:
        ax0 = pyplot.subplot(len(features_of_interest), 1, i+1)
    else:
        pyplot.subplot(len(features_of_interest), 1, i+1, sharex = ax0)
    dateFreq_df[f].plot()
    pyplot.title(f, y=0.85, loc='right')
pyplot.subplots_adjust(hspace=0.05)

### Graph of forecast on last 30 days of dataset

In [None]:
plot_forecasts(polution, inv_forecast, 30)

### Graph errors to determine if our ongoing series has an anomoly. Higher than normal error = anomoly

In [None]:
errors = abs(inv_actual - inv_forecast)
pyplot.figure(figsize=(20,5))
pyplot.plot(errors)
pyplot.title('Errors', y=0.85, loc='right')
pyplot.show()

In [None]:
import numpy as np
errors = abs(inv_actual - inv_forecast)
mean_error = np.mean(errors)
pct_error = errors / mean_error * 100
std_error = np.std(errors)
pyplot.figure(figsize=(20,5))
pyplot.plot(pct_error)
pyplot.title('Percent Errors', y=0.85, loc='right')
pyplot.show()

In [None]:
std_error

In [None]:
# delete endpoint so you avoid any recuring charges
sess.delete_endpoint(predictor.endpoint)