In [1]:
import json
import sagemaker
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.sklearn.model import SKLearnPredictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import NumpyDeserializer

In [2]:
# S3 prefix
s3_bucket = 's3://octank-america-sagemaker-resources'
train_prefix = '/training'

FRAMEWORK_VERSION = "0.23-1"
script_path = 'script_models/train_deploy_isolationForest_scikit.py'

In [3]:
sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
print(role)

arn:aws:iam::153918162224:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole


In [4]:
sklearn = SKLearn(
    entry_point=script_path,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c5.xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={'n_estimators': 100, 'contamination':0.05, 'max_features':1.0})

In [None]:
sklearn.fit({'train': s3_bucket+train_prefix+'/thermafuser_readings.csv'})

2021-03-24 04:33:21 Starting - Starting the training job...
2021-03-24 04:33:44 Starting - Launching requested ML instancesProfilerReport-1616560400: InProgress
......
2021-03-24 04:34:44 Starting - Preparing the instances for training......
2021-03-24 04:35:44 Downloading - Downloading input data...
2021-03-24 04:36:11 Training - Training image download completed. Training in progress.[34m2021-03-24 04:36:12,498 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-03-24 04:36:12,501 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-24 04:36:12,508 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-03-24 04:36:19,904 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-24 04:36:19,915 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-24 04:36:19,924 sagem

In [None]:
serializer = JSONSerializer()
deserializer = NumpyDeserializer()

In [None]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.c5.large", serializer=serializer, 
                           deserializer=deserializer)

-------------!

In [None]:
print(predictor.endpoint_name)

sagemaker-scikit-learn-2021-03-24-04-37-33-374


## Test the endpoint

In [None]:
endpoint = predictor.endpoint_name
#endpoint = 'sagemaker-scikit-learn-2021-03-23-23-28-53-946'

predictor = SKLearnPredictor(endpoint_name=endpoint, 
                        sagemaker_session=sagemaker_session, serializer=serializer, deserializer=deserializer)

print(predictor.endpoint_name)

sagemaker-scikit-learn-2021-03-24-04-37-33-374


In [None]:
cols_names = {'_roomOccupied':'roomOccupied', '_supplyAir':'supplyAir', '_timestamp':'time', 
              '_occupiedCoolingSetpoint':'occupiedCoolingSetpoint', '_terminalLoad':'terminalLoad', 
              '_zoneTemperature':'zoneTemperature', '_airflowFeedback':'airflowFeedback', 
              '_occupiedHeatingSetpoint':'occupiedHeatingSetpoint'}

test_df = pd.read_csv('data/thermafuser_test.csv')
test_df = test_df.rename(columns=cols_names)

In [None]:
test_df

Unnamed: 0.1,Unnamed: 0,_thermafuserId,roomOccupied,supplyAir,time,occupiedCoolingSetpoint,terminalLoad,zoneTemperature,airflowFeedback,occupiedHeatingSetpoint
0,5854,1,True,75.800003,2018-08-03 16:05:00,76.0,,75.000000,205.0,70.0
1,7678,1,True,74.199997,2018-08-10 00:05:05,76.0,,74.900002,80.0,70.0
2,12876,1,True,71.900002,2018-08-28 01:30:02,76.0,,73.699997,84.0,70.0
3,3665,1,True,74.400002,2018-07-27 01:35:04,76.0,,75.699997,84.0,70.0
4,4378,1,True,74.199997,2018-07-29 13:05:03,76.0,,74.900002,129.0,70.0
...,...,...,...,...,...,...,...,...,...,...
2528,6425,1,True,75.199997,2018-08-05 15:40:00,76.0,,75.599998,80.0,70.0
2529,19877,1,True,71.699997,2018-09-21 09:05:02,76.0,,72.300003,84.0,70.0
2530,19229,1,True,71.300003,2018-09-19 03:05:05,76.0,,72.400002,82.0,70.0
2531,8681,1,True,70.400002,2018-08-13 11:40:03,76.0,,73.800003,84.0,70.0


In [None]:
day_quarters = {0: '0-5', 1: '6-11', 2: '12-17', 3: '18-23'}

# To ensure that all of the quarters are created
fake_entries = {'time': [None, None, None, None], 'airflowFeedback': [None, None, None, None],
                'occupiedCoolingSetpoint': [None, None, None, None], 'roomOccupied': [None, None, None, None],
                'roomOccupied': [None, None, None, None], 'supplyAir': [None, None, None, None],
                'terminalLoad': [None, None, None, None], 'zoneTemperature': [None, None, None, None],
                'Day quarter': [0, 1, 2, 3]
                }
quarters_df = pd.DataFrame(fake_entries)

In [None]:
res_df = test_df
res_df = res_df.reset_index()

#Create day quarters
res_df['time'] = pd.to_datetime(res_df['time'])
res_df['Day quarter'] = res_df['time'].map(lambda x: x.hour // 6)
concat_df = pd.concat([res_df, quarters_df], axis=0)
dummies = pd.get_dummies(concat_df['Day quarter'])
concat_df = pd.concat([concat_df, dummies], axis=1)
concat_df.rename(columns=day_quarters, inplace=True)

#Delete fake quarter entries
concat_df = concat_df.dropna(axis=0, subset=['time'])

#Create rolling windows
concat_df['AirflowRoll'] = concat_df['airflowFeedback'].rolling(window=12).mean()
concat_df['SupplyAirRoll'] = concat_df['supplyAir'].rolling(window=12).mean()
concat_df['ZoneTemperatureRoll'] = concat_df['zoneTemperature'].rolling(window=12).mean()

concat_df['0-5 Roll'] = concat_df['0-5'].rolling(window=12).median()
concat_df['6-11 Roll'] = concat_df['6-11'].rolling(window=12).median()
concat_df['12-17 Roll'] = concat_df['12-17'].rolling(window=12).median()
concat_df['18-23 Roll'] = concat_df['18-23'].rolling(window=21).median()

#Keep only the interesting columns

predict_df = concat_df[['AirflowRoll', 'SupplyAirRoll', 'ZoneTemperatureRoll', '0-5 Roll', '6-11 Roll', '12-17 Roll', '18-23 Roll']]
predict_df = predict_df.dropna()

In [None]:
predict_df

Unnamed: 0,AirflowRoll,SupplyAirRoll,ZoneTemperatureRoll,0-5 Roll,6-11 Roll,12-17 Roll,18-23 Roll
20,82.750000,72.841667,73.933332,0.0,0.0,0.0,0.0
21,82.083333,72.866666,73.941666,0.0,0.0,0.0,0.0
22,83.166667,72.558333,73.616667,0.0,0.0,0.0,0.0
23,83.166667,72.483333,73.683333,0.0,0.0,0.0,0.0
24,83.500000,72.691666,73.800000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
2528,77.083333,72.683333,74.175000,0.0,0.0,0.0,0.0
2529,77.083333,72.666666,74.100000,0.0,0.0,0.0,0.0
2530,76.916667,72.616666,74.008334,0.0,0.0,0.0,0.0
2531,77.583333,72.475000,73.975001,0.0,0.0,0.0,0.0


In [None]:
predict_df.describe()

Unnamed: 0,AirflowRoll,SupplyAirRoll,ZoneTemperatureRoll,0-5 Roll,6-11 Roll,12-17 Roll,18-23 Roll
count,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0
mean,80.796127,71.791825,73.942516,0.038002,0.025468,0.026064,0.008357
std,7.698922,2.354095,0.3539,0.16407,0.122783,0.125465,0.09105
min,59.25,60.35,72.740001,0.0,0.0,0.0,0.0
25%,76.0,72.233333,73.708333,0.0,0.0,0.0,0.0
50%,78.75,72.616666,73.94,0.0,0.0,0.0,0.0
75%,84.833333,72.974999,74.183333,0.0,0.0,0.0,0.0
max,107.583333,74.583334,75.174999,1.0,1.0,1.0,1.0


In [None]:
test_df

Unnamed: 0.1,Unnamed: 0,_thermafuserId,roomOccupied,supplyAir,time,occupiedCoolingSetpoint,terminalLoad,zoneTemperature,airflowFeedback,occupiedHeatingSetpoint
0,5854,1,True,75.800003,2018-08-03 16:05:00,76.0,,75.000000,205.0,70.0
1,7678,1,True,74.199997,2018-08-10 00:05:05,76.0,,74.900002,80.0,70.0
2,12876,1,True,71.900002,2018-08-28 01:30:02,76.0,,73.699997,84.0,70.0
3,3665,1,True,74.400002,2018-07-27 01:35:04,76.0,,75.699997,84.0,70.0
4,4378,1,True,74.199997,2018-07-29 13:05:03,76.0,,74.900002,129.0,70.0
...,...,...,...,...,...,...,...,...,...,...
2528,6425,1,True,75.199997,2018-08-05 15:40:00,76.0,,75.599998,80.0,70.0
2529,19877,1,True,71.699997,2018-09-21 09:05:02,76.0,,72.300003,84.0,70.0
2530,19229,1,True,71.300003,2018-09-19 03:05:05,76.0,,72.400002,82.0,70.0
2531,8681,1,True,70.400002,2018-08-13 11:40:03,76.0,,73.800003,84.0,70.0


In [27]:
json_df = test_df.to_json()
#payload = json.dumps(json_df)

anomaly = predictor.predict(json_df, initial_args={'ContentType': 'application/json', 'Accept': 'application/json'})
print(test_df.shape)

(2533, 10)


In [23]:
print(predictor.accept)
#print(payload)

('application/x-npy',)


In [24]:
#data = json.loads(payload)
#print(data)
print(type(anomaly))
print(anomaly.size)

<class 'numpy.ndarray'>
2522


In [25]:
zero_arr = anomaly + 1
print(np.count_nonzero(zero_arr))

2521


In [None]:
json_df2 = pd.read_json(data)

NameError: name 'data' is not defined

In [None]:
json_df2.head()

In [None]:
def output_fn(prediction, response_content_type):
    """Format prediction output

    The default accept/content-type between containers for serial inference is JSON.
    We also want to set the ContentType or mimetype as the same value as accept so the next
    container can read the response payload correctly.
    """
    if response_content_type == "application/json":
        print('json return')
        return("json content type")
    elif response_content_type == 'text/csv':
        print("csv return")
        return "csv content type"
    elif response_content_type == 'application/x-npy':
        print('return numpy')
        #serialized = pickle.dumps(prediction)
        json_output = prediction.tolist()

        return worker.Response(json.dumps(json_output), mimetype=response_content_type)

        #return json.dumps(serialized)
    else:
        raise Exception("{} accept type is not supported by this script.".format(response_content_type))
        
        