In [1]:
library(reticulate)
sagemaker = import('sagemaker')

bucket = sagemaker$Session()$default_bucket()
prefix = "sagemaker/demo-r-byo"

role = sagemaker$get_execution_role()

In [2]:
role

In [17]:
session = sagemaker$Session()

In [18]:
algorithm_name = "rmars"

_Note: Although we could do preliminary data transformations in the notebook, we'll avoid doing so, instead choosing to do those transformations inside the container.  This is not typically the best practice for model efficiency, but provides some benefits in terms of flexibility._

In [19]:
boto3_r = import('boto3')

In [20]:
region = boto3_r$Session()$region_name
account = boto3_r$client('sts')$get_caller_identity()$Account

Now we'll create an estimator using the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk).  This allows us to specify:
- The training container image in ECR
- The IAM role that controls permissions for accessing the S3 data and executing SageMaker functions
- Number and type of training instances
- S3 path for model artifacts to be output to
- Any hyperparameters that we want to have the same value across all training jobs during tuning

In [21]:
library(tidyverse)

## Data

In [22]:
# loading airly sensor (sensor ids = 7201, 7599, 7803; instal ids = 41414, 41816, 42022) data
# for the period 15-08-2021 to 22-11-2021 (Only NO2, T and RH signals)
data_file <- 'data/data_airly.csv'
data_airly_all <- read_csv(file=data_file)#, col_types = cols("d", "d", "T", "c", "d", "d", "d"))
head(data_airly_all)

New names:
* `` -> ...1

[1mRows: [22m[34m342156[39m [1mColumns: [22m[34m7[39m

[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (1): status
[32mdbl[39m  (5): ...1, id, humidity, no2, temperature
[34mdttm[39m (1): date


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



...1,id,date,status,humidity,no2,temperature
<dbl>,<dbl>,<dttm>,<chr>,<dbl>,<dbl>,<dbl>
0,42022,2021-08-15 00:00:00,RAW,71.82511,297.1198,24.94795
1,42022,2021-08-15 00:05:00,RAW,72.20479,297.1906,24.84013
2,42022,2021-08-15 00:10:00,RAW,72.43551,297.3429,24.74854
3,42022,2021-08-15 00:15:00,RAW,72.64743,297.4239,24.66797
4,42022,2021-08-15 00:20:00,RAW,72.76068,297.6219,24.64623
5,42022,2021-08-15 00:25:00,RAW,73.05018,297.7695,24.57725


In [23]:
# loading coloated reference sensor (balcony analyser T200) data for the same period
data_file <- 'data/data_ref-LONDON.csv'
data_ref_all <- read_csv(file=data_file) %>% 
    mutate(Time = lubridate::force_tz(Time, "Europe/London")) %>%   # set time zone (for downloaded data) to pc local tzone
    mutate(date = lubridate::with_tz(Time, "UTC")) # convert time zone to UTC to match airly data
head(data_ref_all)

[1mRows: [22m[34m28512[39m [1mColumns: [22m[34m4[39m

[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m  (3): NO, NO2, NOX
[34mdttm[39m (1): Time


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.



Time,NO,NO2,NOX,date
<dttm>,<dbl>,<dbl>,<dbl>,<dttm>
2021-08-15 01:00:00,4.43,21.3,25.8,2021-08-15 00:00:00
2021-08-15 01:05:00,3.14,20.8,24.0,2021-08-15 00:05:00
2021-08-15 01:10:00,2.85,20.6,23.4,2021-08-15 00:10:00
2021-08-15 01:15:00,3.6,20.8,24.4,2021-08-15 00:15:00
2021-08-15 01:20:00,4.04,21.2,25.2,2021-08-15 00:20:00
2021-08-15 01:25:00,4.53,22.0,26.5,2021-08-15 00:25:00


In [24]:
ID = 42022

In [25]:
data_airly_id <- data_airly_all %>% 
    filter(id==ID) %>% 
    select(date, status, no2) %>% 
    pivot_wider(names_from = status, values_from = c(no2))
head(data_airly_id)

date,RAW,AUX,PPB,FINAL
<dttm>,<dbl>,<dbl>,<dbl>,<dbl>
2021-08-15 00:00:00,297.1198,291.0501,26.0064,47.91067
2021-08-15 00:05:00,297.1906,290.9707,26.64045,49.09779
2021-08-15 00:10:00,297.3429,290.9539,27.30686,50.34218
2021-08-15 00:15:00,297.4239,290.9917,27.46796,50.65224
2021-08-15 00:20:00,297.6219,291.08,27.83964,51.33843
2021-08-15 00:25:00,297.7695,291.0375,28.59472,52.74053


In [26]:
data_airly_TRH <- data_airly_all %>% 
    filter(id==ID) %>% 
    filter(status=="RAW") %>% 
    select(date, temperature, humidity)
head(data_airly_TRH)

date,temperature,humidity
<dttm>,<dbl>,<dbl>
2021-08-15 00:00:00,24.94795,71.82511
2021-08-15 00:05:00,24.84013,72.20479
2021-08-15 00:10:00,24.74854,72.43551
2021-08-15 00:15:00,24.66797,72.64743
2021-08-15 00:20:00,24.64623,72.76068
2021-08-15 00:25:00,24.57725,73.05018


In [28]:
install.packages('openair')
library(openair)

also installing the dependencies ‘jpeg’, ‘latticeExtra’, ‘mapproj’


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done


Attaching package: ‘openair’


The following object is masked from ‘package:reticulate’:

    import




In [29]:
data_base <- data_airly_id %>% 
    openair::timeAverage(avg.time = "60 min", statistic = "min", start.date = "2021-08-15 00:00:00")

In [30]:
data_base_TRH <- data_airly_TRH %>% 
    openair::timeAverage(avg.time = "60 min", statistic = "mean", start.date = "2021-08-15 00:00:00")

In [31]:
data_base_comb <- cbind(data_base[,1:2], data_base_TRH[,2]) %>% 
    na.exclude()
head(data_base_comb)

Unnamed: 0_level_0,date,RAW,temperature
Unnamed: 0_level_1,<dttm>,<dbl>,<dbl>
1,2021-08-15 00:00:00,297.1198,24.60835
2,2021-08-15 01:00:00,296.3621,23.92445
3,2021-08-15 02:00:00,295.7771,23.37839
4,2021-08-15 03:00:00,295.4487,22.81762
5,2021-08-15 04:00:00,295.5031,22.99725
6,2021-08-15 05:00:00,294.5177,24.34073


In [32]:
base_train <- data_base_comb %>% 
    select(-date)

In [None]:
write_csv(base_train, "data/base_train.csv")

In [33]:
data_airly_idTRH <- merge(data_airly_id, data_airly_TRH)
head(data_airly_idTRH)

Unnamed: 0_level_0,date,RAW,AUX,PPB,FINAL,temperature,humidity
Unnamed: 0_level_1,<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2021-08-15 00:00:00,297.1198,291.0501,26.0064,47.91067,24.94795,71.82511
2,2021-08-15 00:05:00,297.1906,290.9707,26.64045,49.09779,24.84013,72.20479
3,2021-08-15 00:10:00,297.3429,290.9539,27.30686,50.34218,24.74854,72.43551
4,2021-08-15 00:15:00,297.4239,290.9917,27.46796,50.65224,24.66797,72.64743
5,2021-08-15 00:20:00,297.6219,291.08,27.83964,51.33843,24.64623,72.76068
6,2021-08-15 00:25:00,297.7695,291.0375,28.59472,52.74053,24.57725,73.05018


In [34]:
# split data into training; first 30 days (~30%), the rest for test (~70%)
data_airly_train <- data_airly_idTRH %>%
  filter(date < "2021-09-15 00-00-00")

data_airly_test <- data_airly_idTRH %>%
  filter(date >= "2021-09-15 00-00-00")

data_ref_train <- data_ref_all %>%
  filter(date < "2021-09-15 00-00-00")

data_ref_test <- data_ref_all %>%
  filter(date >= "2021-09-15 00-00-00")

In [36]:
head(data_airly_test)

Unnamed: 0_level_0,date,RAW,AUX,PPB,FINAL,temperature,humidity
Unnamed: 0_level_1,<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,2021-09-15 00:00:00,297.0186,290.4469,30.05517,48.12218,17.34458,84.18478
2,2021-09-15 00:05:00,296.9539,290.4402,29.83899,47.843,17.30262,84.36512
3,2021-09-15 00:10:00,296.9006,290.423,29.70262,47.66598,17.26921,84.36981
4,2021-09-15 00:15:00,297.4361,290.4441,31.75936,50.37513,17.22725,84.76824
5,2021-09-15 00:20:00,297.4461,290.399,31.98491,50.67905,17.1681,85.36808
6,2021-09-15 00:25:00,297.3139,290.4098,31.43061,49.95518,17.1462,85.35519


In [37]:
data_airly_train_1hr <- data_airly_train %>% 
    openair::timeAverage(avg.time = "60 min", statistic = "mean", start.date = "2021-08-15 00:00:00")

data_ref_train_1hr <- data_ref_train %>% 
    openair::timeAverage(avg.time = "60 min", statistic = "mean", start.date = "2021-08-15 00:00:00")

In [38]:
data_train_comb <- inner_join(data_airly_train_1hr, data_ref_train_1hr, by=c("date"="date"))

In [39]:
head(data_train_comb)

date,RAW,AUX,PPB,FINAL,temperature,humidity,Time,NO,NO2,NOX
<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>,<dbl>,<dbl>,<dbl>
2021-08-15 00:00:00,297.4983,291.0624,27.46018,50.64405,24.60835,72.8412,2021-08-15 01:27:30,4.676667,21.55833,26.225
2021-08-15 01:00:00,296.7222,290.968,25.08844,46.37331,23.92445,74.40823,2021-08-15 02:27:30,4.335833,17.8,22.15
2021-08-15 02:00:00,296.2973,290.9715,23.58629,43.66986,23.37839,75.36103,2021-08-15 03:27:30,4.755,15.29167,20.05
2021-08-15 03:00:00,295.6056,290.8266,21.67852,40.20739,22.81762,75.84851,2021-08-15 04:27:30,3.353333,11.87417,15.225
2021-08-15 04:00:00,295.7697,291.0592,21.3027,39.47796,22.99725,75.73416,2021-08-15 05:27:30,2.855,10.91667,13.775
2021-08-15 05:00:00,295.5761,291.3231,19.16184,35.35252,24.34073,73.23513,2021-08-15 06:27:30,4.393333,13.33333,17.73333


In [41]:
gas_train <- data_train_comb %>% 
    select(NO2, RAW, temperature) %>% 
    na.exclude()

In [42]:
write_csv(gas_train, "data/gas_train.csv")

In [48]:
data_airly_test_1hr <- data_airly_test %>% 
    openair::timeAverage(avg.time = "60 min", statistic = "mean", start.date = "2021-09-15 00:00:00")

data_ref_test_1hr <- data_ref_test %>% 
    openair::timeAverage(avg.time = "60 min", statistic = "mean", start.date = "2021-09-15 00:00:00")

In [49]:
data_test_comb <- inner_join(data_airly_test_1hr, data_ref_test_1hr, by=c("date"="date"))

In [50]:
head(data_test_comb)

date,RAW,AUX,PPB,FINAL,temperature,humidity,Time,NO,NO2,NOX
<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dttm>,<dbl>,<dbl>,<dbl>
2021-09-15 00:00:00,297.1858,290.4173,30.91114,49.27974,17.10584,85.24315,2021-09-15 01:27:30,6.069167,20.91667,26.98333
2021-09-15 01:00:00,296.6088,290.4185,28.79548,46.56564,16.60335,86.91472,2021-09-15 02:27:30,5.256667,19.25,24.525
2021-09-15 02:00:00,295.9173,290.3378,26.48576,43.57857,16.1131,86.60823,2021-09-15 03:27:30,9.43,16.95,26.36667
2021-09-15 03:00:00,295.9334,290.265,26.94795,44.24168,15.67477,86.84233,2021-09-15 04:27:30,7.5525,16.06667,23.64167
2021-09-15 04:00:00,296.0547,290.2665,27.51941,45.03515,15.39307,87.85701,2021-09-15 05:27:30,20.241667,16.39167,36.60833
2021-09-15 05:00:00,296.4984,290.4345,28.72853,46.63902,15.37075,88.66242,2021-09-15 06:27:30,29.433333,17.25,46.7


In [53]:
gas_test <- data_test_comb %>%
    na.exclude()

In [54]:
write_csv(gas_test, "data/gas_test.csv")

## upload

In [46]:
session$upload_data(path="data/base_train.csv",
                                      bucket=bucket,
                                      key_prefix=str_glue(prefix, ID, "train", "base", .sep="/"))

In [47]:
session$upload_data(path="data/gas_train.csv",
                                      bucket=bucket,
                                      key_prefix=str_glue(prefix, ID, "train", "gas", .sep="/"))