-
Notifications
You must be signed in to change notification settings - Fork 2
/
report.Rmd
527 lines (415 loc) · 25.9 KB
/
report.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
---
title: "Report"
author: "Francisco Bischoff"
date: "on Jan 15, 2022"
output:
workflowr::wflow_html:
number_sections: true
fig_caption: yes
toc: false
bibliography: ../papers/references.bib
link-citations: true
# Download your specific csl file and refer to it in the line below.
csl: ../thesis/csl/ama.csl
---
```{r setup, include=FALSE}
# knitr::opts_chunk$set(echo = FALSE, fig.align = "center", dev = "svg")
knitr::opts_chunk$set(
echo = FALSE, fig.align = "center", autodep = TRUE,
fig.height = 5, fig.width = 10,
tidy = "styler",
tidy.opts = list(strict = TRUE)
)
if (knitr::is_latex_output()) {
knitr::opts_chunk$set(dev = "pdf")
} else {
knitr::opts_chunk$set(dev = "svg")
}
library(here)
library(visNetwork)
library(tibble)
library(kableExtra)
library(gridExtra)
library(targets)
library(ggplot2)
knitr::opts_knit$set(root.dir = here("docs"), base.dir = here("docs"))
.rmdenvir <- environment()
.refctr <- c(`_` = 0)
ref <- function(use_name) {
require(stringr)
if (!exists(".refctr")) .refctr <- c(`_` = 0)
if (any(names(.refctr) == use_name)) {
return(.refctr[use_name])
}
type <- str_split(use_name, ":")[[1]][1]
n_obj <- sum(str_detect(names(.refctr), type))
use_num <- n_obj + 1
newrefctr <- c(.refctr, use_num)
names(newrefctr)[length(.refctr) + 1] <- use_name
assign(".refctr", newrefctr, envir = .rmdenvir)
return(use_num)
}
```
# Current Work Status
## Principles
This research is being conducted using the Research Compendium principles [@compendium2019]:
1. Stick with the convention of your peers;
2. Keep data, methods, and output separated;
3. Specify your computational environment as clearly as you can.
Data management follows the FAIR principle (findable, accessible, interoperable, reusable)
[@wilkinson2016]. Concerning these principles, the dataset was converted from Matlab's format to
CSV format, allowing more interoperability. Additionally, all the project, including the dataset, is
in conformity with the Codemeta Project [@CodeMeta2017].
## The data
The current dataset used is the CinC/Physionet Challenge 2015 public dataset, modified to include
only the actual data and the header files in order to be read by the pipeline and is hosted by
Zenodo [@bischoff2021] under the same license as Physionet.
The dataset is composed of 750 patients with at least five minutes records. All signals have been
resampled (using anti-alias filters) to 12 bit, 250 Hz and have had FIR band-pass (0.05 to 40Hz) and
mains notch filters applied to remove noise. Pacemaker and other artifacts are still present on the ECG
[@Clifford2015]. Furthermore, this dataset contains at least two ECG derivations and one or more
variables like arterial blood pressure, photoplethysmograph readings, and respiration movements.
The _event_ we seek to improve is the detection of a life-threatening arrhythmia as defined by
Physionet in Table `r ref("tab:alarms")`.
```{r alarms, echo=FALSE}
alarms <- tribble(
~Alarm, ~Definition,
"Asystole", "No QRS for at least 4 seconds",
"Extreme Bradycardia", "Heart rate lower than 40 bpm for 5 consecutive beats",
"Extreme Tachycardia", "Heart rate higher than 140 bpm for 17 consecutive beats",
"Ventricular Tachycardia", "5 or more ventricular beats with heart rate higher than 100 bpm",
"Ventricular Flutter/Fibrillation", "Fibrillatory, flutter, or oscillatory waveform for at least 4 seconds"
)
kbl(alarms, booktabs = TRUE, caption = paste("Table", ref("tab:alarms"), "- Definition of the five alarm types used in CinC/Physionet Challenge 2015 challenge."), align = "ll") %>%
kable_styling(full_width = TRUE) %>%
# column_spec(c(1,2), width = "50px") %>%
row_spec(0, bold = TRUE)
```
The fifth minute is precisely where the alarm has been triggered on the original recording set. To
meet the ANSI/AAMI EC13 Cardiac Monitor Standards [@AAMI2002], the onset of the event is within 10
seconds of the alarm (i.e., between 4:50 and 5:00 of the record). That doesn't mean that there are
no other arrhythmias before, but those were not labeled.
## Workflow
All steps of the process are being managed using the R package `targets` [@landau2021] from data
extraction to the final report, as shown in Fig. `r ref("fig:targets")`.
```{r targets, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:targets"), "- Reproducible research workflow using `targets`.")}
knitr::include_graphics("figure/targets.png")
```
The report is available on the main webpage [@franz_website], allowing inspection of previous
versions managed by the R package `workflowr`[@workflowr2021], as shown in Fig.
`r ref("fig:workflowr")`.
```{r workflowr, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:workflowr"), "- Reproducible reports using `workflowr`.")}
knitr::include_graphics("figure/workflowr_print.png")
```
## Work in Progress
### Project start
The project started with a literature survey on the databases Scopus, Pubmed, Web of Science, and
Google Scholar with the following query (the syntax was adapted for each database):
TITLE-ABS-KEY ( algorithm OR 'point of care' OR 'signal processing' OR 'computer assisted' OR 'support vector machine' OR 'decision support system*' OR 'neural network*' OR 'automatic interpretation' OR 'machine learning') AND TITLE-ABS-KEY ( electrocardiography OR cardiography OR 'electrocardiographic tracing' OR ecg OR electrocardiogram OR cardiogram ) AND TITLE-ABS-KEY ( 'Intensive care unit' OR 'cardiologic care unit' OR 'intensive care center' OR 'cardiologic care center' )
\
The inclusion and exclusion criteria were defined as in Table `r ref("tab:criteria")`.
```{r criteria, echo=FALSE}
criteria <- tribble(
~"Inclusion criteria", ~"Exclusion criteria",
"ECG automatic interpretation", "Manual interpretation",
"ECG anomaly detection", "Publication older than ten years",
"ECG context change detection", "Do not attempt to identify life-threatening arrhythmias, namely asystole, extreme bradycardia, extreme tachycardia, ventricular tachycardia, and ventricular flutter/fibrillation",
"Online Stream ECG analysis", "No performance measurements reported",
"Specific diagnosis (like a flutter, hyperkalemia, etc.)", ""
)
kbl(criteria, booktabs = TRUE, caption = paste("Table", ref("tab:criteria"), "- Literature review criteria."), align = "ll") %>%
kable_styling(full_width = TRUE) %>%
column_spec(1, width = "10cm") %>%
row_spec(0, bold = TRUE)
```
The current stage of the review is on Data Extraction, from the resulting screening shown in Fig.
`r ref("fig:prisma")`.
```{r prisma, echo=FALSE, out.width="80%", fig.cap=paste("Figure", ref("fig:prisma"), "- Prisma results")}
knitr::include_graphics("figure/PRISMA.png")
```
Meanwhile, the project pipeline has been set up on GitHub, Inc. [@bischoffrepo2021] leveraging on
Github Actions [@gitactions2021] for the Continuous Integration lifecycle. The repository is
available at [@bischoffrepo2021], and the resulting report is available at [@franz_website] for
transparency while the roadmap and tasks are managed using the integrated Zenhub [@zenhub2021].
As it is known worldwide, since 2020, the measures taken to control the SARS-Cov2 pandemic
have had a great impact on any development lifecycle.
At this moment, this project went through two timeline changes. In Fig. `r ref("fig:zenhub1")` it is
shown the initial roadmap (as of May 2020), Fig. `r ref("fig:zenhub2")` the modified roadmap (as of July
2021), and Fig. `r ref("fig:zenhub3")` (as of Jan 2022). The literature survey is currently in the extraction
phase, which includes about 74 articles from the full-text screening.
```{r zenhub1, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:zenhub1"), "- Roadmap original")}
knitr::include_graphics("figure/roadmap_original.png")
```
```{r zenhub2, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:zenhub2"), "- Roadmap updated on August 2021")}
knitr::include_graphics("figure/roadmap_updated.png")
```
```{r zenhub3, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:zenhub3"), "- Roadmap updated on January 2022")}
knitr::include_graphics("figure/roadmap_jan2022.png")
```
### Preliminary Experimentations
### RAW Data
While programming the pipeline for the current dataset, it has been acquired a Single Lead Heart
Rate Monitor breakout from Sparkfun^TM^ [@sparkfun2021] using the AD8232 [@AnalogDevices2020]
microchip from Analog Devices Inc., compatible with Arduino^(R)^ [@arduino2021], for an in-house
experiment (Figs. `r ref("fig:ad8232")` and `r ref("fig:fullsetup")`).
```{r ad8232, echo=FALSE, out.width="50%", fig.cap=paste("Figure", ref("fig:ad8232"), "- Single Lead Heart Rate Monitor")}
knitr::include_graphics("figure/sparkfun.jpg")
```
```{r fullsetup, echo=FALSE, out.width="50%", fig.cap=paste("Figure", ref("fig:fullsetup"), "- Single Lead Heart Rate Monitor")}
knitr::include_graphics("figure/FullSetup.jpg")
```
The output gives us a RAW signal as shown in Fig. `r ref("fig:rawsignal")`.
```{r rawsignal, echo=FALSE, out.width="50%", fig.cap=paste("Figure", ref("fig:rawsignal"), "- RAW output from Arduino at ~300hz")}
knitr::include_graphics("figure/arduino_plot.jpg")
```
After applying the same settings as the Physionet database (collecting the data at 500hz, resample
to 250hz, pass-filter, and notch filter), the signal is much better as shown in Fig.
`r ref("fig:filtersignal")`. Note: the leads were not placed on the correct location.
```{r filtersignal, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:filtersignal"), "- Gray is RAW, Red is filtered")}
knitr::include_graphics("figure/filtered_ecg.png")
```
So in this way, we allow us to import RAW data from other devices and build our own test dataset in
the future.
#### Detecting Regime Changes
The regime change approach will be using the *Arc Counts*, as explained elsewhere [@gharghabi2018].
The current implementation of the Matrix Profile in R, maintained by the first author of this
thesis, is being used to accomplish the computations. This package was published in R Journal
[@RJ-2020-021].
A new concept was needed to be implemented on the algorithm in order to emulate (in this first
iteration) the behavior of the real-time sensor: the search must only look for previous information
within a time constraint. Thus, both the Matrix Profile computation and the *Arc Counts* needed to
be adapted for this task.
At the same time, the ECG data needs to be "cleaned" for proper evaluation. That is different from
the initial filtering process. Several SQIs (Signal Quality Indexes) are used on literature
[@eerikainen2015], some trivial measures as *kurtosis*, *skewness*, median local noise level, other
more complex as pcaSQI (the ratio of the sum of the five largest eigenvalues associated with the
principal components over the sum of all eigenvalues obtained by principal component analysis
applied to the time aligned ECG segments in the window). By experimentation (yet to be validated), a
simple formula gives us the "complexity" of the signal and correlates well with the noisy data is
shown in Equation $\eqref{noise}$.
$$
\sqrt{\sum_{i=1}^w((x_{i+1}-x_i)^2)}, \quad \text{where}\; w \; \text{is the window size} \tag{1} \label{noise}
$$
\
\
The Fig. `r ref("fig:sqi")` shows some SQIs.
```{r sqi, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:sqi"), "- Green line is the \"complexity\" of the signal")}
knitr::include_graphics("figure/noise.png")
```
Finally, a sample of the regime change detection is shown in Figs. `r ref("fig:regimefilter")` to
`r ref("fig:regimetrue")`.
Fig. `r ref("fig:regimefilter")` shows that noisy data (probably patient muscle movements) are marked
with a blue point and thus are ignored by the algorithm. Also, valid for the following plots, the
green and red lines on the data mark the 10 seconds window where the "event" that triggers the alarm
is supposed to happen.
```{r regimefilter, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:regimefilter"), "- Regime changes with noisy data - false alarm")}
knitr::include_graphics("figure/regime_filter.png")
```
In Fig. `r ref("fig:regimefalse")`, the data is clean; thus, nothing is excluded. Interestingly one of
the detected regime changes is inside the "green-red" window. But it is a false alarm.
```{r regimefalse, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:regimefalse"), "- Regime changes with good data - false alarm")}
knitr::include_graphics("figure/regime_false.png")
```
The last plot (Fig. `r ref("fig:regimetrue")`) shows the algorithm's robustness, not excluding good data
with a wandering baseline, and the last regime change is correctly detected inside the "green-red"
window.
```{r regimetrue, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:regimetrue"), "- Regime changes with good but wandering data - true alarm")}
knitr::include_graphics("figure/regime_true.png")
```
### First General Assessment
By August 2021, it was clear that the chosen regime detection algorithm would be the (Fast Low-cost
Online Semantic Segmentation) FLOSS [@gharghabi2017]. Thus, the pipeline had to be adapted in order
to allow the careful replication of the streaming process, even though several computations are made
beforehand to keep the pipeline (i.e., the simulation) fast on exploring several parameters.
The choice of FLOSS is founded on the following arguments:
- **Domain Agnosticism:** the algorithm makes no assumptions about the data as opposed to most
available algorithms to date.
- **Streaming:** the algorithm can provide real-time information.
- **Real-World Data Suitability:** the objective is not to _explain_ all the data. Therefore, areas
marked as "don't know" areas are acceptable.
- **FLOSS is not:** a change point detection algorithm [@aminikhanghahi2016]. The interest here is
changes in the shapes of a sequence of measurements.
Other algorithms we can cite are based on Hidden Markov Models (HMM) that require at least two
parameters set by domain experts: cardinality and dimensionality reduction. The most attractive alternative
could be the Autoplait [@Matsubara2014], which is also domain agnostic and parameter-free. It segments
the time series using Minimum Description Length (MDL) and recursively tests if the region is best modeled
by one or two HMM. However, Autoplait is designed for batch operation, not streaming, and also requires
discrete data. FLOSS was demonstrated to be superior in several datasets in its original paper. In addition,
FLOSS is robust to several changes in data like downsampling, bit depth reduction, baseline wandering,
noise, smoothing, and even deleting 3% of the data and filling with simple interpolation. Finally,
the algorithm is light and suitable for low-power devices.
It is worth also mentioning the Time Series Snippets [@Imani2018], based on MPdist [@gharghabi2018b].
The latter measures the distance between two sequences considering how many similar sub-sequences they
share, no matter the order of matching. It proved to be a useful measure (not a metric) for meaningfully
clustering similar sequences. Time Series Snippets exploits MPdist properties to summarize a dataset that
contains representative sequences. This seems to be an alternative for detecting regime changes, but
it is not. The purpose of this algorithm is to find which pattern(s) explains most of the dataset.
Lastly, MPdist is quite expensive compared to the trivial Euclidean distance.
### Second General Assessment
By January 2022, the following tasks were performed:
1. Restructuring the roadmap (again)
2. Refining the main pipeline (again)
3. Preparing for modeling and parameter tuning
4. Feasibility trial
5. And others
#### Refining the main pipeline
That can also be thought of as "rethinking" the pipeline. Which also leads to the roadmap
restructuration. The new roadmap was already shown above in Fig. `r ref("fig:zenhub3")`.
It is essential not only to write a pipeline that can "autoplot" itself for fine-grain inspection
but also to design a high-level graph that can explain it "in a glance". This exercise was helpful
both ways: telling the story in a short version also reveals missing things and misleading paths
that are not so obvious when thinking "low-level".
Figs. `r ref("fig:regimedetection")`, `r ref("fig:shapelets")` and `r ref("fig:fullmodel")` show the overview
of the processes involved.
```{r regimedetection, echo=FALSE, out.width="100%", fig.cap=paste("Figure", ref("fig:regimedetection"), "- Pipeline for regime change detection")}
if (knitr::is_latex_output()) {
knitr::include_graphics("figure/regime_detection.pdf")
} else {
knitr::include_graphics("figure/regime_detection.svg")
}
```
```{r shapelets, echo=FALSE, out.width="60%", fig.cap=paste("Figure", ref("fig:shapelets"), "- Pipeline for TRUE and FALSE alarm classification")}
if (knitr::is_latex_output()) {
knitr::include_graphics("figure/shapelets.pdf")
} else {
knitr::include_graphics("figure/shapelets.svg")
}
```
```{r fullmodel, echo=FALSE, out.width="60%", fig.cap=paste("Figure", ref("fig:fullmodel"), "- Pipeline of the final process")}
if (knitr::is_latex_output()) {
knitr::include_graphics("figure/fullmodel.pdf")
} else {
knitr::include_graphics("figure/fullmodel.svg")
}
```
#### Preparing for modeling and parameter tuning
Here is one missing part that should have been addressed (formally) earlier. Although this work has
its purpose of being finally deployed on small hardware, this prospective phase will need several
hours of computing, tuning, evaluation, and validation of all findings.
Thus it was necessary to revisit the frameworks we are used to working on R: `caret` [@JSSv028i05]
and the newest `tidymodels` [@tidymodels2020] collection. For sure, there are other frameworks and
opinions [@Thompson2020]. Notwithstanding, this project will follow the `tidymodels` road. Two
significant arguments 1) constantly improving and constantly being re-checked for bugs; large
community contribution; 2) allows to plug in a custom modeling algorithm that, in this case, will be
the one needed for developing this work.
#### Feasibility trial
A side-project called "false.alarm.io" has been derived from this work (an unfortunate mix of
"false.alarm" and "PlatformIO" [@PlatformIO], the IDE chosen to interface the panoply of embedded
systems we can experiment with). The current results of this side-project are very enlightening and show
that the final algorithm can indeed be used in small hardware. Further data will be available in
the future.
#### And others
After this "step back" to look forward, it was time to define how the regime change algorithm would
integrate with the actual decision of triggering or not the alarm. Some hypotheses were thought out:
(1) clustering similar patterns, (2) anomaly detection, (3) classification, and (4) forecasting.
Among these methods, it was thought, in order to avoid exceeding processor capacity, an initial set
of shapelets [@Rakthanmanon2013] can be sufficient to rule in or out the `TRUE`/`FALSE` challenge.
Depending on the accuracy of this approach and the resources available, another method can be
introduced for both (1) improving the "negative"[^1] samples and (2) learning more shapelets to
improve the `TRUE`/`FALSE` alarm discrimination.
[^1]: The term "negative" does not imply that the patient has a "normal" ECG. It means that the
"negative" section is not a life-threatening condition that needs to trigger an alarm.
### Scientific Contributions
1. **On regime change detection:** in the original paper, the FLOSS algorithm assumes the _Arc
Counts_ follow a "uniform distribution" when we add a temporal constraint (not considering arcs
coming from older data), and the _Arc Counts_ of newly incoming data are truncated by the same
amount of temporal constraint. This prevents completely the detection of a regime change in the
last 10 seconds as required. This issue is overcome using the theoretical distributions as shown
in Fig. `r ref("fig:distributions")`.
2. **On the Matrix Profile:** since the first paper presenting this new concept [@Yeh2017a], lots of
investigations were made to speed up its computation. It is notable how all computations are not
dependent on the _rolling window size_ as previous works not using Matrix Profile. Aside from this, we
can see that the first STAMP [@Yeh2017a] algorithm has the time complexity of $O(n^2log{n})$
while STOMP [@zhu2016] $O(n^2)$ (a significant improvement), but STOMP lacks the "any-time"
property. Later SCRIMP [@zhu2018] solves this problem keeping the same time complexity of
$O(n^2)$. Here we are in the "exact" algorithms domain and will not extend the scope for
conciseness.
1. The main issue with the algorithms above is the dependency on a fast Fourier transform (FFT)
library. FFT has been extensively optimized and architecture/CPU bounded to exploit the most
of speed. Also, padding data to some power of 2 happens to increase the efficiency of the
algorithm. We can argue that time complexity doesn't mean "faster" when we can exploit
low-level instructions. In our case, using FFT in a low-power device is overkilling. For
example, a quick search over the internet gives us a hint that computing FFT on a 4096 data in
an ESP32 takes about 21ms (~47 computations in 1 second). This means ~79 seconds for computing
all FFT's (~3797) required for STAMP using a window of 300. Currently, we can compute a full
matrix of 5k data in about 9 seconds in an ESP32 MCU (Fig. `r ref("fig:esp32")`), and keep updating
it as fast as 1 min of data (at 250hz) in just 6 seconds.
2. Recent works using _exact_ algorithms are using an unpublished algorithm called **MPX**, which
computes the Matrix Profile using cross-correlation methods ending up faster and is easily
portable.
3. **The main contribution** of this work on this area is adding the **online** capability to
MPX, which means we can update the Matrix Profile as new data comes in.
3. **On extending the Matrix Profile:** an unexplored constraint that we could apply on building the
Matrix Profile we are calling _Similarity Threshold_ (ST). The original work outputs the
similarity values in Euclidean Distance (ED) values, while MPX naturally outputs the values in
Pearson's correlation coefficients (CC). Both ED and CC are interchangeable using the equation
$\eqref{edcc}$. However, we may argue that it is easier to compare values that do not depend on
the window size during an exploratory phase. MPX happens to naturally return values in CC, saving
a few more computation time. The ST is an interesting factor that we can use, especially when
detecting pattern changes during time. The FLOSS algorithm relies on counting references between
indexes in the time series. ST can help remove "noise" from these references since only similar
patterns above a certain threshold are referenced, and changes have more impact on these counts.
The best ST threshold is still to be determined.
\
$$
CC = 1 - \frac{ED}{(2 \times WindowSize)} \tag{2} \label{edcc}
$$
\
```{r dist data, message=FALSE, warning=FALSE, cache=TRUE, include=FALSE}
source(here("scripts", "common", "compute_floss.R"))
get_dist <- function(mp_const = 1250, floss_const = 0) {
set.seed(2021)
iac <- list()
pro_size <- 5000
mp_time_constraint <- mp_const
floss_time_constraint <- floss_const
for (i in 1:500) {
iac[[i]] <- get_asym(pro_size, mp_time_constraint, floss_time_constraint)
}
aic_avg <- rowMeans(as.data.frame(iac))
data.frame(index = 1:5000, counts = aic_avg)
}
data_5000 <- get_dist(5000)
data_4250 <- get_dist(4250)
data_2500 <- get_dist(2500)
data_1250 <- get_dist(1250)
floss_data_5000 <- get_dist(0, 5000)
floss_data_4250 <- get_dist(0, 4250)
floss_data_2500 <- get_dist(0, 2500)
floss_data_1250 <- get_dist(0, 1250)
```
```{r distributions, echo=FALSE, fig.cap=paste("Figure", ref("fig:distributions"), "- 1D-IAC distributions for earlier temporal constraint (on Matrix Profile)"), message=FALSE, warning=FALSE}
floss_dist <- ggplot(data_5000, aes(index, counts)) +
geom_line(size = 0.1) +
ggtitle("a) No constraint") +
theme_grey(base_size = 7)
floss_4250 <- ggplot(data_4250, aes(index, counts)) +
geom_line(size = 0.1) +
ggtitle("b) Constraint of 4250") +
theme_grey(base_size = 7)
floss_2500 <- ggplot(data_2500, aes(index, counts)) +
geom_line(size = 0.1) +
annotate("segment", y = 0, yend = max(data_2500$counts), x = 2500, xend = 2500, linetype = 2, size = 0.1) +
annotate("text", x = 2500 - 80, y = 40, label = "start", color = "black", size = 2, angle = 90, hjust = 0) +
annotate("segment", y = 0, yend = max(data_2500$counts), x = 5000 - 2500 * 0.9, xend = 5000 - 2500 * 0.9, linetype = 2, size = 0.1) +
annotate("text", x = 5000 - 2500 * 0.9 - 80, y = 40, label = "end", color = "black", size = 2, angle = 90, hjust = 0) +
ggtitle("c) Constraint of 2500") +
theme_grey(base_size = 7)
floss_1250 <- ggplot(data_1250, aes(index, counts)) +
geom_line(size = 0.1) +
annotate("segment", y = 0, yend = max(data_1250$counts), x = 1250, xend = 1250, linetype = 2, size = 0.1) +
annotate("text", x = 1250 - 80, y = 40, label = "start", color = "black", size = 2, angle = 90, hjust = 0) +
annotate("segment", y = 0, yend = max(data_1250$counts), x = 5000 - 1250 * 0.9, xend = 5000 - 1250 * 0.9, linetype = 2, size = 0.1) +
annotate("text", x = 5000 - 1250 * 0.9 - 80, y = 40, label = "end", color = "black", size = 2, angle = 90, hjust = 0) +
ggtitle("d) Constraint of 1250") +
theme_grey(base_size = 7)
gg <- gridExtra::arrangeGrob(floss_dist, floss_4250, floss_2500, floss_1250,
nrow = 2,
bottom = grid::textGrob(paste("The plot a) shows the distribution used for the arc count correction when there is no time constraint.", "\n", "b) Shows a constraint of 3/4 of the total. c) 1/2 of the total. d) 1/4 of the total; here we see clearly the flat line.", "\n", "The dashed line marks the start and the end of the uniform zone."), just = "center", gp = grid::gpar(fontsize = 7))
)
grid::grid.draw(gg)
```
\
```{r esp32, echo=FALSE, out.width="40%", fig.cap=paste("Figure", ref("fig:esp32"), "- ESP32 MCU")}
knitr::include_graphics("figure/esp32.jpg")
```