dlanza1 · dlanza1 · Dec 18, 2017 · Dec 16, 2017 · Dec 16, 2017 · Dec 16, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -1,4 +1,9 @@
 language: java
 
 jdk:
-  - oraclejdk8
+  - oraclejdk8
+
+notifications:
+  email:
+    on_success: change
+    on_failure: change
diff --git a/doc/users-manual/define-metrics.md b/doc/users-manual/define-metrics.md
@@ -158,6 +158,14 @@ Metrics can be grouped by (e.g. machine) with the "metrics.groupby" parameter in
 Group by can be set to ALL, then each metric will be treated independently. 
 If group by is configured to ALL (or all attributes the metrics contain are listed) there is no attributes to differenciate metrics and aggregate them, so aggregation is done over the historical values coming from the metric.
 
+## Status
+
+In order to perform the computation, previous activity needs to be stored. This is stored in a status.
+
+You may want to list the current statuses or remove them in order to stop the generation of metrics of, for example, a specific host.
+
+For that, please read the [statuses management documentation](statuses-management.md).
+
 ## Examples
 
 Some examples of defined metrics can be:

diff --git a/doc/users-manual/monitor.md b/doc/users-manual/monitor.md
@@ -58,4 +58,13 @@ The value of any tag can be extracted from an attribute of the analyzed metric.
 
 ```
 <tag-key-1> = %<metric-key>
-```
+```
+
+## Status
+
+In order to perform the analysis, previous activity needs to be stored. This is stored in a status.
+
+You may want to list the current statuses in order to understand the analysis results or remove the status of, for example, a specific host.
+
+For that, please read the [statuses management documentation](statuses-management.md).
+
diff --git a/doc/users-manual/running.md b/doc/users-manual/running.md
@@ -28,6 +28,8 @@ To run this applications you can use the following command:
 
 ```
 $SPARK_HOME/bin/spark-submit \
+			--repositories https://repository.cloudera.com/artifactory/cloudera-repos/ \
+			--packages org.apache.spark:spark-streaming-kafka-0-10_2.11:2.1.0,org.reflections:reflections:0.9.9 \
 			--class ch.cern.spark.metrics.Driver \
 			target/metrics-monitor-VERSION.jar \
 			<path_to_conf_file>

diff --git a/doc/users-manual/statuses-management.md b/doc/users-manual/statuses-management.md
@@ -0,0 +1,70 @@
+# Statuses management
+
+In order to perform the computation in same of the components, previous activity needs to be stored. This information is stored in a object that we call "status".
+
+You may want to list the current statuses in order to understand the results or remove the status of, for example, a specific host.
+
+Statuses are stored in an external system for easy management.
+
+## Storage system
+
+Statuses must be stored externally in a statuses store. The following sections describe the options.
+
+### Single file statuses store
+
+```
+spark.cern.streaming.status.storage.type = single-file
+spark.cern.streaming.status.storage.path = <path> (default: /tmp/metrics-monitor-statuses/)
+```
+
+### Kafka statuses store
+
+Topic should be configured with [log compaction](https://kafka.apache.org/documentation/#compaction).
+
+```
+spark.cern.streaming.status.storage.type = kafka
+spark.cern.streaming.status.storage.topic = <topic>
+spark.cern.streaming.status.storage.timeout = <period like 1s, 1m, 3h> (default: 2s)
+spark.cern.streaming.status.storage.serialization = <java or json> (default: json)
+```
+
+## Removing statuses
+
+The application can be configured to listen to a TCP socket from which JSON documents will be collected.
+
+JSON documents should represent status keys. These keys will be removed.
+
+```
+statuses.removal.socket = <host:port>
+```
+
+## Statuses management: list and remove
+
+A command line interface is available to manage statuses: list keys, see values and remove.
+
+```
+$SPARK_HOME/bin/spark-submit     \
+      --master local     \
+      --repositories https://repository.cloudera.com/artifactory/cloudera-repos/     \
+      --packages org.apache.spark:spark-streaming-kafka-0-10_2.11:2.1.0,org.reflections:reflections:0.9.9     \
+      --class ch.cern.spark.status.storage.manager.StatusesManagerCLI     \
+      target/metrics-monitor-VERSION.jar     \
+
+usage: spark-statuses-manager
+      -c,--conf <arg>    path to configuration file
+      -id,--id <arg>     filter by status key id
+      -n,--fqcn <arg>    filter by FQCN or alias
+      -p,--print <arg>   print mode: java or json
+      -s,--save <arg>    path to write result as JSON
+```
+
+--conf should be the path to the configuration file of the application
+
+For filtering statuses you can use:
+* --fqcn: defined-metric-key, monitor-key or notificator-key
+* --id: defined metric or monitor id, for notificators: monitor-id:notificator-id
+
+For removing statuses, statuses.removal.socket must be configured and this command line must be run where this property is pointing.
+
+
+
diff --git a/doc/users-manual/users-manual.md b/doc/users-manual/users-manual.md
@@ -25,14 +25,19 @@ The general structure of the configuration file is shown below.
 checkpoint.dir = <path_to_store_stateful_data> (default: /tmp/)
 spark.batch.time = <period like 1h, 3m or 45s> (default: 1m)
 
-# Data for metrics that are not coming will expire 
-data.expiration = <period like 1h, 3m or 45s> (default: 30m)
-
 # Optional
 properties.source.type = <properties_source_type> (default: "file" with path to this configuration file)
 properties.source.expire = <period like 1h, 3m or 45s> (default: 1m)
 properties.source.<other_confs> = <value>
 
+# Optional
+# +info at components that store statuses: defined metrics, monitors and notificators
+statuses.removal.socket = <host:port>
+
+# Default statuses store
+spark.cern.streaming.status.storage.type = single-file
+spark.cern.streaming.status.storage.path = /tmp/metrics-monitor-statuses/
+
 # At least one source is mandatory
 metrics.source.<metric-source-id-1>.type = <metric_source_type>
 metrics.source.<metric-source-id-1>.<other_confs> = <value>
@@ -63,7 +68,7 @@ notifications.sink.<sink-id>.type = <notifications_sink_type>
 notifications.sink.<sink-id>.<other_confs> = <value>
 ```
 
-### Configuration of each component
+### Index
 
 * [Properties source](properties-source.md)
 * [Metrics source](metric-sources.md)
@@ -76,6 +81,8 @@ notifications.sink.<sink-id>.<other_confs> = <value>
 * [Analysis results sink](analysis-results-sink.md)
 * [Notifications sinks](notifications-sink.md)
 
+* [Statuses management](statuses-management.md)
+
 ### Example of full configuration can be:
 
 ```

diff --git a/src/main/java/ch/cern/properties/Properties.java b/src/main/java/ch/cern/properties/Properties.java
@@ -19,7 +19,7 @@
 import ch.cern.components.Component.Type;
 import ch.cern.components.ComponentManager;
 import ch.cern.properties.source.PropertiesSource;
-import ch.cern.spark.Pair;
+import ch.cern.utils.Pair;
 import ch.cern.utils.TimeUtils;
 import scala.Tuple2;
 

diff --git a/src/main/java/ch/cern/spark/PairStream.java b/src/main/java/ch/cern/spark/PairStream.java
@@ -1,32 +1,33 @@
 package ch.cern.spark;
 
 import java.io.IOException;
+import java.util.Optional;
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.Optional;
-import org.apache.spark.api.java.function.Function4;
 import org.apache.spark.streaming.Duration;
-import org.apache.spark.streaming.State;
 import org.apache.spark.streaming.StateSpec;
-import org.apache.spark.streaming.Time;
+import org.apache.spark.streaming.api.java.JavaDStream;
 import org.apache.spark.streaming.api.java.JavaPairDStream;
 
 import ch.cern.components.Component.Type;
 import ch.cern.components.ComponentManager;
 import ch.cern.properties.ConfigurationException;
 import ch.cern.properties.Properties;
+import ch.cern.spark.status.ActionOrValue;
+import ch.cern.spark.status.ActionOrValue.Action;
 import ch.cern.spark.status.StatusKey;
 import ch.cern.spark.status.StatusStream;
 import ch.cern.spark.status.StatusValue;
+import ch.cern.spark.status.UpdateStatusFunction;
 import ch.cern.spark.status.storage.StatusesStorage;
+import scala.Option;
 import scala.Tuple2;
 
 public class PairStream<K, V> extends Stream<Tuple2<K, V>>{
 
-	public static final String CHECKPPOINT_DURATION_PARAM = "spark.cern.streaming.rdd.checkpoint.timeout";
-	public static final String CHECKPPOINT_DURATION_DEFAULT = java.time.Duration.ofMinutes(30).toString();
+	public static final String STATUSES_EXPIRATION_PERIOD_PARAM = "spark.cern.streaming.status.timeout";
 
 	private PairStream(JavaPairDStream<K, V> stream) {
 		super(stream.map(tuple -> tuple));
@@ -35,43 +36,67 @@ private PairStream(JavaPairDStream<K, V> stream) {
 	public static<K, V> PairStream<K, V> from(JavaPairDStream<K, V> input) {
 		return new PairStream<>(input);
 	}
+
+    public static <K, V> PairStream<K, V> fromT(JavaDStream<Tuple2<K, V>> input) {
+        return new PairStream<>(input.mapToPair(p -> p));
+    }
 
 	public static<K extends StatusKey, V, S extends StatusValue, R> StatusStream<K, V, S, R> mapWithState(
 			Class<K> keyClass,
 			Class<S> statusClass,
-			PairStream<K, V> input,
-			Function4<Time, K, Optional<V>, State<S>, Optional<R>> updateStatusFunction) 
+			PairStream<K, V> valuesStream,
+			UpdateStatusFunction<K, V, S, R> updateStatusFunction,
+			Optional<Stream<K>> removeKeysStream) 
 					throws ClassNotFoundException, IOException, ConfigurationException {
 
-		JavaSparkContext context = input.getSparkContext();
+		JavaSparkContext context = valuesStream.getSparkContext();
 
-		java.util.Optional<StatusesStorage> storageOpt = getStorage(context);
+		Optional<StatusesStorage> storageOpt = getStorage(context);
 		if(!storageOpt.isPresent())
-			throw new ConfigurationException("Storgae need to be configured");
+			throw new ConfigurationException("Storage needs to be configured");
 		StatusesStorage storage = storageOpt.get();
 
 		JavaRDD<Tuple2<K, S>> initialStates = storage.load(context, keyClass, statusClass);
 
-        StateSpec<K, V, S, R> statusSpec = StateSpec
-							                .function(updateStatusFunction)
-							                .initialState(initialStates.rdd())
-							                .timeout(getDataExpirationPeriod(input.getSparkContext()));
+        StateSpec<K, ActionOrValue<V>, S, R> statusSpec = StateSpec
+        							                .function(updateStatusFunction)
+        							                .initialState(initialStates.rdd());
+
+        Option<Duration> timeout = getStatusExpirationPeriod(valuesStream.getSparkContext());
+        if(timeout.isDefined())
+            statusSpec = statusSpec.timeout(timeout.get());
 
-        StatusStream<K, V, S, R> statusStream = StatusStream.from(input.asJavaDStream()
-        																	.mapToPair(pair -> pair)
-        																	.mapWithState(statusSpec));
+        PairStream<K, ActionOrValue<V>> actionsAndValues = valuesStream.mapToPair(tuple -> new Tuple2<K, ActionOrValue<V>>(tuple._1, new ActionOrValue<>(tuple._2)));
+
+        if(removeKeysStream.isPresent()) {
+            actionsAndValues = actionsAndValues.union(
+                    removeKeysStream.get().mapToPair(k -> new Tuple2<K, ActionOrValue<V>>(k, new ActionOrValue<>(Action.REMOVE))));
+
+            removeKeysStream.get().foreachRDD(rdd -> storage.remove(rdd));
+        }
+
+        StatusStream<K, V, S, R> statusStream = StatusStream.from(actionsAndValues.asJavaDStream()
+                    																	.mapToPair(pair -> pair)
+                    																	.mapWithState(statusSpec));
 
         statusStream.getStatuses().foreachRDD((rdd, time) -> storage.save(rdd, time));
 
 		return statusStream;
 	}
 
-	private static Duration getDataExpirationPeriod(JavaSparkContext context) {
+	private PairStream<K, V> union(PairStream<K, V> other) {
+        return fromT(asJavaDStream().union(other.asJavaDStream()));
+    }
+
+    private static Option<Duration> getStatusExpirationPeriod(JavaSparkContext context) {
 		SparkConf conf = context.getConf();
 
-		String valueString = conf.get(CHECKPPOINT_DURATION_PARAM, CHECKPPOINT_DURATION_DEFAULT);
+		Option<String> valueString = conf.getOption(STATUSES_EXPIRATION_PERIOD_PARAM);
 
-		return new Duration(java.time.Duration.parse(valueString).toMillis());
+		if(valueString.isDefined())
+		    return Option.apply(new Duration(java.time.Duration.parse(valueString.get()).toMillis()));
+		else
+		    return Option.empty();
 	}
 
 	public JavaPairDStream<K, V> asJavaPairDStream() {

diff --git a/src/main/java/ch/cern/spark/Stream.java b/src/main/java/ch/cern/spark/Stream.java
@@ -2,18 +2,17 @@
 
 import java.io.IOException;
 import java.util.List;
+import java.util.Optional;
 
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.Optional;
 import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.function.Function4;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
+import org.apache.spark.api.java.function.PairFunction;
 import org.apache.spark.api.java.function.VoidFunction;
 import org.apache.spark.api.java.function.VoidFunction2;
-import org.apache.spark.streaming.State;
 import org.apache.spark.streaming.Time;
 import org.apache.spark.streaming.api.java.JavaDStream;
 
@@ -24,6 +23,7 @@
 import ch.cern.spark.status.StatusKey;
 import ch.cern.spark.status.StatusStream;
 import ch.cern.spark.status.StatusValue;
+import ch.cern.spark.status.UpdateStatusFunction;
 
 public class Stream<V> {
 
@@ -45,11 +45,12 @@ public<K extends StatusKey, S extends StatusValue, R> StatusStream<K, V, S, R> m
 			Class<K> keyClass, 
 			Class<S> statusClass,
 			PairFlatMapFunction<V, K, V> toPairFunction, 
-			Function4<Time, K, Optional<V>, State<S>, Optional<R>> updateStatusFunction) throws ClassNotFoundException, IOException, ConfigurationException {
+			Optional<Stream<K>> removeKeys,
+			UpdateStatusFunction<K, V, S, R> updateStatusFunction) throws ClassNotFoundException, IOException, ConfigurationException {
 
 		PairStream<K, V> keyValuePairs = toPair(toPairFunction);
 
-		return PairStream.mapWithState(keyClass, statusClass, keyValuePairs, updateStatusFunction);
+		return PairStream.mapWithState(keyClass, statusClass, keyValuePairs, updateStatusFunction, removeKeys);
 	}
 
 	public Stream<V> union(Stream<V> input) {
@@ -126,5 +127,9 @@ public<R> Stream<R> flatMap(FlatMapFunction<V, R> func) {
 	public void cache() {
 		stream = stream.cache();
 	}
+
+    public<K, T> PairStream<K, T> mapToPair(PairFunction<V, K, T> func) {
+        return PairStream.from(stream.mapToPair(func));
+    }
 
 }
diff --git a/src/main/java/ch/cern/spark/http/HTTPSink.java b/src/main/java/ch/cern/spark/http/HTTPSink.java
@@ -133,7 +133,7 @@ public void sink(Stream<?> outputStream) {
 					thrownExceptions.add(thrownException);
 
 				if(!thrownExceptions.isEmpty())
-					throw new IOException("Same batches could not be sent, details in logs. Exceptions: " + thrownExceptions);
+				    LOG.error(new IOException("Same batches could not be sent. Exceptions: " + thrownExceptions));
 			});
 		});
 	}