From 6d12b3a0b8d8200d0f71166bcd050a9088d1f286 Mon Sep 17 00:00:00 2001 From: Robert Lyons Date: Fri, 1 Oct 2021 07:50:22 -0400 Subject: [PATCH 1/3] refactor to use less implicits --- README.md | 2 +- project/build.properties | 2 +- .../epiphanous/flinkrunner/FlinkRunner.scala | 667 ++++++++++++++++- .../flinkrunner/FlinkRunnerFactory.scala | 47 +- .../algorithm/cardinality/HyperLogLog.scala | 80 ++- .../membership/StableBloomFilter.scala | 229 +++--- .../membership/StableBloomFilterBuilder.scala | 32 +- .../avro/AvroCodingException.scala | 6 +- .../avro/ConfluentSchemaRegistryClient.scala | 9 +- .../flinkrunner/flink/BaseFlinkJob.scala | 131 ++-- .../flinkrunner/flink/BroadcastFlinkJob.scala | 71 +- .../flink/FilterByControlJob.scala | 70 +- .../flinkrunner/flink/FlinkJob.scala | 24 +- .../flinkrunner/flink/IdentityJob.scala | 28 +- .../flinkrunner/model/ConfigToProps.scala | 46 +- .../flinkrunner/model/DataControlPeriod.scala | 2 +- .../flinkrunner/model/DataOrControl.scala | 14 +- .../flinkrunner/model/FlinkConfig.scala | 119 +--- .../flinkrunner/model/SinkConfig.scala | 4 +- .../flinkrunner/model/SourceConfig.scala | 37 +- .../flinkrunner/model/UnitMapper.scala | 1 + .../model/aggregate/Aggregate.scala | 316 ++++++--- .../flinkrunner/model/aggregate/Count.scala | 24 +- .../aggregate/ExponentialMovingAverage.scala | 45 +- .../ExponentialMovingStandardDeviation.scala | 54 +- .../aggregate/ExponentialMovingVariance.scala | 58 +- .../model/aggregate/Histogram.scala | 90 ++- .../flinkrunner/model/aggregate/Max.scala | 24 +- .../flinkrunner/model/aggregate/Mean.scala | 24 +- .../flinkrunner/model/aggregate/Min.scala | 24 +- .../model/aggregate/Percentage.scala | 41 +- .../flinkrunner/model/aggregate/Range.scala | 27 +- .../model/aggregate/StandardDeviation.scala | 27 +- .../flinkrunner/model/aggregate/Sum.scala | 24 +- .../aggregate/SumOfSquaredDeviations.scala | 26 +- .../model/aggregate/Variance.scala | 31 +- .../operator/AddToJdbcBatchFunction.scala | 4 +- .../operator/EnrichmentAsyncFunction.scala | 13 +- .../BoundedLatenessWatermarkStrategy.scala | 9 +- .../flinkrunner/util/JdbcSink.scala | 13 +- .../flinkrunner/util/StreamUtils.scala | 670 ------------------ .../util/BoundedLatenessGeneratorTest.scala | 6 +- 42 files changed, 1742 insertions(+), 1429 deletions(-) diff --git a/README.md b/README.md index a32f604..8970feb 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ ## Maven Dependency -`Flinkrunner 3` is [available on maven central](https://mvnrepository.com/artifact/io.epiphanous/flinkrunner_2.12), +`Flinkrunner 4` is [available on maven central](https://mvnrepository.com/artifact/io.epiphanous/flinkrunner_2.12), built against Flink 1.13 with Scala 2.12 and JDK 11. ```sbtshell diff --git a/project/build.properties b/project/build.properties index dbae93b..10fd9ee 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.4.9 +sbt.version=1.5.5 diff --git a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala index 010d804..7d384a3 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala @@ -1,7 +1,61 @@ package io.epiphanous.flinkrunner import com.typesafe.scalalogging.LazyLogging -import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} +import io.epiphanous.flinkrunner.model._ +import io.epiphanous.flinkrunner.operator.AddToJdbcBatchFunction +import io.epiphanous.flinkrunner.util.{ + BoundedLatenessWatermarkStrategy, + JdbcSink +} +import org.apache.flink.api.common.eventtime.WatermarkStrategy +import org.apache.flink.api.common.functions.RuntimeContext +import org.apache.flink.api.common.serialization.{ + DeserializationSchema, + Encoder, + SerializationSchema +} +import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.core.fs.Path +import org.apache.flink.streaming.api.datastream.DataStreamSink +import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.{ + BasePathBucketAssigner, + DateTimeBucketAssigner +} +import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.{ + DefaultRollingPolicy, + OnCheckpointRollingPolicy +} +import org.apache.flink.streaming.api.functions.sink.filesystem.{ + BucketAssigner, + StreamingFileSink +} +import org.apache.flink.streaming.api.scala.{DataStream, _} +import org.apache.flink.streaming.connectors.cassandra.CassandraSink +import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer +import org.apache.flink.streaming.connectors.elasticsearch7.ElasticsearchSink +import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic +import org.apache.flink.streaming.connectors.kafka.{ + FlinkKafkaConsumer, + FlinkKafkaProducer, + KafkaDeserializationSchema, + KafkaSerializationSchema +} +import org.apache.flink.streaming.connectors.kinesis.serialization.{ + KinesisDeserializationSchema, + KinesisSerializationSchema +} +import org.apache.flink.streaming.connectors.kinesis.{ + FlinkKinesisConsumer, + FlinkKinesisProducer +} +import org.apache.http.HttpHost +import org.elasticsearch.client.Requests + +import java.io.{File, FileNotFoundException} +import java.net.URL +import java.nio.charset.StandardCharsets +import scala.collection.JavaConverters._ +import scala.util.matching.Regex /** * Flink Job Invoker @@ -13,9 +67,9 @@ class FlinkRunner[ADT <: FlinkEvent]( optConfig: Option[String] = None) extends LazyLogging { - implicit val config: FlinkConfig = - new FlinkConfig(args, factory, sources, optConfig) - implicit val env: SEE = config.configureStreamExecutionEnvironment + val config: FlinkConfig[ADT] = + factory.getFlinkConfig(args, sources, optConfig) + val env: SEE = config.configureStreamExecutionEnvironment /** * An intermediate method to process main args, with optional callback to @@ -25,7 +79,7 @@ class FlinkRunner[ADT <: FlinkEvent]( * a function from an iterator to unit */ def process( - callback: PartialFunction[Stream[ADT], Unit] = { case _ => + callback: PartialFunction[List[_], Unit] = { case _ => () } ): Unit = @@ -44,7 +98,7 @@ class FlinkRunner[ADT <: FlinkEvent]( * flink job */ def process1( - callback: PartialFunction[Stream[ADT], Unit] = { case _ => + callback: PartialFunction[List[_], Unit] = { case _ => () } ): Unit = { @@ -53,8 +107,8 @@ class FlinkRunner[ADT <: FlinkEvent]( .exists(s => List("help", "--help", "-help", "-h").contains(s)) ) showJobHelp() else { - factory.getJobInstance(config.jobName, config).run match { - case Left(results) => callback(results.toStream) + factory.getJobInstance(config.jobName, config).run() match { + case Left(results) => callback(results) case Right(_) => () } } @@ -103,4 +157,601 @@ class FlinkRunner[ADT <: FlinkEvent]( println(usage) } + val RESOURCE_PATTERN: Regex = "resource://(.*)".r + + /** + * Generates a timestamp and watermark assigner for a stream with a given + * type of element that limits how late an element is allowed to arrive + * in event time. + * + * @tparam E + * the type of stream element + * @return + * BoundedLatenessGenerator[E] + */ + def boundedLatenessWatermarks[E <: ADT: TypeInformation]( + streamID: String + ) = + new BoundedLatenessWatermarkStrategy[E]( + config.maxLateness, + streamID + ) + + /** + * Create a bounded of order watermark strategy with idleness checking + * + * @tparam E + * the type of stream element + * @return + * BoundedLatenessGenerator[E] + */ + def boundedOutofOrdernessWatermarks[E <: ADT: TypeInformation]() + : WatermarkStrategy[E] = + WatermarkStrategy + .forBoundedOutOfOrderness(config.maxLateness) + .withIdleness(config.maxIdleness) + + /** + * Creates an ascending timestamp watermark strategy. + * @tparam E + * type of stream element + * @return + * AscendingTimestampExtractor[E] + */ + def ascendingTimestampsWatermarks[E <: ADT: TypeInformation]() + : WatermarkStrategy[E] = WatermarkStrategy.forMonotonousTimestamps() + + /** + * Assign timestamps/watermarks if we're using event time + * @param in + * the input stream to watermark + * @param env + * implicit stream execution environment + * @tparam E + * event type + * @return + * the possibly watermarked input stream + */ + def maybeAssignTimestampsAndWatermarks[E <: ADT: TypeInformation]( + in: DataStream[E], + srcConfig: SourceConfig + ): DataStream[E] = + in.assignTimestampsAndWatermarks(srcConfig.watermarkStrategy match { + case "bounded out of orderness" => + boundedOutofOrdernessWatermarks() + case "ascending timestamps" => ascendingTimestampsWatermarks() + case _ => boundedLatenessWatermarks(in.name) + }).name(s"wm:${in.name}") + .uid(s"wm:${in.name}") + + /** + * Configure stream source from configuration. + * + * @param sourceName + * the name of the source to get its configuration + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def fromSource[E <: ADT: TypeInformation]( + sourceName: String = "" + ): DataStream[E] = { + val name = + if (sourceName.isEmpty) config.getSourceNames.head else sourceName + val src = config.getSourceConfig(name) + val uid = src.label + val stream = (src match { + case src: KafkaSourceConfig => fromKafka(src) + case src: KinesisSourceConfig => fromKinesis(src) + case src: FileSourceConfig => fromFile(src) + case src: SocketSourceConfig => fromSocket(src) + case src: CollectionSourceConfig => fromCollection(src) + }).name(uid).uid(uid) + maybeAssignTimestampsAndWatermarks(stream, src) + } + + /** + * Configure stream from kafka source. + * + * @param srcConfig + * a source config + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def fromKafka[E <: ADT: TypeInformation]( + srcConfig: KafkaSourceConfig + ): DataStream[E] = { + val consumer = + new FlinkKafkaConsumer[E]( + srcConfig.topic, + config + .getKafkaDeserializationSchema[E](srcConfig.name) + .asInstanceOf[KafkaDeserializationSchema[E]], + srcConfig.properties + ) + env + .addSource(consumer) + } + + /** + * Configure stream from kinesis. + * + * @param srcConfig + * a source config + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def fromKinesis[E <: ADT: TypeInformation]( + srcConfig: KinesisSourceConfig + ): DataStream[E] = { + val consumer = + new FlinkKinesisConsumer[E]( + srcConfig.stream, + config + .getKinesisDeserializationSchema(srcConfig.name) + .asInstanceOf[KinesisDeserializationSchema[E]], + srcConfig.properties + ) + env + .addSource(consumer) + .name(srcConfig.label) + } + + /** + * Configure stream from file source. + * + * @param srcConfig + * a source config + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def fromFile[E <: ADT: TypeInformation]( + srcConfig: FileSourceConfig + ): DataStream[E] = { + val path = srcConfig.path match { + case RESOURCE_PATTERN(p) => getSourceFilePath(p) + case other => other + } + val ds = config + .getDeserializationSchema(srcConfig.name) + .asInstanceOf[DeserializationSchema[E]] + env + .readTextFile(path) + .name(s"raw:${srcConfig.label}") + .uid(s"raw:${srcConfig.label}") + .map(line => ds.deserialize(line.getBytes(StandardCharsets.UTF_8))) + } + + /** + * Configure stream from socket source. + * + * @param srcConfig + * a source config + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def fromSocket[E <: ADT: TypeInformation]( + srcConfig: SocketSourceConfig + ): DataStream[E] = + env + .socketTextStream(srcConfig.host, srcConfig.port) + .name(s"raw:${srcConfig.label}") + .uid(s"raw:${srcConfig.label}") + .map(line => + config + .getDeserializationSchema(srcConfig.name) + .asInstanceOf[DeserializationSchema[E]] + .deserialize(line.getBytes(StandardCharsets.UTF_8)) + ) + + /** + * Configure stream from collection source. + * + * @param srcConfig + * a source config + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def fromCollection[E <: ADT: TypeInformation]( + srcConfig: CollectionSourceConfig + ): DataStream[E] = + env + .fromCollection[Array[Byte]]( + config.getCollectionSource(srcConfig.topic) + ) + .name(s"raw:${srcConfig.label}") + .uid(s"raw:${srcConfig.label}") + .map(bytes => + config + .getDeserializationSchema(srcConfig.name) + .asInstanceOf[DeserializationSchema[E]] + .deserialize(bytes) + ) + + /** + * Returns the actual path to a resource file named filename or + * filename.gz. + * + * @param filename + * the name of file + * @return + * String + */ + @throws[FileNotFoundException] + def getSourceFilePath(filename: String): String = { + val loader = getClass + val resource = Option(loader.getResource(filename)) match { + case Some(value) => value.toURI + case None => + Option(loader.getResource(s"$filename.gz")) match { + case Some(value) => value.toURI + case None => + throw new FileNotFoundException( + s"can't load resource $filename" + ) + } + } + val file = new File(resource) + file.getAbsolutePath + } + + val runner = this + + implicit class EventStreamOps[E <: ADT: TypeInformation]( + stream: DataStream[E]) { + + def as[T <: ADT: TypeInformation]: DataStream[T] = { + val name = stream.name + stream + .filter((e: E) => e.isInstanceOf[T @unchecked]) + .name(s"filter types $name") + .uid(s"filter types $name") + .map((e: E) => e.asInstanceOf[T @unchecked]) + .name(s"cast types $name") + .uid(s"cast types $name") + } + + def toSink(sinkName: String = "") = + runner.toSink[E](stream, sinkName) + + } + + /** + * Configure stream sink from configuration. + * + * @param stream + * the data stream to send to sink + * @param sinkName + * a sink name to obtain configuration + * @param config + * implicit flink job args + * @tparam E + * stream element type + * @return + * DataStream[E] + */ + def toSink[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkName: String = "" + ) = { + val name = if (sinkName.isEmpty) config.getSinkNames.head else sinkName + config.getSinkConfig(name) match { + case s: KafkaSinkConfig => toKafka[E](stream, s) + case s: KinesisSinkConfig => toKinesis[E](stream, s) + case s: FileSinkConfig => toFile[E](stream, s) + case s: SocketSinkConfig => toSocket[E](stream, s) + case s: JdbcSinkConfig => toJdbc[E](stream, s) + case s: CassandraSinkConfig => toCassandraSink[E](stream, s) + case s: ElasticsearchSinkConfig => toElasticsearchSink[E](stream, s) + case s => + throw new IllegalArgumentException( + s"unsupported source connector: ${s.connector}" + ) + } + } + + /** + * Send stream to a kafka sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @param config + * implicit job args + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toKafka[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: KafkaSinkConfig + ): DataStreamSink[E] = + stream + .addSink( + new FlinkKafkaProducer[E]( + sinkConfig.topic, + config + .getKafkaSerializationSchema(sinkConfig.name) + .asInstanceOf[KafkaSerializationSchema[E]], + sinkConfig.properties, + Semantic.AT_LEAST_ONCE + ) + ) + .uid(sinkConfig.label) + .name(sinkConfig.label) + + /** + * Send stream to a kinesis sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @param config + * implicit job args + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toKinesis[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: KinesisSinkConfig + ): DataStreamSink[E] = + stream + .addSink { + val sink = + new FlinkKinesisProducer[E]( + config + .getKinesisSerializationSchema(sinkConfig.name) + .asInstanceOf[KinesisSerializationSchema[E]], + sinkConfig.properties + ) + sink.setDefaultStream(sinkConfig.stream) + sink.setFailOnError(true) + sink.setDefaultPartition("0") + sink + } + .uid(sinkConfig.label) + .name(sinkConfig.label) + + /** + * Send stream to a socket sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @param config + * implicit job args + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toJdbc[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: JdbcSinkConfig + ): DataStreamSink[E] = + stream + .addSink( + new JdbcSink[E]( + sinkConfig, + config + .getAddToJdbcBatchFunction(sinkConfig.name) + .asInstanceOf[AddToJdbcBatchFunction[E]] + ) + ) + .uid(sinkConfig.label) + .name(sinkConfig.label) + + /** + * Send stream to a rolling file sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @param config + * implicit job args + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toFile[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: FileSinkConfig + ): DataStreamSink[E] = { + val path = sinkConfig.path + val p = sinkConfig.properties + val bucketCheckInterval = + p.getProperty("bucket.check.interval", s"${60000}").toLong + val bucketAssigner = + p.getProperty("bucket.assigner.type", "datetime") match { + case "none" => new BasePathBucketAssigner[E]() + case "datetime" => + new DateTimeBucketAssigner[E]( + p.getProperty( + "bucket.assigner.datetime.format", + "YYYY/MM/DD/HH" + ) + ) + case "custom" => + config + .getBucketAssigner(sinkConfig.name) + .asInstanceOf[BucketAssigner[E, String]] + case other => + throw new IllegalArgumentException( + s"Unknown bucket assigner type '$other'." + ) + } + val encoderFormat = p.getProperty("encoder.format", "row") + val sink = encoderFormat match { + case "row" => + val builder = + StreamingFileSink.forRowFormat( + new Path(path), + config.getEncoder(sinkConfig.name).asInstanceOf[Encoder[E]] + ) + val rollingPolicy = + p.getProperty("bucket.rolling.policy", "default") match { + case "default" => + DefaultRollingPolicy + .builder() + .withInactivityInterval( + p.getProperty( + "bucket.rolling.policy.inactivity.interval", + s"${60000}" + ).toLong + ) + .withMaxPartSize( + p.getProperty( + "bucket.rolling.policy.max.part.size", + s"${128 * 1024 * 1024}" + ).toLong + ) + .withRolloverInterval( + p.getProperty( + "bucket.rolling.policy.rollover.interval", + s"${Long.MaxValue}" + ).toLong + ) + .build[E, String]() + case "checkpoint" => + OnCheckpointRollingPolicy.build[E, String]() + case policy => + throw new IllegalArgumentException( + s"Unknown bucket rolling policy type: '$policy'" + ) + } + builder + .withBucketAssigner(bucketAssigner) + .withRollingPolicy(rollingPolicy) + .withBucketCheckInterval(bucketCheckInterval) + .build() + case "bulk" => + throw new NotImplementedError("Bulk file sink not implemented yet") + + case _ => + throw new IllegalArgumentException( + s"Unknown file sink encoder format: '$encoderFormat'" + ) + } + stream.addSink(sink).uid(sinkConfig.label).name(sinkConfig.label) + } + + /** + * Send stream to a socket sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @param config + * implicit job args + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toSocket[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: SocketSinkConfig + ): DataStreamSink[E] = + stream + .writeToSocket( + sinkConfig.host, + sinkConfig.port, + config + .getSerializationSchema(sinkConfig.name) + .asInstanceOf[SerializationSchema[E]] + ) + .uid(sinkConfig.label) + .name(sinkConfig.label) + + /** + * Send stream to a cassandra sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toCassandraSink[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: CassandraSinkConfig) = + CassandraSink + .addSink(stream) + .setHost(sinkConfig.host) + .setQuery(sinkConfig.query) + .build() + .uid(sinkConfig.label) + .name(sinkConfig.label) + + /** + * Send stream to an elasticsearch sink. + * + * @param stream + * the data stream + * @param sinkConfig + * a sink configuration + * @tparam E + * stream element type + * @return + * DataStreamSink[E] + */ + def toElasticsearchSink[E <: ADT: TypeInformation]( + stream: DataStream[E], + sinkConfig: ElasticsearchSinkConfig + ): DataStreamSink[E] = { + val hosts = sinkConfig.transports.map { s => + val url = new URL(if (s.startsWith("http")) s else s"http://$s") + val hostname = url.getHost + val port = if (url.getPort < 0) 9200 else url.getPort + new HttpHost(hostname, port, url.getProtocol) + }.asJava + val esSink = new ElasticsearchSink.Builder[E]( + hosts, + (element: E, _: RuntimeContext, indexer: RequestIndexer) => { + val data = element.getClass.getDeclaredFields + .filterNot(f => + Seq("$id", "$key", "$timestamp", "$action").contains( + f.getName + ) + ) + .foldLeft(Map.empty[String, Any]) { case (a, f) => + f.setAccessible(true) + val name = f.getName + f.get(element) match { + case Some(v: Any) => a + (name -> v) + case None => a + case v: Any => a + (name -> v) + } + } + .asJava + val req = Requests.indexRequest(sinkConfig.index).source(data) + indexer.add(req) + } + ).build() + stream.addSink(esSink).uid(sinkConfig.label).name(sinkConfig.label) + } + } diff --git a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala index 42abd7c..3814d1d 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala @@ -21,45 +21,54 @@ import org.apache.flink.streaming.connectors.kinesis.serialization.{ trait FlinkRunnerFactory[ADT <: FlinkEvent] { + def getFlinkConfig( + args: Array[String], + sources: Map[String, Seq[Array[Byte]]] = Map.empty, + optConfig: Option[String] = None) = + new FlinkConfig[ADT](args, this, sources, optConfig) + def getJobInstance( name: String, - config: FlinkConfig): BaseFlinkJob[_, _ <: ADT] + config: FlinkConfig[ADT]): BaseFlinkJob[_, _, ADT] - def getDeserializationSchema( + def getDeserializationSchema[E <: ADT]( name: String, - config: FlinkConfig): DeserializationSchema[ADT] = ??? + config: FlinkConfig[ADT]): DeserializationSchema[E] = ??? - def getKafkaDeserializationSchema( + def getKafkaDeserializationSchema[E <: ADT]( name: String, - config: FlinkConfig): KafkaDeserializationSchema[ADT] = + config: FlinkConfig[ADT]): KafkaDeserializationSchema[E] = ??? - def getKinesisDeserializationSchema( + def getKinesisDeserializationSchema[E <: ADT]( name: String, - config: FlinkConfig): KinesisDeserializationSchema[ADT] = ??? + config: FlinkConfig[ADT]): KinesisDeserializationSchema[E] = ??? - def getSerializationSchema( + def getSerializationSchema[E <: ADT]( name: String, - config: FlinkConfig): SerializationSchema[ADT] = ??? + config: FlinkConfig[ADT]): SerializationSchema[E] = ??? - def getKafkaSerializationSchema( + def getKafkaSerializationSchema[E <: ADT]( name: String, - config: FlinkConfig): KafkaSerializationSchema[ADT] = ??? + config: FlinkConfig[ADT]): KafkaSerializationSchema[E] = ??? - def getKinesisSerializationSchema( + def getKinesisSerializationSchema[E <: ADT]( name: String, - config: FlinkConfig): KinesisSerializationSchema[ADT] = ??? + config: FlinkConfig[ADT]): KinesisSerializationSchema[E] = ??? - def getEncoder(name: String, config: FlinkConfig): Encoder[ADT] = ??? + def getEncoder[E <: ADT]( + name: String, + config: FlinkConfig[ADT]): Encoder[E] = ??? - def getAddToJdbcBatchFunction( + def getAddToJdbcBatchFunction[E <: ADT]( name: String, - config: FlinkConfig): AddToJdbcBatchFunction[ADT] = ??? + config: FlinkConfig[ADT]): AddToJdbcBatchFunction[E] = ??? - def getBucketAssigner( + def getBucketAssigner[E <: ADT]( name: String, - config: FlinkConfig): BucketAssigner[ADT, String] = + config: FlinkConfig[ADT]): BucketAssigner[E, String] = ??? - def getAvroCoder(name: String, config: FlinkConfig): AvroCoder[_] = ??? + def getAvroCoder(name: String, config: FlinkConfig[ADT]): AvroCoder[_] = + ??? } diff --git a/src/main/scala/io/epiphanous/flinkrunner/algorithm/cardinality/HyperLogLog.scala b/src/main/scala/io/epiphanous/flinkrunner/algorithm/cardinality/HyperLogLog.scala index 4e0c221..fdbded2 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/algorithm/cardinality/HyperLogLog.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/algorithm/cardinality/HyperLogLog.scala @@ -4,12 +4,12 @@ import com.google.common.hash.Funnel import com.google.common.hash.Hashing.murmur3_128 /** - * Implements hyperloglog cardinality estimate based on paper by - * P. Flajolet, È. Fusy, O. Gandouet, F. Meiunier. - * HyperLogLog: the analysis of a near-optimal - * cardinality estimation algorithm. Proceedings of Discrete Mathematics and Theoretical Computer Science. - * Pages 127-146. 2007. - */ + * Implements hyperloglog cardinality estimate based on paper by P. + * Flajolet, È. Fusy, O. Gandouet, F. Meiunier. HyperLogLog: the analysis + * of a near-optimal cardinality estimation algorithm. Proceedings of + * Discrete Mathematics and Theoretical Computer Science. Pages 127-146. + * 2007. + */ case class HyperLogLog[T](funnel: Funnel[T], b: Int) { require(b >= 4 && b <= 16, "b must be an integer in [4,16]") @@ -47,11 +47,14 @@ case class HyperLogLog[T](funnel: Funnel[T], b: Int) { def nonEmpty = cardinality > 0 /** - * Incorporates an item into the registers, updates the cardinality estimate and returns it. - * - * @param item the item to add - * @return Long - */ + * Incorporates an item into the registers, updates the cardinality + * estimate and returns it. + * + * @param item + * the item to add + * @return + * Long + */ def add(item: T) = { val x = hash(item) val j = 1 + (x & (m - 1)) @@ -61,12 +64,13 @@ case class HyperLogLog[T](funnel: Funnel[T], b: Int) { } /** - * Compute the current distinct cardinality estimate. - * - * @return Long - */ + * Compute the current distinct cardinality estimate. + * + * @return + * Long + */ private def estimateCardinality: Long = { - val E = am2 / M.map(i => 1 / math.pow(2d, i.toDouble)).sum + val E = am2 / M.map(i => 1 / math.pow(2d, i.toDouble)).sum // small range correction val Estar = if (E <= smallRange) { val V = M.count(_ == 0) @@ -83,37 +87,45 @@ case class HyperLogLog[T](funnel: Funnel[T], b: Int) { } /** - * Merge another HyperLogLog[T] instance into this instance. Note the other instance must have the same b - * parameter as this instance. - * - * @param another the other HyperLogLog[T] instance - */ + * Merge another HyperLogLog[T] instance into this instance. Note the + * other instance must have the same b parameter as this instance. + * + * @param another + * the other HyperLogLog[T] instance + */ def merge(another: HyperLogLog[T]) = { if (another.nonEmpty) { require(another.m == m, s"Can only merge HLL with same b=$b") - another.M.zipWithIndex.foreach { case (other, i) => if (M(i) < other) M(i) = other } + another.M.zipWithIndex.foreach { case (other, i) => + if (M(i) < other) M(i) = other + } estimateCardinality } this } /** - * Computes positive integer hash of item - * - * @param item item to hash - * @return Int - */ + * Computes positive integer hash of item + * + * @param item + * item to hash + * @return + * Int + */ private def hash(item: T): Int = { val h = hasher.hashObject(item, funnel).asInt() if (h < 0) ~h else h } /** - * Computes most significant set bit of an integer, where returned bit in [0,32]. - * - * @param i the non-negative Int to examine - * @return Int - */ + * Computes most significant set bit of an integer, where returned bit in + * [0,32]. + * + * @param i + * the non-negative Int to examine + * @return + * Int + */ private def rho(i: Int): Int = { require(i >= 0, "i must be non-negative integer") (32 - HyperLogLog.MASKS.lastIndexWhere(_ <= i)) % 33 @@ -121,8 +133,8 @@ case class HyperLogLog[T](funnel: Funnel[T], b: Int) { } object HyperLogLog { - val MASKS = Range(1, 32).map(i => 1 << (i - 1)) + val MASKS = Range(1, 32).map(i => 1 << (i - 1)) val ALPHA_M = 1 / (2 * math.log(2)) - val TWO32 = math.pow(2, 32) + val TWO32 = math.pow(2, 32) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala b/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala index bcc2f39..976db49 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala @@ -7,26 +7,38 @@ import java.nio.ByteBuffer import scala.util.Random /** - * Implements the stable bloom filter from the paper by - * F. Deng and D. Rafiei. Approximately detecting - * duplicates for streaming data using stable bloom - * filters. In SIGMOD, pages 25–36, 2006. - * - * We use heap storage (an array of Longs). - * This implies M=m*d can be set as high as about 125 giga-bits. - * - * @param funnel a Guava funnel for taking input - * @param m number of cells (see the paper, m is a Long but m/floor(63/d) - * must fit in a 32-bit Int) - * @param d bits per cell (see the paper, should lie in [1,63] but often set to 1, 2 or 3) - * @param FPR expected false positive rate (should lie in (0,1)) - * @tparam T the type of funnel used - */ -case class StableBloomFilter[T](funnel: Funnel[T], m: Long, d: Int, FPR: Double) { + * Implements the stable bloom filter from the paper by F. Deng and D. + * Rafiei. Approximately detecting duplicates for streaming data + * using stable bloom filters. In SIGMOD, pages 25–36, 2006. + * + * We use heap storage (an array of Longs). This implies M=m*d + * can be set as high as about 125 giga-bits. + * + * @param funnel + * a Guava funnel for taking input + * @param m + * number of cells (see the paper, m is a Long + * but m/floor(63/d) must fit in a 32-bit Int) + * @param d + * bits per cell (see the paper, should lie in [1,63] but often set to 1, + * 2 or 3) + * @param FPR + * expected false positive rate (should lie in (0,1)) + * @tparam T + * the type of funnel used + */ +case class StableBloomFilter[T]( + funnel: Funnel[T], + m: Long, + d: Int, + FPR: Double) { import StableBloomFilter._ - require(d > 0 && d <= STORAGE_BITS, s"d must be an integer in [1,$STORAGE_BITS]") + require( + d > 0 && d <= STORAGE_BITS, + s"d must be an integer in [1,$STORAGE_BITS]" + ) /** number of bits used per unit storage */ val storedBits: Long = STORAGE_BITS.toLong / (d * d) @@ -34,7 +46,10 @@ case class StableBloomFilter[T](funnel: Funnel[T], m: Long, d: Int, FPR: Double) /** total memory required */ val M = m * d - require(M / storedBits < Int.MaxValue, s"M/$storedBits must be <= ${Int.MaxValue}") + require( + M / storedBits < Int.MaxValue, + s"M/$storedBits must be <= ${Int.MaxValue}" + ) require(FPR > 0 && FPR < 1, "FPR must be a double in (0,1)") /** cell value to set upon insertion */ @@ -59,13 +74,14 @@ case class StableBloomFilter[T](funnel: Funnel[T], m: Long, d: Int, FPR: Double) val storage = Array.fill[Long](w)(0) /** - * Insert a stream element into the filter. - * - * @param item the item to insert - * @return - */ + * Insert a stream element into the filter. + * + * @param item + * the item to insert + * @return + */ def add(item: T): Boolean = { - val cells = hash(item) + val cells = hash(item) val alreadySeen = cells.forall(i => get(i) > 0L) decrementRandomCells() cells.foreach(set) @@ -73,89 +89,104 @@ case class StableBloomFilter[T](funnel: Funnel[T], m: Long, d: Int, FPR: Double) } /** - * Return true if this SBF might contain the requested item. - * - * @param item the item to check - * @return - */ + * Return true if this SBF might contain the requested item. + * + * @param item + * the item to check + * @return + */ def mightContain(item: T): Boolean = hash(item).forall(i => get(i) > 0L) /** - * Merge another filter into this filter. - * - * @param another the other filter - * @return - */ + * Merge another filter into this filter. + * + * @param another + * the other filter + * @return + */ def merge(another: StableBloomFilter[T]): StableBloomFilter[T] = { - require(another.M == M && another.d == d && another.FPR == FPR, "Can only merge SBFs with same settings") + require( + another.M == M && another.d == d && another.FPR == FPR, + "Can only merge SBFs with same settings" + ) another.storage.zipWithIndex.foreach { case (s, i) => storage(i) |= s } this } /** - * Decrement P cells randomly. As recommended in the DR paper, we only generate a single random index, then - * decrement that cell and the next P-1 cells (wrapping around if needed). - */ + * Decrement P cells randomly. As recommended in the DR paper, we only + * generate a single random index, then decrement that cell and the next + * P-1 cells (wrapping around if needed). + */ private def decrementRandomCells(): Unit = { val p = (random.nextDouble() * m).toLong Range(0, P).map(i => (i + p) % m).foreach(decrement) } /** - * Gets the current value of the i'th cell. - * - * @param i the cell to get (in [0, m)) - * @return - */ + * Gets the current value of the i'th cell. + * + * @param i + * the cell to get (in [0, m)) + * @return + */ def get(i: Long) = { val (x, j) = offset(i) getBitsValue(x, j) } /** - * Decrement a cell by one. - * - * @param i the cell to decrement (in [0,m)) - */ + * Decrement a cell by one. + * + * @param i + * the cell to decrement (in [0,m)) + */ private def decrement(i: Long): Unit = { - val (x, j) = offset(i) + val (x, j) = offset(i) val current = getBitsValue(x, j) if (current > 0) storage(x) -= (1L << j) } /** - * Set a cell's value to Max - * - * @param i the cell to set (in [0,m)) - */ + * Set a cell's value to Max + * + * @param i + * the cell to set (in [0,m)) + */ private def set(i: Long): Unit = { val (x, j) = offset(i) storage(x) |= (Max.toLong << j) } /** - * Extract the Int value of d bits (bits j to j+d-1) from stored element - * x. - * - * @param x the index into storage - * @param j the LSB to start from - * @return Int - */ + * Extract the Int value of d bits (bits j to + * j+d-1) from stored element x. + * + * @param x + * the index into storage + * @param j + * the LSB to start from + * @return + * Int + */ private def getBitsValue(x: Int, j: Int) = (storage(x) & (Max.toLong << j)) >>> j /** - * Converts a cell number into a tuple of (x:Int, j:Int), allowing other methods to get and set - * cell values. - * - * x in the integer offset within storage that contains cell i. - * j is the relative offset (in [0,63]) of the LSB of cell i within storage[x]. - * - * @param i the cell number in [0,m) - * @return (Int, Int) - */ + * Converts a cell number into a tuple of (x:Int, j:Int), + * allowing other methods to get and set cell values. + * + * x in the integer offset within storage that contains cell + * i. j is the relative offset (in [0,63]) of + * the LSB of cell i within storage[x]. + * + * @param i + * the cell number in [0,m) + * @return + * (Int, Int) + */ private def offset(i: Long): (Int, Int) = { // the cell covers d bits starting at b (within our total M bits) val b = (i - 1) * d @@ -167,21 +198,22 @@ case class StableBloomFilter[T](funnel: Funnel[T], m: Long, d: Int, FPR: Double) (x, j) } - /** Computes K hash functions of a filter item. - * - * @param item the item to hash - * @return - */ + /** + * Computes K hash functions of a filter item. + * + * @param item + * the item to hash + * @return + */ private def hash(item: T) = { val hash128 = hasher.hashObject(item, funnel).asBytes() - val hash1 = ByteBuffer.wrap(hash128, 0, 8).getLong - val hash2 = ByteBuffer.wrap(hash128, 8, 8).getLong - Range(1, K + 1).map( - i => - (hash1 + i * hash2 match { - case combined if combined < 0 => ~combined - case combined => combined - }) % m + val hash1 = ByteBuffer.wrap(hash128, 0, 8).getLong + val hash2 = ByteBuffer.wrap(hash128, 8, 8).getLong + Range(1, K + 1).map(i => + (hash1 + i * hash2 match { + case combined if combined < 0 => ~combined + case combined => combined + }) % m ) } @@ -189,21 +221,30 @@ case class StableBloomFilter[T](funnel: Funnel[T], m: Long, d: Int, FPR: Double) object StableBloomFilter { val STORAGE_BITS = java.lang.Long.SIZE - 1 - val LN2 = Math.log(2) - val LN2_SQUARED = LN2 * LN2 + val LN2 = Math.log(2) + val LN2_SQUARED = LN2 * LN2 - /** Return a builder for constructing an instance of StableBloomFilter[T] */ + /** + * Return a builder for constructing an instance of StableBloomFilter[T] + */ def builder[T](funnel: Funnel[T]) = StableBloomFilterBuilder[T](funnel) - /** Return the optimal number of cells to decrement each time a new item is inserted - * in the filter. This quantity is represented by the symbol P in the DR paper (eqn 17). - * - * @param m number of cells in the SBF - * @param K number of hash functions - * @param d bits per cell (Max = 2**d - 1) - * @param FPS false positive rate - * @return P optimal number of cells to decrement - */ + /** + * Return the optimal number of cells to decrement each time a new item + * is inserted in the filter. This quantity is represented by the symbol + * P in the DR paper (eqn 17). + * + * @param m + * number of cells in the SBF + * @param K + * number of hash functions + * @param d + * bits per cell (Max = 2**d - 1) + * @param FPS + * false positive rate + * @return + * P optimal number of cells to decrement + */ def optimalP(m: Long, K: Int, d: Int, FPS: Double) = { val Max = (1L << d) - 1 @@ -216,7 +257,7 @@ object StableBloomFilter { (1d / (denom1 * denom2)).toInt match { case x if x <= 0 => 1 - case x => x + case x => x } } } diff --git a/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilterBuilder.scala b/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilterBuilder.scala index eb789f5..1dcd9a7 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilterBuilder.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilterBuilder.scala @@ -3,24 +3,30 @@ package io.epiphanous.flinkrunner.algorithm.membership import com.google.common.hash.Funnel /** - * A builder interface for creating StableBloomFilter instances. - * - * @param funnel a guava funnel - * @param numCells number of cells in the filter - * @param bitsPerCell number of bits per cell in the filter - * @param falsePositiveRate desired maximum false positive rate of the filter - * @tparam T the type of item inserted into the filter - */ + * A builder interface for creating StableBloomFilter instances. + * + * @param funnel + * a guava funnel + * @param numCells + * number of cells in the filter + * @param bitsPerCell + * number of bits per cell in the filter + * @param falsePositiveRate + * desired maximum false positive rate of the filter + * @tparam T + * the type of item inserted into the filter + */ case class StableBloomFilterBuilder[T]( - funnel: Funnel[T], - numCells: Long = 1000000, - bitsPerCell: Int = 3, - falsePositiveRate: Double = 0.01) { + funnel: Funnel[T], + numCells: Long = 1000000, + bitsPerCell: Int = 3, + falsePositiveRate: Double = 0.01) { def withNumCells(m: Long) = copy(numCells = m) def withBitsPerCell(d: Int) = copy(bitsPerCell = d) def withFalsePositiveRate(p: Double) = copy(falsePositiveRate = p) - def build() = StableBloomFilter(funnel, numCells, bitsPerCell, falsePositiveRate) + def build() = + StableBloomFilter(funnel, numCells, bitsPerCell, falsePositiveRate) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/avro/AvroCodingException.scala b/src/main/scala/io/epiphanous/flinkrunner/avro/AvroCodingException.scala index 048495c..8ddd508 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/avro/AvroCodingException.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/avro/AvroCodingException.scala @@ -1,4 +1,6 @@ package io.epiphanous.flinkrunner.avro -class AvroCodingException(message: String = "Failure during Avro coding", cause: Throwable = None.orNull) - extends Exception(message, cause) +class AvroCodingException( + message: String = "Failure during Avro coding", + cause: Throwable = None.orNull) + extends Exception(message, cause) diff --git a/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala b/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala index bd0dab9..cbbd517 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala @@ -4,9 +4,10 @@ import cats.effect.{ContextShift, IO, Resource, Timer} import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} import com.typesafe.scalalogging.LazyLogging import io.circe.Decoder -import io.epiphanous.flinkrunner.model.FlinkConfig +import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} import io.epiphanous.flinkrunner.util.StringUtils import org.apache.avro.Schema.Parser +import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.runtime.concurrent.Executors.directExecutionContext import org.http4s.EntityDecoder import org.http4s.circe.jsonOf @@ -18,8 +19,8 @@ import java.util.concurrent.TimeUnit import scala.concurrent.ExecutionContext import scala.util.{Failure, Success, Try} -class ConfluentSchemaRegistryClient()(implicit - config: FlinkConfig, +class ConfluentSchemaRegistryClient[ADT <: FlinkEvent: TypeInformation]( + config: FlinkConfig[ADT])(implicit decoder: Decoder[ConfluentSchemaRegistryResponse]) extends AvroSchemaRegistryClient[ConfluentSchemaRegistryContext] with StringUtils @@ -104,7 +105,7 @@ class ConfluentSchemaRegistryClient()(implicit .concurrencyLevel( config.getInt(s"$configPrefix.cache.concurrency.level") ) - .maximumSize(config.getInt(s"$configPrefix.cache.max.size")) + .maximumSize(config.getLong(s"$configPrefix.cache.max.size")) .expireAfterWrite(expireAfter.toMillis, TimeUnit.MILLISECONDS) // .expireAfterWrite(expireAfter) // for guava 27 if (!config.getBoolean(s"$configPrefix.cache.use.strong.keys")) diff --git a/src/main/scala/io/epiphanous/flinkrunner/flink/BaseFlinkJob.scala b/src/main/scala/io/epiphanous/flinkrunner/flink/BaseFlinkJob.scala index 5bf7771..53fbcb6 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/flink/BaseFlinkJob.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/flink/BaseFlinkJob.scala @@ -1,84 +1,107 @@ package io.epiphanous.flinkrunner.flink import com.typesafe.scalalogging.LazyLogging -import io.epiphanous.flinkrunner.SEE import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} -import io.epiphanous.flinkrunner.util.StreamUtils._ +import io.epiphanous.flinkrunner.util.StreamUtils.Pipe +import io.epiphanous.flinkrunner.{FlinkRunner, SEE} import org.apache.flink.api.common.JobExecutionResult import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.streaming.api.datastream.DataStreamUtils import org.apache.flink.streaming.api.scala.DataStream -import scala.collection.JavaConverters._ +import scala.util.Try /** - * An abstract flink job to transform on an input stream into an output stream. - * - * @tparam DS The type of the input stream - * @tparam OUT The type of output stream elements - */ -abstract class BaseFlinkJob[DS: TypeInformation, OUT <: FlinkEvent : TypeInformation] extends LazyLogging { + * An abstract flink job to transform on an input stream into an output + * stream. + * @param runner + * the flink runner associated with this job + * @tparam DS + * The type of the input data stream (not its elements) + * @tparam OUT + * The type of output stream elements + * @tparam ADT + * The flink runner's algebraic data type + */ +abstract class BaseFlinkJob[ + DS: TypeInformation, + OUT <: ADT: TypeInformation, + ADT <: FlinkEvent: TypeInformation](runner: FlinkRunner[ADT]) + extends LazyLogging { + + val config: FlinkConfig[ADT] = runner.config + val env: SEE = runner.env /** - * A pipeline for transforming a single stream. Passes the output of source() - * through transform() and the result of that into maybeSink(), which may pass it - * into sink() if we're not testing. Ultimately, returns the output data stream to - * facilitate testing. - * - * @param config implicit flink job config - * @return data output stream - */ - def flow()(implicit config: FlinkConfig, env: SEE): DataStream[OUT] = - source |> transform |# maybeSink + * A pipeline for transforming a single stream. Passes the output of + * source() through transform() and the result of that into maybeSink(), + * which may pass it into sink() if we're not testing. Ultimately, + * returns the output data stream to facilitate testing. + * + * @return + * data output stream + */ + def flow(): DataStream[OUT] = source |> transform |# maybeSink - def run()(implicit config: FlinkConfig, env: SEE): Either[Iterator[OUT], JobExecutionResult] = { + def run(limitOpt: Option[Int] = None) + : Either[List[OUT], JobExecutionResult] = { - logger.info(s"\nSTARTING FLINK JOB: ${config.jobName} ${config.jobArgs.mkString(" ")}\n") + logger.info( + s"\nSTARTING FLINK JOB: ${config.jobName} ${config.jobArgs.mkString(" ")}\n" + ) - val stream = flow + val stream = flow() - if (config.showPlan) logger.info(s"PLAN:\n${env.getExecutionPlan}\n") + if (config.showPlan) + logger.info(s"PLAN:\n${env.getExecutionPlan}\n") - if (config.mockEdges) - Left(DataStreamUtils.collect(stream.javaStream).asScala) - else + if (config.mockEdges) { + val limit = limitOpt.getOrElse( + Try(config.getJobConfig(config.jobName).getInt("run.limit")) + .getOrElse(100) + ) + Left(stream.executeAndCollect(config.jobName, limit)) + } else Right(env.execute(config.jobName)) } /** - * Returns source data stream to pass into transform(). This must be overridden by subclasses. - * - * @return input data stream - */ - def source()(implicit config: FlinkConfig, env: SEE): DS + * Returns source data stream to pass into transform(). This must be + * overridden by subclasses. + * + * @return + * input data stream + */ + def source(): DS /** - * Primary method to transform the source data stream into the output data stream. The output of - * this method is passed into sink(). This method must be overridden by subclasses. - * - * @param in input data stream created by source() - * @param config implicit flink job config - * @return output data stream - */ - def transform(in: DS)(implicit config: FlinkConfig, env: SEE): DataStream[OUT] + * Primary method to transform the source data stream into the output + * data stream. The output of this method is passed into sink(). This + * method must be overridden by subclasses. + * + * @param in + * input data stream created by source() + * @return + * output data stream + */ + def transform(in: DS): DataStream[OUT] /** - * Writes the transformed data stream to configured output sinks. - * - * @param out a transformed stream from transform() - * @param config implicit flink job config - */ - def sink(out: DataStream[OUT])(implicit config: FlinkConfig, env: SEE): Unit = - config.getSinkNames.foreach(name => out.toSink(name)) + * Writes the transformed data stream to configured output sinks. + * + * @param out + * a transformed stream from transform() + */ + def sink(out: DataStream[OUT]): Unit = + config.getSinkNames.foreach(name => runner.toSink[OUT](out, name)) /** - * The output stream will only be passed to BaseFlinkJob.sink - * if FlinkConfig.mockEdges is false (ie, you're not testing). - * - * @param out the output data stream to pass into BaseFlinkJob.sink) - * @param config implicit flink job config - */ - def maybeSink(out: DataStream[OUT])(implicit config: FlinkConfig, env: SEE): Unit = + * The output stream will only be passed to BaseFlinkJob.sink if + * FlinkConfig.mockEdges is false (ie, you're not testing). + * + * @param out + * the output data stream to pass into BaseFlinkJob.sink) + */ + def maybeSink(out: DataStream[OUT]): Unit = if (!config.mockEdges) sink(out) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/flink/BroadcastFlinkJob.scala b/src/main/scala/io/epiphanous/flinkrunner/flink/BroadcastFlinkJob.scala index 50cf18a..53b562a 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/flink/BroadcastFlinkJob.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/flink/BroadcastFlinkJob.scala @@ -1,8 +1,7 @@ package io.epiphanous.flinkrunner.flink -import io.epiphanous.flinkrunner.SEE -import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} -import io.epiphanous.flinkrunner.util.StreamUtils._ +import io.epiphanous.flinkrunner.FlinkRunner +import io.epiphanous.flinkrunner.model.FlinkEvent import org.apache.flink.api.common.state.MapStateDescriptor import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala._ @@ -18,18 +17,25 @@ import org.apache.flink.streaming.api.scala.{ * href="https://flink.apache.org/2019/06/26/broadcast-state.html">broadcast * stream join pattern. * + * @param runner + * the flink runner associated with this job * @tparam IN * Input stream event type * @tparam BC * Broadcast stream event type * @tparam OUT * Output stream event type + * @tparam ADT + * The flink runner's algebraic data type */ abstract class BroadcastFlinkJob[ - IN <: FlinkEvent: TypeInformation, - BC <: FlinkEvent: TypeInformation, - OUT <: FlinkEvent: TypeInformation] - extends BaseFlinkJob[BroadcastConnectedStream[IN, BC], OUT] { + IN <: ADT: TypeInformation, + BC <: ADT: TypeInformation, + OUT <: ADT: TypeInformation, + ADT <: FlinkEvent: TypeInformation](runner: FlinkRunner[ADT]) + extends BaseFlinkJob[BroadcastConnectedStream[IN, BC], OUT, ADT]( + runner + ) { import BroadcastFlinkJob._ @@ -38,12 +44,10 @@ abstract class BroadcastFlinkJob[ * output data stream from the connected broadcast + events stream. Must * be overridden by sub-classes. * - * @param config - * implicit flink config * @return * KeyedBroadcastProcessFunction[String, IN, BC, OUT] */ - def getBroadcastProcessFunction()(implicit config: FlinkConfig) + def getBroadcastProcessFunction : KeyedBroadcastProcessFunction[String, IN, BC, OUT] /** @@ -52,14 +56,12 @@ abstract class BroadcastFlinkJob[ * @param nameOpt * the name of the broadcast stream in the source configuration * (default "broadcast") - * @param config - * implicit flink config * @return * MapStateDescriptor[String, BC] */ def getBroadcastStateDescriptor( nameOpt: Option[String] = None - )(implicit config: FlinkConfig): MapStateDescriptor[String, BC] = + ): MapStateDescriptor[String, BC] = new MapStateDescriptor[String, BC]( nameOpt.getOrElse(BROADCAST_STATE_DESCRIPTOR_NAME), createTypeInformation[String], @@ -69,41 +71,26 @@ abstract class BroadcastFlinkJob[ /** * Creates the broadcast source stream. * - * @param config - * implicit flink config - * @param env - * implicit streaming execution environment * @return * broadcast stream */ - def broadcastSource(implicit - config: FlinkConfig, - env: SEE): BroadcastStream[BC] = - fromSource[BC](getBroadcastSourceName).broadcast( - getBroadcastStateDescriptor() - ) - - def getBroadcastSourceName()(implicit config: FlinkConfig) = - BROADCAST_SOURCE_NAME - - def getEventSourceName()(implicit config: FlinkConfig) = - EVENT_SOURCE_NAME + def broadcastSource: BroadcastStream[BC] = + runner + .fromSource[BC](getBroadcastSourceName) + .broadcast( + getBroadcastStateDescriptor() + ) /** * Creates the broadcast stream and the input event stream and connects * them * - * @param config - * implicit flink config - * @param env - * implicit streaming execution environment * @return * connected broadcast + events stream */ - override def source()(implicit - config: FlinkConfig, - env: SEE): BroadcastConnectedStream[IN, BC] = - (fromSource[IN](getEventSourceName)) + override def source(): BroadcastConnectedStream[IN, BC] = + (runner + .fromSource[IN](getEventSourceName)) .keyBy((in: IN) => in.$key) .connect(broadcastSource) @@ -122,12 +109,16 @@ abstract class BroadcastFlinkJob[ */ override def transform( in: BroadcastConnectedStream[IN, BC] - )(implicit config: FlinkConfig, env: SEE): DataStream[OUT] = { + ): DataStream[OUT] = { val name = - s"processed:${getEventSourceName()}+${getBroadcastSourceName()}" - in.process(getBroadcastProcessFunction()).name(name).uid(name) + s"processed:$getEventSourceName+$getBroadcastSourceName" + in.process(getBroadcastProcessFunction).name(name).uid(name) } + def getBroadcastSourceName: String = BROADCAST_SOURCE_NAME + + def getEventSourceName: String = EVENT_SOURCE_NAME + } object BroadcastFlinkJob { diff --git a/src/main/scala/io/epiphanous/flinkrunner/flink/FilterByControlJob.scala b/src/main/scala/io/epiphanous/flinkrunner/flink/FilterByControlJob.scala index e4ccc9e..ac1497e 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/flink/FilterByControlJob.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/flink/FilterByControlJob.scala @@ -1,12 +1,7 @@ package io.epiphanous.flinkrunner.flink -import io.epiphanous.flinkrunner.SEE -import io.epiphanous.flinkrunner.model.{ - DataOrControl, - FlinkConfig, - FlinkEvent -} -import io.epiphanous.flinkrunner.util.StreamUtils._ +import io.epiphanous.flinkrunner.FlinkRunner +import io.epiphanous.flinkrunner.model.{DataOrControl, FlinkEvent} import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.api.scala._ import org.apache.flink.streaming.api.scala.DataStream @@ -24,6 +19,8 @@ import org.apache.flink.streaming.api.scala.DataStream * * would output `d3 d4 d5`. * + * @param runner + * the flink runner associated with this job * @tparam D * the data type * @tparam C @@ -32,32 +29,30 @@ import org.apache.flink.streaming.api.scala.DataStream * the output stream element type */ abstract class FilterByControlJob[ - D <: FlinkEvent: TypeInformation, - C <: FlinkEvent: TypeInformation, - OUT <: FlinkEvent: TypeInformation] - extends FlinkJob[D, OUT] { + D <: ADT: TypeInformation, + C <: ADT: TypeInformation, + OUT <: ADT: TypeInformation, + ADT <: FlinkEvent: TypeInformation](runner: FlinkRunner[ADT]) + extends FlinkJob[D, OUT, ADT](runner) { + + import io.epiphanous.flinkrunner.flink.FilterByControlJob._ /** * A source data stream for the data events. * - * @param config - * implicit flink config * @return * a data stream of data events. */ - def data(implicit config: FlinkConfig, env: SEE): DataStream[D] = - fromSource[D]("data") + def data: DataStream[D] = + runner.fromSource[D](getDataStreamName) /** * A source data stream for the control events. * - * @param config - * implicit flink config * @return * a data stream of control events. */ - def control(implicit config: FlinkConfig, env: SEE): DataStream[C] = - fromSource[C]("control") + def control: DataStream[C] = runner.fromSource[C](getControlStreamName) /** * Generate a stream of data records filtered by the control stream. This @@ -67,26 +62,31 @@ abstract class FilterByControlJob[ * when to emit the data records. It remembers the last control time and * state and updates it when the state changes. * * - * @param config - * implicit flink config * @return * data stream of data records */ - override def source()(implicit - config: FlinkConfig, - env: SEE): DataStream[D] = { + override def source(): DataStream[D] = { val controlLockoutDuration = config.getDuration("control.lockout.duration").toMillis + val name = getDataControlStreamName + + implicit val typeInformation + : TypeInformation[DataOrControl[D, C, ADT]] = + TypeInformation.of(classOf[DataOrControl[D, C, ADT]]) + val in = data .connect(control) - .map(DataOrControl.data[D, C], DataOrControl.control[D, C]) - .name("data+control") - .uid("data+control") + .map( + DataOrControl.data[D, C, ADT], + DataOrControl.control[D, C, ADT] + ) + .name(name) + .uid(name) - in.keyBy((e: DataOrControl[D, C]) => e.$key) + in.keyBy((e: DataOrControl[D, C, ADT]) => e.$key) .filterWithState[(Long, Boolean)]((dc, lastControlOpt) => { if (dc.isData) { val emit = lastControlOpt match { @@ -109,9 +109,21 @@ abstract class FilterByControlJob[ }) .name(s"filter:${in.name}") .uid(s"filter:${in.name}") - .map((x: DataOrControl[D, C]) => x.data.get) + .map((x: DataOrControl[D, C, ADT]) => x.data.get) .name("filtered:data") .uid("filtered:data") } + def getDataStreamName: String = DATA_STREAM_NAME + + def getControlStreamName: String = CONTROL_STREAM_NAME + + def getDataControlStreamName: String = + s"$getDataStreamName+$getControlStreamName" + +} + +object FilterByControlJob { + val DATA_STREAM_NAME = "data" + val CONTROL_STREAM_NAME = "control" } diff --git a/src/main/scala/io/epiphanous/flinkrunner/flink/FlinkJob.scala b/src/main/scala/io/epiphanous/flinkrunner/flink/FlinkJob.scala index 3f0655b..3f6f549 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/flink/FlinkJob.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/flink/FlinkJob.scala @@ -1,8 +1,7 @@ package io.epiphanous.flinkrunner.flink -import io.epiphanous.flinkrunner.SEE -import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} -import io.epiphanous.flinkrunner.util.StreamUtils._ +import io.epiphanous.flinkrunner.FlinkRunner +import io.epiphanous.flinkrunner.model.FlinkEvent import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.scala.DataStream @@ -10,24 +9,27 @@ import org.apache.flink.streaming.api.scala.DataStream * An abstract flink job to transform on a stream of events from an * algebraic data type (ADT). * + * @param runner + * the flink runner associated with this job * @tparam IN * The type of input stream elements * @tparam OUT * The type of output stream elements + * @tparam ADT + * The flink runner's algebraic data type */ abstract class FlinkJob[ - IN <: FlinkEvent: TypeInformation, - OUT <: FlinkEvent: TypeInformation] - extends BaseFlinkJob[DataStream[IN], OUT] { + IN <: ADT: TypeInformation, + OUT <: ADT: TypeInformation, + ADT <: FlinkEvent: TypeInformation](runner: FlinkRunner[ADT]) + extends BaseFlinkJob[DataStream[IN], OUT, ADT](runner) { /** * Return the primary event source name - * @param config - * implicit flink config * @return * primary source name */ - def getEventSourceName(implicit config: FlinkConfig): String = + def getEventSourceName: String = config.getSourceNames.headOption.getOrElse("events") /** @@ -37,7 +39,7 @@ abstract class FlinkJob[ * @return * input data stream */ - def source()(implicit config: FlinkConfig, env: SEE): DataStream[IN] = - fromSource[IN](getEventSourceName) + def source(): DataStream[IN] = + runner.fromSource[IN](getEventSourceName) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/flink/IdentityJob.scala b/src/main/scala/io/epiphanous/flinkrunner/flink/IdentityJob.scala index 98c892f..de79d6c 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/flink/IdentityJob.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/flink/IdentityJob.scala @@ -1,26 +1,32 @@ package io.epiphanous.flinkrunner.flink -import io.epiphanous.flinkrunner.SEE -import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} +import io.epiphanous.flinkrunner.FlinkRunner +import io.epiphanous.flinkrunner.model.FlinkEvent import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.scala.DataStream -class IdentityJob[E <: FlinkEvent: TypeInformation] - extends FlinkJob[E, E] { +/** + * An identity mapper that passes through an input events to its configured + * sinks. + * @param runner + * the flink runner associated with this job + * @tparam E + * the input and output event type + * @tparam ADT + * The flink runner's algebraic data type + */ +class IdentityJob[ + E <: ADT: TypeInformation, + ADT <: FlinkEvent: TypeInformation](runner: FlinkRunner[ADT]) + extends FlinkJob[E, E, ADT](runner) { /** * Does the identity transform (passes the stream through unchanged). * * @param in * input data stream created by source() - * @param config - * implicit flink job config - * @param env - * streaming execution environment * @return * output data stream */ - override def transform( - in: DataStream[E])(implicit config: FlinkConfig, env: SEE) = - in + override def transform(in: DataStream[E]): DataStream[E] = in } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/ConfigToProps.scala b/src/main/scala/io/epiphanous/flinkrunner/model/ConfigToProps.scala index f15d55c..31403a7 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/ConfigToProps.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/ConfigToProps.scala @@ -5,32 +5,34 @@ import com.typesafe.config.ConfigObject import java.util.{Properties, List => JList, Map => JMap} import scala.collection.JavaConverters._ -trait ConfigToProps { - def config: Option[ConfigObject] +object ConfigToProps { - // this flattens a hierarchical config into a string -> string properties map - val properties: Properties = { - val p = new Properties() + implicit class RichConfigObject(val config: Option[ConfigObject]) { - def flatten(key: String, value: Object): Unit = { - val pkey = if (key.isEmpty) key else s"$key." - value match { - case map: JMap[String, Object] @unchecked => - map.asScala.foreach { case (k, v) => flatten(s"$pkey$k", v) } - case list: JList[Object] @unchecked => - list.asScala.zipWithIndex.foreach { case (v, i) => - flatten(s"$pkey$i", v) - } - case v => - p.put(key, v.toString) - () // force unit return + // this flattens a hierarchical config into a string -> string properties map + def asProperties: Properties = { + val p = new Properties() + + def flatten(key: String, value: Object): Unit = { + val pkey = if (key.isEmpty) key else s"$key." + value match { + case map: JMap[String, Object] @unchecked => + map.asScala.foreach { case (k, v) => flatten(s"$pkey$k", v) } + case list: JList[Object] @unchecked => + list.asScala.zipWithIndex.foreach { case (v, i) => + flatten(s"$pkey$i", v) + } + case v => + p.put(key, v.toString) + () // force unit return + } } - } - config match { - case Some(c) => flatten("", c.unwrapped()) - case None => // noop + config match { + case Some(c) => flatten("", c.unwrapped()) + case None => // noop + } + p } - p } } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/DataControlPeriod.scala b/src/main/scala/io/epiphanous/flinkrunner/model/DataControlPeriod.scala index 6bac29d..8f09583 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/DataControlPeriod.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/DataControlPeriod.scala @@ -2,7 +2,7 @@ package io.epiphanous.flinkrunner.model import java.util.UUID -case class DataControlPeriod[D <: FlinkEvent]( +case class DataControlPeriod[D <: ADT, ADT <: FlinkEvent]( id: String = UUID.randomUUID().toString, key: String, start: Long = 0L, diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/DataOrControl.scala b/src/main/scala/io/epiphanous/flinkrunner/model/DataOrControl.scala index b6aeff3..6c3f105 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/DataOrControl.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/DataOrControl.scala @@ -1,6 +1,8 @@ package io.epiphanous.flinkrunner.model -case class DataOrControl[D <: FlinkEvent, C <: FlinkEvent]( +import org.apache.flink.api.common.typeinfo.TypeInformation + +case class DataOrControl[D <: ADT, C <: ADT, ADT <: FlinkEvent]( event: Either[D, C]) extends FlinkEvent { def $id: String = event.fold(_.$id, _.$id) @@ -23,9 +25,11 @@ case class DataOrControl[D <: FlinkEvent, C <: FlinkEvent]( } object DataOrControl { - def data[D <: FlinkEvent, C <: FlinkEvent]( - event: D): DataOrControl[D, C] = DataOrControl[D, C](Left(event)) + def data[D <: ADT, C <: ADT, ADT <: FlinkEvent: TypeInformation]( + data: D): DataOrControl[D, C, ADT] = + DataOrControl[D, C, ADT](Left(data)) - def control[D <: FlinkEvent, C <: FlinkEvent]( - event: C): DataOrControl[D, C] = DataOrControl[D, C](Right(event)) + def control[D <: ADT, C <: ADT, ADT <: FlinkEvent: TypeInformation]( + control: C) = + DataOrControl[D, C, ADT](Right(control)) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala b/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala index 505ec33..17d0389 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala @@ -2,26 +2,22 @@ package io.epiphanous.flinkrunner.model import com.typesafe.config.{ConfigFactory, ConfigObject} import com.typesafe.scalalogging.LazyLogging +import io.epiphanous.flinkrunner.model.ConfigToProps.RichConfigObject import io.epiphanous.flinkrunner.{FlinkRunnerFactory, SEE} import org.apache.flink.api.java.utils.ParameterTool -import org.apache.flink.contrib.streaming.state.{ - PredefinedOptions, - RocksDBStateBackend -} -import org.apache.flink.runtime.state.filesystem.FsStateBackend -import org.apache.flink.streaming.api.TimeCharacteristic +import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import java.io.File import java.time.Duration -import java.util.{Properties, List => JList, Map => JMap} +import java.util.Properties import scala.collection.JavaConverters._ import scala.util.{Failure, Success, Try} @SerialVersionUID(1544548116L) -class FlinkConfig( +class FlinkConfig[ADT <: FlinkEvent]( args: Array[String], - factory: FlinkRunnerFactory[_], + factory: FlinkRunnerFactory[ADT], sources: Map[String, Seq[Array[Byte]]] = Map.empty, optConfig: Option[String] = None) extends LazyLogging @@ -128,24 +124,7 @@ class FlinkConfig( case (_, p) => _config.getDuration(p) } - def getProperties(path: String): Properties = { - val p = new Properties() - - def flatten(key: String, value: Object): Unit = { - val pkey = if (key.isEmpty) key else s"$key." - value match { - case map: JMap[String, Object] @unchecked => - map.asScala.foreach { case (k, v) => flatten(s"$pkey$k", v) } - case list: JList[Object] @unchecked => - list.asScala.zipWithIndex.foreach { case (v, i) => - flatten(s"$pkey$i", v) - } - case v => - p.put(key, v.toString) - () // force unit return - } - } - + def getProperties(path: String): Properties = (_s(path) match { case ("a", p) => Some( @@ -155,12 +134,7 @@ class FlinkConfig( ) case (_, p) => if (_config.hasPath(p)) Some(_config.getObject(p)) else None - }) match { - case Some(c) => flatten("", c.unwrapped()) - case None => // noop - } - p - } + }).asProperties def _classInstance[T](path: String): T = Class @@ -171,33 +145,35 @@ class FlinkConfig( // def getJobInstance = factory.getJobInstance(jobName, this) - def getDeserializationSchema(name: String) = - factory.getDeserializationSchema(name, this) + def getDeserializationSchema[E <: ADT](name: String) = + factory.getDeserializationSchema[E](name, this) - def getKafkaDeserializationSchema(name: String) = - factory.getKafkaDeserializationSchema(name, this) + def getKafkaDeserializationSchema[E <: ADT](name: String) = + factory.getKafkaDeserializationSchema[E](name, this) - def getKinesisDeserializationSchema(name: String) = - factory.getKinesisDeserializationSchema(name, this) + def getKinesisDeserializationSchema[E <: ADT](name: String) = + factory.getKinesisDeserializationSchema[E](name, this) - def getSerializationSchema(name: String) = - factory.getSerializationSchema(name, this) + def getSerializationSchema[E <: ADT](name: String) = + factory.getSerializationSchema[E](name, this) - def getKafkaSerializationSchema(name: String) = - factory.getKafkaSerializationSchema(name, this) + def getKafkaSerializationSchema[E <: ADT](name: String) = + factory.getKafkaSerializationSchema[E](name, this) - def getKinesisSerializationSchema(name: String) = - factory.getKinesisSerializationSchema(name, this) + def getKinesisSerializationSchema[E <: ADT](name: String) = + factory.getKinesisSerializationSchema[E](name, this) - def getEncoder(name: String) = factory.getEncoder(name, this) + def getEncoder[E <: ADT](name: String) = + factory.getEncoder[E](name, this) - def getAddToJdbcBatchFunction(name: String) = - factory.getAddToJdbcBatchFunction(name, this) + def getAddToJdbcBatchFunction[E <: ADT](name: String) = + factory.getAddToJdbcBatchFunction[E](name, this) - def getBucketAssigner(name: String) = - factory.getBucketAssigner(name, this) + def getBucketAssigner[E <: ADT](name: String) = + factory.getBucketAssigner[E](name, this) - def getAvroCoder(name: String) = factory.getAvroCoder(name, this) + def getAvroCoder(name: String) = + factory.getAvroCoder(name, this) def getSourceConfig(name: String): SourceConfig = SourceConfig(name, this) @@ -232,9 +208,6 @@ class FlinkConfig( else StreamExecutionEnvironment.getExecutionEnvironment - // use event time - env.setStreamTimeCharacteristic(timeCharacteristic) - // set parallelism env.setParallelism(globalParallelism) @@ -250,44 +223,16 @@ class FlinkConfig( checkpointMaxConcurrent ) - val backend = if (stateBackend == "rocksdb") { - logger.info(s"Using ROCKS DB state backend at $checkpointUrl") - val rocksBackend = - new RocksDBStateBackend(checkpointUrl, checkpointIncremental) - if (checkpointFlash) - rocksBackend.setPredefinedOptions( - PredefinedOptions.FLASH_SSD_OPTIMIZED - ) - rocksBackend - } else { - logger.info(s"Using FILE SYSTEM state backend at $checkpointUrl") - new FsStateBackend(checkpointUrl) - } - /* this deprecation is annoying; its due to rocksdb's state backend - extending AbstractStateBackend which is deprecated */ - env.setStateBackend(backend) + logger.info(s"Using ROCKS DB state backend at $checkpointUrl") + env.setStateBackend( + new EmbeddedRocksDBStateBackend(checkpointIncremental) + ) + env.getCheckpointConfig.setCheckpointStorage(checkpointUrl) } env } - def getTimeCharacteristic(tc: String): TimeCharacteristic = { - tc.toLowerCase - .replaceFirst("\\s*time$", "") match { - case "event" => TimeCharacteristic.EventTime - case "processing" => TimeCharacteristic.ProcessingTime - case "ingestion" => TimeCharacteristic.IngestionTime - case unknown => - throw new RuntimeException( - s"Unknown time.characteristic setting: '$unknown'" - ) - } - } - - lazy val timeCharacteristic = getTimeCharacteristic( - getString("time.characteristic") - ) - def getWatermarkStrategy(ws: String) = ws.toLowerCase.replaceAll("[^a-z]", "") match { case "boundedlateness" => "bounded lateness" diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/SinkConfig.scala b/src/main/scala/io/epiphanous/flinkrunner/model/SinkConfig.scala index 58a73b5..f235cf4 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/SinkConfig.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/SinkConfig.scala @@ -15,7 +15,9 @@ sealed trait SinkConfig { } object SinkConfig { - def apply(name: String, config: FlinkConfig): SinkConfig = { + def apply[ADT <: FlinkEvent]( + name: String, + config: FlinkConfig[ADT]): SinkConfig = { val p = s"sinks.$name" FlinkConnectorName.withNameInsensitiveOption( config.getString(s"$p.connector") diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/SourceConfig.scala b/src/main/scala/io/epiphanous/flinkrunner/model/SourceConfig.scala index f736f53..85d081b 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/SourceConfig.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/SourceConfig.scala @@ -1,9 +1,9 @@ package io.epiphanous.flinkrunner.model import io.epiphanous.flinkrunner.model.FlinkConnectorName._ -import org.apache.flink.streaming.api.TimeCharacteristic import java.util.Properties +import scala.concurrent.duration.DurationInt import scala.util.Try sealed trait SourceConfig { @@ -13,20 +13,21 @@ sealed trait SourceConfig { def label: String = s"$connector/$name" - def timeCharacteristic: TimeCharacteristic - def watermarkStrategy: String + def maxAllowedLateness: Long + def properties: Properties } object SourceConfig { - def apply(name: String, config: FlinkConfig): SourceConfig = { + def apply[ADT <: FlinkEvent]( + name: String, + config: FlinkConfig[ADT]): SourceConfig = { val p = s"sources.$name" - val timeCharacteristic = - Try(config.getString(s"$p.time.characteristic")) - .map(config.getTimeCharacteristic) - .getOrElse(config.timeCharacteristic) + val maxAllowedLateness = Try( + config.getDuration(s"$p.max.allowed.lateness") + ).map(_.toMillis).getOrElse(5.minutes.toMillis) val watermarkStrategy = Try(config.getString(s"$p.watermark.strategy")) .map(config.getWatermarkStrategy) .getOrElse(config.watermarkStrategy) @@ -42,8 +43,8 @@ object SourceConfig { name, config.getString(s"$p.topic"), config.getBoolean(s"$p.isKeyed"), - timeCharacteristic, watermarkStrategy, + maxAllowedLateness, config.getProperties(s"$p.config") ) case Kinesis => @@ -51,8 +52,8 @@ object SourceConfig { connector, name, config.getString(s"$p.stream"), - timeCharacteristic, watermarkStrategy, + maxAllowedLateness, config.getProperties(s"$p.config") ) case File => @@ -60,8 +61,8 @@ object SourceConfig { connector, name, config.getString(s"$p.path"), - timeCharacteristic, watermarkStrategy, + maxAllowedLateness, config.getProperties(s"$p.config") ) case Socket => @@ -70,8 +71,8 @@ object SourceConfig { name, config.getString(s"$p.host"), config.getInt(s"$p.port"), - timeCharacteristic, watermarkStrategy, + maxAllowedLateness, config.getProperties(s"$p.config") ) case Collection => @@ -79,8 +80,8 @@ object SourceConfig { connector, name, name, - timeCharacteristic, watermarkStrategy, + maxAllowedLateness, config.getProperties(s"$p.config") ) case other => @@ -101,8 +102,8 @@ final case class KafkaSourceConfig( name: String, topic: String, isKeyed: Boolean, - timeCharacteristic: TimeCharacteristic, watermarkStrategy: String, + maxAllowedLateness: Long, properties: Properties) extends SourceConfig @@ -110,8 +111,8 @@ final case class KinesisSourceConfig( connector: FlinkConnectorName = Kinesis, name: String, stream: String, - timeCharacteristic: TimeCharacteristic, watermarkStrategy: String, + maxAllowedLateness: Long, properties: Properties) extends SourceConfig @@ -119,8 +120,8 @@ final case class FileSourceConfig( connector: FlinkConnectorName = File, name: String, path: String, - timeCharacteristic: TimeCharacteristic, watermarkStrategy: String, + maxAllowedLateness: Long, properties: Properties) extends SourceConfig @@ -129,8 +130,8 @@ final case class SocketSourceConfig( name: String, host: String, port: Int, - timeCharacteristic: TimeCharacteristic, watermarkStrategy: String, + maxAllowedLateness: Long, properties: Properties) extends SourceConfig @@ -138,7 +139,7 @@ final case class CollectionSourceConfig( connector: FlinkConnectorName = Collection, name: String, topic: String, - timeCharacteristic: TimeCharacteristic, watermarkStrategy: String, + maxAllowedLateness: Long, properties: Properties) extends SourceConfig diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/UnitMapper.scala b/src/main/scala/io/epiphanous/flinkrunner/model/UnitMapper.scala index f210406..4dadd2c 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/UnitMapper.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/UnitMapper.scala @@ -145,6 +145,7 @@ trait UnitMapper extends LazyLogging { ) } + //noinspection ScalaUnusedSymbol def getSymbolFromString(dimension: String, unit: String): String = unit } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Aggregate.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Aggregate.scala index 37e923d..4f43ac0 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Aggregate.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Aggregate.scala @@ -32,11 +32,12 @@ trait Aggregate extends Product with Serializable with LazyLogging { // a copy constructor private def _copy( - newValue: Double, - aggregatedLastUpdated: Instant, - dependentAggregations: Map[String, Aggregate] - ): Aggregate = - Aggregate(name, + newValue: Double, + aggregatedLastUpdated: Instant, + dependentAggregations: Map[String, Aggregate] + ): Aggregate = + Aggregate( + name, dimension, outUnit, newValue, @@ -44,64 +45,96 @@ trait Aggregate extends Product with Serializable with LazyLogging { aggregatedLastUpdated, Instant.now(), dependentAggregations, - params) + params + ) /** - * Used by some subclasses to update the underlying aggregate value as a Quantity. - * When this is called, any dependent aggregations will be updated and passed into - * the depAggs parameter. You can find the previous dependent aggregations in - * `this.dependentAggregations` if you need them. - * - * @param current Quantity value of the aggregate - * @param quantity Quantity the new quantity to incorporate into the aggregate - * @param depAggs dependent aggregations already updated with the new quantity - * @tparam A the dimension of the quantity - * @return A - */ - def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]): A = ??? + * Used by some subclasses to update the underlying aggregate value as a + * Quantity. When this is called, any dependent aggregations will be + * updated and passed into the depAggs parameter. You can find the + * previous dependent aggregations in `this.dependentAggregations` if you + * need them. + * + * @param current + * Quantity value of the aggregate + * @param quantity + * Quantity the new quantity to incorporate into the aggregate + * @param depAggs + * dependent aggregations already updated with the new quantity + * @tparam A + * the dimension of the quantity + * @return + * A + */ + def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]): A = ??? /** - * Update dependent aggregations. - * - * @param q the quantity being added to the aggregations - * @param aggLU the instant associated with the new quantity - * @param unitMapper a unit mapper - * @tparam A the type of the quantity - * @return - */ - def updateDependents[A <: Quantity[A]](q: A, aggLU: Instant, unitMapper: UnitMapper): Map[String, Aggregate] = + * Update dependent aggregations. + * + * @param q + * the quantity being added to the aggregations + * @param aggLU + * the instant associated with the new quantity + * @param unitMapper + * a unit mapper + * @tparam A + * the type of the quantity + * @return + */ + def updateDependents[A <: Quantity[A]]( + q: A, + aggLU: Instant, + unitMapper: UnitMapper): Map[String, Aggregate] = getDependents .map(kv => kv._1 -> kv._2.update(q, aggLU, unitMapper)) .filter(_._2.nonEmpty) .map(kv => kv._1 -> kv._2.get) - def getDependents: Map[String, Aggregate] = this.dependentAggregations + def getDependents: Map[String, Aggregate] = this.dependentAggregations /** - * Update the aggregate with a Quantity. - * - * @param q Quantity[A] - * @param aggLU event timestamp of quantity - * @tparam A dimension of Quantity - * @return Aggregate - */ - def update[A <: Quantity[A]](q: A, aggLU: Instant, unitMapper: UnitMapper): Option[Aggregate] = { + * Update the aggregate with a Quantity. + * + * @param q + * Quantity[A] + * @param aggLU + * event timestamp of quantity + * @tparam A + * dimension of Quantity + * @return + * Aggregate + */ + def update[A <: Quantity[A]]( + q: A, + aggLU: Instant, + unitMapper: UnitMapper): Option[Aggregate] = { if (q.dimension.name != dimension) { - logger.error(s"$name[$dimension,$unit] can not be updated with (Quantity[${q.dimension.name}]=$q)") + logger.error( + s"$name[$dimension,$unit] can not be updated with (Quantity[${q.dimension.name}]=$q)" + ) None } else { val depAggs = updateDependents(q, aggLU, unitMapper) if (depAggs.size < this.dependentAggregations.size) { - logger.error(s"$name[$dimension,$unit] dependents can not be updated with (Quantity[${q.dimension.name}]=$q)") + logger.error( + s"$name[$dimension,$unit] dependents can not be updated with (Quantity[${q.dimension.name}]=$q)" + ) None } else { unitMapper .createQuantity(q.dimension, value, unit) - .map(current => updateQuantity(current, q, depAggs) in current.unit) match { + .map(current => + updateQuantity(current, q, depAggs) in current.unit + ) match { case Some(updated) => Some(_copy(updated.value, aggLU, depAggs)) - case None => - logger.error(s"$name[$dimension,$unit] can not be updated with (Quantity[${q.dimension.name}]=$q)") + case None => + logger.error( + s"$name[$dimension,$unit] can not be updated with (Quantity[${q.dimension.name}]=$q)" + ) None } } @@ -109,20 +142,24 @@ trait Aggregate extends Product with Serializable with LazyLogging { } /** - * Most common entry point for updating aggregates. - * - * @param value Double value of quantity to update aggregate with - * @param unit String unit of quantity to update aggregate with - * @param aggLU event timestamp of value - * @param unitMapper allows caller to customize unit system mappings - * @return - */ + * Most common entry point for updating aggregates. + * + * @param value + * Double value of quantity to update aggregate with + * @param unit + * String unit of quantity to update aggregate with + * @param aggLU + * event timestamp of value + * @param unitMapper + * allows caller to customize unit system mappings + * @return + */ def update( - value: Double, - unit: String, - aggLU: Instant, - unitMapper: UnitMapper = UnitMapper.defaultUnitMapper - ): Option[Aggregate] = + value: Double, + unit: String, + aggLU: Instant, + unitMapper: UnitMapper = UnitMapper.defaultUnitMapper + ): Option[Aggregate] = unitMapper.updateAggregateWith(this, value, unit, aggLU) def isEmpty: Boolean = count == BigInt(0) @@ -139,91 +176,189 @@ trait Aggregate extends Product with Serializable with LazyLogging { object Aggregate extends LazyLogging { implicit class caseOps(s: String) { - def normalize: String = "[^A-Za-z\\d]".r.replaceAllIn(s, "").toLowerCase() + def normalize: String = + "[^A-Za-z\\d]".r.replaceAllIn(s, "").toLowerCase() } def apply( - name: String, - dimension: String, - unit: String, - value: Double = 0, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String] - ): Aggregate = { + name: String, + dimension: String, + unit: String, + value: Double = 0, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String] + ): Aggregate = { val normalizedName = name.normalize - val initValue = if (normalizedName == "min" && count == 0 && value == 0) Double.MaxValue else value + val initValue = + if (normalizedName == "min" && count == 0 && value == 0) + Double.MaxValue + else value normalizedName match { - case "mean" => - Mean(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated) - case "count" => - Count(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated) + case "mean" => + Mean( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated + ) + case "count" => + Count( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated + ) case "exponentialmovingaverage" => - ExponentialMovingAverage(dimension, + ExponentialMovingAverage( + dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations, - maybeUpdateParams(params, "alpha", ExponentialMovingAverage.defaultAlpha)) + maybeUpdateParams( + params, + "alpha", + ExponentialMovingAverage.defaultAlpha + ) + ) case "exponentialmovingstandarddeviation" => - ExponentialMovingStandardDeviation(dimension, + ExponentialMovingStandardDeviation( + dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations, - maybeUpdateParams(params, + maybeUpdateParams( + params, "alpha", - ExponentialMovingStandardDeviation.defaultAlpha)) + ExponentialMovingStandardDeviation.defaultAlpha + ) + ) case "exponentialmovingvariance" => - ExponentialMovingVariance(dimension, + ExponentialMovingVariance( + dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations, - maybeUpdateParams(params, "alpha", ExponentialMovingVariance.defaultAlpha)) + maybeUpdateParams( + params, + "alpha", + ExponentialMovingVariance.defaultAlpha + ) + ) case "histogram" => - Histogram(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations) + Histogram( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated, + dependentAggregations + ) case "max" => - Max(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated) + Max( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated + ) case "min" => - Min(dimension, unit, initValue, count, aggregatedLastUpdated, lastUpdated) + Min( + dimension, + unit, + initValue, + count, + aggregatedLastUpdated, + lastUpdated + ) case "range" => - Range(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations) + Range( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated, + dependentAggregations + ) case "sum" => - Sum(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated) + Sum( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated + ) case "variance" => - Variance(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations) + Variance( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated, + dependentAggregations + ) case "standarddeviation" => - StandardDeviation(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations) + StandardDeviation( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated, + dependentAggregations + ) case "sumofsquareddeviations" => - SumOfSquaredDeviations(dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations) - case "percentage" => - Percentage(dimension, + SumOfSquaredDeviations( + dimension, + unit, + value, + count, + aggregatedLastUpdated, + lastUpdated, + dependentAggregations + ) + case "percentage" => + Percentage( + dimension, unit, value, count, aggregatedLastUpdated, lastUpdated, dependentAggregations, - maybeUpdateParams(params, "base", Percentage.defaultBase)) + maybeUpdateParams(params, "base", Percentage.defaultBase) + ) case _ => val message = s"Unknown aggregation type '$name'" @@ -232,6 +367,9 @@ object Aggregate extends LazyLogging { } } - def maybeUpdateParams(map: Map[String, String], key: String, defaultValue: String): Map[String, String] = + def maybeUpdateParams( + map: Map[String, String], + key: String, + defaultValue: String): Map[String, String] = if (map.contains(key)) map else map.updated(key, defaultValue) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Count.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Count.scala index 51ab50d..f67acf3 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Count.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Count.scala @@ -5,21 +5,25 @@ import squants.{Dimensionless, Each, Quantity} import java.time.Instant final case class Count( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { override def isDimensionless = true override def outUnit: String = Each.symbol - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = current + current.unit(1) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingAverage.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingAverage.scala index 7e7bacc..d0f6c7a 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingAverage.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingAverage.scala @@ -5,19 +5,27 @@ import squants.Quantity import java.time.Instant final case class ExponentialMovingAverage( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map("alpha" -> ExponentialMovingAverage.defaultAlpha)) - extends Aggregate { - - def alpha: Double = params.getOrElse("alpha", ExponentialMovingAverage.defaultAlpha).toDouble - - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map( + "alpha" -> ExponentialMovingAverage.defaultAlpha + )) + extends Aggregate { + + def alpha: Double = params + .getOrElse("alpha", ExponentialMovingAverage.defaultAlpha) + .toDouble + + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = if (count == 0) quantity else current * (1 - alpha) + quantity * alpha } @@ -27,6 +35,13 @@ object ExponentialMovingAverage { def defaultAlpha = DEFAULT_ALPHA.toString - def apply(dimension: String, unit: String, alpha: Double): ExponentialMovingAverage = - ExponentialMovingAverage(dimension, unit, params = Map("alpha" -> alpha.toString)) + def apply( + dimension: String, + unit: String, + alpha: Double): ExponentialMovingAverage = + ExponentialMovingAverage( + dimension, + unit, + params = Map("alpha" -> alpha.toString) + ) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingStandardDeviation.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingStandardDeviation.scala index eab05c0..6f7b43f 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingStandardDeviation.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingStandardDeviation.scala @@ -5,24 +5,37 @@ import squants.Quantity import java.time.Instant final case class ExponentialMovingStandardDeviation( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map("alpha" -> ExponentialMovingStandardDeviation.defaultAlpha)) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map( + "alpha" -> ExponentialMovingStandardDeviation.defaultAlpha + )) + extends Aggregate { override def getDependents = { if (this.dependentAggregations.isEmpty) - Map("ExponentialMovingVariance" -> ExponentialMovingVariance(dimension, unit, params = params)) + Map( + "ExponentialMovingVariance" -> ExponentialMovingVariance( + dimension, + unit, + params = params + ) + ) else this.dependentAggregations } - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = { - if (count == 0) current.unit(0d) else { + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = { + if (count == 0) current.unit(0d) + else { val updatedEmv = depAggs("ExponentialMovingVariance") current.unit(Math.sqrt(updatedEmv.value)) } @@ -35,13 +48,20 @@ object ExponentialMovingStandardDeviation { def defaultAlpha = DEFAULT_ALPHA.toString - def apply(dimension: String, unit: String, alpha: Double): ExponentialMovingStandardDeviation = - ExponentialMovingStandardDeviation(dimension, + def apply( + dimension: String, + unit: String, + alpha: Double): ExponentialMovingStandardDeviation = + ExponentialMovingStandardDeviation( + dimension, unit, dependentAggregations = Map( - "ExponentialMovingVariance" -> ExponentialMovingVariance(dimension, + "ExponentialMovingVariance" -> ExponentialMovingVariance( + dimension, unit, - alpha) + alpha + ) ), - params = Map("alpha" -> alpha.toString)) + params = Map("alpha" -> alpha.toString) + ) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingVariance.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingVariance.scala index 3cd6a94..e665e0f 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingVariance.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/ExponentialMovingVariance.scala @@ -5,29 +5,44 @@ import squants.Quantity import java.time.Instant final case class ExponentialMovingVariance( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map("alpha" -> ExponentialMovingVariance.defaultAlpha)) - extends Aggregate { - - def alpha = params.getOrElse("alpha", ExponentialMovingVariance.defaultAlpha).toDouble + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map( + "alpha" -> ExponentialMovingVariance.defaultAlpha + )) + extends Aggregate { + + def alpha = params + .getOrElse("alpha", ExponentialMovingVariance.defaultAlpha) + .toDouble override def getDependents = { if (this.dependentAggregations.isEmpty) - Map("ExponentialMovingAverage" -> ExponentialMovingAverage(dimension, unit, params = params)) + Map( + "ExponentialMovingAverage" -> ExponentialMovingAverage( + dimension, + unit, + params = params + ) + ) else this.dependentAggregations } - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = { - if (count == 0) quantity.unit(0d) else { + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = { + if (count == 0) quantity.unit(0d) + else { val currentEma = getDependents("ExponentialMovingAverage") - val q = quantity in current.unit - val delta = q - current.unit(currentEma.value) + val q = quantity in current.unit + val delta = q - current.unit(currentEma.value) (1 - alpha) * (current + delta * delta.value * alpha) } } @@ -39,7 +54,14 @@ object ExponentialMovingVariance { def defaultAlpha = DEFAULT_ALPHA.toString - def apply(dimension: String, unit: String, alpha: Double): ExponentialMovingVariance = - ExponentialMovingVariance(dimension, unit, params = Map("alpha" -> alpha.toString)) + def apply( + dimension: String, + unit: String, + alpha: Double): ExponentialMovingVariance = + ExponentialMovingVariance( + dimension, + unit, + params = Map("alpha" -> alpha.toString) + ) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Histogram.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Histogram.scala index 6b9ae3d..18552d3 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Histogram.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Histogram.scala @@ -6,68 +6,82 @@ import squants.Quantity import java.time.Instant final case class Histogram( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { import Histogram._ - def bin(key: String): Aggregate = this.dependentAggregations.getOrElse(key, Count(dimension, unit)) + def bin(key: String): Aggregate = + this.dependentAggregations.getOrElse(key, Count(dimension, unit)) - /** Compute a dynamic bin for the requested quantity. This picks a bin - * based on the order of magnitude of the quantity in the aggregate's preferred unit. - * If the order of magnitude is 3 (say the value is 2345) - * For instance if the quantity value is 0.00157, its order of magnitude is -3. We - * reduce that in absolute value by 1 (= -2) to compute the min and max of the bin as - * [floor(0.0157 * 10**2)/10**2 (= 0.01) and - * ceil(0.0157 * 10**2)/10**2 (= 0.02). - * - * @param q the quantity to compute a bin of - * @return - */ + /** + * Compute a dynamic bin for the requested quantity. This picks a bin + * based on the order of magnitude of the quantity in the aggregate's + * preferred unit. If the order of magnitude is 3 (say the value is 2345) + * For instance if the quantity value is 0.00157, its order of magnitude + * is -3. We reduce that in absolute value by 1 (= -2) to compute the min + * and max of the bin as [floor(0.0157 * 10**2)/10**2 (= 0.01) and + * ceil(0.0157 * 10**2)/10**2 (= 0.02). + * + * @param q + * the quantity to compute a bin of + * @return + */ def binOf[A <: Quantity[A]](q: A, unitMapper: UnitMapper) = { unitMapper .createQuantity(q.dimension, value, unit) .map(_.unit) .map(u => (q in u).value) - .map(d => { - val absd = math.abs(d) - val magnitude = + .map { d => + val absd = math.abs(d) + val magnitude = math.floor(math.log10(if (absd < TOL) TOL else absd)).toInt - val sign = math.signum(magnitude) - val abs = math.abs(magnitude) - val mag = sign * (abs - 1) - val pow = math.pow(10, mag.toDouble) - val min = math.floor(d / pow) * pow - val max = math.ceil(d / pow) * pow + val sign = math.signum(magnitude) + val abs = math.abs(magnitude) + val mag = sign * (abs - 1) + val pow = math.pow(10, mag.toDouble) + val min = math.floor(d / pow) * pow + val max = math.ceil(d / pow) * pow val formatString = if (abs < 8) { - val fs = s"%${if (sign < 0) "." else ""}$abs${if (sign > 0) ".0" else ""}" + val fs = + s"%${if (sign < 0) "." else ""}$abs${if (sign > 0) ".0" else ""}" s"${fs}f,${fs}f" } else { "%e,%e" } formatString.format(min, max) - }) + } } - override def update[A <: Quantity[A]](q: A, aggLU: Instant, unitMapper: UnitMapper) = + override def update[A <: Quantity[A]]( + q: A, + aggLU: Instant, + unitMapper: UnitMapper) = binOf(q, unitMapper) match { case Some(binKey) => bin(binKey) .update(q.value, q.unit.symbol, aggLU, unitMapper) match { - case Some(updatedBin) => Some(copy(dependentAggregations = dependentAggregations.updated(binKey, updatedBin))) - case None => { - logger.error(s"$name[$dimension,$unit] Quantity[$q] can't be binned") + case Some(updatedBin) => + Some( + copy(dependentAggregations = + dependentAggregations.updated(binKey, updatedBin) + ) + ) + case None => + logger.error( + s"$name[$dimension,$unit] Quantity[$q] can't be binned" + ) None - } } - case None => + case None => logger.error(s"$name[$dimension,$unit] can't be updated with $q") None } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Max.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Max.scala index 5c84f28..6848a7d 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Max.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Max.scala @@ -5,17 +5,21 @@ import squants.Quantity import java.time.Instant final case class Max( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = if (count == 0) quantity else current.max(quantity) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Mean.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Mean.scala index 4fd281c..1293d60 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Mean.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Mean.scala @@ -5,17 +5,21 @@ import squants.Quantity import java.time.Instant final case class Mean( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = { + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = { val n = count.doubleValue() (current * n + quantity) / (n + 1) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Min.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Min.scala index a38b7e8..04ad344 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Min.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Min.scala @@ -5,17 +5,21 @@ import squants.Quantity import java.time.Instant final case class Min( - dimension: String, - unit: String, - value: Double = Double.MaxValue, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = Double.MaxValue, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = if (count == 0) quantity else current.min(quantity) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Percentage.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Percentage.scala index 9fe0d5f..bea5aad 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Percentage.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Percentage.scala @@ -6,33 +6,46 @@ import squants.{Percent, Quantity} import java.time.Instant final case class Percentage( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map("base" -> Percentage.defaultBase)) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map("base" -> Percentage.defaultBase)) + extends Aggregate { override def isDimensionless = true override def outUnit = Percent.symbol - val baseParam: Double = params.getOrElse("base", Percentage.defaultBase).toDouble + val baseParam: Double = + params.getOrElse("base", Percentage.defaultBase).toDouble def baseQuantity[A <: Quantity[A]](q: A, unitMapper: UnitMapper) = unitMapper.createQuantity(q.dimension, baseParam, unit) - override def update[A <: Quantity[A]](q: A, aggLU: Instant, unitMapper: UnitMapper) = { + override def update[A <: Quantity[A]]( + q: A, + aggLU: Instant, + unitMapper: UnitMapper) = { val updateValue = baseQuantity(q, unitMapper).map(b => q / b) match { case Some(addValue) => addValue * 100.0 - case None => - logger.error(s"$name[$dimension,$unit] can not be updated with (Quantity[${q.dimension.name}]=$q)") + case None => + logger.error( + s"$name[$dimension,$unit] can not be updated with (Quantity[${q.dimension.name}]=$q)" + ) 0d } - Some(copy(value = this.value + updateValue, count = count + 1, aggregatedLastUpdated = aggLU)) + Some( + copy( + value = this.value + updateValue, + count = count + 1, + aggregatedLastUpdated = aggLU + ) + ) } } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Range.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Range.scala index 0fc2a49..2c234c7 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Range.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Range.scala @@ -5,15 +5,16 @@ import squants.Quantity import java.time.Instant final case class Range( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { override def getDependents = { if (this.dependentAggregations.isEmpty) @@ -21,6 +22,10 @@ final case class Range( else this.dependentAggregations } - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = - if (count == 0) current else current.unit(depAggs("Max").value - depAggs("Min").value) + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = + if (count == 0) current + else current.unit(depAggs("Max").value - depAggs("Min").value) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/StandardDeviation.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/StandardDeviation.scala index de104cb..17ca4ff 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/StandardDeviation.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/StandardDeviation.scala @@ -5,18 +5,23 @@ import squants.Quantity import java.time.Instant final case class StandardDeviation( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = { - if (count == 0) current else { + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = { + if (count == 0) current + else { val updatedVariance = depAggs("Variance") current.unit(Math.sqrt(updatedVariance.value)) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Sum.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Sum.scala index fbe1199..0d8db5a 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Sum.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Sum.scala @@ -5,17 +5,21 @@ import squants.Quantity import java.time.Instant final case class Sum( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = current + quantity } diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/SumOfSquaredDeviations.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/SumOfSquaredDeviations.scala index 42a2231..b9fbff7 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/SumOfSquaredDeviations.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/SumOfSquaredDeviations.scala @@ -5,15 +5,16 @@ import squants.Quantity import java.time.Instant final case class SumOfSquaredDeviations( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { override def getDependents = { if (this.dependentAggregations.isEmpty) @@ -22,8 +23,11 @@ final case class SumOfSquaredDeviations( } // see https://www.johndcook.com/blog/standard_deviation/ - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = { - val q = quantity in current.unit + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = { + val q = quantity in current.unit val currentMean = q.unit(getDependents("Mean").value) val updatedMean = q.unit(depAggs("Mean").value) current + (q - currentMean) * (q - updatedMean).value diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Variance.scala b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Variance.scala index 2f9663e..2186815 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Variance.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/aggregate/Variance.scala @@ -5,24 +5,31 @@ import squants.Quantity import java.time.Instant final case class Variance( - dimension: String, - unit: String, - value: Double = 0d, - count: BigInt = BigInt(0), - aggregatedLastUpdated: Instant = Instant.EPOCH, - lastUpdated: Instant = Instant.now(), - dependentAggregations: Map[String, Aggregate] = Map.empty[String, Aggregate], - params: Map[String, String] = Map.empty[String, String]) - extends Aggregate { + dimension: String, + unit: String, + value: Double = 0d, + count: BigInt = BigInt(0), + aggregatedLastUpdated: Instant = Instant.EPOCH, + lastUpdated: Instant = Instant.now(), + dependentAggregations: Map[String, Aggregate] = + Map.empty[String, Aggregate], + params: Map[String, String] = Map.empty[String, String]) + extends Aggregate { override def getDependents = { if (this.dependentAggregations.isEmpty) - Map("SumOfSquaredDeviations" -> SumOfSquaredDeviations(dimension, unit)) + Map( + "SumOfSquaredDeviations" -> SumOfSquaredDeviations(dimension, unit) + ) else this.dependentAggregations } - override def updateQuantity[A <: Quantity[A]](current: A, quantity: A, depAggs: Map[String, Aggregate]) = { - if (count < 2) current else { + override def updateQuantity[A <: Quantity[A]]( + current: A, + quantity: A, + depAggs: Map[String, Aggregate]) = { + if (count < 2) current + else { val k = count.doubleValue() val s = current.unit(depAggs("SumOfSquaredDeviations").value) s / k diff --git a/src/main/scala/io/epiphanous/flinkrunner/operator/AddToJdbcBatchFunction.scala b/src/main/scala/io/epiphanous/flinkrunner/operator/AddToJdbcBatchFunction.scala index 85758bf..b546699 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/operator/AddToJdbcBatchFunction.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/operator/AddToJdbcBatchFunction.scala @@ -1,10 +1,8 @@ package io.epiphanous.flinkrunner.operator -import io.epiphanous.flinkrunner.model.FlinkEvent - import java.sql.PreparedStatement -abstract class AddToJdbcBatchFunction[E <: FlinkEvent] { +abstract class AddToJdbcBatchFunction[E] { def addToJdbcStatement(row: E, ps: PreparedStatement): Unit diff --git a/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala b/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala index a3a85b8..07b27a7 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala @@ -4,7 +4,7 @@ import cats.effect.{ContextShift, IO, Timer} import com.google.common.cache.{CacheBuilder, CacheLoader} import com.typesafe.scalalogging.LazyLogging import io.circe.Decoder -import io.epiphanous.flinkrunner.model.FlinkConfig +import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} import org.apache.flink.runtime.concurrent.Executors.directExecutionContext import org.apache.flink.streaming.api.scala.async.{ AsyncFunction, @@ -58,11 +58,16 @@ import scala.util.{Failure, Success, Try} * @tparam CV * the cache value type */ -abstract class EnrichmentAsyncFunction[IN, OUT, CV <: AnyRef]( +abstract class EnrichmentAsyncFunction[ + IN, + OUT, + CV <: AnyRef, + ADT <: FlinkEvent]( configPrefix: String, cacheLoaderOpt: Option[CacheLoader[String, Option[CV]]] = None, - preloaded: Map[String, CV] = Map.empty[String, CV] -)(implicit config: FlinkConfig, decoder: Decoder[CV]) + preloaded: Map[String, CV] = Map.empty[String, CV], + config: FlinkConfig[ADT] +)(implicit decoder: Decoder[CV]) extends AsyncFunction[IN, OUT] with LazyLogging { diff --git a/src/main/scala/io/epiphanous/flinkrunner/util/BoundedLatenessWatermarkStrategy.scala b/src/main/scala/io/epiphanous/flinkrunner/util/BoundedLatenessWatermarkStrategy.scala index 951c3d8..ee18fd8 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/util/BoundedLatenessWatermarkStrategy.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/util/BoundedLatenessWatermarkStrategy.scala @@ -2,16 +2,17 @@ package io.epiphanous.flinkrunner.util import io.epiphanous.flinkrunner.model.FlinkEvent import org.apache.flink.api.common.eventtime.{ - WatermarkGenerator, WatermarkGeneratorSupplier, WatermarkStrategy } +import java.time.Duration + class BoundedLatenessWatermarkStrategy[E <: FlinkEvent]( - val maxAllowedLateness: Long, + val maxAllowedLateness: Duration, val streamID: String) extends WatermarkStrategy[E] { override def createWatermarkGenerator( - context: WatermarkGeneratorSupplier.Context): WatermarkGenerator[E] = - new BoundedLatenessGenerator[E](maxAllowedLateness, streamID) + context: WatermarkGeneratorSupplier.Context) = + new BoundedLatenessGenerator(maxAllowedLateness.toMillis, streamID) } diff --git a/src/main/scala/io/epiphanous/flinkrunner/util/JdbcSink.scala b/src/main/scala/io/epiphanous/flinkrunner/util/JdbcSink.scala index 8075d88..bfa4657 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/util/JdbcSink.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/util/JdbcSink.scala @@ -1,7 +1,7 @@ package io.epiphanous.flinkrunner.util import com.typesafe.scalalogging.LazyLogging -import io.epiphanous.flinkrunner.model.FlinkEvent +import io.epiphanous.flinkrunner.model.JdbcSinkConfig import io.epiphanous.flinkrunner.operator.AddToJdbcBatchFunction import org.apache.flink.api.common.state.{ListState, ListStateDescriptor} import org.apache.flink.api.common.typeinfo.TypeInformation @@ -16,7 +16,6 @@ import org.apache.flink.streaming.api.functions.sink.SinkFunction.Context import org.apache.flink.streaming.api.scala._ import java.sql.{Connection, DriverManager, PreparedStatement} -import java.util.Properties import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer import scala.util.{Failure, Success, Try} @@ -37,13 +36,15 @@ import scala.util.{Failure, Success, Try} * @tparam E * the class of sink elements. */ -class JdbcSink[E <: FlinkEvent: TypeInformation]( - batchFunction: AddToJdbcBatchFunction[E], - props: Properties) - extends RichSinkFunction[E] +class JdbcSink[E: TypeInformation]( + sinkConfig: JdbcSinkConfig, + batchFunction: AddToJdbcBatchFunction[E] +) extends RichSinkFunction[E] with CheckpointedFunction with LazyLogging { + val props = sinkConfig.properties + val bufferSize = props.getProperty("buffer.size").toInt private val pendingRows = ListBuffer.empty[E] private var connection: Connection = _ diff --git a/src/main/scala/io/epiphanous/flinkrunner/util/StreamUtils.scala b/src/main/scala/io/epiphanous/flinkrunner/util/StreamUtils.scala index 6ae444e..3a4a810 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/util/StreamUtils.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/util/StreamUtils.scala @@ -1,64 +1,9 @@ package io.epiphanous.flinkrunner.util import com.typesafe.scalalogging.LazyLogging -import io.epiphanous.flinkrunner.SEE -import io.epiphanous.flinkrunner.model._ -import io.epiphanous.flinkrunner.operator.AddToJdbcBatchFunction -import org.apache.flink.api.common.eventtime.WatermarkStrategy -import org.apache.flink.api.common.functions.RuntimeContext -import org.apache.flink.api.common.serialization.{ - DeserializationSchema, - Encoder, - SerializationSchema -} -import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.core.fs.Path -import org.apache.flink.streaming.api.TimeCharacteristic -import org.apache.flink.streaming.api.datastream.DataStreamSink -import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.{ - BasePathBucketAssigner, - DateTimeBucketAssigner -} -import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.{ - DefaultRollingPolicy, - OnCheckpointRollingPolicy -} -import org.apache.flink.streaming.api.functions.sink.filesystem.{ - BucketAssigner, - StreamingFileSink -} -import org.apache.flink.streaming.api.scala._ -import org.apache.flink.streaming.connectors.cassandra.CassandraSink -import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer -import org.apache.flink.streaming.connectors.elasticsearch7.ElasticsearchSink -import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic -import org.apache.flink.streaming.connectors.kafka.{ - FlinkKafkaConsumer, - FlinkKafkaProducer, - KafkaDeserializationSchema, - KafkaSerializationSchema -} -import org.apache.flink.streaming.connectors.kinesis.serialization.{ - KinesisDeserializationSchema, - KinesisSerializationSchema -} -import org.apache.flink.streaming.connectors.kinesis.{ - FlinkKinesisConsumer, - FlinkKinesisProducer -} -import org.apache.http.HttpHost -import org.elasticsearch.client.Requests - -import java.io.{File, FileNotFoundException} -import java.net.URL -import java.nio.charset.StandardCharsets -import scala.collection.JavaConverters._ -import scala.util.matching.Regex object StreamUtils extends LazyLogging { - val RESOURCE_PATTERN: Regex = "resource://(.*)".r - /** * A little syntactic sugar for writing stream program. This is the pipe * operator, ala F#. @@ -91,619 +36,4 @@ object StreamUtils extends LazyLogging { } } - /** - * Generates a timestamp and watermark assigner for a stream with a given - * type of element that limits how late an element is allowed to arrive - * in event time. - * - * @param config - * implicitly provided job config - * @tparam E - * the type of stream element - * @return - * BoundedLatenessGenerator[E] - */ - def boundedLatenessWatermarks[E <: FlinkEvent: TypeInformation]( - streamID: String - )(implicit config: FlinkConfig) = - new BoundedLatenessWatermarkStrategy[E]( - config.maxLateness.toMillis, - streamID - ) - - /** - * Create a bounded of order watermark strategy with idleness checking - * - * @param config - * implicitly provided job config - * @tparam E - * the type of stream element - * @return - * BoundedLatenessGenerator[E] - */ - def boundedOutofOrdernessWatermarks[E <: FlinkEvent: TypeInformation]()( - implicit config: FlinkConfig): WatermarkStrategy[E] = - WatermarkStrategy - .forBoundedOutOfOrderness(config.maxLateness) - .withIdleness(config.maxIdleness) - - /** - * Creates an ascending timestamp watermark strategy. - * @tparam E - * type of stream element - * @return - * AscendingTimestampExtractor[E] - */ - def ascendingTimestampsWatermarks[E <: FlinkEvent: TypeInformation]() - : WatermarkStrategy[E] = WatermarkStrategy.forMonotonousTimestamps() - - /** - * Assign timestamps/watermarks if we're using event time - * @param in - * the input stream to watermark - * @param config - * implicit flink configuration - * @param env - * implicit stream execution environment - * @tparam E - * event type - * @return - * the possibly watermarked input stream - */ - def maybeAssignTimestampsAndWatermarks[E <: FlinkEvent: TypeInformation]( - in: DataStream[E], - srcConfig: SourceConfig - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = - if (srcConfig.timeCharacteristic == TimeCharacteristic.EventTime) { - in.assignTimestampsAndWatermarks(srcConfig.watermarkStrategy match { - case "bounded out of orderness" => - boundedOutofOrdernessWatermarks() - case "ascending timestamps" => ascendingTimestampsWatermarks() - case _ => boundedLatenessWatermarks(in.name) - }).name(s"wm:${in.name}") - .uid(s"wm:${in.name}") - } else in - - /** - * Configure stream source from configuration. - * - * @param sourceName - * the name of the source to get its configuration - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def fromSource[E <: FlinkEvent: TypeInformation]( - sourceName: String = "" - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = { - val name = - if (sourceName.isEmpty) config.getSourceNames.head else sourceName - val src = config.getSourceConfig(name) - val uid = src.label - val stream = (src match { - case src: KafkaSourceConfig => fromKafka(src) - case src: KinesisSourceConfig => fromKinesis(src) - case src: FileSourceConfig => fromFile(src) - case src: SocketSourceConfig => fromSocket(src) - case src: CollectionSourceConfig => fromCollection(src) - case src => - throw new IllegalArgumentException( - s"unsupported source connector: ${src.connector}" - ) - }).name(uid).uid(uid) - maybeAssignTimestampsAndWatermarks(stream, src) - } - - /** - * Configure stream from kafka source. - * - * @param srcConfig - * a source config - * @param config - * implicitly provided job config - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def fromKafka[E <: FlinkEvent: TypeInformation]( - srcConfig: KafkaSourceConfig - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = { - val consumer = - new FlinkKafkaConsumer[E]( - srcConfig.topic, - config - .getKafkaDeserializationSchema(srcConfig.name) - .asInstanceOf[KafkaDeserializationSchema[E]], - srcConfig.properties - ) - env - .addSource(consumer) - } - - /** - * Configure stream from kinesis. - * - * @param srcConfig - * a source config - * @param config - * implicitly provided job config - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def fromKinesis[E <: FlinkEvent: TypeInformation]( - srcConfig: KinesisSourceConfig - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = { - val consumer = - new FlinkKinesisConsumer[E]( - srcConfig.stream, - config - .getKinesisDeserializationSchema(srcConfig.name) - .asInstanceOf[KinesisDeserializationSchema[E]], - srcConfig.properties - ) - env - .addSource(consumer) - .name(srcConfig.label) - } - - /** - * Configure stream from file source. - * - * @param srcConfig - * a source config - * @param config - * implicitly provided job config - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def fromFile[E <: FlinkEvent: TypeInformation]( - srcConfig: FileSourceConfig - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = { - val path = srcConfig.path match { - case RESOURCE_PATTERN(p) => getSourceFilePath(p) - case other => other - } - val ds = config - .getDeserializationSchema(srcConfig.name) - .asInstanceOf[DeserializationSchema[E]] - env - .readTextFile(path) - .name(s"raw:${srcConfig.label}") - .uid(s"raw:${srcConfig.label}") - .map(line => ds.deserialize(line.getBytes(StandardCharsets.UTF_8))) - } - - /** - * Configure stream from socket source. - * - * @param srcConfig - * a source config - * @param config - * implicitly provided job config - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def fromSocket[E <: FlinkEvent: TypeInformation]( - srcConfig: SocketSourceConfig - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = - env - .socketTextStream(srcConfig.host, srcConfig.port) - .name(s"raw:${srcConfig.label}") - .uid(s"raw:${srcConfig.label}") - .map(line => - config - .getDeserializationSchema(srcConfig.name) - .asInstanceOf[DeserializationSchema[E]] - .deserialize(line.getBytes(StandardCharsets.UTF_8)) - ) - - /** - * Configure stream from collection source. - * - * @param srcConfig - * a source config - * @param config - * implicitly provided job config - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def fromCollection[E <: FlinkEvent: TypeInformation]( - srcConfig: CollectionSourceConfig - )(implicit config: FlinkConfig, env: SEE): DataStream[E] = - env - .fromCollection[Array[Byte]]( - config.getCollectionSource(srcConfig.topic) - ) - .name(s"raw:${srcConfig.label}") - .uid(s"raw:${srcConfig.label}") - .map(bytes => - config - .getDeserializationSchema(srcConfig.name) - .asInstanceOf[DeserializationSchema[E]] - .deserialize(bytes) - ) - - /** - * Returns the actual path to a resource file named filename or - * filename.gz. - * - * @param filename - * the name of file - * @return - * String - */ - @throws[FileNotFoundException] - def getSourceFilePath(filename: String): String = { - val loader = getClass - val resource = Option(loader.getResource(filename)) match { - case Some(value) => value.toURI - case None => - Option(loader.getResource(s"$filename.gz")) match { - case Some(value) => value.toURI - case None => - throw new FileNotFoundException( - s"can't load resource $filename" - ) - } - } - val file = new File(resource) - file.getAbsolutePath - } - - implicit class EventStreamOps[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E]) { - - def as[T <: FlinkEvent: TypeInformation]: DataStream[T] = { - val name = stream.name - stream - .filter((e: E) => e.isInstanceOf[T @unchecked]) - .name(s"filter types $name") - .uid(s"filter types $name") - .map((e: E) => e.asInstanceOf[T @unchecked]) - .name(s"cast types $name") - .uid(s"cast types $name") - } - - def toSink(sinkName: String = "")(implicit config: FlinkConfig) = - StreamUtils.toSink[E](stream, sinkName) - - } - - /** - * Configure stream sink from configuration. - * - * @param stream - * the data stream to send to sink - * @param sinkName - * a sink name to obtain configuration - * @param config - * implicit flink job args - * @tparam E - * stream element type - * @return - * DataStream[E] - */ - def toSink[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkName: String = "" - )(implicit config: FlinkConfig) = { - val name = if (sinkName.isEmpty) config.getSinkNames.head else sinkName - config.getSinkConfig(name) match { - case s: KafkaSinkConfig => toKafka[E](stream, s) - case s: KinesisSinkConfig => toKinesis[E](stream, s) - case s: FileSinkConfig => toFile[E](stream, s) - case s: SocketSinkConfig => toSocket[E](stream, s) - case s: JdbcSinkConfig => toJdbc[E](stream, s) - case s: CassandraSinkConfig => toCassandraSink[E](stream, s) - case s: ElasticsearchSinkConfig => toElasticsearchSink[E](stream, s) - case s => - throw new IllegalArgumentException( - s"unsupported source connector: ${s.connector}" - ) - } - } - - /** - * Send stream to a kafka sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @param config - * implicit job args - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toKafka[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: KafkaSinkConfig - )(implicit config: FlinkConfig): DataStreamSink[E] = - stream - .addSink( - new FlinkKafkaProducer[E]( - sinkConfig.topic, - config - .getKafkaSerializationSchema(sinkConfig.name) - .asInstanceOf[KafkaSerializationSchema[E]], - sinkConfig.properties, - Semantic.AT_LEAST_ONCE - ) - ) - .uid(sinkConfig.label) - .name(sinkConfig.label) - - /** - * Send stream to a kinesis sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @param config - * implicit job args - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toKinesis[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: KinesisSinkConfig - )(implicit config: FlinkConfig): DataStreamSink[E] = - stream - .addSink { - val sink = - new FlinkKinesisProducer[E]( - config - .getKinesisSerializationSchema(sinkConfig.name) - .asInstanceOf[KinesisSerializationSchema[E]], - sinkConfig.properties - ) - sink.setDefaultStream(sinkConfig.stream) - sink.setFailOnError(true) - sink.setDefaultPartition("0") - sink - } - .uid(sinkConfig.label) - .name(sinkConfig.label) - - /** - * Send stream to a socket sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @param config - * implicit job args - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toJdbc[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: JdbcSinkConfig - )(implicit config: FlinkConfig): DataStreamSink[E] = - stream - .addSink( - new JdbcSink( - config - .getAddToJdbcBatchFunction(sinkConfig.name) - .asInstanceOf[AddToJdbcBatchFunction[E]], - sinkConfig.properties - ) - ) - .uid(sinkConfig.label) - .name(sinkConfig.label) - - /** - * Send stream to a rolling file sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @param config - * implicit job args - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toFile[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: FileSinkConfig - )(implicit config: FlinkConfig): DataStreamSink[E] = { - val path = sinkConfig.path - val p = sinkConfig.properties - val bucketCheckInterval = - p.getProperty("bucket.check.interval", s"${60000}").toLong - val bucketAssigner = - p.getProperty("bucket.assigner.type", "datetime") match { - case "none" => new BasePathBucketAssigner[E]() - case "datetime" => - new DateTimeBucketAssigner[E]( - p.getProperty( - "bucket.assigner.datetime.format", - "YYYY/MM/DD/HH" - ) - ) - case "custom" => - config - .getBucketAssigner(sinkConfig.name) - .asInstanceOf[BucketAssigner[E, String]] - case other => - throw new IllegalArgumentException( - s"Unknown bucket assigner type '$other'." - ) - } - val encoderFormat = p.getProperty("encoder.format", "row") - val sink = encoderFormat match { - case "row" => - val builder = - StreamingFileSink.forRowFormat( - new Path(path), - config.getEncoder(sinkConfig.name).asInstanceOf[Encoder[E]] - ) - val rollingPolicy = - p.getProperty("bucket.rolling.policy", "default") match { - case "default" => - DefaultRollingPolicy - .builder() - .withInactivityInterval( - p.getProperty( - "bucket.rolling.policy.inactivity.interval", - s"${60000}" - ).toLong - ) - .withMaxPartSize( - p.getProperty( - "bucket.rolling.policy.max.part.size", - s"${128 * 1024 * 1024}" - ).toLong - ) - .withRolloverInterval( - p.getProperty( - "bucket.rolling.policy.rollover.interval", - s"${Long.MaxValue}" - ).toLong - ) - .build[E, String]() - case "checkpoint" => - OnCheckpointRollingPolicy.build[E, String]() - case policy => - throw new IllegalArgumentException( - s"Unknown bucket rolling policy type: '$policy'" - ) - } - builder - .withBucketAssigner(bucketAssigner) - .withRollingPolicy(rollingPolicy) - .withBucketCheckInterval(bucketCheckInterval) - .build() - case "bulk" => - throw new NotImplementedError("Bulk file sink not implemented yet") - - case _ => - throw new IllegalArgumentException( - s"Unknown file sink encoder format: '$encoderFormat'" - ) - } - stream.addSink(sink).uid(sinkConfig.label).name(sinkConfig.label) - } - - /** - * Send stream to a socket sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @param config - * implicit job args - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toSocket[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: SocketSinkConfig - )(implicit config: FlinkConfig): DataStreamSink[E] = - stream - .writeToSocket( - sinkConfig.host, - sinkConfig.port, - config - .getSerializationSchema(sinkConfig.name) - .asInstanceOf[SerializationSchema[E]] - ) - .uid(sinkConfig.label) - .name(sinkConfig.label) - - /** - * Send stream to a cassandra sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toCassandraSink[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: CassandraSinkConfig) = - CassandraSink - .addSink(stream) - .setHost(sinkConfig.host) - .setQuery(sinkConfig.query) - .build() - .uid(sinkConfig.label) - .name(sinkConfig.label) - - /** - * Send stream to an elasticsearch sink. - * - * @param stream - * the data stream - * @param sinkConfig - * a sink configuration - * @tparam E - * stream element type - * @return - * DataStreamSink[E] - */ - def toElasticsearchSink[E <: FlinkEvent: TypeInformation]( - stream: DataStream[E], - sinkConfig: ElasticsearchSinkConfig - ): DataStreamSink[E] = { - val hosts = sinkConfig.transports.map { s => - val url = new URL(if (s.startsWith("http")) s else s"http://$s") - val hostname = url.getHost - val port = if (url.getPort < 0) 9200 else url.getPort - new HttpHost(hostname, port, url.getProtocol) - }.asJava - val esSink = new ElasticsearchSink.Builder[E]( - hosts, - (element: E, _: RuntimeContext, indexer: RequestIndexer) => { - val data = element.getClass.getDeclaredFields - .filterNot(f => - Seq("$id", "$key", "$timestamp", "$action").contains( - f.getName - ) - ) - .foldLeft(Map.empty[String, Any]) { case (a, f) => - f.setAccessible(true) - val name = f.getName - f.get(element) match { - case Some(v: Any) => a + (name -> v) - case None => a - case v: Any => a + (name -> v) - } - } - .asJava - val req = Requests.indexRequest(sinkConfig.index).source(data) - indexer.add(req) - } - ).build() - stream.addSink(esSink).uid(sinkConfig.label).name(sinkConfig.label) - } - } diff --git a/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala b/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala index 0448d76..429c56f 100644 --- a/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala +++ b/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala @@ -1,6 +1,7 @@ package io.epiphanous.flinkrunner.util import io.epiphanous.flinkrunner.UnitSpec +import io.epiphanous.flinkrunner.model.FlinkEvent import org.apache.flink.api.common.eventtime.{Watermark, WatermarkOutput} import java.util.UUID.randomUUID @@ -12,7 +13,10 @@ class BoundedLatenessGeneratorTest extends UnitSpec { val random = new Random() def getBlg(maxAllowedLateness: Long = 10L, streamID: String = "Test") = - new BoundedLatenessGenerator[TestEvent](maxAllowedLateness, streamID) + new BoundedLatenessGenerator[TestEvent]( + maxAllowedLateness, + streamID + ) def uuid = randomUUID().toString From ca1edc8ccd4c85ed4092c070ac8f35a49aea66d4 Mon Sep 17 00:00:00 2001 From: Robert Lyons Date: Tue, 30 Nov 2021 09:33:08 -0500 Subject: [PATCH 2/3] begin refactor of confluent avro support --- build.sbt | 77 +++++++------ .../epiphanous/flinkrunner/FlinkRunner.scala | 60 +++------- .../flinkrunner/FlinkRunnerFactory.scala | 4 +- .../avro/ConfluentSchemaRegistryClient.scala | 19 +-- .../avro/RegisteredAvroSchema.scala | 5 +- .../CirceJsonDeserializationSchema.scala | 72 ++++++++++++ .../serde/CirceJsonSerializationSchema.scala | 74 ++++++++++++ ...onfluentAvroKafkaSerializationSchema.scala | 47 ++++++++ ...roRegistryKafkaDeserializationSchema.scala | 108 ++++++++++++++++++ .../epiphanous/flinkrunner/model/MyADT.scala | 28 +++++ ...ConfluentAvroSerializationSchemaTest.scala | 14 +++ ...yADTConfluentAvroSerializationSchema.scala | 8 ++ 12 files changed, 423 insertions(+), 93 deletions(-) create mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonDeserializationSchema.scala create mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonSerializationSchema.scala create mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala create mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala create mode 100644 src/test/scala/io/epiphanous/flinkrunner/model/MyADT.scala create mode 100644 src/test/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroSerializationSchemaTest.scala create mode 100644 src/test/scala/io/epiphanous/flinkrunner/serde/MyADTConfluentAvroSerializationSchema.scala diff --git a/build.sbt b/build.sbt index 7f9488f..60abd1b 100644 --- a/build.sbt +++ b/build.sbt @@ -24,10 +24,11 @@ inThisBuild( Test / parallelExecution := false Test / fork := true resolvers += "Local Maven Repository" at "file://" + Path.userHome.absolutePath + "/.m2/repository" +resolvers += "Confluent Repository" at "https://packages.confluent.io/maven/" val V = new { - val flink = "1.13.2" - val logback = "1.2.6" + val flink = "1.13.3" + val logback = "1.2.7" val scalaLogging = "3.9.4" val scalaTest = "3.2.10" val scalaCheck = "1.15.4" @@ -35,53 +36,57 @@ val V = new { val http4s = "0.21.29" val enumeratum = "1.7.0" val typesafeConfig = "1.4.1" - val guava = "29.0-jre" + val guava = "31.0.1-jre" //"29.0-jre" val squants = "1.8.3" - val avro = "1.10.2" + val avro = "1.11.0" val avro4s = "4.0.11" + val schemaRegistry = "7.0.0" } -val flinkDeps = - Seq("scala", "streaming-scala", "cep-scala").map(a => - "org.apache.flink" %% s"flink-$a" % V.flink % Provided - ) ++ - Seq( - "connector-kafka", - "connector-kinesis", - "connector-cassandra", - "connector-elasticsearch7", - "statebackend-rocksdb" - ).map(a => "org.apache.flink" %% s"flink-$a" % V.flink) ++ - Seq("org.apache.flink" %% "flink-test-utils" % V.flink % Test) +val flinkDeps = + Seq( + "org.apache.flink" %% s"flink-scala" % V.flink % Provided, + "org.apache.flink" %% s"flink-streaming-scala" % V.flink % Provided, + "org.apache.flink" %% s"flink-cep-scala" % V.flink % Provided, + "org.apache.flink" %% s"flink-connector-kafka" % V.flink, + "org.apache.flink" %% s"flink-connector-kinesis" % V.flink, + "org.apache.flink" %% s"flink-connector-cassandra" % V.flink, + "org.apache.flink" %% s"flink-connector-elasticsearch7" % V.flink, + "org.apache.flink" %% s"flink-statebackend-rocksdb" % V.flink, + "org.apache.flink" % s"flink-avro-confluent-registry" % V.flink, + "org.apache.flink" %% s"flink-test-utils" % V.flink % Test + ) val loggingDeps = Seq( "ch.qos.logback" % "logback-classic" % V.logback % Provided, "com.typesafe.scala-logging" %% "scala-logging" % V.scalaLogging ) -val http4sDeps = - Seq("http4s-dsl", "http4s-client", "http4s-blaze-client", "http4s-circe") - .map("org.http4s" %% _ % V.http4s) +val http4sDeps = Seq( + "dsl", + "client", + "blaze-client", + "circe" +).map(d => "org.http4s" %% s"http4s-$d" % V.http4s) val circeDeps = Seq( - "circe-core", - "circe-generic", - "circe-generic-extras", - "circe-parser" -).map( - "io.circe" %% _ % V.circe -) + "core", + "generic", + "generic-extras", + "parser" +).map(d => "io.circe" %% s"circe-$d" % V.circe) -val otherDeps = Seq( - "com.beachape" %% "enumeratum" % V.enumeratum, - "org.apache.avro" % "avro" % V.avro, - "com.typesafe" % "config" % V.typesafeConfig, - "com.google.guava" % "guava" % V.guava, - "org.typelevel" %% "squants" % V.squants, - "com.sksamuel.avro4s" %% "avro4s-core" % V.avro4s, - "org.scalactic" %% "scalactic" % V.scalaTest % Test, - "org.scalatest" %% "scalatest" % V.scalaTest % Test, - "org.scalacheck" %% "scalacheck" % V.scalaCheck % Test +val otherDeps = Seq( + "io.confluent" % "kafka-schema-registry-client" % V.schemaRegistry, + "com.beachape" %% "enumeratum" % V.enumeratum, + "org.apache.avro" % "avro" % V.avro, + "com.typesafe" % "config" % V.typesafeConfig, + "com.google.guava" % "guava" % V.guava, + "org.typelevel" %% "squants" % V.squants, + "com.sksamuel.avro4s" %% "avro4s-core" % V.avro4s, + "org.scalactic" %% "scalactic" % V.scalaTest % Test, + "org.scalatest" %% "scalatest" % V.scalaTest % Test, + "org.scalacheck" %% "scalacheck" % V.scalaCheck % Test ) /** diff --git a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala index 7d384a3..e4374e1 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala @@ -72,24 +72,9 @@ class FlinkRunner[ADT <: FlinkEvent]( val env: SEE = config.configureStreamExecutionEnvironment /** - * An intermediate method to process main args, with optional callback to - * capture output of flink job. - * - * @param callback - * a function from an iterator to unit - */ - def process( - callback: PartialFunction[List[_], Unit] = { case _ => - () - } - ): Unit = - if (config.jobName == "help") showHelp() - else process1(callback) - - /** - * Actually invoke the job based on the job name and arguments passed in. - * If the job run returns an iterator of results, pass those results to - * the callback. Otherwise, just return. The callback is for testing the + * Invoke a job based on the job name and arguments passed in. If the job + * run returns an iterator of results, pass those results to the + * callback. Otherwise, just return. The callback is for testing the * stream of results from a flink job. It will only be invoked if * --mock.edges option is on. * @@ -97,17 +82,18 @@ class FlinkRunner[ADT <: FlinkEvent]( * a function from a stream to unit that receives results from running * flink job */ - def process1( + def process( callback: PartialFunction[List[_], Unit] = { case _ => () } ): Unit = { - if ( + if (config.jobName == "help") showHelp() + else if ( config.jobArgs.headOption .exists(s => List("help", "--help", "-help", "-h").contains(s)) ) showJobHelp() else { - factory.getJobInstance(config.jobName, config).run() match { + factory.getJobInstance(config.jobName, this).run() match { case Left(results) => callback(results) case Right(_) => () } @@ -185,7 +171,7 @@ class FlinkRunner[ADT <: FlinkEvent]( * @return * BoundedLatenessGenerator[E] */ - def boundedOutofOrdernessWatermarks[E <: ADT: TypeInformation]() + def boundedOutOfOrderWatermarks[E <: ADT: TypeInformation]() : WatermarkStrategy[E] = WatermarkStrategy .forBoundedOutOfOrderness(config.maxLateness) @@ -205,8 +191,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * Assign timestamps/watermarks if we're using event time * @param in * the input stream to watermark - * @param env - * implicit stream execution environment * @tparam E * event type * @return @@ -217,10 +201,10 @@ class FlinkRunner[ADT <: FlinkEvent]( srcConfig: SourceConfig ): DataStream[E] = in.assignTimestampsAndWatermarks(srcConfig.watermarkStrategy match { - case "bounded out of orderness" => - boundedOutofOrdernessWatermarks() - case "ascending timestamps" => ascendingTimestampsWatermarks() - case _ => boundedLatenessWatermarks(in.name) + case "bounded out of order" => + boundedOutOfOrderWatermarks() + case "ascending timestamps" => ascendingTimestampsWatermarks() + case _ => boundedLatenessWatermarks(in.name) }).name(s"wm:${in.name}") .uid(s"wm:${in.name}") @@ -406,7 +390,7 @@ class FlinkRunner[ADT <: FlinkEvent]( file.getAbsolutePath } - val runner = this + val runner: FlinkRunner[ADT] = this implicit class EventStreamOps[E <: ADT: TypeInformation]( stream: DataStream[E]) { @@ -422,7 +406,7 @@ class FlinkRunner[ADT <: FlinkEvent]( .uid(s"cast types $name") } - def toSink(sinkName: String = "") = + def toSink(sinkName: String = ""): Object = runner.toSink[E](stream, sinkName) } @@ -434,8 +418,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * the data stream to send to sink * @param sinkName * a sink name to obtain configuration - * @param config - * implicit flink job args * @tparam E * stream element type * @return @@ -444,7 +426,7 @@ class FlinkRunner[ADT <: FlinkEvent]( def toSink[E <: ADT: TypeInformation]( stream: DataStream[E], sinkName: String = "" - ) = { + ): Object = { val name = if (sinkName.isEmpty) config.getSinkNames.head else sinkName config.getSinkConfig(name) match { case s: KafkaSinkConfig => toKafka[E](stream, s) @@ -468,8 +450,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * the data stream * @param sinkConfig * a sink configuration - * @param config - * implicit job args * @tparam E * stream element type * @return @@ -500,8 +480,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * the data stream * @param sinkConfig * a sink configuration - * @param config - * implicit job args * @tparam E * stream element type * @return @@ -535,8 +513,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * the data stream * @param sinkConfig * a sink configuration - * @param config - * implicit job args * @tparam E * stream element type * @return @@ -565,8 +541,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * the data stream * @param sinkConfig * a sink configuration - * @param config - * implicit job args * @tparam E * stream element type * @return @@ -661,8 +635,6 @@ class FlinkRunner[ADT <: FlinkEvent]( * the data stream * @param sinkConfig * a sink configuration - * @param config - * implicit job args * @tparam E * stream element type * @return @@ -697,7 +669,7 @@ class FlinkRunner[ADT <: FlinkEvent]( */ def toCassandraSink[E <: ADT: TypeInformation]( stream: DataStream[E], - sinkConfig: CassandraSinkConfig) = + sinkConfig: CassandraSinkConfig): CassandraSink[E] = CassandraSink .addSink(stream) .setHost(sinkConfig.host) diff --git a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala index 3814d1d..3943eb2 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala @@ -27,9 +27,9 @@ trait FlinkRunnerFactory[ADT <: FlinkEvent] { optConfig: Option[String] = None) = new FlinkConfig[ADT](args, this, sources, optConfig) - def getJobInstance( + def getJobInstance[DS, OUT <: ADT]( name: String, - config: FlinkConfig[ADT]): BaseFlinkJob[_, _, ADT] + runner: FlinkRunner[ADT]): BaseFlinkJob[DS, OUT, ADT] def getDeserializationSchema[E <: ADT]( name: String, diff --git a/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala b/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala index cbbd517..dbd45c9 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala @@ -209,14 +209,17 @@ class ConfluentSchemaRegistryClient[ADT <: FlinkEvent: TypeInformation]( */ protected def getSubjectName[E]( event: E, - optContext: Option[ConfluentSchemaRegistryContext] = None): String = - (event.getClass.getCanonicalName.split("\\.") - :+ (if (optContext.getOrElse(ConfluentSchemaRegistryContext()).isKey) - "key" - else "value")) - .map(snakify) - .map(name => clean(name, replacement = "_")) - .mkString("_") + optContext: Option[ConfluentSchemaRegistryContext] = None) + : String = { + val keyOrValue = + if (optContext.getOrElse(ConfluentSchemaRegistryContext()).isKey) + "key" + else "value" + val subjectName = config.getString( + s"schema.registry.${event.getClass.getCanonicalName}" + ) + s"$subjectName-$keyOrValue" + } /** * Retrieve a schema based on its id or subject, and optionally, some diff --git a/src/main/scala/io/epiphanous/flinkrunner/avro/RegisteredAvroSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/avro/RegisteredAvroSchema.scala index 0f0a335..cab791f 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/avro/RegisteredAvroSchema.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/avro/RegisteredAvroSchema.scala @@ -28,8 +28,7 @@ case class RegisteredAvroSchema( */ def decode[E: Decoder](bytes: Array[Byte]): Try[E] = { Try( - AvroInputStream - .binary[E] + AvroInputStream.binary .from(bytes) .build(schema) .iterator @@ -55,7 +54,7 @@ case class RegisteredAvroSchema( magic: Array[Byte] = Array.emptyByteArray): Try[Array[Byte]] = Try { val baos = new ByteArrayOutputStream() - val os = AvroOutputStream.binary[E].to(baos).build() + val os = AvroOutputStream.binary.to(baos).build() os.write(event) os.flush() os.close() diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonDeserializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonDeserializationSchema.scala new file mode 100644 index 0000000..a7f0d59 --- /dev/null +++ b/src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonDeserializationSchema.scala @@ -0,0 +1,72 @@ +package io.epiphanous.flinkrunner.serde + +import com.typesafe.scalalogging.LazyLogging +import io.circe.Decoder +import io.circe.parser._ +import io.epiphanous.flinkrunner.model.{ + FlinkConfig, + FlinkEvent, + SourceConfig +} +import org.apache.flink.api.common.serialization.DeserializationSchema +import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation} + +import java.nio.charset.StandardCharsets + +/** + * @param sourceName + * the name of the source we are deserializing from + * @param config + * flink runner configuration + * @tparam ADT + * the algebraic data type of our events + */ +class CirceJsonDeserializationSchema[E <: ADT, ADT <: FlinkEvent]( + sourceName: String, + config: FlinkConfig[ADT])(implicit + circeDecoder: Decoder[E], + ev: Null <:< E) + extends DeserializationSchema[E] + with LazyLogging { + + val sourceConfig: SourceConfig = config.getSourceConfig(sourceName) + + /** + * Deserialize a json byte array into an ADT event instance or return + * null if the byte array can't be successfully deserialized + * @param bytes + * a json-encoded byte array + * @return + * an instance of an ADT event type + */ + override def deserialize(bytes: Array[Byte]): E = { + val payload = new String(bytes, StandardCharsets.UTF_8) + decode[E](payload).toOption match { + case Some(event) => event + case other => + logger.error( + s"Failed to deserialize JSON payload from source $sourceName: $payload" + ) + other.orNull + } + } + + /** + * Determine if the next event is the end of the stream or not. We always + * return false since we assume the stream never ends. + * @param nextEvent + * the next event + * @return + * false + */ + override def isEndOfStream(nextEvent: E): Boolean = false + + /** + * Compute the produced type when deserializing a byte array + * @return + * TypeInformation[E] + */ + override def getProducedType: TypeInformation[E] = + TypeInformation.of(new TypeHint[E] {}) + +} diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonSerializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonSerializationSchema.scala new file mode 100644 index 0000000..48b8796 --- /dev/null +++ b/src/main/scala/io/epiphanous/flinkrunner/serde/CirceJsonSerializationSchema.scala @@ -0,0 +1,74 @@ +package io.epiphanous.flinkrunner.serde + +import com.typesafe.scalalogging.LazyLogging +import io.circe.Encoder +import io.circe.syntax._ +import io.epiphanous.flinkrunner.model.{ + FlinkConfig, + FlinkEvent, + SinkConfig +} +import org.apache.flink.api.common.serialization.SerializationSchema + +import java.nio.charset.StandardCharsets + +/** + * A JSON serialization schema that uses the circe json library. + * + * @param sinkName + * name of the sink we're serializing to + * @param config + * a flink runner config + * @param circeEncoder + * an implicit circe encoder + * @tparam E + * the ADT member type we're serializing + * @tparam ADT + * the flink runner ADT + */ +class CirceJsonSerializationSchema[E <: ADT, ADT <: FlinkEvent]( + sinkName: String, + config: FlinkConfig[ADT])(implicit circeEncoder: Encoder[E]) + extends SerializationSchema[E] + with LazyLogging { + + val sourceConfig: SinkConfig = config.getSinkConfig(sinkName) + val configPretty: Boolean = + sourceConfig.properties.getProperty("pretty", "false").toBoolean + val configSort: Boolean = + sourceConfig.properties.getProperty("sort", "false").toBoolean + + /** + * Serialize an ADT event into json byte array + * @param event + * an instance of an ADT event type + * @return + * a json encoded byte array + */ + override def serialize(event: E): Array[Byte] = + toJson(event).getBytes(StandardCharsets.UTF_8) + + /** + * Utility method to convert an event into a JSON string with options for + * pretty-printing and sorting keys + * @param event + * the ADT event instance to encode + * @param pretty + * true to encode with lines and 2 space indentation + * @param sortKeys + * true to sort the json keys + * @return + * a json-encoded string + */ + def toJson( + event: E, + pretty: Boolean = configPretty, + sortKeys: Boolean = configSort): String = { + val j = event.asJson + if (pretty) { + if (sortKeys) j.spaces2SortKeys else j.spaces2 + } else { + if (sortKeys) j.noSpacesSortKeys else j.noSpaces + } + } +} diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala new file mode 100644 index 0000000..84b81fa --- /dev/null +++ b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala @@ -0,0 +1,47 @@ +package io.epiphanous.flinkrunner.serde + +import com.typesafe.scalalogging.LazyLogging +import io.epiphanous.flinkrunner.model.{ + FlinkConfig, + FlinkEvent, + KafkaSinkConfig, + KafkaSourceConfig +} +import org.apache.avro.specific.SpecificRecord +import org.apache.flink.api.common.serialization.SerializationSchema +import org.apache.flink.formats.avro.registry.confluent.ConfluentRegistryAvroSerializationSchema +import org.apache.flink.streaming.connectors.kafka.KafkaSerializationSchema +import org.apache.kafka.clients.producer.ProducerRecord + +import java.lang + +/** + * A schema to serialize an ADT event using a confluent avro schema + * registry. An implementing class must provide a Flink + * [[ConfluentRegistryAvroSerializationSchema]] to interface with the + * schema registry. That registry is specific to a type that implements + * Avro's [[SpecificRecord]] interface type. + * @param sinkName + * name of the sink stream + * @param config + * flink runner config + * @tparam E + * the event type we are serializing from, which is a member of the ADT + * @tparam ADT + * the flink runner ADT + */ +abstract class ConfluentAvroKafkaSerializationSchema[ + E <: ADT, + ADT <: FlinkEvent]( + sinkName: String, + config: FlinkConfig[ADT] +) extends KafkaSerializationSchema[E] + with LazyLogging { + + val sinkConfig: KafkaSinkConfig = + config.getSourceConfig(sinkName).asInstanceOf[KafkaSinkConfig] + + override def serialize( + element: E, + timestamp: lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = ??? +} diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala new file mode 100644 index 0000000..3c18fbb --- /dev/null +++ b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala @@ -0,0 +1,108 @@ +package io.epiphanous.flinkrunner.serde + +import com.typesafe.scalalogging.LazyLogging +import io.epiphanous.flinkrunner.model.{ + FlinkConfig, + FlinkEvent, + KafkaSourceConfig +} +import org.apache.avro.specific.SpecificRecord +import org.apache.flink.api.common.serialization.DeserializationSchema +import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation} +import org.apache.flink.formats.avro.registry.confluent.ConfluentRegistryAvroDeserializationSchema +import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema +import org.apache.kafka.clients.consumer.ConsumerRecord + +/** + * A schema to deserialize bytes from kafka into an ADT event using a + * confluent avro schema registry. An implementing class must provide a + * Flink [[ConfluentRegistryAvroDeserializationSchema]] to interface with + * the schema registry. That registry is specific to a type that implements + * Avro's [[SpecificRecord]] interface. The implementing class must also + * provide a [[deserializeSpecificRecord]] method that deserializes an + * array of bytes into a specific record type as well as a + * [[fromSpecificRecord]] method that converts that type into a type that + * is a member of the ADT. + * + * @param sourceName + * name of the source stream + * @param config + * flink runner config + * @tparam E + * the event type we are producing here, which is a member of the ADT + * @tparam ADT + * the flink runner ADT + */ +abstract class ConfluentAvroRegistryKafkaDeserializationSchema[ + E <: ADT, + ADT <: FlinkEvent +]( + sourceName: String, + config: FlinkConfig[ADT] +) extends KafkaDeserializationSchema[E] + with LazyLogging { + + val sourceConfig: KafkaSourceConfig = + config.getSourceConfig(sourceName).asInstanceOf[KafkaSourceConfig] + + /** + * Implementing classes must provide a a confluent schema registry + * deserialization schema for specific records of type T. + * @tparam K + * specific record type + * @return + * ConfluentRegistryAvroDeserializationSchema[K] + */ + def schemaRegistryKeyDeserializer[K] + : ConfluentRegistryAvroDeserializationSchema[K] + + /** + * A helper method to use the provided schema registry deserialization + * schema to deserialize a kafka message into a specific record instance. + * @param message + * the kafka message + * @return + * an instance of specific record type T + */ + def deserializeSpecificRecord[T <: SpecificRecord]( + message: Array[Byte], + isKey: Boolean = false): T = ??? + ///schemaRegistryDeserializer.deserialize(message) + + /** + * Convert a deserialized specific record instance into an instance of + * our produced event type. Must be defined by implementing classes. + * @param key + * an optional key of type K + * @param value + * a value of specific record type V + * @tparam K + * the type of the key + * @tparam V + * the type of the value, subtype of avro specific record + * @return + * an instance of the flink runner ADT + */ + def fromSpecificRecord[K, V <: SpecificRecord]( + key: Option[K], + value: V): E + + def deserializeKey[K](key: Array[Byte]): K + + override def deserialize( + record: ConsumerRecord[Array[Byte], Array[Byte]]): E = ??? +// { +// val key = +// if (sourceConfig.isKeyed) Some(deserializeKey(record.key())) +// else None +// fromSpecificRecord( +// key, +// schemaRegistryDeserializer.deserialize(record.value()) +// ) +// } + + override def isEndOfStream(nextElement: E): Boolean = false + + override def getProducedType: TypeInformation[E] = + TypeInformation.of(new TypeHint[E] {}) +} diff --git a/src/test/scala/io/epiphanous/flinkrunner/model/MyADT.scala b/src/test/scala/io/epiphanous/flinkrunner/model/MyADT.scala new file mode 100644 index 0000000..ec1383e --- /dev/null +++ b/src/test/scala/io/epiphanous/flinkrunner/model/MyADT.scala @@ -0,0 +1,28 @@ +package io.epiphanous.flinkrunner.model + +import java.time.Instant +import java.util.UUID + +sealed trait MyADT extends FlinkEvent + +case class A( + id: String = UUID.randomUUID().toString, + a: String = "A", + value: Int = 0, + modified: Instant) + extends MyADT { + override def $id: String = id + override def $key: String = a + override def $timestamp: Long = modified.toEpochMilli +} + +case class B( + id: String = UUID.randomUUID().toString, + b: String = "B", + value: Double = 0d, + modified: Instant) + extends MyADT { + override def $id: String = id + override def $key: String = b + override def $timestamp: Long = modified.toEpochMilli +} diff --git a/src/test/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroSerializationSchemaTest.scala b/src/test/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroSerializationSchemaTest.scala new file mode 100644 index 0000000..dbe4e54 --- /dev/null +++ b/src/test/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroSerializationSchemaTest.scala @@ -0,0 +1,14 @@ +package io.epiphanous.flinkrunner.serde + +import io.epiphanous.flinkrunner.UnitSpec +import io.epiphanous.flinkrunner.model.FlinkEvent + +class ConfluentAvroSerializationSchemaTest extends UnitSpec { + +// val ss = new ConfluentAvroSerializationSchema() + + behavior of "ConfluentAvroSerializationSchema" + + it should "" + +} diff --git a/src/test/scala/io/epiphanous/flinkrunner/serde/MyADTConfluentAvroSerializationSchema.scala b/src/test/scala/io/epiphanous/flinkrunner/serde/MyADTConfluentAvroSerializationSchema.scala new file mode 100644 index 0000000..6f0446d --- /dev/null +++ b/src/test/scala/io/epiphanous/flinkrunner/serde/MyADTConfluentAvroSerializationSchema.scala @@ -0,0 +1,8 @@ +package io.epiphanous.flinkrunner.serde + +import io.epiphanous.flinkrunner.model.{FlinkConfig, MyADT} + +class MyADTConfluentAvroSerializationSchema( + name: String, + config: FlinkConfig[MyADT]) +// extends ConfluentAvroSerializationSchema {} From 4890d748213be647fc19fec9d1a2ce76517f9d5f Mon Sep 17 00:00:00 2001 From: Robert Lyons Date: Thu, 2 Dec 2021 13:42:36 -0500 Subject: [PATCH 3/3] implement serdes and upgrade flink to 1.14 --- build.sbt | 23 ++-- .../epiphanous/flinkrunner/FlinkRunner.scala | 111 +++++++++-------- .../flinkrunner/FlinkRunnerFactory.scala | 10 ++ .../membership/StableBloomFilter.scala | 2 +- .../avro/ConfluentSchemaRegistryClient.scala | 7 +- .../flinkrunner/model/FlinkConfig.scala | 8 ++ .../operator/EnrichmentAsyncFunction.scala | 5 +- ...onfluentAvroKafkaSerializationSchema.scala | 47 -------- ...roRegistryKafkaDeserializationSchema.scala | 108 ----------------- ...stryKafkaRecordDeserializationSchema.scala | 112 ++++++++++++++++++ ...gistryKafkaRecordSerializationSchema.scala | 109 +++++++++++++++++ .../util/BoundedLatenessGeneratorTest.scala | 26 ++-- 12 files changed, 327 insertions(+), 241 deletions(-) delete mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala delete mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala create mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordDeserializationSchema.scala create mode 100644 src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordSerializationSchema.scala diff --git a/build.sbt b/build.sbt index 60abd1b..c565166 100644 --- a/build.sbt +++ b/build.sbt @@ -27,7 +27,7 @@ resolvers += "Local Maven Repository" at "file://" + Path.userHome.absolutePath resolvers += "Confluent Repository" at "https://packages.confluent.io/maven/" val V = new { - val flink = "1.13.3" + val flink = "1.14.0" val logback = "1.2.7" val scalaLogging = "3.9.4" val scalaTest = "3.2.10" @@ -77,16 +77,17 @@ val circeDeps = Seq( ).map(d => "io.circe" %% s"circe-$d" % V.circe) val otherDeps = Seq( - "io.confluent" % "kafka-schema-registry-client" % V.schemaRegistry, - "com.beachape" %% "enumeratum" % V.enumeratum, - "org.apache.avro" % "avro" % V.avro, - "com.typesafe" % "config" % V.typesafeConfig, - "com.google.guava" % "guava" % V.guava, - "org.typelevel" %% "squants" % V.squants, - "com.sksamuel.avro4s" %% "avro4s-core" % V.avro4s, - "org.scalactic" %% "scalactic" % V.scalaTest % Test, - "org.scalatest" %% "scalatest" % V.scalaTest % Test, - "org.scalacheck" %% "scalacheck" % V.scalaCheck % Test +// "io.confluent" % "kafka-schema-registry-client" % V.schemaRegistry, + "io.confluent" % "kafka-streams-avro-serde" % "7.0.0", + "com.beachape" %% "enumeratum" % V.enumeratum, +// "org.apache.avro" % "avro" % V.avro, + "com.typesafe" % "config" % V.typesafeConfig, + "com.google.guava" % "guava" % V.guava, + "org.typelevel" %% "squants" % V.squants, + "com.sksamuel.avro4s" %% "avro4s-core" % V.avro4s, + "org.scalactic" %% "scalactic" % V.scalaTest % Test, + "org.scalatest" %% "scalatest" % V.scalaTest % Test, + "org.scalacheck" %% "scalacheck" % V.scalaCheck % Test ) /** diff --git a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala index e4374e1..1192bdb 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunner.scala @@ -15,6 +15,8 @@ import org.apache.flink.api.common.serialization.{ SerializationSchema } import org.apache.flink.api.common.typeinfo.TypeInformation +import org.apache.flink.connector.kafka.sink.KafkaSink +import org.apache.flink.connector.kafka.source.KafkaSource import org.apache.flink.core.fs.Path import org.apache.flink.streaming.api.datastream.DataStreamSink import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.{ @@ -33,17 +35,7 @@ import org.apache.flink.streaming.api.scala.{DataStream, _} import org.apache.flink.streaming.connectors.cassandra.CassandraSink import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer import org.apache.flink.streaming.connectors.elasticsearch7.ElasticsearchSink -import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.Semantic -import org.apache.flink.streaming.connectors.kafka.{ - FlinkKafkaConsumer, - FlinkKafkaProducer, - KafkaDeserializationSchema, - KafkaSerializationSchema -} -import org.apache.flink.streaming.connectors.kinesis.serialization.{ - KinesisDeserializationSchema, - KinesisSerializationSchema -} +import org.apache.flink.streaming.connectors.kinesis.serialization.KinesisSerializationSchema import org.apache.flink.streaming.connectors.kinesis.{ FlinkKinesisConsumer, FlinkKinesisProducer @@ -247,18 +239,27 @@ class FlinkRunner[ADT <: FlinkEvent]( */ def fromKafka[E <: ADT: TypeInformation]( srcConfig: KafkaSourceConfig - ): DataStream[E] = { - val consumer = - new FlinkKafkaConsumer[E]( - srcConfig.topic, - config - .getKafkaDeserializationSchema[E](srcConfig.name) - .asInstanceOf[KafkaDeserializationSchema[E]], - srcConfig.properties - ) + ): DataStream[E] = env - .addSource(consumer) - } + .fromSource( + KafkaSource + .builder[E]() + .setProperties(srcConfig.properties) + .setDeserializer( + config + .getKafkaRecordDeserializationSchema[E]( + srcConfig.name + ) + ) + .build(), + srcConfig.watermarkStrategy match { + case "bounded out of order" => + boundedOutOfOrderWatermarks[E]() + case "ascending timestamps" => ascendingTimestampsWatermarks[E]() + case _ => boundedLatenessWatermarks[E](srcConfig.name) + }, + srcConfig.label + ) /** * Configure stream from kinesis. @@ -277,13 +278,11 @@ class FlinkRunner[ADT <: FlinkEvent]( new FlinkKinesisConsumer[E]( srcConfig.stream, config - .getKinesisDeserializationSchema(srcConfig.name) - .asInstanceOf[KinesisDeserializationSchema[E]], + .getKinesisDeserializationSchema[E](srcConfig.name), srcConfig.properties ) env .addSource(consumer) - .name(srcConfig.label) } /** @@ -304,8 +303,7 @@ class FlinkRunner[ADT <: FlinkEvent]( case other => other } val ds = config - .getDeserializationSchema(srcConfig.name) - .asInstanceOf[DeserializationSchema[E]] + .getDeserializationSchema[E](srcConfig.name) env .readTextFile(path) .name(s"raw:${srcConfig.label}") @@ -427,15 +425,22 @@ class FlinkRunner[ADT <: FlinkEvent]( stream: DataStream[E], sinkName: String = "" ): Object = { - val name = if (sinkName.isEmpty) config.getSinkNames.head else sinkName - config.getSinkConfig(name) match { - case s: KafkaSinkConfig => toKafka[E](stream, s) - case s: KinesisSinkConfig => toKinesis[E](stream, s) - case s: FileSinkConfig => toFile[E](stream, s) - case s: SocketSinkConfig => toSocket[E](stream, s) - case s: JdbcSinkConfig => toJdbc[E](stream, s) - case s: CassandraSinkConfig => toCassandraSink[E](stream, s) - case s: ElasticsearchSinkConfig => toElasticsearchSink[E](stream, s) + val name = if (sinkName.isEmpty) config.getSinkNames.head else sinkName + val sinkConfig = config.getSinkConfig(name) + val label = sinkConfig.label + sinkConfig match { + case s: KafkaSinkConfig => + toKafka[E](stream, s).uid(label).name(label) + case s: KinesisSinkConfig => + toKinesis[E](stream, s).uid(label).name(label) + case s: FileSinkConfig => toFile[E](stream, s).uid(label).name(label) + case s: SocketSinkConfig => + toSocket[E](stream, s).uid(label).name(label) + case s: JdbcSinkConfig => toJdbc[E](stream, s).uid(label).name(label) + case s: CassandraSinkConfig => + toCassandraSink[E](stream, s).uid(label).name(label) + case s: ElasticsearchSinkConfig => + toElasticsearchSink[E](stream, s).uid(label).name(label) case s => throw new IllegalArgumentException( s"unsupported source connector: ${s.connector}" @@ -460,18 +465,18 @@ class FlinkRunner[ADT <: FlinkEvent]( sinkConfig: KafkaSinkConfig ): DataStreamSink[E] = stream - .addSink( - new FlinkKafkaProducer[E]( - sinkConfig.topic, - config - .getKafkaSerializationSchema(sinkConfig.name) - .asInstanceOf[KafkaSerializationSchema[E]], - sinkConfig.properties, - Semantic.AT_LEAST_ONCE - ) + .sinkTo( + KafkaSink + .builder() + .setKafkaProducerConfig(sinkConfig.properties) + .setRecordSerializer( + config + .getKafkaRecordSerializationSchema[E]( + sinkConfig.name + ) + ) + .build() ) - .uid(sinkConfig.label) - .name(sinkConfig.label) /** * Send stream to a kinesis sink. @@ -503,8 +508,6 @@ class FlinkRunner[ADT <: FlinkEvent]( sink.setDefaultPartition("0") sink } - .uid(sinkConfig.label) - .name(sinkConfig.label) /** * Send stream to a socket sink. @@ -531,8 +534,6 @@ class FlinkRunner[ADT <: FlinkEvent]( .asInstanceOf[AddToJdbcBatchFunction[E]] ) ) - .uid(sinkConfig.label) - .name(sinkConfig.label) /** * Send stream to a rolling file sink. @@ -625,7 +626,7 @@ class FlinkRunner[ADT <: FlinkEvent]( s"Unknown file sink encoder format: '$encoderFormat'" ) } - stream.addSink(sink).uid(sinkConfig.label).name(sinkConfig.label) + stream.addSink(sink) } /** @@ -652,8 +653,6 @@ class FlinkRunner[ADT <: FlinkEvent]( .getSerializationSchema(sinkConfig.name) .asInstanceOf[SerializationSchema[E]] ) - .uid(sinkConfig.label) - .name(sinkConfig.label) /** * Send stream to a cassandra sink. @@ -675,8 +674,6 @@ class FlinkRunner[ADT <: FlinkEvent]( .setHost(sinkConfig.host) .setQuery(sinkConfig.query) .build() - .uid(sinkConfig.label) - .name(sinkConfig.label) /** * Send stream to an elasticsearch sink. @@ -723,7 +720,7 @@ class FlinkRunner[ADT <: FlinkEvent]( indexer.add(req) } ).build() - stream.addSink(esSink).uid(sinkConfig.label).name(sinkConfig.label) + stream.addSink(esSink) } } diff --git a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala index 3943eb2..1623da8 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/FlinkRunnerFactory.scala @@ -9,6 +9,8 @@ import org.apache.flink.api.common.serialization.{ Encoder, SerializationSchema } +import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner import org.apache.flink.streaming.connectors.kafka.{ KafkaDeserializationSchema, @@ -40,6 +42,14 @@ trait FlinkRunnerFactory[ADT <: FlinkEvent] { config: FlinkConfig[ADT]): KafkaDeserializationSchema[E] = ??? + def getKafkaRecordSerializationSchema[E <: ADT]( + name: String, + config: FlinkConfig[ADT]): KafkaRecordSerializationSchema[E] = ??? + + def getKafkaRecordDeserializationSchema[E <: ADT]( + name: String, + config: FlinkConfig[ADT]): KafkaRecordDeserializationSchema[E] = ??? + def getKinesisDeserializationSchema[E <: ADT]( name: String, config: FlinkConfig[ADT]): KinesisDeserializationSchema[E] = ??? diff --git a/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala b/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala index 26db8d1..7a10fde 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/algorithm/membership/StableBloomFilter.scala @@ -41,7 +41,7 @@ case class StableBloomFilter[T]( ) /** number of cells per unit storage */ - val storedCells: Int = Math.floor(STORAGE_BITS / d).toInt + val storedCells: Int = STORAGE_BITS / d /** number of bits used per unit storage */ val storedBits: Int = storedCells * d diff --git a/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala b/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala index dbd45c9..b41c16b 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/avro/ConfluentSchemaRegistryClient.scala @@ -8,7 +8,7 @@ import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} import io.epiphanous.flinkrunner.util.StringUtils import org.apache.avro.Schema.Parser import org.apache.flink.api.common.typeinfo.TypeInformation -import org.apache.flink.runtime.concurrent.Executors.directExecutionContext +import org.apache.flink.util.concurrent.Executors import org.http4s.EntityDecoder import org.http4s.circe.jsonOf import org.http4s.client.Client @@ -16,7 +16,7 @@ import org.http4s.client.blaze.BlazeClientBuilder import java.nio.ByteBuffer import java.util.concurrent.TimeUnit -import scala.concurrent.ExecutionContext +import scala.concurrent.{ExecutionContext, ExecutionContextExecutor} import scala.util.{Failure, Success, Try} class ConfluentSchemaRegistryClient[ADT <: FlinkEvent: TypeInformation]( @@ -37,7 +37,8 @@ class ConfluentSchemaRegistryClient[ADT <: FlinkEvent: TypeInformation]( jsonOf[IO, ConfluentSchemaRegistryResponse] @transient - lazy implicit val ec: ExecutionContext = directExecutionContext() + lazy implicit val ec: ExecutionContextExecutor = + ExecutionContext.fromExecutor(Executors.directExecutor()) @transient lazy implicit val cs: ContextShift[IO] = IO.contextShift(ec) diff --git a/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala b/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala index 17d0389..72d5685 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/model/FlinkConfig.scala @@ -5,6 +5,7 @@ import com.typesafe.scalalogging.LazyLogging import io.epiphanous.flinkrunner.model.ConfigToProps.RichConfigObject import io.epiphanous.flinkrunner.{FlinkRunnerFactory, SEE} import org.apache.flink.api.java.utils.ParameterTool +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment @@ -151,6 +152,10 @@ class FlinkConfig[ADT <: FlinkEvent]( def getKafkaDeserializationSchema[E <: ADT](name: String) = factory.getKafkaDeserializationSchema[E](name, this) + def getKafkaRecordDeserializationSchema[E <: ADT]( + name: String): KafkaRecordDeserializationSchema[E] = + factory.getKafkaRecordDeserializationSchema[E](name, this) + def getKinesisDeserializationSchema[E <: ADT](name: String) = factory.getKinesisDeserializationSchema[E](name, this) @@ -160,6 +165,9 @@ class FlinkConfig[ADT <: FlinkEvent]( def getKafkaSerializationSchema[E <: ADT](name: String) = factory.getKafkaSerializationSchema[E](name, this) + def getKafkaRecordSerializationSchema[E <: ADT](name: String) = + factory.getKafkaRecordSerializationSchema[E](name, this) + def getKinesisSerializationSchema[E <: ADT](name: String) = factory.getKinesisSerializationSchema[E](name, this) diff --git a/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala b/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala index 07b27a7..8c54587 100644 --- a/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala +++ b/src/main/scala/io/epiphanous/flinkrunner/operator/EnrichmentAsyncFunction.scala @@ -5,7 +5,7 @@ import com.google.common.cache.{CacheBuilder, CacheLoader} import com.typesafe.scalalogging.LazyLogging import io.circe.Decoder import io.epiphanous.flinkrunner.model.{FlinkConfig, FlinkEvent} -import org.apache.flink.runtime.concurrent.Executors.directExecutionContext +import org.apache.flink.util.concurrent.Executors import org.apache.flink.streaming.api.scala.async.{ AsyncFunction, ResultFuture @@ -75,7 +75,8 @@ abstract class EnrichmentAsyncFunction[ lazy implicit val entityDecoder: EntityDecoder[IO, CV] = jsonOf[IO, CV] @transient - lazy implicit val ec: ExecutionContext = directExecutionContext() + lazy implicit val ec: ExecutionContext = + ExecutionContext.fromExecutor(Executors.directExecutor()) @transient lazy implicit val cs: ContextShift[IO] = IO.contextShift(ec) diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala deleted file mode 100644 index 84b81fa..0000000 --- a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroKafkaSerializationSchema.scala +++ /dev/null @@ -1,47 +0,0 @@ -package io.epiphanous.flinkrunner.serde - -import com.typesafe.scalalogging.LazyLogging -import io.epiphanous.flinkrunner.model.{ - FlinkConfig, - FlinkEvent, - KafkaSinkConfig, - KafkaSourceConfig -} -import org.apache.avro.specific.SpecificRecord -import org.apache.flink.api.common.serialization.SerializationSchema -import org.apache.flink.formats.avro.registry.confluent.ConfluentRegistryAvroSerializationSchema -import org.apache.flink.streaming.connectors.kafka.KafkaSerializationSchema -import org.apache.kafka.clients.producer.ProducerRecord - -import java.lang - -/** - * A schema to serialize an ADT event using a confluent avro schema - * registry. An implementing class must provide a Flink - * [[ConfluentRegistryAvroSerializationSchema]] to interface with the - * schema registry. That registry is specific to a type that implements - * Avro's [[SpecificRecord]] interface type. - * @param sinkName - * name of the sink stream - * @param config - * flink runner config - * @tparam E - * the event type we are serializing from, which is a member of the ADT - * @tparam ADT - * the flink runner ADT - */ -abstract class ConfluentAvroKafkaSerializationSchema[ - E <: ADT, - ADT <: FlinkEvent]( - sinkName: String, - config: FlinkConfig[ADT] -) extends KafkaSerializationSchema[E] - with LazyLogging { - - val sinkConfig: KafkaSinkConfig = - config.getSourceConfig(sinkName).asInstanceOf[KafkaSinkConfig] - - override def serialize( - element: E, - timestamp: lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = ??? -} diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala deleted file mode 100644 index 3c18fbb..0000000 --- a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaDeserializationSchema.scala +++ /dev/null @@ -1,108 +0,0 @@ -package io.epiphanous.flinkrunner.serde - -import com.typesafe.scalalogging.LazyLogging -import io.epiphanous.flinkrunner.model.{ - FlinkConfig, - FlinkEvent, - KafkaSourceConfig -} -import org.apache.avro.specific.SpecificRecord -import org.apache.flink.api.common.serialization.DeserializationSchema -import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation} -import org.apache.flink.formats.avro.registry.confluent.ConfluentRegistryAvroDeserializationSchema -import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema -import org.apache.kafka.clients.consumer.ConsumerRecord - -/** - * A schema to deserialize bytes from kafka into an ADT event using a - * confluent avro schema registry. An implementing class must provide a - * Flink [[ConfluentRegistryAvroDeserializationSchema]] to interface with - * the schema registry. That registry is specific to a type that implements - * Avro's [[SpecificRecord]] interface. The implementing class must also - * provide a [[deserializeSpecificRecord]] method that deserializes an - * array of bytes into a specific record type as well as a - * [[fromSpecificRecord]] method that converts that type into a type that - * is a member of the ADT. - * - * @param sourceName - * name of the source stream - * @param config - * flink runner config - * @tparam E - * the event type we are producing here, which is a member of the ADT - * @tparam ADT - * the flink runner ADT - */ -abstract class ConfluentAvroRegistryKafkaDeserializationSchema[ - E <: ADT, - ADT <: FlinkEvent -]( - sourceName: String, - config: FlinkConfig[ADT] -) extends KafkaDeserializationSchema[E] - with LazyLogging { - - val sourceConfig: KafkaSourceConfig = - config.getSourceConfig(sourceName).asInstanceOf[KafkaSourceConfig] - - /** - * Implementing classes must provide a a confluent schema registry - * deserialization schema for specific records of type T. - * @tparam K - * specific record type - * @return - * ConfluentRegistryAvroDeserializationSchema[K] - */ - def schemaRegistryKeyDeserializer[K] - : ConfluentRegistryAvroDeserializationSchema[K] - - /** - * A helper method to use the provided schema registry deserialization - * schema to deserialize a kafka message into a specific record instance. - * @param message - * the kafka message - * @return - * an instance of specific record type T - */ - def deserializeSpecificRecord[T <: SpecificRecord]( - message: Array[Byte], - isKey: Boolean = false): T = ??? - ///schemaRegistryDeserializer.deserialize(message) - - /** - * Convert a deserialized specific record instance into an instance of - * our produced event type. Must be defined by implementing classes. - * @param key - * an optional key of type K - * @param value - * a value of specific record type V - * @tparam K - * the type of the key - * @tparam V - * the type of the value, subtype of avro specific record - * @return - * an instance of the flink runner ADT - */ - def fromSpecificRecord[K, V <: SpecificRecord]( - key: Option[K], - value: V): E - - def deserializeKey[K](key: Array[Byte]): K - - override def deserialize( - record: ConsumerRecord[Array[Byte], Array[Byte]]): E = ??? -// { -// val key = -// if (sourceConfig.isKeyed) Some(deserializeKey(record.key())) -// else None -// fromSpecificRecord( -// key, -// schemaRegistryDeserializer.deserialize(record.value()) -// ) -// } - - override def isEndOfStream(nextElement: E): Boolean = false - - override def getProducedType: TypeInformation[E] = - TypeInformation.of(new TypeHint[E] {}) -} diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordDeserializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordDeserializationSchema.scala new file mode 100644 index 0000000..6e2ac0b --- /dev/null +++ b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordDeserializationSchema.scala @@ -0,0 +1,112 @@ +package io.epiphanous.flinkrunner.serde + +import com.typesafe.scalalogging.LazyLogging +import io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient +import io.confluent.kafka.serializers.{ + KafkaAvroDeserializer, + KafkaAvroDeserializerConfig +} +import io.epiphanous.flinkrunner.model.{ + FlinkConfig, + FlinkEvent, + KafkaSourceConfig +} +import io.epiphanous.flinkrunner.serde.ConfluentAvroRegistryKafkaRecordDeserializationSchema.DEFAULT_CACHE_CAPACITY +import org.apache.avro.specific.SpecificRecord +import org.apache.flink.api.common.typeinfo.{TypeHint, TypeInformation} +import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema +import org.apache.flink.formats.avro.registry.confluent.ConfluentRegistryAvroDeserializationSchema +import org.apache.flink.util.Collector +import org.apache.kafka.clients.consumer.ConsumerRecord + +import java.util +import scala.collection.JavaConverters.mapAsJavaMapConverter +import scala.collection.mutable + +/** + * A schema to deserialize bytes from kafka into an ADT event using a + * confluent avro schema registry. + * + * @param sourceName + * name of the source stream + * @param config + * flink runner config + * @tparam E + * the event type we are producing here, which is a member of the ADT + * @tparam ADT + * the flink runner ADT + */ +abstract class ConfluentAvroRegistryKafkaRecordDeserializationSchema[ + E <: ADT, + ADT <: FlinkEvent +]( + sourceName: String, + config: FlinkConfig[ADT] +) extends KafkaRecordDeserializationSchema[E] + with LazyLogging { + + val sourceConfig: KafkaSourceConfig = + config.getSourceConfig(sourceName).asInstanceOf[KafkaSourceConfig] + + val topic: String = sourceConfig.topic + + val url: String = + sourceConfig.properties.getProperty("schema.registry.url") + val cacheCapacity: Int = sourceConfig.properties + .getProperty("schema.registry.cache.capacity", DEFAULT_CACHE_CAPACITY) + .toInt + val useSpecificAvroReader: Boolean = sourceConfig.properties + .getProperty("use.specific.avro.reader", "true") + .toBoolean + val useLogicalTypes: Boolean = sourceConfig.properties + .getProperty("use.logical.type.converters", "true") + .toBoolean + + /** create deserializer config */ + val deserializerConfig: util.Map[String, Boolean] = Map( + KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG -> useSpecificAvroReader, + KafkaAvroDeserializerConfig.AVRO_USE_LOGICAL_TYPE_CONVERTERS_CONFIG -> useLogicalTypes + ).asJava + + /** our schema registry client */ + val schemaRegistryClient = + new CachedSchemaRegistryClient(url, cacheCapacity) + + /** map to store the value, and optionally, key deserializers */ + val deserializers: mutable.Map[String, KafkaAvroDeserializer] = + mutable.Map( + "value" -> new KafkaAvroDeserializer( + schemaRegistryClient, + deserializerConfig + ) + ) + + /** add the key deserializer if needed */ + if (sourceConfig.isKeyed) { + val keyDeserializer = new KafkaAvroDeserializer(schemaRegistryClient) + keyDeserializer.configure(deserializerConfig, true) + deserializers += ("key" -> keyDeserializer) + } + + /** + * Convert a kafka consumer record instance into an instance of our + * produced event type. Must be defined by implementing classes. + * @param record + * a kafka consumer record + * @return + * an instance of the flink runner ADT + */ + def fromConsumerRecord( + record: ConsumerRecord[Array[Byte], Array[Byte]]): E + + override def deserialize( + record: ConsumerRecord[Array[Byte], Array[Byte]], + out: Collector[E]): Unit = fromConsumerRecord(record) + + override def getProducedType: TypeInformation[E] = + TypeInformation.of(new TypeHint[E] {}) +} + +object ConfluentAvroRegistryKafkaRecordDeserializationSchema { + val DEFAULT_CACHE_CAPACITY = "1000" +} diff --git a/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordSerializationSchema.scala b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordSerializationSchema.scala new file mode 100644 index 0000000..120a2c0 --- /dev/null +++ b/src/main/scala/io/epiphanous/flinkrunner/serde/ConfluentAvroRegistryKafkaRecordSerializationSchema.scala @@ -0,0 +1,109 @@ +package io.epiphanous.flinkrunner.serde + +import com.typesafe.scalalogging.LazyLogging +import io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient +import io.confluent.kafka.serializers.{ + KafkaAvroSerializer, + KafkaAvroSerializerConfig +} +import io.epiphanous.flinkrunner.model.{ + FlinkConfig, + FlinkEvent, + KafkaSinkConfig +} +import io.epiphanous.flinkrunner.serde.ConfluentAvroRegistryKafkaRecordSerializationSchema.DEFAULT_CACHE_CAPACITY +import org.apache.avro.specific.SpecificRecord +import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema +import org.apache.flink.formats.avro.registry.confluent.ConfluentRegistryAvroSerializationSchema +import org.apache.kafka.clients.producer.ProducerRecord + +import java.{lang, util} +import scala.collection.JavaConverters.mapAsJavaMapConverter +import scala.collection.mutable + +/** + * A schema to serialize an ADT event using a confluent avro schema + * registry. An implementing class must provide a Flink + * [[ConfluentRegistryAvroSerializationSchema]] to interface with the + * schema registry. That registry is specific to a type that implements + * Avro's [[SpecificRecord]] interface type. + * @param sinkName + * name of the sink stream + * @param config + * flink runner config + * @tparam E + * the event type we are serializing from, which is a member of the ADT + * @tparam ADT + * the flink runner ADT + */ +abstract class ConfluentAvroRegistryKafkaRecordSerializationSchema[ + E <: ADT, + ADT <: FlinkEvent]( + sinkName: String, + config: FlinkConfig[ADT] +) extends KafkaRecordSerializationSchema[E] + with LazyLogging { + + val sinkConfig: KafkaSinkConfig = + config.getSourceConfig(sinkName).asInstanceOf[KafkaSinkConfig] + + val url: String = + sinkConfig.properties.getProperty("schema.registry.url") + val cacheCapacity: Int = sinkConfig.properties + .getProperty("schema.registry.cache.capacity", DEFAULT_CACHE_CAPACITY) + .toInt + val removeJavaProps: Boolean = sinkConfig.properties + .getProperty("serializer.remove.java.props", "true") + .toBoolean + val useLogicalTypes: Boolean = sinkConfig.properties + .getProperty("serializer.use.logical.type.converters", "true") + .toBoolean + + /** create serializer config */ + val serializerConfig: util.Map[String, Boolean] = Map( + KafkaAvroSerializerConfig.AVRO_REMOVE_JAVA_PROPS_CONFIG -> removeJavaProps, + KafkaAvroSerializerConfig.AVRO_USE_LOGICAL_TYPE_CONVERTERS_CONFIG -> useLogicalTypes + ).asJava + + /** our schema registry client */ + val schemaRegistryClient = + new CachedSchemaRegistryClient(url, cacheCapacity) + + /** map to store the value, and optionally, key serializers */ + val serializers: mutable.Map[String, KafkaAvroSerializer] = + mutable.Map( + "value" -> new KafkaAvroSerializer( + schemaRegistryClient, + serializerConfig + ) + ) + + /** add the key serializer if needed */ + if (sinkConfig.isKeyed) { + val keySerializer = new KafkaAvroSerializer(schemaRegistryClient) + keySerializer.configure(serializerConfig, true) + serializers += ("key" -> keySerializer) + } + + /** + * Convert an element into a producer record of byte arrays. Must be + * defined by implementing classes. + * @param element + * an instance of the flinkrunner ADT + * @return + * ProducerRecord of bytes + */ + def toProducerRecord( + element: E): ProducerRecord[Array[Byte], Array[Byte]] + + override def serialize( + element: E, + context: KafkaRecordSerializationSchema.KafkaSinkContext, + timestamp: lang.Long): ProducerRecord[Array[Byte], Array[Byte]] = + toProducerRecord(element) + +} + +object ConfluentAvroRegistryKafkaRecordSerializationSchema { + val DEFAULT_CACHE_CAPACITY = "1000" +} diff --git a/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala b/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala index 429c56f..27c459a 100644 --- a/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala +++ b/src/test/scala/io/epiphanous/flinkrunner/util/BoundedLatenessGeneratorTest.scala @@ -1,16 +1,16 @@ package io.epiphanous.flinkrunner.util import io.epiphanous.flinkrunner.UnitSpec -import io.epiphanous.flinkrunner.model.FlinkEvent import org.apache.flink.api.common.eventtime.{Watermark, WatermarkOutput} +import org.scalatest.Assertion import java.util.UUID.randomUUID import scala.util.Random class BoundedLatenessGeneratorTest extends UnitSpec { - val now = System.currentTimeMillis() - val random = new Random() + val now: Long = System.currentTimeMillis() + val random = new Random() def getBlg(maxAllowedLateness: Long = 10L, streamID: String = "Test") = new BoundedLatenessGenerator[TestEvent]( @@ -18,13 +18,14 @@ class BoundedLatenessGeneratorTest extends UnitSpec { streamID ) - def uuid = randomUUID().toString + def uuid: String = randomUUID().toString - def nextEvent(ts: Long) = TestEvent(uuid, ts) + def nextEvent(ts: Long): TestEvent = TestEvent(uuid, ts) - def ascending(space: Long = 5) = (prev: Long) => prev + space + def ascending(space: Long = 5): Long => Long = (prev: Long) => + prev + space - def randomWalk(minSpace: Long = -20, maxSpace: Long = 40) = + def randomWalk(minSpace: Long = -20, maxSpace: Long = 40): Long => Long = (prev: Long) => prev + Math.ceil(minSpace + random.nextDouble() * maxSpace).toLong @@ -33,7 +34,7 @@ class BoundedLatenessGeneratorTest extends UnitSpec { progress: Long => Long = randomWalk(), probSpike: Double = 0, spikeSize: Long = 50000 - ) = + ): Stream[TestEvent] = Stream .iterate((TestEvent(uuid, start), start)) { case (_, timeline) => val spike = if (random.nextDouble() < probSpike) spikeSize else 0L @@ -46,7 +47,7 @@ class BoundedLatenessGeneratorTest extends UnitSpec { start: Long = now - 1000L, space: Long = 5, probSpike: Double = 0, - spikeSize: Long = 50000) = + spikeSize: Long = 50000): Stream[TestEvent] = events(start, ascending(space), probSpike, spikeSize) def randomEvents( @@ -55,7 +56,7 @@ class BoundedLatenessGeneratorTest extends UnitSpec { maxSpace: Long = 40, probSpike: Double = 0, spikeSize: Long = 50000 - ) = + ): Stream[TestEvent] = events(start, randomWalk(minSpace, maxSpace), probSpike, spikeSize) def randomEventsWithSpike( @@ -64,7 +65,7 @@ class BoundedLatenessGeneratorTest extends UnitSpec { maxSpace: Long = 40, probSpike: Double = .20, spikeSize: Long = 50000 - ) = + ): Stream[TestEvent] = randomEvents(start, minSpace, maxSpace, probSpike, spikeSize) behavior of "BoundedLatenessGenerator" @@ -88,13 +89,14 @@ class BoundedLatenessGeneratorTest extends UnitSpec { assert(seenSpaces.head === space) } - def watermarkTest(testEvents: Stream[TestEvent]) = { + def watermarkTest(testEvents: Stream[TestEvent]): Assertion = { var prevTs = -1L val maxLateness = 10L val blg = getBlg(maxLateness) val wmo = new WatermarkOutput { override def emitWatermark(watermark: Watermark): Unit = {} override def markIdle(): Unit = {} + override def markActive(): Unit = {} } var maxTs = 0L val result = testEvents