From 453e6d5b1531afb69d880d5996bb8e706f3bc353 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 5 Nov 2015 23:35:55 +0100 Subject: [PATCH 01/80] Start of work to support many-to-many sources/sinks/mappings. Changes include: - A new configuration layout for supporting multiple sources, sinks and mappings. - No more enqueuing delay. Queues tend to either be full or empty, and are really only effective at smoothing jitter. Spinning to delay doesn't really help, so we just drop the events immediately instead of blocking. - No more session-binning strategy for HDFS. In practice we didn't use this, and it's complicated to do properly when multiple sources (with their own idea of sessions) might be feeding into a sink. For now, the code only supports the existing single browser-based source and HDFS/Kafka sinks. Configuration validation is also not yet implemented. --- build.gradle | 1 + .../server/IncomingRequestProcessingPool.java | 15 +- .../server/IncomingRequestProcessor.java | 30 +- .../io/divolte/server/MappingTestServer.java | 2 +- src/main/java/io/divolte/server/Server.java | 26 +- .../config/BrowserSourceConfiguration.java | 59 +++ .../server/config/DivolteConfiguration.java | 90 +++- .../config/FileStrategyConfiguration.java | 74 ++- .../server/config/GlobalConfiguration.java | 37 ++ .../server/config/HdfsConfiguration.java | 26 +- .../config/HdfsFlusherConfiguration.java | 38 -- .../server/config/HdfsSinkConfiguration.java | 29 ++ ...IncomingRequestProcessorConfiguration.java | 38 -- .../config/JavascriptConfiguration.java | 35 +- .../server/config/KafkaConfiguration.java | 32 ++ .../config/KafkaFlusherConfiguration.java | 39 -- .../server/config/KafkaSinkConfiguration.java | 24 + .../server/config/MapperConfiguration.java | 41 ++ .../server/config/MappingConfiguration.java | 54 ++ .../config/SchemaMappingConfiguration.java | 23 - .../server/config/ServerConfiguration.java | 18 +- ...ssionBinningFileStrategyConfiguration.java | 37 -- ...impleRollingFileStrategyConfiguration.java | 28 -- .../server/config/SinkConfiguration.java | 24 + .../server/config/SinkTypeConfiguration.java | 31 ++ .../server/config/SourceConfiguration.java | 23 + .../server/config/TrackingConfiguration.java | 48 -- .../server/config/UaParserConfiguration.java | 23 - .../config/UserAgentParserConfiguration.java | 37 ++ .../server/config/ValidatedConfiguration.java | 19 +- .../hdfs/FileCreateAndSyncStrategy.java | 17 +- .../io/divolte/server/hdfs/HdfsFlusher.java | 59 +-- .../divolte/server/hdfs/HdfsFlushingPool.java | 8 +- .../hdfs/SessionBinningFileStrategy.java | 435 ---------------- .../hdfs/SimpleRollingFileStrategy.java | 31 +- .../server/js/TrackingJavaScriptResource.java | 34 +- .../io/divolte/server/kafka/KafkaFlusher.java | 27 +- .../server/kafka/KafkaFlushingPool.java | 10 +- .../server/processing/ProcessingPool.java | 21 +- .../server/recordmapping/DslRecordMapper.java | 4 - .../UserAgentParserAndCache.java | 13 +- src/main/resources/reference.conf | 468 +++++------------- .../divolte/server/DslRecordMapperTest.java | 17 +- .../io/divolte/server/ServerTestUtils.java | 21 +- .../config/ValidatedConfigurationTest.java | 13 +- .../divolte/server/hdfs/HdfsFlusherTest.java | 195 ++++---- .../hdfs/SessionBinningFileStrategyTest.java | 304 ------------ .../checksum-discard-corrupt-test.conf | 2 +- src/test/resources/checksum-test.conf | 2 +- src/test/resources/dsl-mapping-test.conf | 4 +- src/test/resources/duplicates-test.conf | 2 +- .../resources/hdfs-flusher-binning-test.conf | 42 -- src/test/resources/hdfs-flusher-test.conf | 29 +- src/test/resources/reference-test.conf | 47 +- src/test/resources/x-forwarded-for-test.conf | 2 +- 55 files changed, 956 insertions(+), 1852 deletions(-) create mode 100644 src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/GlobalConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/KafkaConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/MapperConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/MappingConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/SinkConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/SinkTypeConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/SourceConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/TrackingConfiguration.java delete mode 100644 src/main/java/io/divolte/server/config/UaParserConfiguration.java create mode 100644 src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java delete mode 100644 src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java delete mode 100644 src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java delete mode 100644 src/test/resources/hdfs-flusher-binning-test.conf diff --git a/build.gradle b/build.gradle index 0aef39cc..983b060a 100644 --- a/build.gradle +++ b/build.gradle @@ -114,6 +114,7 @@ dependencies { compile group: 'com.jayway.jsonpath', name: 'json-path', version: '2.0.0' compile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: '2.6.3' compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: '2.6.3' + compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-guava', version: '2.6.3' compile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: '2.6.3' compile group: 'com.jasonclawson', name: 'jackson-dataformat-hocon', version: '1.1.0' diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index dabd7135..f7716208 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -46,13 +46,12 @@ final class IncomingRequestProcessingPool extends ProcessingPool new IncomingRequestProcessor(vc, kafkaFlushingPool, hdfsFlushingPool, geoipLookupService, schema, listener)); @@ -80,7 +77,7 @@ public IncomingRequestProcessingPool( } private static Schema schemaFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().tracking.schemaFile + return vc.configuration().incomingRequestProcessor.schemaFile .map((schemaFileName) -> { final Parser parser = new Schema.Parser(); logger.info("Using Avro schema from configuration: {}", schemaFileName); @@ -99,7 +96,7 @@ private static Schema schemaFromConfig(final ValidatedConfiguration vc) { @Nullable private static LookupService lookupServiceFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().tracking.ip2geoDatabase + return vc.configuration().global.mapper.ip2geoDatabase .map((path) -> { try { return new ExternalDatabaseLookupService(Paths.get(path)); diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index 0ccf77e9..6525dd3a 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -77,30 +77,18 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, keepCorrupted = !vc.configuration().incomingRequestProcessor.discardCorrupted; - memory = new ShortTermDuplicateMemory(vc.configuration().incomingRequestProcessor.duplicateMemorySize); + memory = new ShortTermDuplicateMemory(vc.configuration().global.mapper.duplicateMemorySize); keepDuplicates = !vc.configuration().incomingRequestProcessor.discardDuplicates; - mapper = vc.configuration().tracking.schemaMapping - .map((smc) -> { - final int version = smc.version; - switch(version) { - case 1: - logger.error("Version 1 configuration version had been deprecated and is no longer supported."); - throw new RuntimeException("Unsupported schema mapping config version: " + version); - case 2: + mapper = vc.configuration().incomingRequestProcessor.mappingScriptFile + .map((mappingScriptFile) -> { logger.info("Using script based schema mapping."); - return new DslRecordMapper( - vc, - Objects.requireNonNull(schema), - Optional.ofNullable(geoipLookupService)); - default: - throw new RuntimeException("Unsupported schema mapping config version: " + version); - } - }) - .orElseGet(() -> { - logger.info("Using built in default schema mapping."); - return new DslRecordMapper(DefaultEventRecord.getClassSchema(), defaultRecordMapping(vc)); - }); + return new DslRecordMapper(vc, mappingScriptFile, Objects.requireNonNull(schema), Optional.ofNullable(geoipLookupService)); + + }).orElseGet(() -> { + logger.info("Using built in default schema mapping."); + return new DslRecordMapper(DefaultEventRecord.getClassSchema(), defaultRecordMapping(vc)); + }); } private DslRecordMapping defaultRecordMapping(final ValidatedConfiguration vc) { diff --git a/src/main/java/io/divolte/server/MappingTestServer.java b/src/main/java/io/divolte/server/MappingTestServer.java index ca755793..7bc96d9f 100644 --- a/src/main/java/io/divolte/server/MappingTestServer.java +++ b/src/main/java/io/divolte/server/MappingTestServer.java @@ -94,7 +94,7 @@ private Schema loadSchema(final String schemaFilename) throws IOException { @Nullable private static LookupService lookupServiceFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().tracking.ip2geoDatabase + return vc.configuration().global.mapper.ip2geoDatabase .map((path) -> { try { return new ExternalDatabaseLookupService(Paths.get(path)); diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index ef8f1de0..ad618642 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -16,6 +16,7 @@ package io.divolte.server; +import com.typesafe.config.ConfigFactory; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.js.TrackingJavaScriptResource; import io.undertow.Undertow; @@ -31,18 +32,14 @@ import io.undertow.server.handlers.resource.ResourceManager; import io.undertow.util.Headers; import io.undertow.util.Methods; - -import java.io.IOException; -import java.time.Duration; -import java.util.Optional; - -import javax.annotation.ParametersAreNonnullByDefault; - import org.apache.hadoop.fs.FileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.typesafe.config.ConfigFactory; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.time.Duration; +import java.util.Optional; @ParametersAreNonnullByDefault public final class Server implements Runnable { @@ -61,8 +58,8 @@ public Server(final ValidatedConfiguration vc) { } Server(final ValidatedConfiguration vc, final IncomingRequestListener listener) { - host = vc.configuration().server.host; - port = vc.configuration().server.port; + host = vc.configuration().global.server.host; + port = vc.configuration().global.server.port; processingPool = new IncomingRequestProcessingPool(vc, listener); final ClientSideCookieEventHandler clientSideCookieEventHandler = @@ -71,19 +68,20 @@ public Server(final ValidatedConfiguration vc) { final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); final PathHandler handler = new PathHandler(); - handler.addExactPath("/csc-event", + handler.addExactPath(vc.configuration().browserSourceConfiguration.prefix + "csc-event", new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET)); - handler.addExactPath('/' + trackingJavaScript.getScriptName(), javascriptHandler); + handler.addExactPath(vc.configuration().browserSourceConfiguration.prefix + trackingJavaScript.getScriptName(), javascriptHandler); handler.addExactPath("/ping", PingHandler::handlePingRequest); - if (vc.configuration().server.serveStaticResources) { + if (vc.configuration().global.server.serveStaticResources) { // Catch-all handler; must be last if present. + // XXX: Our static resources assume the default 'browser' endpoint. handler.addPrefixPath("/", createStaticResourceHandler()); } final SetHeaderHandler headerHandler = new SetHeaderHandler(handler, Headers.SERVER_STRING, "divolte"); final HttpHandler canonicalPathHandler = new CanonicalPathHandler(headerHandler); final GracefulShutdownHandler rootHandler = new GracefulShutdownHandler( - vc.configuration().server.useXForwardedFor ? + vc.configuration().global.server.useXForwardedFor ? new ProxyAdjacentPeerAddressHandler(canonicalPathHandler) : canonicalPathHandler ); diff --git a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java new file mode 100644 index 00000000..1d83154f --- /dev/null +++ b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java @@ -0,0 +1,59 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.Valid; +import java.time.Duration; +import java.util.Objects; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class BrowserSourceConfiguration extends SourceConfiguration { + private static final String DEFAULT_PREFIX = "/"; + private static final String DEFAULT_PARTY_COOKIE = "_dvp"; + private static final Duration DEFAULT_PARTY_TIMEOUT = Duration.ofDays(730); + private static final String DEFAULT_SESSION_COOKIE = "_dvs"; + private static final Duration DEFAULT_SESSION_TIMEOUT = Duration.ofMinutes(30); + + public final String prefix; + + public final Optional cookieDomain; + public final String partyCookie; + public final Duration partyTimeout; + public final String sessionCookie; + public final Duration sessionTimeout; + + @Valid + public final JavascriptConfiguration javascript; + + @JsonCreator + BrowserSourceConfiguration(final Optional prefix, + final Optional cookieDomain, + final Optional partyCookie, + final Optional partyTimeout, + final Optional sessionCookie, + final Optional sessionTimeout, + final Optional javascript) { + this.prefix = prefix.orElse(DEFAULT_PREFIX); + this.cookieDomain = Objects.requireNonNull(cookieDomain); + this.partyCookie = partyCookie.orElse(DEFAULT_PARTY_COOKIE); + this.partyTimeout = partyTimeout.orElse(DEFAULT_PARTY_TIMEOUT); + this.sessionCookie = sessionCookie.orElse(DEFAULT_SESSION_COOKIE); + this.sessionTimeout = sessionTimeout.orElse(DEFAULT_SESSION_TIMEOUT); + this.javascript = javascript.orElse(JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("prefix", prefix) + .add("cookieDomain", cookieDomain) + .add("partyCookie", partyCookie) + .add("partyTimeout", partyTimeout) + .add("sessionCookie", sessionCookie) + .add("sessionTimeout", sessionTimeout) + .add("javascript", javascript); + } +} diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 7a9b2421..fd05a7ae 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -1,37 +1,87 @@ package io.divolte.server.config; import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.Valid; +import java.util.Objects; +import java.util.Optional; @ParametersAreNonnullByDefault public final class DivolteConfiguration { - @Valid public final ServerConfiguration server; - @Valid public final TrackingConfiguration tracking; - @Valid public final JavascriptConfiguration javascript; - @Valid public final IncomingRequestProcessorConfiguration incomingRequestProcessor; - @Valid public final KafkaFlusherConfiguration kafkaFlusher; - @Valid public final HdfsFlusherConfiguration hdfsFlusher; + @Valid public final GlobalConfiguration global; + @Valid public final ImmutableMap sources; + @Valid public final ImmutableMap sinks; + @Valid public final ImmutableMap mappings; + + /** @deprecated */ + public final MappingConfiguration incomingRequestProcessor; + /** @deprecated */ + public final BrowserSourceConfiguration browserSourceConfiguration; + /** @deprecated */ + public final KafkaSinkConfiguration kafkaFlusher; + /** @deprecated */ + public final HdfsSinkConfiguration hdfsFlusher; @JsonCreator - private DivolteConfiguration( - final ServerConfiguration server, - final TrackingConfiguration tracking, - final JavascriptConfiguration javascript, - final IncomingRequestProcessorConfiguration incomingRequestProcessor, - final KafkaFlusherConfiguration kafkaFlusher, - final HdfsFlusherConfiguration hdfsFlusher) { - this.server = server; - this.tracking = tracking; - this.javascript = javascript; - this.incomingRequestProcessor = incomingRequestProcessor; - this.kafkaFlusher = kafkaFlusher; - this.hdfsFlusher = hdfsFlusher; + DivolteConfiguration(final GlobalConfiguration global, + final Optional> sources, + final Optional> sinks, + final Optional> mappings) { + this.sources = sources.orElseGet(DivolteConfiguration::defaultSourceConfigurations); + this.sinks = sinks.orElseGet(DivolteConfiguration::defaultSinkConfigurations); + this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); + this.global = Objects.requireNonNull(global); + // Temporary interop + this.incomingRequestProcessor = Iterables.getOnlyElement(this.mappings.values()); + this.browserSourceConfiguration = (BrowserSourceConfiguration) Iterables.getOnlyElement(this.sources.values()); + this.kafkaFlusher = (KafkaSinkConfiguration) Iterators.getOnlyElement(this.sinks.values().stream().filter((sink) -> sink instanceof KafkaSinkConfiguration).iterator()); + this.hdfsFlusher = (HdfsSinkConfiguration) Iterators.getOnlyElement(this.sinks.values().stream().filter((sink) -> sink instanceof HdfsSinkConfiguration).iterator()); + // TODO: Validate that the mappings refer to defined sources and sinks. + // TODO: Validate that all mappings that refer to a sink have the same schema. + + // TODO: Optimizations: + // - Elide HDFS and Kafka sinks if they are globally disabled. + // - Elide unreferenced sources and sinks. + } + + private static ImmutableMap defaultSourceConfigurations() { + return ImmutableMap.of("browser", new BrowserSourceConfiguration(Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty())); + } + + private static ImmutableMap defaultSinkConfigurations() { + return ImmutableMap.of("hdfs", new HdfsSinkConfiguration(Optional.empty(), Optional.empty()), + "kafka", new KafkaSinkConfiguration(Optional.empty())); + } + + private static ImmutableMap defaultMappingConfigurations(final ImmutableSet sourceNames, + final ImmutableSet sinkNames) { + return ImmutableMap.of("default", new MappingConfiguration(Optional.empty(), + Optional.empty(), + sourceNames, + sinkNames, + Optional.empty(), + Optional.empty())); } @Override public String toString() { - return "DivolteConfiguration [server=" + server + ", tracking=" + tracking + ", javascript=" + javascript + ", incomingRequestProcessor=" + incomingRequestProcessor + ", kafkaFlusher=" + kafkaFlusher + ", hdfsFlusher=" + hdfsFlusher + "]"; + return MoreObjects.toStringHelper(this) + .add("global", global) + .add("sources", sources) + .add("sinks", sinks) + .add("mappings", mappings) + .toString(); } } diff --git a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java index b2accace..361284f8 100644 --- a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java +++ b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java @@ -1,55 +1,49 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonSubTypes; -import com.fasterxml.jackson.annotation.JsonSubTypes.Type; -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.google.common.base.Preconditions; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; import javax.annotation.ParametersAreNonnullByDefault; import java.time.Duration; -import java.util.Objects; +import java.util.Optional; -@JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") -@JsonSubTypes({ - @Type(value=SimpleRollingFileStrategyConfiguration.class, name = "SIMPLE_ROLLING_FILE"), - @Type(value=SessionBinningFileStrategyConfiguration.class, name = "SESSION_BINNING") -}) @ParametersAreNonnullByDefault -public abstract class FileStrategyConfiguration { - public final FileStrategyConfiguration.Types type; +public class FileStrategyConfiguration { + private static final int DEFAULT_SYNC_FILE_AFTER_RECORDS = 1000; + private static final Duration DEFAULT_SYNC_FILE_AFTER_DURATION = Duration.ofSeconds(30); + private static final String DEFAULT_WORKING_DIR = "/tmp"; + private static final String DEFAULT_PUBLISH_DIR = "/tmp"; + private static final Duration DEFAULT_ROLL_EVERY = Duration.ofHours(1); + + static final FileStrategyConfiguration DEFAULT_FILE_STRATEGY_CONFIGURATION = + new FileStrategyConfiguration(Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); + public final int syncFileAfterRecords; public final Duration syncFileAfterDuration; public final String workingDir; public final String publishDir; - - protected FileStrategyConfiguration ( - final FileStrategyConfiguration.Types type, - final int syncFileAfterRecords, - final Duration syncFileAfterDuration, - final String workingDir, - final String publishDir) { - this.type = Objects.requireNonNull(type); - this.syncFileAfterRecords = Objects.requireNonNull(syncFileAfterRecords); - this.syncFileAfterDuration = Objects.requireNonNull(syncFileAfterDuration); - this.workingDir = Objects.requireNonNull(workingDir); - this.publishDir = Objects.requireNonNull(publishDir); - } - - @ParametersAreNonnullByDefault - public enum Types { - SIMPLE_ROLLING_FILE(SimpleRollingFileStrategyConfiguration.class), - SESSION_BINNING(SessionBinningFileStrategyConfiguration.class); - - public final Class clazz; - - Types(final Class clazz) { - this.clazz = Objects.requireNonNull(clazz); - } + public final Duration rollEvery; + + @JsonCreator + FileStrategyConfiguration(final Optional rollEvery, + final Optional syncFileAfterRecords, + final Optional syncFileAfterDuration, + final Optional workingDir, + final Optional publishDir) { + this.rollEvery = rollEvery.orElse(DEFAULT_ROLL_EVERY); + this.syncFileAfterRecords = syncFileAfterRecords.orElse(DEFAULT_SYNC_FILE_AFTER_RECORDS); + this.syncFileAfterDuration = syncFileAfterDuration.orElse(DEFAULT_SYNC_FILE_AFTER_DURATION); + this.workingDir = workingDir.orElse(DEFAULT_WORKING_DIR); + this.publishDir = publishDir.orElse(DEFAULT_PUBLISH_DIR); } - public T as(Class target) { - Preconditions.checkState(type.clazz.equals(target), - "Attempt to cast FileStrategyConfiguration to wrong type."); - return target.cast(this); + @Override + public final String toString() { + return MoreObjects.toStringHelper(this) + .add("rollEvery", rollEvery) + .add("syncFileAfterRecords", syncFileAfterRecords) + .add("syncFileAfterDuration", syncFileAfterDuration) + .add("workingDir", workingDir) + .add("publishDir", publishDir).toString(); } } diff --git a/src/main/java/io/divolte/server/config/GlobalConfiguration.java b/src/main/java/io/divolte/server/config/GlobalConfiguration.java new file mode 100644 index 00000000..7819f677 --- /dev/null +++ b/src/main/java/io/divolte/server/config/GlobalConfiguration.java @@ -0,0 +1,37 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.Valid; +import java.util.Objects; + +@ParametersAreNonnullByDefault +public class GlobalConfiguration { + @Valid public final ServerConfiguration server; + @Valid public final MapperConfiguration mapper; + @Valid public final HdfsConfiguration hdfs; + @Valid public final KafkaConfiguration kafka; + + @JsonCreator + GlobalConfiguration(final ServerConfiguration server, + final MapperConfiguration mapper, + final HdfsConfiguration hdfs, + final KafkaConfiguration kafka) { + this.server = Objects.requireNonNull(server); + this.mapper = Objects.requireNonNull(mapper); + this.hdfs = Objects.requireNonNull(hdfs); + this.kafka = Objects.requireNonNull(kafka); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("server", server) + .add("mapper", mapper) + .add("hdfs", hdfs) + .add("kafka", kafka) + .toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/HdfsConfiguration.java b/src/main/java/io/divolte/server/config/HdfsConfiguration.java index 666410f7..ceaf62f5 100644 --- a/src/main/java/io/divolte/server/config/HdfsConfiguration.java +++ b/src/main/java/io/divolte/server/config/HdfsConfiguration.java @@ -1,24 +1,32 @@ package io.divolte.server.config; import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; import java.util.Optional; +import java.util.Properties; @ParametersAreNonnullByDefault -public final class HdfsConfiguration { - public final Optional uri; - public final short replication; +public final class HdfsConfiguration extends SinkTypeConfiguration { + + private final Optional client; @JsonCreator - private HdfsConfiguration(final Optional uri, final short replication) { - this.uri = Objects.requireNonNull(uri); - this.replication = replication; + HdfsConfiguration(final boolean enabled, final int bufferSize, final int threads, final Optional client) { + super(bufferSize, threads, enabled); + // Defensive copy: ensure our copy remains immutable. + this.client = client.map(properties -> (Properties) properties.clone()); } @Override - public String toString() { - return "HdfsConfiguration [uri=" + uri + ", replication=" + replication + "]"; + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("client", client); + } + + public Optional getClient() { + // Defensive copy: we can't stop callers from modifying what we return. + return client.map(properties -> (Properties) properties.clone()); } } diff --git a/src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java b/src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java deleted file mode 100644 index 5ae91756..00000000 --- a/src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java +++ /dev/null @@ -1,38 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class HdfsFlusherConfiguration { - public final boolean enabled; - public final int threads; - public final int maxWriteQueue; - public final Duration maxEnqueueDelay; - public final HdfsConfiguration hdfs; - public final FileStrategyConfiguration fileStrategy; - - @JsonCreator - private HdfsFlusherConfiguration( - final boolean enabled, - final int threads, - final int maxWriteQueue, - final Duration maxEnqueueDelay, - final HdfsConfiguration hdfs, - final FileStrategyConfiguration fileStrategy) { - this.enabled = enabled; - this.threads = threads; - this.maxWriteQueue = maxWriteQueue; - this.maxEnqueueDelay = Objects.requireNonNull(maxEnqueueDelay); - this.hdfs = Objects.requireNonNull(hdfs); - this.fileStrategy = Objects.requireNonNull(fileStrategy); - } - - @Override - public String toString() { - return "HdfsFlusherConfiguration [enabled=" + enabled + ", threads=" + threads + ", maxWriteQueue=" + maxWriteQueue + ", maxEnqueueDelay=" + maxEnqueueDelay + ", hdfs=" + hdfs + ", fileStrategy=" + fileStrategy + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java new file mode 100644 index 00000000..4e4112b2 --- /dev/null +++ b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java @@ -0,0 +1,29 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class HdfsSinkConfiguration extends SinkConfiguration { + private static final short DEFAULT_REPLICATION = 3; + + public final short replication; + public final FileStrategyConfiguration fileStrategy; + + @JsonCreator + HdfsSinkConfiguration(final Optional replication, + final Optional fileStrategy) { + this.replication = replication.orElse(DEFAULT_REPLICATION); + this.fileStrategy = fileStrategy.orElse(FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("replication", replication) + .add("fileStrategy", fileStrategy); + } +} diff --git a/src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java b/src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java deleted file mode 100644 index 2866fae4..00000000 --- a/src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java +++ /dev/null @@ -1,38 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class IncomingRequestProcessorConfiguration { - public final int threads; - public final int maxWriteQueue; - public final Duration maxEnqueueDelay; - public final boolean discardCorrupted; - public final int duplicateMemorySize; - public final boolean discardDuplicates; - - @JsonCreator - private IncomingRequestProcessorConfiguration( - final int threads, - final int maxWriteQueue, - final Duration maxEnqueueDelay, - final boolean discardCorrupted, - final int duplicateMemorySize, - final boolean discardDuplicates) { - this.threads = threads; - this.maxWriteQueue = maxWriteQueue; - this.maxEnqueueDelay = Objects.requireNonNull(maxEnqueueDelay); - this.discardCorrupted = discardCorrupted; - this.duplicateMemorySize = duplicateMemorySize; - this.discardDuplicates = discardDuplicates; - } - - @Override - public String toString() { - return "IncomingRequestProcessorConfiguration [threads=" + threads + ", maxWriteQueue=" + maxWriteQueue + ", maxEnqueueDelay=" + maxEnqueueDelay + ", discardCorrupted=" + discardCorrupted + ", duplicateMemorySize=" + duplicateMemorySize + ", discardDuplicates=" + discardDuplicates + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java index f8f7648c..0d58bdd8 100644 --- a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java +++ b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java @@ -1,15 +1,24 @@ package io.divolte.server.config; import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; import org.hibernate.validator.constraints.NotEmpty; import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.constraints.NotNull; import javax.validation.constraints.Pattern; -import java.util.Objects; +import java.util.Optional; @ParametersAreNonnullByDefault public final class JavascriptConfiguration { + private static final String DEFAULT_NAME = "divolte.js"; + private static final boolean DEFAULT_LOGGING = false; + private static final boolean DEFAULT_DEBUG = false; + private static final boolean DEFAULT_AUTO_PAGE_VIEW_EVENT = false; + + static final JavascriptConfiguration DEFAULT_JAVASCRIPT_CONFIGURATION = + new JavascriptConfiguration(Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); + @NotNull @NotEmpty @Pattern(regexp="^[A-Za-z0-9_-]+\\.js$") public final String name; @@ -18,19 +27,23 @@ public final class JavascriptConfiguration { public final boolean autoPageViewEvent; @JsonCreator - private JavascriptConfiguration( - final String name, - final boolean logging, - final boolean debug, - final boolean autoPageViewEvent) { - this.name = Objects.requireNonNull(name); - this.logging = logging; - this.debug = debug; - this.autoPageViewEvent = autoPageViewEvent; + JavascriptConfiguration(final Optional name, + final Optional logging, + final Optional debug, + final Optional autoPageViewEvent) { + this.name = name.orElse(DEFAULT_NAME); + this.logging = logging.orElse(DEFAULT_LOGGING); + this.debug = debug.orElse(DEFAULT_DEBUG); + this.autoPageViewEvent = autoPageViewEvent.orElse(DEFAULT_AUTO_PAGE_VIEW_EVENT); } @Override public String toString() { - return "JavascriptConfiguration [name=" + name + ", logging=" + logging + ", debug=" + debug + ", autoPageViewEvent=" + autoPageViewEvent + "]"; + return MoreObjects.toStringHelper(this) + .add("name", name) + .add("logging", logging) + .add("debug", debug) + .add("autoPageViewEvent", autoPageViewEvent) + .toString(); } } diff --git a/src/main/java/io/divolte/server/config/KafkaConfiguration.java b/src/main/java/io/divolte/server/config/KafkaConfiguration.java new file mode 100644 index 00000000..80a4bf04 --- /dev/null +++ b/src/main/java/io/divolte/server/config/KafkaConfiguration.java @@ -0,0 +1,32 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; +import java.util.Properties; + +@ParametersAreNonnullByDefault +public class KafkaConfiguration extends SinkTypeConfiguration { + + private final Properties producer; + + @JsonCreator + KafkaConfiguration(final int bufferSize, final int threads, final boolean enabled, final Properties producer) { + super(bufferSize, threads, enabled); + // Defensive copy: ensure our copy remains immutable. + this.producer = Objects.requireNonNull((Properties) producer.clone()); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("producer", producer); + } + + public Properties getProducer() { + // Defensive copy: we can't stop callers from modifying what we return. + return (Properties)producer.clone(); + } +} diff --git a/src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java b/src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java deleted file mode 100644 index e476a630..00000000 --- a/src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java +++ /dev/null @@ -1,39 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; -import java.util.Properties; - -@ParametersAreNonnullByDefault -public final class KafkaFlusherConfiguration { - public final boolean enabled; - public final int threads; - public final int maxWriteQueue; - public final Duration maxEnqueueDelay; - public final String topic; - public final Properties producer; - - @JsonCreator - private KafkaFlusherConfiguration( - final boolean enabled, - final int threads, - final int maxWriteQueue, - final Duration maxEnqueueDelay, - final String topic, - final Properties producer) { - this.enabled = enabled; - this.threads = threads; - this.maxWriteQueue = maxWriteQueue; - this.maxEnqueueDelay = Objects.requireNonNull(maxEnqueueDelay); - this.topic = Objects.requireNonNull(topic); - this.producer = Objects.requireNonNull(producer); - } - - @Override - public String toString() { - return "KafkaFlusherConfiguration [enabled=" + enabled + ", threads=" + threads + ", maxWriteQueue=" + maxWriteQueue + ", maxEnqueueDelay=" + maxEnqueueDelay + ", topic=" + topic + ", producer=" + producer + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java new file mode 100644 index 00000000..8b2a481e --- /dev/null +++ b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java @@ -0,0 +1,24 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class KafkaSinkConfiguration extends SinkConfiguration { + private static final String DEFAULT_TOPIC = "divolte"; + + public final String topic; + + @JsonCreator + KafkaSinkConfiguration(final Optional topic) { + this.topic = topic.orElse(DEFAULT_TOPIC); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper().add("topic", topic); + } +} diff --git a/src/main/java/io/divolte/server/config/MapperConfiguration.java b/src/main/java/io/divolte/server/config/MapperConfiguration.java new file mode 100644 index 00000000..90f60dea --- /dev/null +++ b/src/main/java/io/divolte/server/config/MapperConfiguration.java @@ -0,0 +1,41 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class MapperConfiguration { + public final int bufferSize; + public final int threads; + public final int duplicateMemorySize; + public final UserAgentParserConfiguration userAgentParser; + public final Optional ip2geoDatabase; + + @JsonCreator + MapperConfiguration(final int bufferSize, + final int threads, + final int duplicateMemorySize, + final UserAgentParserConfiguration userAgentParser, + final Optional ip2geoDatabase) { + this.bufferSize = bufferSize; + this.threads = threads; + this.duplicateMemorySize = duplicateMemorySize; + this.userAgentParser = Objects.requireNonNull(userAgentParser); + this.ip2geoDatabase = Objects.requireNonNull(ip2geoDatabase); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("bufferSize", bufferSize) + .add("threads", threads) + .add("duplicateMemorySize", duplicateMemorySize) + .add("userAgentParses", userAgentParser) + .add("ip2geoDatabase", ip2geoDatabase) + .toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/MappingConfiguration.java b/src/main/java/io/divolte/server/config/MappingConfiguration.java new file mode 100644 index 00000000..c0483ae8 --- /dev/null +++ b/src/main/java/io/divolte/server/config/MappingConfiguration.java @@ -0,0 +1,54 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; +import com.google.common.collect.ImmutableSet; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class MappingConfiguration { + private static final boolean DEFAULT_DISCARD_CORRUPTED = false; + private static final boolean DEFAULT_DISCARD_DUPLICATES = false; + + public final Optional schemaFile; + public final Optional mappingScriptFile; + + public final ImmutableSet sources; + public final ImmutableSet sinks; + + public final boolean discardCorrupted; + public final boolean discardDuplicates; + + @JsonCreator + MappingConfiguration(final Optional schemaFile, + final Optional mappingScriptFile, + @JsonProperty(required = true) + final ImmutableSet sources, + @JsonProperty(required = true) + final ImmutableSet sinks, + final Optional discardCorrupted, + final Optional discardDuplicates) { + this.schemaFile = Objects.requireNonNull(schemaFile); + this.mappingScriptFile = Objects.requireNonNull(mappingScriptFile); + this.sources = Objects.requireNonNull(sources); + this.sinks = Objects.requireNonNull(sinks); + this.discardCorrupted = discardCorrupted.orElse(DEFAULT_DISCARD_CORRUPTED); + this.discardDuplicates = discardDuplicates.orElse(DEFAULT_DISCARD_DUPLICATES); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("schemaFile", schemaFile) + .add("mappingScriptFile", mappingScriptFile) + .add("sources", sources) + .add("sinks", sinks) + .add("discardCorrupted", discardCorrupted) + .add("discardDuplicates", discardDuplicates) + .toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java b/src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java deleted file mode 100644 index 1b4f2fc9..00000000 --- a/src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java +++ /dev/null @@ -1,23 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class SchemaMappingConfiguration { - public final int version; - public final String mappingScriptFile; - - @JsonCreator - private SchemaMappingConfiguration(final int version, final String mappingScriptFile) { - this.version = version; - this.mappingScriptFile = Objects.requireNonNull(mappingScriptFile); - } - - @Override - public String toString() { - return "SchemaMappingConfiguration [version=" + version + ", mappingScriptFile=" + mappingScriptFile + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/ServerConfiguration.java b/src/main/java/io/divolte/server/config/ServerConfiguration.java index ad6c00fc..d1b29ca3 100644 --- a/src/main/java/io/divolte/server/config/ServerConfiguration.java +++ b/src/main/java/io/divolte/server/config/ServerConfiguration.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; import javax.annotation.ParametersAreNonnullByDefault; import java.util.Objects; @@ -16,18 +17,23 @@ public final class ServerConfiguration { public final boolean serveStaticResources; @JsonCreator - private ServerConfiguration(final Optional host, - final int port, - @JsonProperty("use_x_forwarded_for") final boolean useXForwardedFor, - final boolean serveStaticResources) { + ServerConfiguration(final Optional host, + final int port, + @JsonProperty("use_x_forwarded_for") final boolean useXForwardedFor, + final boolean serveStaticResources) { this.host = Objects.requireNonNull(host); this.port = port; this.useXForwardedFor = useXForwardedFor; - this.serveStaticResources = Objects.requireNonNull(serveStaticResources, "Cannot be null."); + this.serveStaticResources = serveStaticResources; } @Override public String toString() { - return "ServerConfiguration [host=" + host + ", port=" + port + ", useXForwardedFor=" + useXForwardedFor + ", serveStaticResources=" + serveStaticResources + "]"; + return MoreObjects.toStringHelper(this) + .add("host", host) + .add("port", port) + .add("useXForwardedFor", useXForwardedFor) + .add("serverStaticResources", serveStaticResources) + .toString(); } } diff --git a/src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java deleted file mode 100644 index 87e2ae8e..00000000 --- a/src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java +++ /dev/null @@ -1,37 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; - -@ParametersAreNonnullByDefault -public final class SessionBinningFileStrategyConfiguration extends FileStrategyConfiguration { - @JsonCreator - private SessionBinningFileStrategyConfiguration( - final int syncFileAfterRecords, - final Duration syncFileAfterDuration, - final String workingDir, - final String publishDir, - /* - * Nasty hack here! We need to have a roll_every property on this object - * in order to support the default configuration without breaking when - * overriding to the session binning strategy vs. the file binning one. - * - * This will be fixed when we either drop support for session binning - * or we'll move to a new config setup with separation in sources, mappings - * and sinks, where there is no default setup anymore. - * - * This makes it valid configuration to declare roll_every on a configuration - * for session binning flushing, although it has no effect. - */ - @SuppressWarnings("unused") - final Duration rollEvery) { - super(Types.SESSION_BINNING, syncFileAfterRecords, syncFileAfterDuration, workingDir, publishDir); - } - - @Override - public String toString() { - return "SessionBinningFileStrategyConfiguration [type=" + type + ", syncFileAfterRecords=" + syncFileAfterRecords + ", syncFileAfterDuration=" + syncFileAfterDuration + ", workingDir=" + workingDir + ", publishDir=" + publishDir + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java deleted file mode 100644 index a942eac4..00000000 --- a/src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java +++ /dev/null @@ -1,28 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class SimpleRollingFileStrategyConfiguration extends FileStrategyConfiguration { - public final Duration rollEvery; - - @JsonCreator - private SimpleRollingFileStrategyConfiguration( - final Duration rollEvery, - final int syncFileAfterRecords, - final Duration syncFileAfterDuration, - final String workingDir, - final String publishDir) { - super(Types.SIMPLE_ROLLING_FILE, syncFileAfterRecords, syncFileAfterDuration, workingDir, publishDir); - this.rollEvery = Objects.requireNonNull(rollEvery); - } - - @Override - public String toString() { - return "SimpleRollingFileStrategyConfiguration [rollEvery=" + rollEvery + ", type=" + type + ", syncFileAfterRecords=" + syncFileAfterRecords + ", syncFileAfterDuration=" + syncFileAfterDuration + ", workingDir=" + workingDir + ", publishDir=" + publishDir + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/SinkConfiguration.java b/src/main/java/io/divolte/server/config/SinkConfiguration.java new file mode 100644 index 00000000..afb22d54 --- /dev/null +++ b/src/main/java/io/divolte/server/config/SinkConfiguration.java @@ -0,0 +1,24 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; + +@JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value=HdfsSinkConfiguration.class, name = "hdfs"), + @JsonSubTypes.Type(value=KafkaSinkConfiguration.class, name = "kafka"), +}) +@ParametersAreNonnullByDefault +public abstract class SinkConfiguration { + protected MoreObjects.ToStringHelper toStringHelper() { + return MoreObjects.toStringHelper(this); + } + + @Override + public final String toString() { + return toStringHelper().toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java b/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java new file mode 100644 index 00000000..70d8a423 --- /dev/null +++ b/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java @@ -0,0 +1,31 @@ +package io.divolte.server.config; + +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; + +@ParametersAreNonnullByDefault +public abstract class SinkTypeConfiguration { + + public final boolean enabled; + public final int bufferSize; + public final int threads; + + protected SinkTypeConfiguration(final int bufferSize, final int threads, final boolean enabled) { + this.bufferSize = bufferSize; + this.threads = threads; + this.enabled = enabled; + } + + protected MoreObjects.ToStringHelper toStringHelper() { + return MoreObjects.toStringHelper(this) + .add("enabled", enabled) + .add("bufferSize", bufferSize) + .add("threads", threads); + } + + @Override + public final String toString() { + return toStringHelper().toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/SourceConfiguration.java b/src/main/java/io/divolte/server/config/SourceConfiguration.java new file mode 100644 index 00000000..114a84db --- /dev/null +++ b/src/main/java/io/divolte/server/config/SourceConfiguration.java @@ -0,0 +1,23 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; + +@JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value=BrowserSourceConfiguration.class, name = "browser"), +}) +@ParametersAreNonnullByDefault +public abstract class SourceConfiguration { + protected MoreObjects.ToStringHelper toStringHelper() { + return MoreObjects.toStringHelper(this); + } + + @Override + public final String toString() { + return toStringHelper().toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/TrackingConfiguration.java b/src/main/java/io/divolte/server/config/TrackingConfiguration.java deleted file mode 100644 index b390fe22..00000000 --- a/src/main/java/io/divolte/server/config/TrackingConfiguration.java +++ /dev/null @@ -1,48 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; -import java.util.Optional; - -@ParametersAreNonnullByDefault -public final class TrackingConfiguration { - public final String partyCookie; - public final Duration partyTimeout; - public final String sessionCookie; - public final Duration sessionTimeout; - public final Optional cookieDomain; - public final UaParserConfiguration uaParser; - public final Optional ip2geoDatabase; - public final Optional schemaFile; - public final Optional schemaMapping; - - @JsonCreator - private TrackingConfiguration( - final String partyCookie, - final Duration partyTimeout, - final String sessionCookie, - final Duration sessionTimeout, - final Optional cookieDomain, - final UaParserConfiguration uaParser, - final Optional ip2geoDatabase, - final Optional schemaFile, - final Optional schemaMapping) { - this.partyCookie = Objects.requireNonNull(partyCookie); - this.partyTimeout = Objects.requireNonNull(partyTimeout); - this.sessionCookie = Objects.requireNonNull(sessionCookie); - this.sessionTimeout = Objects.requireNonNull(sessionTimeout); - this.cookieDomain = Objects.requireNonNull(cookieDomain); - this.uaParser = Objects.requireNonNull(uaParser); - this.ip2geoDatabase = Objects.requireNonNull(ip2geoDatabase); - this.schemaFile = Objects.requireNonNull(schemaFile); - this.schemaMapping = Objects.requireNonNull(schemaMapping); - } - - @Override - public String toString() { - return "TrackingConfiguration [partyCookie=" + partyCookie + ", partyTimeout=" + partyTimeout + ", sessionCookie=" + sessionCookie + ", sessionTimeout=" + sessionTimeout + ", cookieDomain=" + cookieDomain + ", uaParser=" + uaParser + ", ip2geoDatabase=" + ip2geoDatabase + ", schemaFile=" + schemaFile + ", schemaMapping=" + schemaMapping + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/UaParserConfiguration.java b/src/main/java/io/divolte/server/config/UaParserConfiguration.java deleted file mode 100644 index ddf6af3b..00000000 --- a/src/main/java/io/divolte/server/config/UaParserConfiguration.java +++ /dev/null @@ -1,23 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class UaParserConfiguration { - public final String type; - public final int cacheSize; - - @JsonCreator - private UaParserConfiguration(final String type, final int cacheSize) { - this.type = Objects.requireNonNull(type); - this.cacheSize = cacheSize; - } - - @Override - public String toString() { - return "UaParserConfiguration [type=" + type + ", cacheSize=" + cacheSize + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java b/src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java new file mode 100644 index 00000000..ba2e05a1 --- /dev/null +++ b/src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java @@ -0,0 +1,37 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Locale; +import java.util.Objects; + +@ParametersAreNonnullByDefault +public final class UserAgentParserConfiguration { + public final ParserType type; + public final int cacheSize; + + @JsonCreator + UserAgentParserConfiguration(final ParserType type, final int cacheSize) { + this.type = Objects.requireNonNull(type); + this.cacheSize = cacheSize; + } + + @Override + public String toString() { + return "UserAgentParserConfiguration [type=" + type + ", cacheSize=" + cacheSize + "]"; + } + + @ParametersAreNonnullByDefault + public enum ParserType { + NON_UPDATING, + ONLINE_UPDATING, + CACHING_AND_UPDATING; + + // Ensure that enumeration names are case-insensitive when parsing JSON. + @JsonCreator + static ParserType fromJson(final String value) { + return ParserType.valueOf(value.toUpperCase(Locale.ROOT)); + } + } +} diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 6500e81e..5c7c3892 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.PropertyNamingStrategy; import com.fasterxml.jackson.databind.module.SimpleModule; +import com.fasterxml.jackson.datatype.guava.GuavaModule; import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import com.fasterxml.jackson.module.paramnames.ParameterNamesModule; import com.google.common.base.Preconditions; @@ -31,17 +32,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.ConstraintViolation; import javax.validation.Validation; import javax.validation.Validator; import java.io.IOException; import java.time.Duration; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; -import java.util.Set; +import java.util.*; import java.util.function.Supplier; /** @@ -59,9 +56,8 @@ public final class ValidatedConfiguration { private final static Logger logger = LoggerFactory.getLogger(ValidatedConfiguration.class); - private final List configurationErrors; - @Nullable - private final DivolteConfiguration divolteConfiguration; + private final ImmutableList configurationErrors; + private final Optional divolteConfiguration; /** * Creates an instance of a validated configuration. The underlying @@ -95,7 +91,7 @@ public ValidatedConfiguration(final Supplier configLoader) { } this.configurationErrors = ImmutableList.copyOf(configurationErrors); - this.divolteConfiguration = divolteConfiguration; + this.divolteConfiguration = Optional.ofNullable(divolteConfiguration); } private void validate(final List configurationErrors, final DivolteConfiguration divolteConfiguration) { @@ -129,6 +125,7 @@ private static DivolteConfiguration mapped(final Config input) throws IOExceptio mapper.registerModules( new Jdk8Module(), // JDK8 types (Optional, etc.) + new GuavaModule(), // Guava types (immutable collections) new ParameterNamesModule(), // Support JDK8 parameter name discovery module // Register custom deserializers module ); @@ -146,9 +143,9 @@ private static DivolteConfiguration mapped(final Config input) throws IOExceptio * When validation errors exist. */ public DivolteConfiguration configuration() { - Preconditions.checkState(null != divolteConfiguration && configurationErrors.isEmpty(), + Preconditions.checkState(configurationErrors.isEmpty(), "Attempt to access invalid configuration."); - return divolteConfiguration; + return divolteConfiguration.get(); } /** diff --git a/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java b/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java index bcaf81b0..ff2e7813 100644 --- a/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java +++ b/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java @@ -17,11 +17,8 @@ package io.divolte.server.hdfs; import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.FileStrategyConfiguration.Types; -import io.divolte.server.config.ValidatedConfiguration; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; +import javax.annotation.ParametersAreNonnullByDefault; /* * Used by the HdfsFlusher to actually flush events to HDFS. Different implementation @@ -31,23 +28,13 @@ * heartbeat() when no events are available. When either append(...) or heartbeat return FAILURE, * clients MUST NOT call append(...) any more, until a call to heartbeat() has returned SUCCESS. */ +@ParametersAreNonnullByDefault interface FileCreateAndSyncStrategy { HdfsOperationResult setup(); HdfsOperationResult heartbeat(); HdfsOperationResult append(final AvroRecordBuffer record); void cleanup(); - static FileCreateAndSyncStrategy create(final ValidatedConfiguration vc, final FileSystem fs, final short hdfsReplication, final Schema schema) { - if (vc.configuration().hdfsFlusher.fileStrategy.type == Types.SESSION_BINNING) { - return new SessionBinningFileStrategy(vc, fs, hdfsReplication, schema); - } else if (vc.configuration().hdfsFlusher.fileStrategy.type == Types.SIMPLE_ROLLING_FILE) { - return new SimpleRollingFileStrategy(vc, fs, hdfsReplication, schema); - } else { - // Should not occur with a validate configuration. - throw new RuntimeException("No valid HDFS file flushing strategy was configured."); - } - } - enum HdfsOperationResult { SUCCESS, FAILURE diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java index 8e0f6f13..2d755ec4 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java @@ -16,27 +16,25 @@ package io.divolte.server.hdfs; -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; import io.divolte.server.AvroRecordBuffer; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult; import io.divolte.server.processing.ItemProcessor; - -import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; -import java.util.Objects; - -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; - import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; +import java.io.IOException; +import java.util.Objects; + +import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.SUCCESS; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.PAUSE; + @ParametersAreNonnullByDefault @NotThreadSafe public final class HdfsFlusher implements ItemProcessor { @@ -48,43 +46,38 @@ public final class HdfsFlusher implements ItemProcessor { public HdfsFlusher(final ValidatedConfiguration vc, final Schema schema) { Objects.requireNonNull(vc); - final FileSystem hadoopFs; - final Configuration hdfsConfiguration = new Configuration(); - final short hdfsReplication = vc.configuration().hdfsFlusher.hdfs.replication; - + final Configuration hdfsConfiguration = vc.configuration().global.hdfs.getClient() + .map(clientProperties -> { + final Configuration configuration = new Configuration(false); + for (final String propertyName : clientProperties.stringPropertyNames()) { + configuration.set(propertyName, clientProperties.getProperty(propertyName)); + } + return configuration; + }) + .orElse(new Configuration()); /* * The HDFS client creates a JVM shutdown hook, which interferes with our own server shutdown hook. * This config option disabled the built in shutdown hook. We call FileSystem.closeAll() ourselves * in the server shutdown hook instead. */ hdfsConfiguration.setBoolean("fs.automatic.close", false); + + final FileSystem hadoopFs; try { - hadoopFs = vc.configuration().hdfsFlusher.hdfs.uri.map(uri -> { - try { - return FileSystem.get(new URI(uri), hdfsConfiguration); - } catch (IOException | URISyntaxException e) { - /* - * It is possible to create a FileSystem instance when HDFS is not available (e.g. NameNode down). - * This exception only occurs when there is a configuration error in the URI (e.g. wrong scheme). - * So we fail to start up in this case. Below we create the actual HDFS connection, by opening - * files. If that fails, we do startup and initiate the regular retry cycle. - */ - logger.error("Could not initialize HDFS filesystem.", e); - throw new RuntimeException("Could not initialize HDFS filesystem", e); - } - }).orElse(FileSystem.get(hdfsConfiguration)); - } catch (IOException ioe) { + hadoopFs = FileSystem.get(hdfsConfiguration); + } catch (final IOException e) { /* * It is possible to create a FileSystem instance when HDFS is not available (e.g. NameNode down). * This exception only occurs when there is a configuration error in the URI (e.g. wrong scheme). * So we fail to start up in this case. Below we create the actual HDFS connection, by opening * files. If that fails, we do startup and initiate the regular retry cycle. */ - logger.error("Could not initialize HDFS filesystem.", ioe); - throw new RuntimeException("Could not initialize HDFS filesystem", ioe); + logger.error("Could not initialize HDFS filesystem.", e); + throw new RuntimeException("Could not initialize HDFS filesystem", e); } + final short hdfsReplication = vc.configuration().hdfsFlusher.replication; - fileStrategy = FileCreateAndSyncStrategy.create(vc, hadoopFs, hdfsReplication, Objects.requireNonNull(schema)); + fileStrategy = new SimpleRollingFileStrategy(vc, hadoopFs, hdfsReplication, Objects.requireNonNull(schema)); lastHdfsResult = fileStrategy.setup(); } diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java index bac80ccf..8815b772 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java @@ -32,17 +32,15 @@ public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema) { this( Objects.requireNonNull(vc), Objects.requireNonNull(schema), - vc.configuration().hdfsFlusher.threads, - vc.configuration().hdfsFlusher.maxWriteQueue, - vc.configuration().hdfsFlusher.maxEnqueueDelay.toMillis() + vc.configuration().global.hdfs.threads, + vc.configuration().global.hdfs.bufferSize ); } - public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema, final int numThreads, final int maxQueueSize, final long maxEnqueueDelay) { + public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema, final int numThreads, final int maxQueueSize) { super( numThreads, maxQueueSize, - maxEnqueueDelay, "Hdfs Flusher", () -> new HdfsFlusher(vc, schema)); } diff --git a/src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java b/src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java deleted file mode 100644 index b5ee7bef..00000000 --- a/src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java +++ /dev/null @@ -1,435 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server.hdfs; - -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; -import static java.util.Calendar.*; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.SessionBinningFileStrategyConfiguration; -import io.divolte.server.config.ValidatedConfiguration; - -import java.io.IOException; -import java.net.InetAddress; -import java.net.UnknownHostException; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.GregorianCalendar; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; - -import javax.annotation.concurrent.NotThreadSafe; - -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Maps; - -/* - * The general idea of this file strategy is to provide a best effort to put events that belong to the same session in the same file. - * - * The session binning file strategy assigns event to files as such: - * - each timestamp is assigned to a round, defined as timestamp_in_millis / session_timeout_in_millis - * - we open a file for a round as time passes - * - all events for a session are stored in the file with the round marked by the session start time - * - a file for a round is kept open for at least three times the session duration *in absence of failures* - * - during this entire process, we use the event timestamp for events that come off the queue as a logical clock signal - * - only in the case of an empty queue, we use the actual system time as clock signal (receiving heartbeats in a state of normal operation means an empty queue) - * - when a file for a round is closed, but events that should be in that file still arrive, they are stored in the oldest open file - * - this happens for exceptionally long sessions - * - * The above mechanics allow for the following guarantee: if a file is properly opened, used for flushing and closed without intermediate failures, - * all sessions that start within that file and last less than the session timeout duration, will be fully contained in that file. - * - * In case of failure, we close all open files. This means that files that were closed as a result of such a failure *DO NOT* provide above guarantee. - */ -@NotThreadSafe -public class SessionBinningFileStrategy implements FileCreateAndSyncStrategy { - private final static Logger logger = LoggerFactory.getLogger(SessionBinningFileStrategy.class); - - private final static long HDFS_RECONNECT_DELAY_MILLIS = 15000; - private final static long FILE_TIME_TO_LIVE_IN_SESSION_DURATIONS = 3; - - private final static AtomicInteger INSTANCE_COUNTER = new AtomicInteger(); - private final int instanceNumber; - private final String hostString; - - - private final FileSystem hdfs; - private final short hdfsReplication; - - private final Schema schema; - - private final long sessionTimeoutMillis; - - private final Map openFiles; - private final String hdfsWorkingDir; - private final String hdfsPublishDir; - private final long syncEveryMillis; - private final int syncEveryRecords; - - private boolean isHdfsAlive; - private long lastFixAttempt; - private long timeSignal; - - private long lastSyncTime; - private int recordsSinceLastSync; - - - public SessionBinningFileStrategy(final ValidatedConfiguration vc, final FileSystem hdfs, final short hdfsReplication, final Schema schema) { - sessionTimeoutMillis = vc.configuration().tracking.sessionTimeout.toMillis(); - - hostString = findLocalHostName(); - instanceNumber = INSTANCE_COUNTER.incrementAndGet(); - final SessionBinningFileStrategyConfiguration fileStrategyConfiguration = vc.configuration().hdfsFlusher.fileStrategy.as(SessionBinningFileStrategyConfiguration.class); - hdfsWorkingDir = fileStrategyConfiguration.workingDir; - hdfsPublishDir = fileStrategyConfiguration.publishDir; - - syncEveryMillis = fileStrategyConfiguration.syncFileAfterDuration.toMillis(); - syncEveryRecords = fileStrategyConfiguration.syncFileAfterRecords; - - this.hdfs = hdfs; - this.hdfsReplication = hdfsReplication; - - this.schema = schema; - - openFiles = Maps.newHashMapWithExpectedSize(10); - - throwsIoException(() -> { - if (!hdfs.isDirectory(new Path(hdfsWorkingDir))) { - throw new IOException("Working directory for in-flight AVRO records does not exist: " + hdfsWorkingDir); - } - if (!hdfs.isDirectory(new Path(hdfsPublishDir))) { - throw new IOException("Working directory for publishing AVRO records does not exist: " + hdfsPublishDir); - } - }).ifPresent((e) -> { throw new RuntimeException("Configuration error", e); }); - } - - private static String findLocalHostName() { - try { - return InetAddress.getLocalHost().getHostName(); - } catch (final UnknownHostException e) { - return "localhost"; - } - } - - @Override - public HdfsOperationResult setup() { - /* - * On setup, we assume everything to work, as we cannot open - * any files before receiving any events. This is because the - * events are used as a clock signal. - */ - isHdfsAlive = true; - lastFixAttempt = 0; - - lastSyncTime = 0; - recordsSinceLastSync = 0; - - return SUCCESS; - } - - @Override - public HdfsOperationResult heartbeat() { - if (isHdfsAlive) { - // queue is empty, so logical time == current system time - timeSignal = System.currentTimeMillis(); - return throwsIoException(this::possiblySyncAndOrClose) - .map((ioe) -> { - logger.warn("Failed to sync HDFS file.", ioe); - hdfsDied(); - return FAILURE; - }) - .orElse(SUCCESS); - } else { - // queue may or may not be empty, just attempt a reconnect - return possiblyFixHdfsConnection(); - } - } - - @Override - public HdfsOperationResult append(final AvroRecordBuffer record) { - if (!isHdfsAlive) { - throw new IllegalStateException("Append attempt while HDFS connection is not alive."); - } - - timeSignal = record.getEventTime(); - return writeRecord(record); - } - - private HdfsOperationResult writeRecord(final AvroRecordBuffer record) { - return throwsIoException(() -> { - final RoundHdfsFile file = fileForSessionStartTime(record.getSessionId().timestamp - record.getCookieUtcOffset()); - file.writer.appendEncoded(record.getByteBuffer()); - file.recordsSinceLastSync += 1; - recordsSinceLastSync += 1; - possiblySyncAndOrClose(); - }) - .map((ioe) -> { - logger.warn("Error while flushing event to HDFS.", ioe); - hdfsDied(); - return FAILURE; - }) - .orElse(SUCCESS); - } - - @Override - public void cleanup() { - openFiles.values().forEach((file) -> throwsIoException(() -> file.close(false)) - .ifPresent((ioe) -> logger.warn("Failed to properly close HDFS file: " + file.path, ioe))); - openFiles.clear(); - } - - private void possiblySyncAndOrClose() { - try { - final long time = System.currentTimeMillis(); - - if ( - recordsSinceLastSync >= syncEveryRecords || - time - lastSyncTime >= syncEveryMillis && recordsSinceLastSync > 0) { - - openFiles - .values() - .stream() - .filter((f) -> f.recordsSinceLastSync > 0) // only sync files that have pending records - .forEach((file) -> { - try { - logger.debug("Syncing file: {}", file.path); - file.writer.sync(); // Forces the Avro file to write a block - file.stream.hsync(); // Forces a (HDFS) sync on the underlying stream - file.recordsSinceLastSync = 0; - } catch (final IOException e) { - throw new WrappedIOException(e); - } - }); - - recordsSinceLastSync = 0; - lastSyncTime = time; - } else if (recordsSinceLastSync == 0) { - lastSyncTime = time; - } - } finally { - possiblyCloseAndCleanup(); - } - } - - private void possiblyCloseAndCleanup() { - final long oldestAllowedRound = (timeSignal / sessionTimeoutMillis) - (FILE_TIME_TO_LIVE_IN_SESSION_DURATIONS - 1); - - final List> entriesToBeClosed = openFiles - .entrySet() - .stream() - .filter((e) -> e.getValue().round < oldestAllowedRound) - .collect(Collectors.toList()); - - entriesToBeClosed - .stream() - .map(Entry::getValue) - .distinct() - .forEach((file) -> { - logger.debug("Closing HDFS file: {}", file.path); - throwsIoException(() -> file.close(true)) - .ifPresent((ioe) -> logger.warn("Failed to cleanly close HDFS file: " + file.path, ioe)); - }); - - entriesToBeClosed - .forEach((e) -> openFiles.remove(e.getKey())); - } - - private HdfsOperationResult possiblyFixHdfsConnection() { - if (isHdfsAlive) { - throw new IllegalStateException("HDFS connection repair attempt while not broken."); - } - - final long time = System.currentTimeMillis(); - if (time - lastFixAttempt > HDFS_RECONNECT_DELAY_MILLIS) { - return throwsIoException(() -> openFiles.put(timeSignal / sessionTimeoutMillis, new RoundHdfsFile(timeSignal))) - .map((ioe) -> { - logger.warn("Could not reconnect to HDFS after failure."); - lastFixAttempt = time; - return FAILURE; - }) - .orElseGet(() -> { - logger.info("Recovered HDFS connection."); - isHdfsAlive = true; - lastFixAttempt = 0; - return SUCCESS; - }); - } else { - return FAILURE; - } - } - - private void hdfsDied() { - /* - * On HDFS connection / access failure, we abandon everything and periodically try to reconnect, - * by re-creating a file for the round that caused the failure. Other files will be re-created - * as records for specific files arrive. - */ - isHdfsAlive = false; - openFiles.values().forEach((file) -> throwsIoException(() -> file.close(false))); - openFiles.clear(); - - logger.warn("HDFS failure. Closing all files and going into connect retry cycle."); - } - - private RoundHdfsFile fileForSessionStartTime(final long sessionStartTime) { - final long requestedRound = sessionStartTime / sessionTimeoutMillis; - // return the first open file for which the round >= the requested round - // or create a new file if no such file is present - return openFiles.computeIfAbsent(requestedRound, (ignored) -> openFiles - .values() - .stream() - .sorted((left, right) -> Long.compare(left.round, right.round)) - .filter((f) -> f.round >= requestedRound) - .findFirst() - .orElseGet(() -> - // if the requested round is greater than the current round + 1, - // we return the file for the current round, as probably this is - // a result of a very skewed client side clock, or a fake request - requestedRound > timeSignal / sessionTimeoutMillis + 1 - ? fileForSessionStartTime(timeSignal) - : new RoundHdfsFile(sessionStartTime) - )); - } - - private final class RoundHdfsFile { - private static final String INFLIGHT_EXTENSION = ".partial"; - private static final int MAX_AVRO_SYNC_INTERVAL = 1 << 30; - private final DateFormat format = new SimpleDateFormat("HH.mm.ss.SSS"); - - final Path path; - final long round; - final FSDataOutputStream stream; - final DataFileWriter writer; - - int recordsSinceLastSync; - - RoundHdfsFile(final long time) { - final long requestedRound = time / sessionTimeoutMillis; - final long oldestAllowedRound = (timeSignal / sessionTimeoutMillis) - (FILE_TIME_TO_LIVE_IN_SESSION_DURATIONS - 1); - this.round = Math.max(requestedRound, oldestAllowedRound); - - this.path = new Path(hdfsWorkingDir, - String.format("%s-divolte-tracking-%s-%s-%d.avro" + INFLIGHT_EXTENSION, - hostString, // add host name, differentiates when deploying multiple collector instances - roundString(round * sessionTimeoutMillis), // composed of the round start date + round number within the day - format.format(new Date()), // additionally, we add a timestamp, because after failures, a file for a round can be created multiple times - instanceNumber)); // add instance number, so different threads cannot try to create the exact same file - - try { - stream = hdfs.create(path, hdfsReplication); - writer = new DataFileWriter(new GenericDatumWriter<>(schema)).create(schema, stream); - writer.setSyncInterval(MAX_AVRO_SYNC_INTERVAL); // since we manually sync at chosen intervals - writer.setFlushOnEveryBlock(true); - - // Sync the file on open to make sure the - // connection actually works, because - // HDFS allows file creation even with no - // datanodes available - stream.hsync(); - recordsSinceLastSync = 0; - - logger.debug("Created new HDFS file: {}", path); - } catch (final IOException e) { - logger.warn("Failed HDFS file creation: {}", path); - // we may have created the file, but failed to sync, so we attempt a delete - // this happens when the NN responds successfully, but there are no DNs available - throwsIoException(() -> hdfs.delete(path, false)); - throw new WrappedIOException(e); - } - } - - private String roundString(final long roundStartTime) { - /* - * The round string in the filename is constructed from the current date - * in the form YYYYmmdd-RR. Where RR is the 0-padded number of session length - * intervals since midnight on the current day. This uses the system timezone. - * Note that if the system is in a timezone that supports DST, the number of - * session length intervals per day is not equal for all days. - */ - final GregorianCalendar gc = new GregorianCalendar(); - gc.setTimeInMillis(roundStartTime); - gc.set(HOUR_OF_DAY, 0); - gc.set(MINUTE, 0); - gc.set(SECOND, 0); - gc.set(MILLISECOND, 0); - - return String.format("%d%02d%02d-%02d", - gc.get(YEAR), - gc.get(MONTH) + 1, - gc.get(DAY_OF_MONTH), - (roundStartTime - gc.getTimeInMillis()) / sessionTimeoutMillis); - } - - private Path getPublishDestination() { - final String pathName = path.getName(); - return new Path(hdfsPublishDir, pathName.substring(0, pathName.length() - INFLIGHT_EXTENSION.length())); - } - - public void close(final boolean publish) { - try { - writer.close(); - if (publish) { - final Path publishDestination = getPublishDestination(); - logger.debug("Moving HDFS file: {} -> {}", path, publishDestination); - if (!hdfs.rename(path, publishDestination)) { - throw new IOException("Could not rename HDFS file: " + path + " -> " + publishDestination); - } - } - } catch (final IOException e) { - throw new WrappedIOException(e); - } - } - } - - @SuppressWarnings("serial") - private static final class WrappedIOException extends RuntimeException { - final IOException wrappedIOException; - - private WrappedIOException(final IOException ioe) { - this.wrappedIOException = ioe; - } - } - - @FunctionalInterface - private interface IOExceptionThrower { - void run() throws IOException; - } - - private static Optional throwsIoException(final IOExceptionThrower r) { - try { - r.run(); - return Optional.empty(); - } catch (final IOException ioe) { - return Optional.of(ioe); - } catch (final WrappedIOException wioe) { - return Optional.of(wioe.wrappedIOException); - } - } -} diff --git a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java index 257dd12f..21aab583 100644 --- a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java +++ b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java @@ -16,11 +16,22 @@ package io.divolte.server.hdfs; -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.SimpleRollingFileStrategyConfiguration; +import io.divolte.server.config.FileStrategyConfiguration; import io.divolte.server.config.ValidatedConfiguration; +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import javax.annotation.Nonnull; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; @@ -31,18 +42,8 @@ import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; - -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.FAILURE; +import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.SUCCESS; @NotThreadSafe @ParametersAreNonnullByDefault @@ -77,7 +78,7 @@ public SimpleRollingFileStrategy(final ValidatedConfiguration vc, final FileSyst Objects.requireNonNull(vc); this.schema = Objects.requireNonNull(schema); - final SimpleRollingFileStrategyConfiguration fileStrategyConfiguration = vc.configuration().hdfsFlusher.fileStrategy.as(SimpleRollingFileStrategyConfiguration.class); + final FileStrategyConfiguration fileStrategyConfiguration = vc.configuration().hdfsFlusher.fileStrategy; syncEveryMillis = fileStrategyConfiguration.syncFileAfterDuration.toMillis(); syncEveryRecords = fileStrategyConfiguration.syncFileAfterRecords; newFileEveryMillis = fileStrategyConfiguration.rollEvery.toMillis(); diff --git a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java index e047efba..d5f91f64 100644 --- a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java +++ b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java @@ -16,17 +16,15 @@ package io.divolte.server.js; +import com.google.common.collect.ImmutableMap; +import io.divolte.server.config.BrowserSourceConfiguration; import io.divolte.server.config.ValidatedConfiguration; - -import java.io.IOException; -import java.time.temporal.ChronoUnit; - -import javax.annotation.ParametersAreNonnullByDefault; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.ImmutableMap; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.time.temporal.ChronoUnit; @ParametersAreNonnullByDefault public class TrackingJavaScriptResource extends JavaScriptResource { @@ -35,7 +33,9 @@ public class TrackingJavaScriptResource extends JavaScriptResource { private static final String SCRIPT_CONSTANT_NAME = "SCRIPT_NAME"; public TrackingJavaScriptResource(final ValidatedConfiguration vc) throws IOException { - super("divolte.js", createScriptConstants(vc), vc.configuration().javascript.debug); + super(vc.configuration().browserSourceConfiguration.javascript.name, + createScriptConstants(vc), + vc.configuration().browserSourceConfiguration.javascript.debug); } public String getScriptName() { @@ -43,16 +43,16 @@ public String getScriptName() { } private static ImmutableMap createScriptConstants(final ValidatedConfiguration vc) { + final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().browserSourceConfiguration; final ImmutableMap.Builder builder = ImmutableMap.builder(); - builder.put("PARTY_COOKIE_NAME", vc.configuration().tracking.partyCookie); - builder.put("PARTY_ID_TIMEOUT_SECONDS", trimLongToMaxInt(vc.configuration().tracking.partyTimeout.get(ChronoUnit.SECONDS))); - builder.put("SESSION_COOKIE_NAME", vc.configuration().tracking.sessionCookie); - builder.put("SESSION_ID_TIMEOUT_SECONDS", trimLongToMaxInt(vc.configuration().tracking.sessionTimeout.get(ChronoUnit.SECONDS))); - vc.configuration().tracking.cookieDomain - .ifPresent((v) -> builder.put("COOKIE_DOMAIN", v)); - builder.put("LOGGING", vc.configuration().javascript.logging); - builder.put(SCRIPT_CONSTANT_NAME, vc.configuration().javascript.name); - builder.put("AUTO_PAGE_VIEW_EVENT", vc.configuration().javascript.autoPageViewEvent); + builder.put("PARTY_COOKIE_NAME", browserSourceConfiguration.partyCookie); + builder.put("PARTY_ID_TIMEOUT_SECONDS", trimLongToMaxInt(browserSourceConfiguration.partyTimeout.get(ChronoUnit.SECONDS))); + builder.put("SESSION_COOKIE_NAME", browserSourceConfiguration.sessionCookie); + builder.put("SESSION_ID_TIMEOUT_SECONDS", trimLongToMaxInt(browserSourceConfiguration.sessionTimeout.get(ChronoUnit.SECONDS))); + browserSourceConfiguration.cookieDomain.ifPresent((v) -> builder.put("COOKIE_DOMAIN", v)); + builder.put("LOGGING", browserSourceConfiguration.javascript.logging); + builder.put(SCRIPT_CONSTANT_NAME, browserSourceConfiguration.javascript.name); + builder.put("AUTO_PAGE_VIEW_EVENT", browserSourceConfiguration.javascript.autoPageViewEvent); return builder.build(); } diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java index c575e062..dfd256a1 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java @@ -16,31 +16,26 @@ package io.divolte.server.kafka; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; import io.divolte.server.AvroRecordBuffer; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.processing.ItemProcessor; - -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; -import java.util.Objects; -import java.util.Optional; -import java.util.Queue; -import java.util.stream.Collectors; - -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; - import kafka.common.FailedToSendMessageException; import kafka.javaapi.producer.Producer; import kafka.producer.KeyedMessage; import kafka.producer.ProducerConfig; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.stream.Collectors; + +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.PAUSE; + @ParametersAreNonnullByDefault @NotThreadSafe public final class KafkaFlusher implements ItemProcessor { @@ -55,7 +50,7 @@ public final class KafkaFlusher implements ItemProcessor { public KafkaFlusher(final ValidatedConfiguration vc) { Objects.requireNonNull(vc); - final ProducerConfig producerConfig = new ProducerConfig(vc.configuration().kafkaFlusher.producer); + final ProducerConfig producerConfig = new ProducerConfig(vc.configuration().global.kafka.getProducer()); topic = vc.configuration().kafkaFlusher.topic; producer = new Producer<>(producerConfig); } diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java index ea4fa088..918a01b9 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java @@ -29,14 +29,12 @@ public class KafkaFlushingPool extends ProcessingPool new KafkaFlusher(vc)); + public KafkaFlushingPool(ValidatedConfiguration vc, int numThreads, int maxWriteQueue) { + super(numThreads, maxWriteQueue, "Kafka Flusher", () -> new KafkaFlusher(vc)); } public void enqueueRecord(final AvroRecordBuffer record) { diff --git a/src/main/java/io/divolte/server/processing/ProcessingPool.java b/src/main/java/io/divolte/server/processing/ProcessingPool.java index 6d423417..b4fc512b 100644 --- a/src/main/java/io/divolte/server/processing/ProcessingPool.java +++ b/src/main/java/io/divolte/server/processing/ProcessingPool.java @@ -50,7 +50,6 @@ public class ProcessingPool, E> { private final ExecutorService executorService; private final List> queues; - private final long maxEnqueueDelay; private volatile boolean running; @@ -60,7 +59,6 @@ public class ProcessingPool, E> { public ProcessingPool( final int numThreads, final int maxQueueSize, - final long maxEnqueueDelay, final String threadBaseName, final Supplier processorSupplier) { @@ -73,8 +71,6 @@ public ProcessingPool( final ThreadFactory factory = createThreadFactory(threadGroup, threadBaseName + " - %d"); executorService = Executors.newFixedThreadPool(numThreads, factory); - this.maxEnqueueDelay = maxEnqueueDelay; - this.queues = Stream.> generate(() -> new ArrayBlockingQueue<>(maxQueueSize)) .limit(numThreads) @@ -89,12 +85,8 @@ public ProcessingPool( public void enqueue(String key, E e) { // We mask the hash-code to ensure we always get a positive bucket index. - if (!offerQuietly( - queues.get((key.hashCode() & Integer.MAX_VALUE) % queues.size()), - e, - maxEnqueueDelay, - TimeUnit.MILLISECONDS)) { - logger.warn("Failed to enqueue item for {} ms. Dropping event.", maxEnqueueDelay); + if (!queues.get((key.hashCode() & Integer.MAX_VALUE) % queues.size()).offer(e)) { + logger.warn("Failed to enqueue item. Dropping event."); } } @@ -172,15 +164,6 @@ private static E pollQuietly(final BlockingQueue queue, final long timeou } } - private static boolean offerQuietly(final BlockingQueue queue, final E item, final long timeout, final TimeUnit unit) { - try { - return queue.offer(item, timeout, unit); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - return false; - } - } - private static ThreadFactory createThreadFactory(final ThreadGroup group, final String nameFormat) { return new ThreadFactoryBuilder() .setNameFormat(nameFormat) diff --git a/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java b/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java index 2bc417bc..a04d6811 100644 --- a/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java +++ b/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java @@ -55,10 +55,6 @@ public class DslRecordMapper implements RecordMapper { private final Schema schema; private final List actions; - public DslRecordMapper(final ValidatedConfiguration vc, final Schema schema, final Optional geoipService) { - this(vc, vc.configuration().tracking.schemaMapping.get().mappingScriptFile, schema, geoipService); - } - public DslRecordMapper(final ValidatedConfiguration vc, final String groovyFile, final Schema schema, final Optional geoipService) { this.schema = Objects.requireNonNull(schema); diff --git a/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java b/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java index eb97da26..0826b810 100644 --- a/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java +++ b/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java @@ -1,5 +1,6 @@ package io.divolte.server.recordmapping; +import io.divolte.server.config.UserAgentParserConfiguration; import io.divolte.server.config.ValidatedConfiguration; import java.util.Optional; @@ -23,8 +24,8 @@ public final class UserAgentParserAndCache { private final LoadingCache cache; public UserAgentParserAndCache(final ValidatedConfiguration vc) { - final UserAgentStringParser parser = parserBasedOnTypeConfig(vc.configuration().tracking.uaParser.type); - this.cache = sizeBoundCacheFromLoadingFunction(parser::parse, vc.configuration().tracking.uaParser.cacheSize); + final UserAgentStringParser parser = parserBasedOnTypeConfig(vc.configuration().global.mapper.userAgentParser.type); + this.cache = sizeBoundCacheFromLoadingFunction(parser::parse, vc.configuration().global.mapper.userAgentParser.cacheSize); logger.info("User agent parser data version: {}", parser.getDataVersion()); } @@ -37,15 +38,15 @@ public Optional tryParse(final String userAgentString) { } } - private static UserAgentStringParser parserBasedOnTypeConfig(String type) { + private static UserAgentStringParser parserBasedOnTypeConfig(UserAgentParserConfiguration.ParserType type) { switch (type) { - case "caching_and_updating": + case CACHING_AND_UPDATING: logger.info("Using caching and updating user agent parser."); return UADetectorServiceFactory.getCachingAndUpdatingParser(); - case "online_updating": + case ONLINE_UPDATING: logger.info("Using online updating user agent parser."); return UADetectorServiceFactory.getOnlineUpdatingParser(); - case "non_updating": + case NON_UPDATING: logger.info("Using non-updating (resource module based) user agent parser."); return UADetectorServiceFactory.getResourceModuleParser(); default: diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf index 34c4127e..d93e2386 100644 --- a/src/main/resources/reference.conf +++ b/src/main/resources/reference.conf @@ -16,361 +16,133 @@ // This is the default configuration. divolte { - server { - // The host to which the server binds. - // Set to a specific IP address to selectively listen on that interface. - // If not present, a loopback-only address will be bound. - //host = 0.0.0.0 - // The bind host can be overridden using the DIVOLTE_HOST environment variable. - //host = ${?DIVOLTE_HOST} - - // The port on which the sever listens. - port = 8290 - // Server port can be overridden using the DIVOLTE_PORT environment variable. - port = ${?DIVOLTE_PORT} - - // Whether to use the X-Forwarded-For header HTTP header - // for determining the source IP of a request if present. - // When a X-Forwared-For header is present, the rightmost - // IP address of the value is used as source IP when - // when multiple IP addresses are separated by a comma. - // When the header is present more than once, the last - // value will be used. - // E.g. - // "X-Forwarded-For: 10.200.13.28, 11.45.82.30" ==> 11.45.82.30 - // - // "X-Forwarded-For: 10.200.13.28" - // "X-Forwarded-For: 11.45.82.30" ==> 11.45.82.30 - use_x_forwarded_for = false - - // When true Divolte Collector serves a static test page at /. - serve_static_resources = true - } - - // The tracking section controls the settings related to the tracking - // JavaScript. This script is compiled using the closure compiler - // (https://developers.google.com/closure/compiler/) on startup of the - // server. During compilation the values from the settings are substituted - // in the JavaScript and thus appear as hard-coded on the client side. - tracking { - // The name of the cookie used for setting a party ID - party_cookie = _dvp - // The expiry time for the party ID cookie - party_timeout = 730 days - - // The name of the cookie used tracking the session ID - session_cookie = _dvs - - // The expiry time for a session - session_timeout = 30 minutes - - // The cookie domain that is assigned to the cookies. - // When left empty, the cookie will have no domain - // explicitly associated with it, which effectively - // sets it to the website domain of the page that - // contains the Divolte Collector JavaScript. - // cookie_domain = '' - - - // This section controls the user agent parsing settings. The user agent - // parsing is based on this library (https://github.com/before/uadetector), - // which allows for dynamic reloading of the backing database if a internet - // connection is available. The parser type controls this behavior. - // Possible values are: - // - non_updating: Uses a local database, bundled - // with Divolte Collector. - // - online_updating: Uses a online database only, never falls back - // to the local database. - // - caching_and_updating: Uses a cached version of the online database - // and periodically checks for new version at the - // remote location. Updates are downloaded - // automatically and cached locally. - - ua_parser { - // The parser type. - type = non_updating - - // User agent parsing is a relatively expensive operation that requires - // many regular expression evaluations. Very often the same user agent - // will make consecutive requests and many clients will have the exact - // same user agent as well. It therefore makes sense to cache the - // parsing results in memory and do a lookup before trying a parse. - // This setting determines how many unique user agent strings will be - // cached. - cache_size = 1000 + global { + server { + // The host to which the server binds. + // Set to a specific IP address to selectively listen on that interface. + // If not present, a loopback-only address will be bound. + //host = 0.0.0.0 + // The bind host can be overridden using the DIVOLTE_HOST environment variable. + //host = ${?DIVOLTE_HOST} + + // The port on which the sever listens. + port = 8290 + // Server port can be overridden using the DIVOLTE_PORT environment variable. + port = ${?DIVOLTE_PORT} + + // Whether to use the X-Forwarded-For header HTTP header + // for determining the source IP of a request if present. + // When a X-Forwared-For header is present, the rightmost + // IP address of the value is used as source IP when + // when multiple IP addresses are separated by a comma. + // When the header is present more than once, the last + // value will be used. + // E.g. + // "X-Forwarded-For: 10.200.13.28, 11.45.82.30" ==> 11.45.82.30 + // + // "X-Forwarded-For: 10.200.13.28" + // "X-Forwarded-For: 11.45.82.30" ==> 11.45.82.30 + use_x_forwarded_for = false + + // When true Divolte Collector serves a static test page at /. + serve_static_resources = true } - // This configures the ip2geo database for geo lookups. A ip2geo database - // can be obtained from MaxMind (https://www.maxmind.com/en/geoip2-databases). - // Both a free version and a more accurate paid version are available. - // - // By default, no ip2geo database is configured. When this setting is - // absent, no attempt will be made to lookup geo-coordinates for IP - // addresses. If configured, Divolte Collector will keep a filesystem - // watch on the database file. If the file is changed on the filesystem - // the database will be reloaded at runtime without requireing a restart. - // ip2geo_database = /path/to/dabase/file.db - - // By default, Divolte Collector will use a built-in Avro schema for - // writing data and a default mapping, which is documented in the - // Mapping section of the user documentation. The default schema - // can be found here: https://github.com/divolte/divolte-schema - // - // Typically, users will configure their own schema, usually with - // fields specific to their domain and custom events and other - // mappings. When using a user defined schema, it is also - // required to provide a mapping script. See the user documentation - // for further reference. - - // schema_file = /Users/friso/code/divolte-examples/avro-schema/src/main/resources/JavadocEventRecord.avsc - // schema_mapping { - // The version of the mapping dialect to use. The current latest - // version is 2. Version 1 has been deprecated and removed from - // Divolte Collector since release 0.2 - // version = 2 - - // The groovy script file to use as mapping definition. - // mapping_script_file = "/Users/friso/code/divolte-examples/avro-schema/mapping.groovy" - // } - } - - // The javascript section controls settings related to the way - // the JavaScript file is compiled. - javascript { - // Name of the script file. This changes the divolte.js part in - // the script url: http://www.domain.tld/divolte.js - name = divolte.js - - // Enable or disable the logging on the JavaScript console in - // the browser - logging = false - - // When true, the served JavaScript will be compiled, but not - // minified, improving readability when debugging in the browser. - debug = false - - // When false, divolte.js will not automatically send a pageView - // event after being loaded. This way clients can send a initial - // event themselves and have full control over the event type and - // the custom parameters that are sent with the initial event. - auto_page_view_event = true - } - - // This section controls settings related to the processing of incoming - // requests after they have been responded to by the server. Incoming - // requests in Divolte Collector are initially handled by a pool of - // HTTP threads, which immediately respond with a HTTP code 200 and send - // the response payload (a 1x1 pixel transparent GIF image). After - // responding, the request data is passed onto the incoming request - // processing thread pool. This is the incoming request processor. - incoming_request_processor { - // Number of threads to use for processing incoming requests - threads = 2 - - // The maximum queue of incoming requests to keep - // before starting to drop incoming requests. Note - // that when this queue is full, requests are dropped - // and a warning is logged. No errors are reported to - // the client side. Divolte Collector will always respond - // with a HTTP 200 status code and the image payload. - // Note that the queue size is per thread. - max_write_queue = 100000 - - // The maximum delay to block before an incoming request - // is dropped in case of a full queue. - max_enqueue_delay = 1 second - - // The incoming request handler attempts to parse out all - // relevant information from the request as passed by the - // JavaScript. If the incoming request appears corrupt, - // for example because of a truncated URL or incorrect - // data in the fields, the request is flagged as corrupt. - // The detection of corrupt requests is enforced by appending - // a hash of all fields to the request from the JavaScript. - // This hash is validated on the server side. - // If this setting is true, events that are flagged as corrupt - // will be dropped from the stream, instead of processed further. - // It is common not to drop the corrupt events, but instead - // include them for later analysis. - discard_corrupted = false - - // Browsers and other clients (e.g. anti-virus software, proxies) - // will sometimes send the exact same request twice. Divolte - // Collector attempts to flag these duplicate events, by using - // a internal probabilistic data structure with a finite memory - // size. The memory consists internally of an array of 64 bit - // integers. This the memory required in bytes is the memory size - // times 8 (8 megabytes for 1 million entries). - // Note that the memory size is per thread. - duplicate_memory_size = 1000000 - - // If this setting is true, events that are flagged as duplicate - // will be dropped from the stream, instead of processed further. - // It is common not to drop the duplicate events, but instead - // include them for later analysis. - discard_duplicates = false - } - - // This section controls settings related to flushing the event stream - // to a Apache Kafka topic. - kafka_flusher { - // If true, flushing to Kafka is enabled. - enabled = false - - // Number of threads to use for flushing events to Kafka - threads = 2 - - // The maximum queue of incoming requests to keep - // before starting to drop incoming requests. Note - // that when this queue is full, requests are dropped - // and a warning is logged. No errors are reported to - // the client side. Divolte Collector will always respond - // with a HTTP 200 status code and the image payload. - // Note that the queue size is per thread. - max_write_queue = 200000 - - // The maximum delay to block before an incoming request - // is dropped in case of a full queue. - max_enqueue_delay = 1 second - - // The Kafka topic onto which events are published. - topic = "divolte" - // The topic can be overridden by setting the - // DIVOLTE_KAFKA_TOPIC environment variable. - topic = ${?DIVOLTE_KAFKA_TOPIC} - - // All settings in here are used as-is to configure - // the Kafka producer. - // See: http://kafka.apache.org/documentation.html#producerconfigs - producer = { - metadata.broker.list = ["localhost:9092"] - metadata.broker.list = ${?DIVOLTE_KAFKA_BROKER_LIST} - - client.id = divolte.collector - client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} - - request.required.acks = 0 - message.send.max.retries = 5 - retry.backoff.ms = 200 + mapper { + // Size of the buffer used by each mapper to hold the incoming + // events that need to be mapped. This is rounded up to the + // nearest power of two. + buffer_size = 1048576 + + // The number of threads each configured mapper should use to + // process the events. + threads = 1 + + // The amount of memory that each mapper thread should use for + // detecting duplicate events. + duplicate_memory_size = 1000000 + + // This section controls the user agent parsing settings. The user agent + // parsing is based on this library (https://github.com/before/uadetector), + // which allows for dynamic reloading of the backing database if a internet + // connection is available. + user_agent_parser { + // The parser type. Possible values are: + // - non_updating: Uses a local database, bundled + // with Divolte Collector. + // - online_updating: Uses a online database only, never falls back + // to the local database. + // - caching_and_updating: Uses a cached version of the online database + // and periodically checks for new version at the + // remote location. Updates are downloaded + // automatically and cached locally. + type = non_updating + + // User agent parsing is a relatively expensive operation that requires + // many regular expression evaluations. Very often the same user agent + // will make consecutive requests and many clients will have the exact + // same user agent as well. It therefore makes sense to cache the + // parsing results in memory and do a lookup before trying a parse. + // This setting determines how many unique user agent strings will be + // cached. + cache_size = 1000 + } } - } - - // This section controls settings related to flushing the event stream - // to HDFS. - hdfs_flusher { - // If true, flushing to HDFS is enabled. - enabled = true - - // Number of threads to use for flushing events to HDFS. - // Each thread creates its own files on HDFS. Depending - // on the flushing strategy, multiple concurrent files - // could be kept open per thread. - threads = 2 - - // The maximum queue of incoming requests to keep - // before starting to drop incoming requests. Note - // that when this queue is full, requests are dropped - // and a warning is logged. No errors are reported to - // the client side. Divolte Collector will always respond - // with a HTTP 200 status code and the image payload. - // Note that the queue size is per thread. - max_write_queue = 100000 - // The maximum delay to block before an incoming request - // is dropped in case of a full queue. - max_enqueue_delay = 1 second - - - // HDFS specific settings. Although it's possible to configure - // a HDFS URI here, it is more advisable to configure HDFS - // settings by specifying a HADOOP_CONF_DIR environment variable - // which will be added to the classpath on startup and as such - // configure the HDFS client automatically. hdfs { - // default nonexistant: Use HADOOP_CONF_DIR on the classpath. - // If not present empty config results in local filesystem being used. - // uri = "file:///" - // uri = ${?DIVOLTE_HDFS_URI} - - // The HDFS replication factor to use when creating - // files. - replication = 1 - - // The replication factor can be overridden by setting the - // DIVOLTE_HDFS_REPLICATION environment variable. - replication = ${?DIVOLTE_HDFS_REPLICATION} + // If true, flushing to HDFS is enabled. + enabled = true + + // Number of threads to use for flushing events to HDFS. + // Each thread creates its own files on HDFS. Depending + // on the flushing strategy, multiple concurrent files + // could be kept open per thread. + threads = 2 + + // The maximum queue of mapped events to buffer before + // starting to drop new ones. Note that when this buffer is full, + // events are dropped and a warning is logged. No errors are reported + // to the source of the events. A single buffer is shared between all + // threads, and its size will be rounded up to the nearest power of 2. + buffer_size = 1048576 + + // Arbitrary HDFS client properties. + // If absent, hdfs-site.xml from the classpath will be used. + //client {} } - // Divolte Collector has two strategies for creating files - // on HDFS and flushing data. By default, a simple rolling - // file strategy is employed. This opens one file per thread - // and rolls on to a new file after a configurable interval. - // Files that are being written to, have a extension of - // .avro.partial and are written the the directory configured - // in the working_dir setting. When a file is closed, it - // will be renamed to have a .avro extension and is moved to - // the directory configured in the publish_dir settins. This - // happens in a single (atomic) filesystem move operation. - file_strategy { - // File strategy type - type = SIMPLE_ROLLING_FILE - - // Roll over files on HDFS after this amount of time. - roll_every = 60 minutes - - // Issue a hsync against files each time this number of - // records has been flushed to it. - sync_file_after_records = 1000 - - // If no records are being flushed, issue a hsync when - // this amount of time passes, regardless of how much - // data was written. - sync_file_after_duration = 30 seconds - - // Directory where files are created and kept while being - // written to. - working_dir = /tmp - - // Directory where files are moved to, after they are closed. - publish_dir = /tmp + kafka { + // If true, flushing to Kafka is enabled. + enabled = false + + // Number of threads to use for flushing events to Kafka + threads = 2 + + // The maximum queue of mapped events to buffer before + // starting to drop new ones. Note that when this buffer is full, + // events are dropped and a warning is logged. No errors are reported + // to the source of the events. A single buffer is shared between all + // threads, and its size will be rounded up to the nearest power of 2. + buffer_size = 1048576 + + // All settings in here are used as-is to configure + // the Kafka producer. + // See: http://kafka.apache.org/documentation.html#producerconfigs + producer = { + metadata.broker.list = ["localhost:9092"] + metadata.broker.list = ${?DIVOLTE_KAFKA_BROKER_LIST} + + client.id = divolte.collector + client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} + + request.required.acks = 0 + message.send.max.retries = 5 + retry.backoff.ms = 200 + } } - - // Next to the rolling file strategy, there is a more complex - // strategy called session binning file strategy. The general - // idea of this strategy is to provide a best effort to put - // events that belong to the same session in the same file. - // - // This strategy assigns event to files as such: - // - Each event is assigned to a round based on timestamp, - // defined as timestamp_in_millis / session_timeout_in_millis. - // - A file is opened for each round as time passes. - // - All events for a session are stored in the file with the - // round marked by the session start time. - // - A file for a round is kept open for at least three times the - // session duration *in absence of failures*. - // - During this entire process, the event timestamp is used for - // events that come off the queue as a logical clock signal. - // - Only in the case of an empty queue, the actual system - // time is used as clock signal. - // - When a file for a round is closed, but events that should be - // in that file still arrive, they are stored in the oldest open - // file. - // - This happens for exceptionally long sessions - // - // This strategy attempts to write events that belong to the same - // session to the same file. Do note that in case of failures, - // this guarantee not longer holds. For this reason, in failure - // scenario's or at shutdown, this strategy DOES NOT move files - // to the publish directory. Users have to setup a separate process - // to periodically move these files out of the way. - -// file_strategy { -// type = SESSION_BINNING -// sync_file_after_records = 1000 -// sync_file_after_duration = 30 seconds -// working_dir = /tmp -// publish_dir = /tmp -// } - } + + // Sources, sinks and mappings are provided only if the user hasn't + // specified anything. Due to the merging rules for configuration, + // defaults are not present here: this is handled in code. } diff --git a/src/test/java/io/divolte/server/DslRecordMapperTest.java b/src/test/java/io/divolte/server/DslRecordMapperTest.java index 74821531..c26ce01b 100644 --- a/src/test/java/io/divolte/server/DslRecordMapperTest.java +++ b/src/test/java/io/divolte/server/DslRecordMapperTest.java @@ -37,6 +37,9 @@ import javax.annotation.ParametersAreNonnullByDefault; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import io.divolte.server.config.ValidatedConfiguration; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; @@ -52,12 +55,9 @@ import com.google.common.collect.ImmutableMap; import com.google.common.io.Resources; import com.maxmind.geoip2.model.CityResponse; -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; import io.divolte.server.ServerTestUtils.EventPayload; import io.divolte.server.ServerTestUtils.TestServer; -import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.ip2geo.LookupService; import io.divolte.server.ip2geo.LookupService.ClosedServiceException; import io.divolte.server.recordmapping.DslRecordMapper; @@ -361,9 +361,9 @@ public void shouldMapAllGeoIpFields() throws IOException, InterruptedException, copyResourceToFile("geo-mapping.groovy", geoMappingFile); final ImmutableMap mappingConfig = ImmutableMap.of( - "divolte.tracking.schema_mapping.mapping_script_file", geoMappingFile.getAbsolutePath(), - "divolte.tracking.schema_file", avroFile.getAbsolutePath() - ); + "divolte.mappings.test.mapping_script_file", geoMappingFile.getAbsolutePath(), + "divolte.mappings.test.schema_file", avroFile.getAbsolutePath() + ); final Config geoConfig = ConfigFactory.parseMap(mappingConfig) .withFallback(ConfigFactory.parseResources("dsl-mapping-test.conf")) @@ -378,6 +378,7 @@ public void shouldMapAllGeoIpFields() throws IOException, InterruptedException, final DslRecordMapper mapper = new DslRecordMapper( vc, + geoMappingFile.getAbsolutePath(), new Schema.Parser().parse(Resources.toString(Resources.getResource("TestRecord.avsc"), StandardCharsets.UTF_8)), Optional.of(mockLookupService)); @@ -545,8 +546,8 @@ private void setupServer(final String mapping) throws IOException { copyResourceToFile("TestRecord.avsc", avroFile); final ImmutableMap mappingConfig = ImmutableMap.of( - "divolte.tracking.schema_mapping.mapping_script_file", mappingFile.getAbsolutePath(), - "divolte.tracking.schema_file", avroFile.getAbsolutePath() + "divolte.mappings.test.mapping_script_file", mappingFile.getAbsolutePath(), + "divolte.mappings.test.schema_file", avroFile.getAbsolutePath() ); server = new TestServer("dsl-mapping-test.conf", mappingConfig); diff --git a/src/test/java/io/divolte/server/ServerTestUtils.java b/src/test/java/io/divolte/server/ServerTestUtils.java index 944316e5..eda83104 100644 --- a/src/test/java/io/divolte/server/ServerTestUtils.java +++ b/src/test/java/io/divolte/server/ServerTestUtils.java @@ -16,6 +16,13 @@ package io.divolte.server; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.ConfigValueFactory; +import io.divolte.server.config.ValidatedConfiguration; +import org.apache.avro.generic.GenericRecord; + +import javax.annotation.ParametersAreNonnullByDefault; import java.io.IOException; import java.net.ServerSocket; import java.util.Map; @@ -25,16 +32,6 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; -import javax.annotation.ParametersAreNonnullByDefault; - -import org.apache.avro.generic.GenericRecord; - -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; -import com.typesafe.config.ConfigValueFactory; - -import io.divolte.server.config.ValidatedConfiguration; - public final class ServerTestUtils { /* * Theoretically, this is prone to race conditions, @@ -83,7 +80,7 @@ public TestServer(final String configResource) { public TestServer(final String configResource, final Map extraConfig) { this( findFreePort(), - ConfigFactory.parseMap(extraConfig) + ConfigFactory.parseMap(extraConfig, "Test-specific overrides") .withFallback(ConfigFactory.parseResources(configResource)) .withFallback(ConfigFactory.parseResources("reference-test.conf")) ); @@ -91,7 +88,7 @@ public TestServer(final String configResource, final Map extraCon private TestServer(final int port, final Config config) { this.port = port; - this.config = config.withValue("divolte.server.port", ConfigValueFactory.fromAnyRef(port)); + this.config = config.withValue("divolte.global.server.port", ConfigValueFactory.fromAnyRef(port)); events = new ArrayBlockingQueue<>(100); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> this.config); diff --git a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java index c031f875..09205949 100644 --- a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java +++ b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java @@ -2,6 +2,7 @@ import static org.junit.Assert.*; +import com.google.common.collect.ImmutableMap; import org.junit.Test; import com.typesafe.config.Config; @@ -19,14 +20,16 @@ public void shouldNotThrowExceptionsOnInvalidConfiguration() { @Test public void shouldValidateJavaScriptName() { - final Config config = - ConfigFactory.parseString( - "divolte.javascript.name = 404.exe\n") - .withFallback(ConfigFactory.parseResources("reference-test.conf")); + final String propertyName = "divolte.sources.browser.javascript.name"; + final String invalidValue = "404.exe"; + final Config config = ConfigFactory.parseMap(ImmutableMap.of(propertyName, invalidValue)) + .withFallback(ConfigFactory.parseResources("reference-test.conf")); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); assertFalse(vc.errors().isEmpty()); - assertEquals("Property 'divolte.javascript.name' must match \"^[A-Za-z0-9_-]+\\.js$\". Found: '404.exe'.", vc.errors().get(0)); + final String reportedPropertyName = propertyName.replace(".sources.browser.", ".sources[browser]."); + assertEquals("Property '" + reportedPropertyName + "' must match \"^[A-Za-z0-9_-]+\\.js$\". Found: '" + invalidValue + "'.", + vc.errors().get(0)); } @Test(expected = IllegalStateException.class) diff --git a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java index 546235d2..38d86844 100644 --- a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java +++ b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java @@ -16,21 +16,12 @@ package io.divolte.server.hdfs; -import static org.junit.Assert.*; +import com.google.common.collect.ImmutableMap; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; import io.divolte.server.AvroRecordBuffer; import io.divolte.server.DivolteIdentifier; import io.divolte.server.config.ValidatedConfiguration; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.LongStream; -import java.util.stream.StreamSupport; - import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericData.Record; @@ -44,89 +35,79 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.LongStream; +import java.util.stream.StreamSupport; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +@ParametersAreNonnullByDefault public class HdfsFlusherTest { private static final Logger logger = LoggerFactory.getLogger(HdfsFlusherTest.class); @SuppressWarnings("PMD.AvoidUsingHardCodedIP") private static final String ARBITRARY_IP = "8.8.8.8"; + private Schema schema; private Path tempInflightDir; private Path tempPublishDir; + private List records; + private HdfsFlusher flusher; + @Before - public void setupTempDir() throws IOException { + public void setup() throws IOException { + schema = schemaFromClassPath("/MinimalRecord.avsc"); tempInflightDir = Files.createTempDirectory("hdfs-flusher-test-inflight"); tempPublishDir = Files.createTempDirectory("hdfs-flusher-test-publish"); } @After - public void cleanupTempDir() throws IOException { + public void teardown() throws IOException { + schema = null; + Files.walk(tempInflightDir) .filter((p) -> !p.equals(tempInflightDir)) .forEach(this::deleteQuietly); deleteQuietly(tempInflightDir); + tempInflightDir = null; + Files.walk(tempPublishDir) .filter((p) -> !p.equals(tempPublishDir)) .forEach(this::deleteQuietly); deleteQuietly(tempPublishDir); + tempPublishDir = null; + + flusher = null; + records = null; + flusher = null; } @Test public void shouldCreateAndPopulateFileWithSimpleStrategy() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 1 day\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 10) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + setupFlusher("1 day", 10); + processRecords(); flusher.cleanup(); Files.walk(tempPublishDir) - .filter((p) -> p.toString().endsWith(".avro")) - .findFirst() - .ifPresent((p) -> verifyAvroFile(records, schema, p)); + .filter((p) -> p.toString().endsWith(".avro")) + .findFirst() + .ifPresent((p) -> verifyAvroFile(records, schema, p)); } @Test public void shouldWriteInProgressFilesWithNonAvroExtension() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 1 day\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 10) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + setupFlusher("1 day", 10); + processRecords(); assertTrue(Files.walk(tempInflightDir) .filter((p) -> p.toString().endsWith(".avro.partial")) @@ -136,90 +117,82 @@ public void shouldWriteInProgressFilesWithNonAvroExtension() throws IOException @Test public void shouldRollFilesWithSimpleStrategy() throws IOException, InterruptedException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 1 second\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final List records = LongStream.range(0, 5) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + setupFlusher("1 second", 5); + processRecords(); for (int c = 0; c < 2; c++) { Thread.sleep(500); flusher.heartbeat(); } - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + processRecords(); flusher.cleanup(); final MutableInt count = new MutableInt(0); Files.walk(tempPublishDir) - .filter((p) -> p.toString().endsWith(".avro")) - .forEach((p) -> { - verifyAvroFile(records, schema, p); - count.increment(); - }); + .filter((p) -> p.toString().endsWith(".avro")) + .forEach((p) -> { + verifyAvroFile(records, schema, p); + count.increment(); + }); assertEquals(2, count.intValue()); } @Test public void shouldNotCreateEmptyFiles() throws IOException, InterruptedException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 100 millisecond\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); + setupFlusher("100 millisecond", 5); - final List records = LongStream.range(0, 5) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + processRecords(); for (int c = 0; c < 4; c++) { Thread.sleep(500); flusher.heartbeat(); } - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + processRecords(); flusher.cleanup(); final MutableInt count = new MutableInt(0); Files.walk(tempPublishDir) - .filter((p) -> p.toString().endsWith(".avro")) - .forEach((p) -> { - verifyAvroFile(records, schema, p); - count.increment(); - }); - + .filter((p) -> p.toString().endsWith(".avro")) + .forEach((p) -> { + verifyAvroFile(records, schema, p); + count.increment(); + }); assertEquals(2, count.intValue()); } + private void setupFlusher(final String rollEvery, final int recordCount) throws IOException { + final Config config = ConfigFactory + .parseMap(ImmutableMap.of( + "divolte.sinks.hdfs.file_strategy.roll_every", rollEvery, + "divolte.sinks.hdfs.file_strategy.working_dir", tempInflightDir.toString(), + "divolte.sinks.hdfs.file_strategy.publish_dir", tempPublishDir.toString())) + .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); + + records = LongStream.range(0, recordCount) + .mapToObj((time) -> + new GenericRecordBuilder(schema) + .set("ts", time) + .set("remoteHost", ARBITRARY_IP) + .build()) + .collect(Collectors.toList()); + + flusher = new HdfsFlusher(vc, schema); + } + + private void processRecords() { + records.forEach((record) -> + flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), + DivolteIdentifier.generate(), + System.currentTimeMillis(), + 0, + record))); + } private void deleteQuietly(Path p) { try { diff --git a/src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java b/src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java deleted file mode 100644 index ab40730c..00000000 --- a/src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server.hdfs; - -import static org.junit.Assert.*; - -import com.google.common.collect.ImmutableList; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.DivolteIdentifier; -import io.divolte.server.config.ValidatedConfiguration; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.LongStream; -import java.util.stream.StreamSupport; - -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.generic.GenericData.Record; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.avro.io.DatumReader; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; - -public class SessionBinningFileStrategyTest { - private static final Logger logger = LoggerFactory.getLogger(SessionBinningFileStrategyTest.class); - - @SuppressWarnings("PMD.AvoidUsingHardCodedIP") - private static final String ARBITRARY_IP = "8.8.8.8"; - - private Path tempInflightDir; - private Path tempPublishDir; - - @Before - public void setupTempDir() throws IOException { - tempInflightDir = Files.createTempDirectory("hdfs-flusher-test-inflight"); - tempPublishDir = Files.createTempDirectory("hdfs-flusher-test-publish"); - } - - @After - public void cleanupTempDir() throws IOException { - Files.walk(tempInflightDir) - .filter((p) -> !p.equals(tempInflightDir)) - .forEach(this::deleteQuietly); - deleteQuietly(tempInflightDir); - Files.walk(tempPublishDir) - .filter((p) -> !p.equals(tempPublishDir)) - .forEach(this::deleteQuietly); - deleteQuietly(tempPublishDir); - } - - @Test - public void shouldCreateFilePerRound() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 5) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time * 1000 + 100) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach( - (record) -> flusher.process( - AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record))); - - flusher.cleanup(); - - final List inflightFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - final List publishedFiles = Files.walk(tempPublishDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro")) - .collect(Collectors.toList()); - - /* - * We created 5 events, each in a different round. On each sync event, we evaluate - * which open files can be closed because their 3-session span has elapsed. So: - * a) On the 4th event, the 1st span is completed. - * b) On the 5th event, the 2nd span is completed. - * c) The last 3 spans remain in-flight. - */ - assertEquals(3, inflightFiles.size()); - assertEquals(2 ,publishedFiles.size()); - verifyAvroFile(ImmutableList.of(records.get(0)), schema, publishedFiles.get(0)); - verifyAvroFile(ImmutableList.of(records.get(1)), schema, publishedFiles.get(1)); - verifyAvroFile(ImmutableList.of(records.get(2)), schema, inflightFiles.get(0)); - verifyAvroFile(ImmutableList.of(records.get(3)), schema, inflightFiles.get(1)); - verifyAvroFile(ImmutableList.of(records.get(4)), schema, inflightFiles.get(2)); - } - - @Test - public void eventsShouldStickWithSessionStartTimeRound() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 2) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time * 1000 + 100) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach( - (record) -> flusher.process( - AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record))); - - records.forEach( - (record) -> flusher.process( - AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record))); - - flusher.cleanup(); - - final List avroFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - - assertEquals(2, avroFiles.size()); - verifyAvroFile(Arrays.asList(records.get(0), records.get(0)), schema, avroFiles.get(0)); - verifyAvroFile(Arrays.asList(records.get(1), records.get(1)), schema, avroFiles.get(1)); - } - - @Test - public void eventsShouldMoveToNextRoundFileIfSessionStartTimeRoundFileIsNoLongerOpen() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = Arrays.asList( - new GenericRecordBuilder(schema).set("ts", 100L).set("session", DivolteIdentifier.generate(100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 1100L).set("session", DivolteIdentifier.generate(1100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 2100L).set("session", DivolteIdentifier.generate(2100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3100L).set("session", DivolteIdentifier.generate(3100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3150L).set("session", DivolteIdentifier.generate(100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3160L).set("session", DivolteIdentifier.generate(1100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3170L).set("session", DivolteIdentifier.generate(2100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3180L).set("session", DivolteIdentifier.generate(3100).value).set("remoteHost", ARBITRARY_IP).build() - ); - - final List buffers = records - .stream() - .map((r) -> AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.tryParse((String) r.get("session")).get(), (Long) r.get("ts"), 0, r)) - .collect(Collectors.toList()); - - buffers.forEach(flusher::process); - flusher.cleanup(); - - final List inflightFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - final List publishedFiles = Files.walk(tempPublishDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro")) - .collect(Collectors.toList()); - - assertEquals(1, publishedFiles.size()); - assertEquals(3, inflightFiles.size()); - - verifyAvroFile(Arrays.asList(records.get(0)), schema, publishedFiles.get(0)); - verifyAvroFile(Arrays.asList(records.get(1), records.get(4), records.get(5)), schema, inflightFiles.get(0)); - verifyAvroFile(Arrays.asList(records.get(2), records.get(6)), schema, inflightFiles.get(1)); - verifyAvroFile(Arrays.asList(records.get(3), records.get(7)), schema, inflightFiles.get(2)); - } - - @Test - public void shouldNotPublishInflightFilesOnCleanup() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final Record record = new GenericRecordBuilder(schema) - .set("ts", 100L) - .set("remoteHost", ARBITRARY_IP) - .build(); - - flusher.process(AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record)); - flusher.cleanup(); - - final List inflightFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - final List publishedFiles = Files.walk(tempPublishDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro")) - .collect(Collectors.toList()); - - assertEquals(1, inflightFiles.size()); - assertEquals(0 ,publishedFiles.size()); - } - - private void deleteQuietly(Path p) { - try { - Files.delete(p); - } catch (final Exception e) { - logger.info("Ignoring failure while deleting file: " + p, e); - } - } - - private void verifyAvroFile(List expected, Schema schema, Path avroFile) { - final List result = StreamSupport - .stream(readAvroFile(schema, avroFile.toFile()).spliterator(), false) - .collect(Collectors.toList()); - - assertEquals(expected, result); - } - - private DataFileReader readAvroFile(Schema schema, File file) { - final DatumReader dr = new GenericDatumReader<>(schema); - try { - return new DataFileReader<>(file, dr); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private Schema schemaFromClassPath(final String resource) throws IOException { - try (final InputStream resourceStream = this.getClass().getResourceAsStream(resource)) { - return new Schema.Parser().parse(resourceStream); - } - } -} diff --git a/src/test/resources/checksum-discard-corrupt-test.conf b/src/test/resources/checksum-discard-corrupt-test.conf index 440cc3dd..8e86f26c 100644 --- a/src/test/resources/checksum-discard-corrupt-test.conf +++ b/src/test/resources/checksum-discard-corrupt-test.conf @@ -15,5 +15,5 @@ // divolte { - incoming_request_processor.discard_corrupted = true + mappings.test.discard_corrupted = false } diff --git a/src/test/resources/checksum-test.conf b/src/test/resources/checksum-test.conf index 639dc399..8e86f26c 100644 --- a/src/test/resources/checksum-test.conf +++ b/src/test/resources/checksum-test.conf @@ -15,5 +15,5 @@ // divolte { - incoming_request_processor.discard_corrupted = false + mappings.test.discard_corrupted = false } diff --git a/src/test/resources/dsl-mapping-test.conf b/src/test/resources/dsl-mapping-test.conf index ac8ba8af..966e4072 100644 --- a/src/test/resources/dsl-mapping-test.conf +++ b/src/test/resources/dsl-mapping-test.conf @@ -14,6 +14,4 @@ // limitations under the License. // -divolte.tracking.schema_mapping { - version = 2 -} +// Nothing needed here. diff --git a/src/test/resources/duplicates-test.conf b/src/test/resources/duplicates-test.conf index 3a4de335..d6b586c9 100644 --- a/src/test/resources/duplicates-test.conf +++ b/src/test/resources/duplicates-test.conf @@ -16,5 +16,5 @@ divolte { // Configure 2 slots, so we can control things easily. - incoming_request_processor.duplicate_memory_size = 2 + global.mapper.duplicate_memory_size = 2 } diff --git a/src/test/resources/hdfs-flusher-binning-test.conf b/src/test/resources/hdfs-flusher-binning-test.conf deleted file mode 100644 index 17da524a..00000000 --- a/src/test/resources/hdfs-flusher-binning-test.conf +++ /dev/null @@ -1,42 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -include classpath("reference-test.conf") - -divolte { - tracking { - session_timeout = 1 second - } - - hdfs_flusher { - enabled = true - - max_write_queue = 10 - max_enqueue_delay = 1 second - threads = 1 - - file_strategy { - type = SESSION_BINNING - sync_file_after_records = 1 - sync_file_after_duration = 1 hour - } - - hdfs { - uri = "file:///" - replication = 1 - } - } -} diff --git a/src/test/resources/hdfs-flusher-test.conf b/src/test/resources/hdfs-flusher-test.conf index 42aaaafa..71e758e5 100644 --- a/src/test/resources/hdfs-flusher-test.conf +++ b/src/test/resources/hdfs-flusher-test.conf @@ -17,22 +17,21 @@ include classpath("reference-test.conf") divolte { - hdfs_flusher { - enabled = true - - max_write_queue = 10 - max_enqueue_delay = 1 second - threads = 1 - - file_strategy { - type = SIMPLE_ROLLING_FILE - sync_file_after_records = 1 - sync_file_after_duration = 1 seconds - } - + global { + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. hdfs { - uri = "file:///" - replication = 1 + enabled = true + buffer_size = 16 + threads = 1 + } + } + sinks { + hdfs = { + file_strategy { + sync_file_after_records = 1 + sync_file_after_duration = 1 seconds + } } } } diff --git a/src/test/resources/reference-test.conf b/src/test/resources/reference-test.conf index 00bcc532..1942f87d 100644 --- a/src/test/resources/reference-test.conf +++ b/src/test/resources/reference-test.conf @@ -17,20 +17,41 @@ include classpath("reference.conf") divolte { - // The test server should only listen on loopback by default. - // The port number is a free ephemeral port determined at runtime. - server.host = 127.0.0.1 + global { + server.host = 127.0.0.1 - // For tests we generally want single-threaded processing with a small - // buffer. - incoming_request_processor { - threads = 1 - max_write_queue = 10 - max_enqueue_delay = 1 second + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } + + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false } - // By default the flushers are disabled. Instead events are placed on - // a special queue for the tests to collect. - kafka_flusher.enabled = false - hdfs_flusher.enabled = false + // Explicitly specify the default sinks and sources, so that tests can merge properties in. + sources { + browser = { + type = browser + } + } + sinks { + hdfs = { + type = hdfs + } + kafka = { + type = kafka + } + } + + mappings { + test = { + sources = [browser] + sinks = [hdfs,kafka] + } + } } diff --git a/src/test/resources/x-forwarded-for-test.conf b/src/test/resources/x-forwarded-for-test.conf index 1485d6fb..5481b62a 100644 --- a/src/test/resources/x-forwarded-for-test.conf +++ b/src/test/resources/x-forwarded-for-test.conf @@ -16,5 +16,5 @@ divolte { // This is what we're testing. - server.use_x_forwarded_for = true + global.server.use_x_forwarded_for = true } From f8ad26c47c2063d41d8cdc007a892a6a0de7995f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 6 Nov 2015 20:58:27 +0100 Subject: [PATCH 02/80] Fix the default value for automatic page-view events. --- .../java/io/divolte/server/config/JavascriptConfiguration.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java index 0d58bdd8..45083074 100644 --- a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java +++ b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java @@ -14,7 +14,7 @@ public final class JavascriptConfiguration { private static final String DEFAULT_NAME = "divolte.js"; private static final boolean DEFAULT_LOGGING = false; private static final boolean DEFAULT_DEBUG = false; - private static final boolean DEFAULT_AUTO_PAGE_VIEW_EVENT = false; + private static final boolean DEFAULT_AUTO_PAGE_VIEW_EVENT = true; static final JavascriptConfiguration DEFAULT_JAVASCRIPT_CONFIGURATION = new JavascriptConfiguration(Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); From 0acf80e652c815c6684ee8f14502a1334801aec8 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Mon, 23 Nov 2015 11:28:06 +0100 Subject: [PATCH 03/80] Favor ImmutableProperties over getter with defensive copy. --- .../server/config/HdfsConfiguration.java | 19 +++--- .../server/config/ImmutableProperties.java | 62 +++++++++++++++++++ .../server/config/KafkaConfiguration.java | 18 ++---- .../io/divolte/server/hdfs/HdfsFlusher.java | 29 ++++----- .../io/divolte/server/kafka/KafkaFlusher.java | 31 ++++++---- 5 files changed, 108 insertions(+), 51 deletions(-) create mode 100644 src/main/java/io/divolte/server/config/ImmutableProperties.java diff --git a/src/main/java/io/divolte/server/config/HdfsConfiguration.java b/src/main/java/io/divolte/server/config/HdfsConfiguration.java index ceaf62f5..41cd9c4b 100644 --- a/src/main/java/io/divolte/server/config/HdfsConfiguration.java +++ b/src/main/java/io/divolte/server/config/HdfsConfiguration.java @@ -1,22 +1,22 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.google.common.base.MoreObjects; - -import javax.annotation.ParametersAreNonnullByDefault; import java.util.Optional; import java.util.Properties; +import javax.annotation.ParametersAreNonnullByDefault; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + @ParametersAreNonnullByDefault public final class HdfsConfiguration extends SinkTypeConfiguration { - private final Optional client; + public final Optional client; @JsonCreator HdfsConfiguration(final boolean enabled, final int bufferSize, final int threads, final Optional client) { super(bufferSize, threads, enabled); - // Defensive copy: ensure our copy remains immutable. - this.client = client.map(properties -> (Properties) properties.clone()); + this.client = client.map(ImmutableProperties::fromSource); } @Override @@ -24,9 +24,4 @@ protected MoreObjects.ToStringHelper toStringHelper() { return super.toStringHelper() .add("client", client); } - - public Optional getClient() { - // Defensive copy: we can't stop callers from modifying what we return. - return client.map(properties -> (Properties) properties.clone()); - } } diff --git a/src/main/java/io/divolte/server/config/ImmutableProperties.java b/src/main/java/io/divolte/server/config/ImmutableProperties.java new file mode 100644 index 00000000..8245e4f8 --- /dev/null +++ b/src/main/java/io/divolte/server/config/ImmutableProperties.java @@ -0,0 +1,62 @@ +package io.divolte.server.config; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.InvalidPropertiesFormatException; +import java.util.Map; +import java.util.Properties; + +public class ImmutableProperties extends Properties { + private static final long serialVersionUID = 1333087762733134653L; + + public static ImmutableProperties fromSource(final Properties source) { + final ImmutableProperties result = new ImmutableProperties(); + source.forEach(result::set); + return result; + } + + private void set(final Object key, final Object value) { + super.put(key, value); + } + + @Override + public synchronized void load(final InputStream inStream) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void load(final Reader reader) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void loadFromXML(final InputStream in) throws IOException, InvalidPropertiesFormatException { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized Object setProperty(final String key, final String value) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized Object put(final Object key, final Object value) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void putAll(final Map t) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized Object remove(final Object key) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void clear() { + throw new UnsupportedOperationException(); + } +} diff --git a/src/main/java/io/divolte/server/config/KafkaConfiguration.java b/src/main/java/io/divolte/server/config/KafkaConfiguration.java index 80a4bf04..9f3302c9 100644 --- a/src/main/java/io/divolte/server/config/KafkaConfiguration.java +++ b/src/main/java/io/divolte/server/config/KafkaConfiguration.java @@ -1,22 +1,21 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.google.common.base.MoreObjects; +import java.util.Properties; import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; -import java.util.Properties; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; @ParametersAreNonnullByDefault public class KafkaConfiguration extends SinkTypeConfiguration { - private final Properties producer; + public final Properties producer; @JsonCreator KafkaConfiguration(final int bufferSize, final int threads, final boolean enabled, final Properties producer) { super(bufferSize, threads, enabled); - // Defensive copy: ensure our copy remains immutable. - this.producer = Objects.requireNonNull((Properties) producer.clone()); + this.producer = ImmutableProperties.fromSource(producer); } @Override @@ -24,9 +23,4 @@ protected MoreObjects.ToStringHelper toStringHelper() { return super.toStringHelper() .add("producer", producer); } - - public Properties getProducer() { - // Defensive copy: we can't stop callers from modifying what we return. - return (Properties)producer.clone(); - } } diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java index 2d755ec4..5a9bb3ac 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java @@ -16,24 +16,25 @@ package io.divolte.server.hdfs; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult; -import io.divolte.server.processing.ItemProcessor; +import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; + +import java.io.IOException; +import java.util.Objects; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; + import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; -import java.io.IOException; -import java.util.Objects; - -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.SUCCESS; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.PAUSE; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult; +import io.divolte.server.processing.ItemProcessor; @ParametersAreNonnullByDefault @NotThreadSafe @@ -46,7 +47,7 @@ public final class HdfsFlusher implements ItemProcessor { public HdfsFlusher(final ValidatedConfiguration vc, final Schema schema) { Objects.requireNonNull(vc); - final Configuration hdfsConfiguration = vc.configuration().global.hdfs.getClient() + final Configuration hdfsConfiguration = vc.configuration().global.hdfs.client .map(clientProperties -> { final Configuration configuration = new Configuration(false); for (final String propertyName : clientProperties.stringPropertyNames()) { @@ -87,7 +88,7 @@ public void cleanup() { } @Override - public ProcessingDirective process(AvroRecordBuffer record) { + public ProcessingDirective process(final AvroRecordBuffer record) { if (lastHdfsResult == SUCCESS) { return (lastHdfsResult = fileStrategy.append(record)) == SUCCESS ? CONTINUE : PAUSE; } else { diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java index dfd256a1..ef153751 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java @@ -16,6 +16,23 @@ package io.divolte.server.kafka; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.Queue; +import java.util.stream.Collectors; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import io.divolte.server.AvroRecordBuffer; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.processing.ItemProcessor; @@ -23,18 +40,6 @@ import kafka.javaapi.producer.Producer; import kafka.producer.KeyedMessage; import kafka.producer.ProducerConfig; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.stream.Collectors; - -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.PAUSE; @ParametersAreNonnullByDefault @NotThreadSafe @@ -50,7 +55,7 @@ public final class KafkaFlusher implements ItemProcessor { public KafkaFlusher(final ValidatedConfiguration vc) { Objects.requireNonNull(vc); - final ProducerConfig producerConfig = new ProducerConfig(vc.configuration().global.kafka.getProducer()); + final ProducerConfig producerConfig = new ProducerConfig(vc.configuration().global.kafka.producer); topic = vc.configuration().kafkaFlusher.topic; producer = new Producer<>(producerConfig); } From d32f4ea78e7d33422311798d7a8c8437ae9a069c Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Tue, 24 Nov 2015 11:42:16 +0100 Subject: [PATCH 04/80] Use deprecation annotation instead of comment. --- .../io/divolte/server/config/DivolteConfiguration.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index fd05a7ae..12de732e 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -19,13 +19,13 @@ public final class DivolteConfiguration { @Valid public final ImmutableMap sinks; @Valid public final ImmutableMap mappings; - /** @deprecated */ + @Deprecated public final MappingConfiguration incomingRequestProcessor; - /** @deprecated */ + @Deprecated public final BrowserSourceConfiguration browserSourceConfiguration; - /** @deprecated */ + @Deprecated public final KafkaSinkConfiguration kafkaFlusher; - /** @deprecated */ + @Deprecated public final HdfsSinkConfiguration hdfsFlusher; @JsonCreator From e51eabba0281d0ce55c4d1fb227b5793518b6eca Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Tue, 24 Nov 2015 11:54:09 +0100 Subject: [PATCH 05/80] Do not throw RuntimeException on MappingException, but instead catch mapping exceptions and turn them into useful error messages and leave configuratio invalid. --- src/main/java/io/divolte/server/Server.java | 4 +- .../server/config/ValidatedConfiguration.java | 60 +++++++++++++++---- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index ad618642..b3cdeb9a 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -146,8 +146,8 @@ public void shutdown() { public static void main(final String[] args) { final ValidatedConfiguration vc = new ValidatedConfiguration(ConfigFactory::load); if (!vc.isValid()) { - System.err.println("There are configuration errors. Details:"); - vc.errors().forEach(System.err::println); + vc.errors().forEach(logger::error); + logger.error("There are configuration errors. Exiting server."); System.exit(1); } diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 5c7c3892..15d48db5 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -16,9 +16,31 @@ package io.divolte.server.config; +import java.io.IOException; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.ConstraintViolation; +import javax.validation.Validation; +import javax.validation.Validator; + +import org.hibernate.validator.HibernateValidator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.JsonMappingException.Reference; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.PropertyNamingStrategy; +import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException; import com.fasterxml.jackson.databind.module.SimpleModule; import com.fasterxml.jackson.datatype.guava.GuavaModule; import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; @@ -28,18 +50,6 @@ import com.jasonclawson.jackson.dataformat.hocon.HoconTreeTraversingParser; import com.typesafe.config.Config; import com.typesafe.config.ConfigException; -import org.hibernate.validator.HibernateValidator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import javax.validation.ConstraintViolation; -import javax.validation.Validation; -import javax.validation.Validator; -import java.io.IOException; -import java.time.Duration; -import java.util.*; -import java.util.function.Supplier; /** * Container for a validated configuration loaded from a {@code Config} @@ -85,6 +95,32 @@ public ValidatedConfiguration(final Supplier configLoader) { logger.debug("Configuration error caught during validation.", e); configurationErrors.add(e.getMessage()); divolteConfiguration = null; + } catch (final UnrecognizedPropertyException e) { + // Add a special case for unknown property as we add the list of available properties to the message. + logger.debug("Configuration error. Exception while mapping.", e); + final String message = String.format( + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.%n\tAvailable properties: %s.", + e.getOriginalMessage(), + e.getLocation().getSourceRef(), + e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining(".")), + e.getKnownPropertyIds().stream() + .map(Object::toString).map(s -> "'" + s + "'") + .collect(Collectors.joining(", "))); + configurationErrors.add(message); + divolteConfiguration = null; + } catch (final JsonMappingException e) { + logger.debug("Configuration error. Exception while mapping.", e); + final String message = String.format( + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.", + e.getOriginalMessage(), + e.getLocation().getSourceRef(), + e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining("."))); + configurationErrors.add(message); + divolteConfiguration = null; } catch (final IOException e) { logger.error("Error while reading configuration!", e); throw new RuntimeException(e); From c00ed4e96d7f82adb08e1e9cda381dbfb25c9756 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Tue, 24 Nov 2015 11:57:14 +0100 Subject: [PATCH 06/80] Extract some methods for readability. --- .../server/config/ValidatedConfiguration.java | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 15d48db5..a053a8f0 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -98,27 +98,12 @@ public ValidatedConfiguration(final Supplier configLoader) { } catch (final UnrecognizedPropertyException e) { // Add a special case for unknown property as we add the list of available properties to the message. logger.debug("Configuration error. Exception while mapping.", e); - final String message = String.format( - "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.%n\tAvailable properties: %s.", - e.getOriginalMessage(), - e.getLocation().getSourceRef(), - e.getPath().stream() - .map(Reference::getFieldName) - .collect(Collectors.joining(".")), - e.getKnownPropertyIds().stream() - .map(Object::toString).map(s -> "'" + s + "'") - .collect(Collectors.joining(", "))); + final String message = messageForUnrecognizedPropertyException(e); configurationErrors.add(message); divolteConfiguration = null; } catch (final JsonMappingException e) { logger.debug("Configuration error. Exception while mapping.", e); - final String message = String.format( - "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.", - e.getOriginalMessage(), - e.getLocation().getSourceRef(), - e.getPath().stream() - .map(Reference::getFieldName) - .collect(Collectors.joining("."))); + final String message = messageForMappingException(e); configurationErrors.add(message); divolteConfiguration = null; } catch (final IOException e) { @@ -130,6 +115,31 @@ public ValidatedConfiguration(final Supplier configLoader) { this.divolteConfiguration = Optional.ofNullable(divolteConfiguration); } + private String messageForMappingException(final JsonMappingException e) { + final String message = String.format( + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.", + e.getOriginalMessage(), + e.getLocation().getSourceRef(), + e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining("."))); + return message; + } + + private static String messageForUnrecognizedPropertyException(final UnrecognizedPropertyException e) { + final String message = String.format( + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.%n\tAvailable properties: %s.", + e.getOriginalMessage(), + e.getLocation().getSourceRef(), + e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining(".")), + e.getKnownPropertyIds().stream() + .map(Object::toString).map(s -> "'" + s + "'") + .collect(Collectors.joining(", "))); + return message; + } + private void validate(final List configurationErrors, final DivolteConfiguration divolteConfiguration) { final Validator validator = Validation .byProvider(HibernateValidator.class) From 04d9b2ba71ca94828f27b50e8f7b09d64ee514b5 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Fri, 4 Dec 2015 16:42:08 +0100 Subject: [PATCH 07/80] Validators for DivolteConfiguration. --- .../server/config/DivolteConfiguration.java | 81 ++++++++++++++++--- .../server/config/ValidatedConfiguration.java | 29 ++++--- .../MappingSourceSinkReferencesMustExist.java | 36 +++++++++ .../config/constraint/OneSchemaPerSink.java | 36 +++++++++ .../SourceAndSinkNamesCannotCollide.java | 36 +++++++++ .../config/ValidatedConfigurationTest.java | 36 +++++++++ src/test/resources/missing-sources-sinks.conf | 57 +++++++++++++ .../resources/multiple-schemas-one-sink.conf | 66 +++++++++++++++ .../resources/source-sink-collisions.conf | 72 +++++++++++++++++ 9 files changed, 430 insertions(+), 19 deletions(-) create mode 100644 src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java create mode 100644 src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java create mode 100644 src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java create mode 100644 src/test/resources/missing-sources-sinks.conf create mode 100644 src/test/resources/multiple-schemas-one-sink.conf create mode 100644 src/test/resources/source-sink-collisions.conf diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 12de732e..c3baf52c 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -1,18 +1,35 @@ package io.divolte.server.config; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.Valid; + import com.fasterxml.jackson.annotation.JsonCreator; import com.google.common.base.MoreObjects; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Iterators; +import com.google.common.collect.Sets; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.validation.Valid; -import java.util.Objects; -import java.util.Optional; +import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; +import io.divolte.server.config.constraint.OneSchemaPerSink; +import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; @ParametersAreNonnullByDefault +@MappingSourceSinkReferencesMustExist +@SourceAndSinkNamesCannotCollide +@OneSchemaPerSink public final class DivolteConfiguration { @Valid public final GlobalConfiguration global; @Valid public final ImmutableMap sources; @@ -37,19 +54,63 @@ public final class DivolteConfiguration { this.sinks = sinks.orElseGet(DivolteConfiguration::defaultSinkConfigurations); this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); this.global = Objects.requireNonNull(global); + // Temporary interop - this.incomingRequestProcessor = Iterables.getOnlyElement(this.mappings.values()); - this.browserSourceConfiguration = (BrowserSourceConfiguration) Iterables.getOnlyElement(this.sources.values()); - this.kafkaFlusher = (KafkaSinkConfiguration) Iterators.getOnlyElement(this.sinks.values().stream().filter((sink) -> sink instanceof KafkaSinkConfiguration).iterator()); - this.hdfsFlusher = (HdfsSinkConfiguration) Iterators.getOnlyElement(this.sinks.values().stream().filter((sink) -> sink instanceof HdfsSinkConfiguration).iterator()); - // TODO: Validate that the mappings refer to defined sources and sinks. - // TODO: Validate that all mappings that refer to a sink have the same schema. + this.incomingRequestProcessor = Iterables.get(this.mappings.values(), 0); + this.browserSourceConfiguration = (BrowserSourceConfiguration) Iterables.get(this.sources.values(), 0); + this.kafkaFlusher = (KafkaSinkConfiguration) Iterators.get(this.sinks.values().stream().filter((sink) -> sink instanceof KafkaSinkConfiguration).iterator(), 0); + this.hdfsFlusher = (HdfsSinkConfiguration) Iterators.get(this.sinks.values().stream().filter((sink) -> sink instanceof HdfsSinkConfiguration).iterator(), 0); // TODO: Optimizations: // - Elide HDFS and Kafka sinks if they are globally disabled. // - Elide unreferenced sources and sinks. } + /* + * Validation support methods here. + * + * As bean validation uses expression language for rendering error messages, + * substitutions need to be available for some of these. EL doesn't allow for + * access to attributes, just getters/setters and methods. Hence, here are a + * number of methods that are used to render validation messages. These result + * of these methods can also be used for actual validation. + */ + public Set missingSourcesSinks() { + final Set defined = new HashSet<>(); + defined.addAll(sources.keySet()); + defined.addAll(sinks.keySet()); + + final Set used = mappings + .values() + .stream() + .flatMap(mc -> Stream.concat( + mc.sources.stream(), + mc.sinks.stream())) + .collect(Collectors.toSet()); + + return Sets.difference(used, defined); + } + + public Set collidingSourceAndSinkNames() { + return Sets.intersection(sources.keySet(), sinks.keySet()); + } + + public Set sinksWithMultipleSchemas() { + final Map> sinkSchemas = new HashMap<>(); + for (final MappingConfiguration mc : mappings.values()) { + for (final String s : mc.sinks) { + sinkSchemas.computeIfAbsent(s, i -> new ArrayList<>()).add(mc.schemaFile.orElse("")); + } + } + + return sinkSchemas.entrySet() + .stream() + .filter(e -> e.getValue().size() > 1) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + } + + // Defaults private static ImmutableMap defaultSourceConfigurations() { return ImmutableMap.of("browser", new BrowserSourceConfiguration(Optional.empty(), Optional.empty(), diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index a053a8f0..eff1087f 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -35,6 +35,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.core.JsonLocation; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.JsonMappingException.Reference; @@ -45,6 +46,7 @@ import com.fasterxml.jackson.datatype.guava.GuavaModule; import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import com.fasterxml.jackson.module.paramnames.ParameterNamesModule; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.jasonclawson.jackson.dataformat.hocon.HoconTreeTraversingParser; @@ -66,6 +68,8 @@ public final class ValidatedConfiguration { private final static Logger logger = LoggerFactory.getLogger(ValidatedConfiguration.class); + private final static Joiner DOT_JOINER = Joiner.on('.'); + private final ImmutableList configurationErrors; private final Optional divolteConfiguration; @@ -90,7 +94,7 @@ public ValidatedConfiguration(final Supplier configLoader) { */ final Config config = configLoader.get(); divolteConfiguration = mapped(config.getConfig("divolte")); - validate(configurationErrors, divolteConfiguration); + configurationErrors.addAll(validate(divolteConfiguration)); } catch(final ConfigException e) { logger.debug("Configuration error caught during validation.", e); configurationErrors.add(e.getMessage()); @@ -116,13 +120,14 @@ public ValidatedConfiguration(final Supplier configLoader) { } private String messageForMappingException(final JsonMappingException e) { + final String pathToError = e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining(".")); final String message = String.format( "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.", e.getOriginalMessage(), - e.getLocation().getSourceRef(), - e.getPath().stream() - .map(Reference::getFieldName) - .collect(Collectors.joining("."))); + Optional.ofNullable(e.getLocation()).map(JsonLocation::getSourceRef).orElse(""), + "".equals(pathToError) ? "" : pathToError); return message; } @@ -140,7 +145,7 @@ private static String messageForUnrecognizedPropertyException(final Unrecognized return message; } - private void validate(final List configurationErrors, final DivolteConfiguration divolteConfiguration) { + private List validate(final DivolteConfiguration divolteConfiguration) { final Validator validator = Validation .byProvider(HibernateValidator.class) .configure() @@ -149,9 +154,15 @@ private void validate(final List configurationErrors, final DivolteConfi final Set> validationErrors = validator.validate(divolteConfiguration); - validationErrors.forEach((e) -> configurationErrors.add( - String.format("Property 'divolte.%s' %s. Found: '%s'.", e.getPropertyPath(), e.getMessage(), e.getInvalidValue()) - )); + return validationErrors + .stream() + .map( + (e) -> String.format( + "Property '%s' %s. Found: '%s'.", + DOT_JOINER.join("divolte", e.getPropertyPath()), + e.getMessage(), + e.getInvalidValue())) + .collect(Collectors.toList()); } private static DivolteConfiguration mapped(final Config input) throws IOException { diff --git a/src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java b/src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java new file mode 100644 index 00000000..4a96cb0c --- /dev/null +++ b/src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java @@ -0,0 +1,36 @@ +package io.divolte.server.config.constraint; + +import static java.lang.annotation.ElementType.*; +import static java.lang.annotation.RetentionPolicy.*; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import javax.validation.Constraint; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import javax.validation.Payload; + +import io.divolte.server.config.DivolteConfiguration; + +@Target({ TYPE }) +@Retention(RUNTIME) +@Constraint(validatedBy = MappingSourceSinkReferencesMustExist.Validator.class) +@Documented +public @interface MappingSourceSinkReferencesMustExist { + String message() default "The following sources and/or sinks were used in a mapping but never defined: ${validatedValue.missingSourcesSinks()}."; + Class[] groups() default {}; + Class[] payload() default {}; + + public static final class Validator implements ConstraintValidator{ + @Override + public void initialize(final MappingSourceSinkReferencesMustExist constraintAnnotation) { + } + + @Override + public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { + return value.missingSourcesSinks().isEmpty(); + } + } +} diff --git a/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java new file mode 100644 index 00000000..a207532c --- /dev/null +++ b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java @@ -0,0 +1,36 @@ +package io.divolte.server.config.constraint; + +import static java.lang.annotation.ElementType.*; +import static java.lang.annotation.RetentionPolicy.*; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import javax.validation.Constraint; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import javax.validation.Payload; + +import io.divolte.server.config.DivolteConfiguration; + +@Target({ TYPE }) +@Retention(RUNTIME) +@Constraint(validatedBy=OneSchemaPerSink.Validator.class) +@Documented +public @interface OneSchemaPerSink { + String message() default "Any sink can only use one schema. The following sinks have multiple mappings with different schema's linked to them: ${validatedValue.sinksWithMultipleSchemas()}."; + Class[] groups() default {}; + Class[] payload() default {}; + + public static class Validator implements ConstraintValidator { + @Override + public void initialize(final OneSchemaPerSink constraintAnnotation) { + } + + @Override + public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { + return value.sinksWithMultipleSchemas().size() == 0; + } + } +} diff --git a/src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java b/src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java new file mode 100644 index 00000000..d084619a --- /dev/null +++ b/src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java @@ -0,0 +1,36 @@ +package io.divolte.server.config.constraint; + +import static java.lang.annotation.ElementType.*; +import static java.lang.annotation.RetentionPolicy.*; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import javax.validation.Constraint; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import javax.validation.Payload; + +import io.divolte.server.config.DivolteConfiguration; + +@Target({ TYPE }) +@Retention(RUNTIME) +@Constraint(validatedBy=SourceAndSinkNamesCannotCollide.Validator.class) +@Documented +public @interface SourceAndSinkNamesCannotCollide { + String message() default "Source and sink names cannot collide (must be globally unique). The following names were both used as source and as sink: ${validatedValue.collidingSourceAndSinkNames()}."; + Class[] groups() default {}; + Class[] payload() default {}; + + public static class Validator implements ConstraintValidator { + @Override + public void initialize(final SourceAndSinkNamesCannotCollide constraintAnnotation) { + } + + @Override + public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { + return value.collidingSourceAndSinkNames().isEmpty(); + } + } +} diff --git a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java index 09205949..d94e6fe3 100644 --- a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java +++ b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java @@ -55,4 +55,40 @@ public void shouldMapReferenceConfig() { final ValidatedConfiguration vc = new ValidatedConfiguration(ConfigFactory::load); assertTrue(vc.errors().isEmpty()); } + + @Test + public void shouldReportMissingSourcesAndSinks() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("missing-sources-sinks.conf")); + + assertFalse(vc.isValid()); + assertEquals(1, vc.errors().size()); + assertTrue( + vc.errors() + .get(0) + .startsWith("Property 'divolte.' The following sources and/or sinks were used in a mapping but never defined: [missing-sink, missing-source]..")); + } + + @Test + public void sourceAndSinkNamesCannotCollide() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("source-sink-collisions.conf")); + + assertFalse(vc.isValid()); + assertEquals(1, vc.errors().size()); + assertTrue( + vc.errors() + .get(0) + .startsWith("Property 'divolte.' Source and sink names cannot collide (must be globally unique). The following names were both used as source and as sink: [foo, bar]..")); + } + + @Test + public void sinksCanOnlyHaveOneSchema() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("multiple-schemas-one-sink.conf")); + + assertFalse(vc.isValid()); + assertEquals(1, vc.errors().size()); + assertTrue( + vc.errors() + .get(0) + .startsWith("Property 'divolte.' Any sink can only use one schema. The following sinks have multiple mappings with different schema's linked to them: [kafka]..")); + } } diff --git a/src/test/resources/missing-sources-sinks.conf b/src/test/resources/missing-sources-sinks.conf new file mode 100644 index 00000000..4e23c623 --- /dev/null +++ b/src/test/resources/missing-sources-sinks.conf @@ -0,0 +1,57 @@ +// +// Copyright 2014 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +divolte { + global { + server.host = 127.0.0.1 + + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } + + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false + } + + // Explicitly specify the default sinks and sources, so that tests can merge properties in. + sources { + browser = { + type = browser + } + } + sinks { + hdfs = { + type = hdfs + } + kafka = { + type = kafka + } + } + + mappings { + test = { + sources = [browser,missing-source] + sinks = [hdfs,kafka,missing-sink] + } + } +} diff --git a/src/test/resources/multiple-schemas-one-sink.conf b/src/test/resources/multiple-schemas-one-sink.conf new file mode 100644 index 00000000..1a2d54e0 --- /dev/null +++ b/src/test/resources/multiple-schemas-one-sink.conf @@ -0,0 +1,66 @@ +// +// Copyright 2014 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +divolte { + global { + server.host = 127.0.0.1 + + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } + + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false + } + + // Explicitly specify the default sinks and sources, so that tests can merge properties in. + sources { + browser { + type = browser + } + } + + sinks { + hdfs = { + type = hdfs + } + + kafka = { + type = kafka + } + } + + mappings { + foo = { + sources = [browser] + sinks = [kafka] + schema_file = bar.avsc + } + + bar = { + sources = [browser] + sinks = [hdfs,kafka] + schema_file = bar.avsc + } + } +} diff --git a/src/test/resources/source-sink-collisions.conf b/src/test/resources/source-sink-collisions.conf new file mode 100644 index 00000000..7d93a4d6 --- /dev/null +++ b/src/test/resources/source-sink-collisions.conf @@ -0,0 +1,72 @@ +// +// Copyright 2014 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +divolte { + global { + server.host = 127.0.0.1 + + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } + + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false + } + + // Explicitly specify the default sinks and sources, so that tests can merge properties in. + sources { + browser { + type = browser + } + + foo { + type = browser + } + + bar = { + type = browser + } + } + + sinks { + hdfs = { + type = hdfs + } + kafka = { + type = kafka + } + foo = { + type = hdfs + } + bar = { + type = hdfs + } + } + + mappings { + test = { + sources = [browser] + sinks = [hdfs,kafka] + } + } +} From 7fa5f73309e95686cc527f40eab37ec4d89c38a6 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Mon, 7 Dec 2015 16:09:45 +0100 Subject: [PATCH 08/80] Reordering of methods. --- .../io/divolte/server/BaseEventHandler.java | 134 ------------------ .../server/config/DivolteConfiguration.java | 80 +++++------ 2 files changed, 40 insertions(+), 174 deletions(-) delete mode 100644 src/main/java/io/divolte/server/BaseEventHandler.java diff --git a/src/main/java/io/divolte/server/BaseEventHandler.java b/src/main/java/io/divolte/server/BaseEventHandler.java deleted file mode 100644 index 835c1449..00000000 --- a/src/main/java/io/divolte/server/BaseEventHandler.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server; - -import com.google.common.base.Strings; -import com.google.common.io.Resources; -import io.undertow.server.HttpHandler; -import io.undertow.server.HttpServerExchange; -import io.undertow.util.ETag; -import io.undertow.util.ETagUtils; -import io.undertow.util.Headers; -import io.undertow.util.StatusCodes; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.IOException; -import java.net.InetSocketAddress; -import java.nio.ByteBuffer; -import java.util.Deque; -import java.util.Objects; -import java.util.Optional; - -@ParametersAreNonnullByDefault -public abstract class BaseEventHandler implements HttpHandler { - private static final Logger logger = LoggerFactory.getLogger(BaseEventHandler.class); - - private final static ETag SENTINEL_ETAG = new ETag(false, "6b3edc43-20ec-4078-bc47-e965dd76b88a"); - private final static String SENTINEL_ETAG_VALUE = SENTINEL_ETAG.toString(); - - private final ByteBuffer transparentImage; - protected final IncomingRequestProcessingPool processingPool; - - public BaseEventHandler(final IncomingRequestProcessingPool processingPool) { - this.processingPool = Objects.requireNonNull(processingPool); - - try { - this.transparentImage = ByteBuffer.wrap( - Resources.toByteArray(Resources.getResource("transparent1x1.gif")) - ).asReadOnlyBuffer(); - } catch (final IOException e) { - // Should throw something more specific than this. - throw new RuntimeException("Could not load transparent image resource.", e); - } - } - - @Override - public void handleRequest(final HttpServerExchange exchange) { - /* - * The source address can be fetched on-demand from the peer connection, which may - * no longer be available after the response has been sent. So we materialize it here - * to ensure it's available further down the chain. - */ - final InetSocketAddress sourceAddress = exchange.getSourceAddress(); - exchange.setSourceAddress(sourceAddress); - - /* - * Set up the headers that we always send as a response, irrespective of what type it - * will be. Note that the client is responsible for ensuring that ensures that each request - * is unique. - * The cache-related headers are intended to prevent spurious reloads for an event. - * (Being a GET request, agents are free to re-issue the request at will. We don't want this.) - * As a last resort, we try to detect duplicates via the ETag header. - */ - exchange.getResponseHeaders() - .put(Headers.CONTENT_TYPE, "image/gif") - .put(Headers.ETAG, SENTINEL_ETAG_VALUE) - .put(Headers.CACHE_CONTROL, "private, no-cache, proxy-revalidate") - .put(Headers.PRAGMA, "no-cache") - .put(Headers.EXPIRES, "Fri, 14 Apr 1995 11:30:00 GMT"); - - // If an ETag is present, this is a duplicate event. - if (ETagUtils.handleIfNoneMatch(exchange, SENTINEL_ETAG, true)) { - /* - * Subclasses are responsible to logging events. - * We just ensure the pixel is always returned, no matter what. - */ - try { - logEvent(exchange); - } finally { - // Default status code what we want: 200 OK. - exchange.getResponseSender().send(transparentImage.slice()); - } - } else { - if (logger.isDebugEnabled()) { - logger.debug("Ignoring duplicate event from {}: {}", sourceAddress, getFullUrl(exchange)); - } - exchange.setStatusCode(StatusCodes.NOT_MODIFIED); - exchange.endExchange(); - } - } - - private static String getFullUrl(HttpServerExchange exchange) { - final String queryString = exchange.getQueryString(); - final String requestUrl = exchange.getRequestURL(); - return Strings.isNullOrEmpty(queryString) - ? requestUrl - : requestUrl + '?' + queryString; - } - - static Optional queryParamFromExchange(final HttpServerExchange exchange, final String param) { - return Optional.ofNullable(exchange.getQueryParameters().get(param)).map(Deque::getFirst); - } - - /** - * Log this event. - * - * The subclass is responsible for extracting all information from the request and - * handing it off. The client is still waiting at this point; the subclass should hand - * further processing of as expediently as possible. When it returns (or throws an - * exception) the pixel response will be sent. (The subclass must never complete the - * request.) - * @param exchange the HTTP exchange from which event data can be extracted. - */ - protected abstract void logEvent(final HttpServerExchange exchange); - - protected static class IncompleteRequestException extends Exception { - private static final long serialVersionUID = 1L; - } -} diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index c3baf52c..20347566 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -66,6 +66,42 @@ public final class DivolteConfiguration { // - Elide unreferenced sources and sinks. } + // Defaults + private static ImmutableMap defaultSourceConfigurations() { + return ImmutableMap.of("browser", new BrowserSourceConfiguration(Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty(), + Optional.empty())); + } + + private static ImmutableMap defaultSinkConfigurations() { + return ImmutableMap.of("hdfs", new HdfsSinkConfiguration(Optional.empty(), Optional.empty()), + "kafka", new KafkaSinkConfiguration(Optional.empty())); + } + + private static ImmutableMap defaultMappingConfigurations(final ImmutableSet sourceNames, + final ImmutableSet sinkNames) { + return ImmutableMap.of("default", new MappingConfiguration(Optional.empty(), + Optional.empty(), + sourceNames, + sinkNames, + Optional.empty(), + Optional.empty())); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("global", global) + .add("sources", sources) + .add("sinks", sinks) + .add("mappings", mappings) + .toString(); + } + /* * Validation support methods here. * @@ -104,45 +140,9 @@ public Set sinksWithMultipleSchemas() { } return sinkSchemas.entrySet() - .stream() - .filter(e -> e.getValue().size() > 1) - .map(Map.Entry::getKey) - .collect(Collectors.toSet()); - } - - // Defaults - private static ImmutableMap defaultSourceConfigurations() { - return ImmutableMap.of("browser", new BrowserSourceConfiguration(Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty())); - } - - private static ImmutableMap defaultSinkConfigurations() { - return ImmutableMap.of("hdfs", new HdfsSinkConfiguration(Optional.empty(), Optional.empty()), - "kafka", new KafkaSinkConfiguration(Optional.empty())); - } - - private static ImmutableMap defaultMappingConfigurations(final ImmutableSet sourceNames, - final ImmutableSet sinkNames) { - return ImmutableMap.of("default", new MappingConfiguration(Optional.empty(), - Optional.empty(), - sourceNames, - sinkNames, - Optional.empty(), - Optional.empty())); - } - - @Override - public String toString() { - return MoreObjects.toStringHelper(this) - .add("global", global) - .add("sources", sources) - .add("sinks", sinks) - .add("mappings", mappings) - .toString(); + .stream() + .filter(e -> e.getValue().size() > 1) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); } } From 4ffeff7d17ff7226ff32e77c37f48ebb382c72e1 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Mon, 7 Dec 2015 16:21:55 +0100 Subject: [PATCH 09/80] Flatten ClientSideCookieHandler and super class into one class. --- .../server/ClientSideCookieEventHandler.java | 96 ++++++++++++++++++- 1 file changed, 91 insertions(+), 5 deletions(-) diff --git a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java index cb08c67f..7117b5a0 100644 --- a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java +++ b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java @@ -18,9 +18,11 @@ import java.io.IOException; import java.net.InetSocketAddress; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Deque; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.SortedMap; import java.util.TreeMap; @@ -33,15 +35,29 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; +import com.google.common.base.Strings; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; +import com.google.common.io.Resources; import io.divolte.server.mincode.MincodeFactory; +import io.undertow.server.HttpHandler; import io.undertow.server.HttpServerExchange; +import io.undertow.util.ETag; +import io.undertow.util.ETagUtils; +import io.undertow.util.Headers; +import io.undertow.util.StatusCodes; @ParametersAreNonnullByDefault -public final class ClientSideCookieEventHandler extends BaseEventHandler { +public final class ClientSideCookieEventHandler implements HttpHandler { private static final Logger logger = LoggerFactory.getLogger(ClientSideCookieEventHandler.class); + + private final static ETag SENTINEL_ETAG = new ETag(false, "6b3edc43-20ec-4078-bc47-e965dd76b88a"); + private final static String SENTINEL_ETAG_VALUE = SENTINEL_ETAG.toString(); + + private final ByteBuffer transparentImage; + protected final IncomingRequestProcessingPool processingPool; + private static final String TRUE_STRING = "t"; private static final String PARTY_ID_QUERY_PARAM = "p"; @@ -66,12 +82,20 @@ public final class ClientSideCookieEventHandler extends BaseEventHandler { static final String EVENT_SOURCE_NAME = "browser"; - public ClientSideCookieEventHandler(final IncomingRequestProcessingPool pool) { - super(pool); + public ClientSideCookieEventHandler(final IncomingRequestProcessingPool processingPool) { + this.processingPool = Objects.requireNonNull(processingPool); + + try { + this.transparentImage = ByteBuffer.wrap( + Resources.toByteArray(Resources.getResource("transparent1x1.gif")) + ).asReadOnlyBuffer(); + } catch (final IOException e) { + // Should throw something more specific than this. + throw new RuntimeException("Could not load transparent image resource.", e); + } } - @Override - protected void logEvent(final HttpServerExchange exchange) { + private void logEvent(final HttpServerExchange exchange) { try { handleRequestIfComplete(exchange); } catch (final IncompleteRequestException ire) { @@ -80,6 +104,68 @@ protected void logEvent(final HttpServerExchange exchange) { } } + @Override + public void handleRequest(final HttpServerExchange exchange) { + /* + * The source address can be fetched on-demand from the peer connection, which may + * no longer be available after the response has been sent. So we materialize it here + * to ensure it's available further down the chain. + */ + final InetSocketAddress sourceAddress = exchange.getSourceAddress(); + exchange.setSourceAddress(sourceAddress); + + /* + * Set up the headers that we always send as a response, irrespective of what type it + * will be. Note that the client is responsible for ensuring that ensures that each request + * is unique. + * The cache-related headers are intended to prevent spurious reloads for an event. + * (Being a GET request, agents are free to re-issue the request at will. We don't want this.) + * As a last resort, we try to detect duplicates via the ETag header. + */ + exchange.getResponseHeaders() + .put(Headers.CONTENT_TYPE, "image/gif") + .put(Headers.ETAG, SENTINEL_ETAG_VALUE) + .put(Headers.CACHE_CONTROL, "private, no-cache, proxy-revalidate") + .put(Headers.PRAGMA, "no-cache") + .put(Headers.EXPIRES, "Fri, 14 Apr 1995 11:30:00 GMT"); + + // If an ETag is present, this is a duplicate event. + if (ETagUtils.handleIfNoneMatch(exchange, SENTINEL_ETAG, true)) { + /* + * Subclasses are responsible to logging events. + * We just ensure the pixel is always returned, no matter what. + */ + try { + logEvent(exchange); + } finally { + // Default status code what we want: 200 OK. + exchange.getResponseSender().send(transparentImage.slice()); + } + } else { + if (logger.isDebugEnabled()) { + logger.debug("Ignoring duplicate event from {}: {}", sourceAddress, getFullUrl(exchange)); + } + exchange.setStatusCode(StatusCodes.NOT_MODIFIED); + exchange.endExchange(); + } + } + + private static String getFullUrl(final HttpServerExchange exchange) { + final String queryString = exchange.getQueryString(); + final String requestUrl = exchange.getRequestURL(); + return Strings.isNullOrEmpty(queryString) + ? requestUrl + : requestUrl + '?' + queryString; + } + + static Optional queryParamFromExchange(final HttpServerExchange exchange, final String param) { + return Optional.ofNullable(exchange.getQueryParameters().get(param)).map(Deque::getFirst); + } + + public static class IncompleteRequestException extends Exception { + private static final long serialVersionUID = 1L; + } + private void handleRequestIfComplete(final HttpServerExchange exchange) throws IncompleteRequestException { final boolean corrupt = !isRequestChecksumCorrect(exchange); final DivolteIdentifier partyId = queryParamFromExchange(exchange, PARTY_ID_QUERY_PARAM).flatMap(DivolteIdentifier::tryParse).orElseThrow(IncompleteRequestException::new); From aaedd8511544b6fbadf33da9cab9643f8f9be2bf Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Tue, 22 Dec 2015 11:35:50 +0100 Subject: [PATCH 10/80] Remove Optional<...> from *Configuration constructors where actual property is not optional. This is to make the constructor args match the reality of the config params, such that we can later use JavaDoc for the documentation. Default values are set using the defaultValue annotation, which is however not interpreted by the deserializer. We will later fix this using a custom deserializer in Jackson. --- .../config/BrowserSourceConfiguration.java | 51 +++++++++++-------- .../server/config/DivolteConfiguration.java | 18 +++---- .../server/config/DurationDeserializer.java | 31 ++++++++--- .../config/DurationFormatException.java | 9 ++++ .../config/FileStrategyConfiguration.java | 43 +++++++++------- .../server/config/HdfsSinkConfiguration.java | 18 +++++-- .../config/JavascriptConfiguration.java | 39 +++++++------- .../server/config/KafkaSinkConfiguration.java | 11 ++-- .../server/config/MappingConfiguration.java | 13 ++--- .../server/config/ValidatedConfiguration.java | 2 +- .../hdfs/SimpleRollingFileStrategy.java | 41 +++++++-------- 11 files changed, 164 insertions(+), 112 deletions(-) create mode 100644 src/main/java/io/divolte/server/config/DurationFormatException.java diff --git a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java index 1d83154f..60a03747 100644 --- a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java +++ b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java @@ -1,21 +1,31 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.google.common.base.MoreObjects; +import java.time.Duration; +import java.util.Optional; import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.Valid; -import java.time.Duration; -import java.util.Objects; -import java.util.Optional; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; @ParametersAreNonnullByDefault public class BrowserSourceConfiguration extends SourceConfiguration { private static final String DEFAULT_PREFIX = "/"; private static final String DEFAULT_PARTY_COOKIE = "_dvp"; - private static final Duration DEFAULT_PARTY_TIMEOUT = Duration.ofDays(730); + private static final String DEFAULT_PARTY_TIMEOUT = "730 days"; private static final String DEFAULT_SESSION_COOKIE = "_dvs"; - private static final Duration DEFAULT_SESSION_TIMEOUT = Duration.ofMinutes(30); + private static final String DEFAULT_SESSION_TIMEOUT = "30 minutes"; + + public static final BrowserSourceConfiguration DEFAULT_BROWSER_SOURCE_CONFIGURATION = new BrowserSourceConfiguration( + DEFAULT_PREFIX, + Optional.empty(), + DEFAULT_PARTY_COOKIE, + DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT), + DEFAULT_SESSION_COOKIE, + DurationDeserializer.parseDuration(DEFAULT_SESSION_TIMEOUT), + JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); public final String prefix; @@ -29,20 +39,21 @@ public class BrowserSourceConfiguration extends SourceConfiguration { public final JavascriptConfiguration javascript; @JsonCreator - BrowserSourceConfiguration(final Optional prefix, + BrowserSourceConfiguration(@JsonProperty(defaultValue=DEFAULT_PREFIX) final String prefix, final Optional cookieDomain, - final Optional partyCookie, - final Optional partyTimeout, - final Optional sessionCookie, - final Optional sessionTimeout, - final Optional javascript) { - this.prefix = prefix.orElse(DEFAULT_PREFIX); - this.cookieDomain = Objects.requireNonNull(cookieDomain); - this.partyCookie = partyCookie.orElse(DEFAULT_PARTY_COOKIE); - this.partyTimeout = partyTimeout.orElse(DEFAULT_PARTY_TIMEOUT); - this.sessionCookie = sessionCookie.orElse(DEFAULT_SESSION_COOKIE); - this.sessionTimeout = sessionTimeout.orElse(DEFAULT_SESSION_TIMEOUT); - this.javascript = javascript.orElse(JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); + @JsonProperty(defaultValue=DEFAULT_PARTY_COOKIE) final String partyCookie, + @JsonProperty(defaultValue=DEFAULT_PARTY_TIMEOUT) final Duration partyTimeout, + @JsonProperty(defaultValue=DEFAULT_SESSION_COOKIE) final String sessionCookie, + @JsonProperty(defaultValue=DEFAULT_SESSION_TIMEOUT) final Duration sessionTimeout, + final JavascriptConfiguration javascript) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this + this.prefix = prefix == null ? DEFAULT_PREFIX : prefix; + this.cookieDomain = cookieDomain; + this.partyCookie = partyCookie == null ? DEFAULT_PARTY_COOKIE : partyCookie; + this.partyTimeout = partyTimeout == null ? DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT) : partyTimeout; + this.sessionCookie = sessionCookie == null ? DEFAULT_SESSION_COOKIE : sessionCookie; + this.sessionTimeout = sessionTimeout == null ? DurationDeserializer.parseDuration(DEFAULT_SESSION_TIMEOUT) : sessionTimeout; + this.javascript = Optional.ofNullable(javascript).orElse(JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); } @Override diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 20347566..b5808ead 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -66,20 +66,14 @@ public final class DivolteConfiguration { // - Elide unreferenced sources and sinks. } - // Defaults + // Defaults; these will eventually disappear private static ImmutableMap defaultSourceConfigurations() { - return ImmutableMap.of("browser", new BrowserSourceConfiguration(Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty(), - Optional.empty())); + return ImmutableMap.of("browser", BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION); } private static ImmutableMap defaultSinkConfigurations() { - return ImmutableMap.of("hdfs", new HdfsSinkConfiguration(Optional.empty(), Optional.empty()), - "kafka", new KafkaSinkConfiguration(Optional.empty())); + return ImmutableMap.of("hdfs", new HdfsSinkConfiguration((short) 1, FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION), + "kafka", new KafkaSinkConfiguration(null)); } private static ImmutableMap defaultMappingConfigurations(final ImmutableSet sourceNames, @@ -88,8 +82,8 @@ private static ImmutableMap defaultMappingConfigura Optional.empty(), sourceNames, sinkNames, - Optional.empty(), - Optional.empty())); + false, + false)); } @Override diff --git a/src/main/java/io/divolte/server/config/DurationDeserializer.java b/src/main/java/io/divolte/server/config/DurationDeserializer.java index f0263066..e2b7f8b2 100644 --- a/src/main/java/io/divolte/server/config/DurationDeserializer.java +++ b/src/main/java/io/divolte/server/config/DurationDeserializer.java @@ -6,14 +6,14 @@ import java.time.Duration; import java.util.concurrent.TimeUnit; +import javax.annotation.ParametersAreNonnullByDefault; + import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.deser.std.StdScalarDeserializer; import com.typesafe.config.impl.ConfigImplUtil; -import javax.annotation.ParametersAreNonnullByDefault; - @ParametersAreNonnullByDefault public class DurationDeserializer extends StdScalarDeserializer { private static final long serialVersionUID = 1L; @@ -28,11 +28,23 @@ public Duration deserialize(final JsonParser p, if (VALUE_STRING != p.getCurrentToken()) { throw ctx.mappingException("Expected string value for Duration mapping."); } - return Duration.ofNanos(parseDuration(p.getText(), ctx)); + return Duration.ofNanos(parse(p.getText(), ctx)); + } + + private static long parse(final String input, final DeserializationContext context) throws JsonMappingException { + try { + return parse(input); + } catch(final DurationFormatException de) { + throw new JsonMappingException(de.getMessage(), context.getParser().getCurrentLocation(), de); + } + } + + public static Duration parseDuration(final String input) { + return Duration.ofNanos(parse(input)); } // Inspired by Typesafe Config parseDuration(...) - private static long parseDuration(final String input, final DeserializationContext context) throws JsonMappingException { + private static long parse(final String input) { final String s = ConfigImplUtil.unicodeTrim(input); final String originalUnitString = getUnits(s); String unitString = originalUnitString; @@ -41,7 +53,8 @@ private static long parseDuration(final String input, final DeserializationConte // this would be caught later anyway, but the error message // is more helpful if we check it here. if (numberString.isEmpty()) { - throw context.mappingException(String.format("No number in duration value '%s'", input)); + final String msg = String.format("No number in duration value '%s'", input); + throw new DurationFormatException(msg); } // All units longer than 2 characters are accepted in singular or plural form. @@ -86,7 +99,8 @@ private static long parseDuration(final String input, final DeserializationConte units = TimeUnit.MINUTES; break; default: - throw context.mappingException(String.format("Could not parse time unit '%s' (try ns, us, ms, s, m, h, d)", originalUnitString)); + final String msg = String.format("Could not parse time unit '%s' (try ns, us, ms, s, m, h, d)", originalUnitString); + throw new DurationFormatException(msg); } try { @@ -96,14 +110,15 @@ private static long parseDuration(final String input, final DeserializationConte ? units.toNanos(Long.parseLong(numberString)) : (long) (Double.parseDouble(numberString) * units.toNanos(1)); } catch (final NumberFormatException e) { - throw context.mappingException(String.format("Could not parse duration number '%s'", numberString)); + final String msg = String.format("Could not parse duration number '%s'", numberString); + throw new DurationFormatException(msg); } } private static String getUnits(final String s) { int i = s.length() - 1; while (i >= 0) { - char c = s.charAt(i); + final char c = s.charAt(i); if (!Character.isLetter(c)) { break; } diff --git a/src/main/java/io/divolte/server/config/DurationFormatException.java b/src/main/java/io/divolte/server/config/DurationFormatException.java new file mode 100644 index 00000000..2c8a6770 --- /dev/null +++ b/src/main/java/io/divolte/server/config/DurationFormatException.java @@ -0,0 +1,9 @@ +package io.divolte.server.config; + +public class DurationFormatException extends RuntimeException { + private static final long serialVersionUID = 8475209646046838380L; + + public DurationFormatException(final String message) { + super(message); + } +} diff --git a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java index 361284f8..df1da0c3 100644 --- a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java +++ b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java @@ -1,22 +1,28 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.google.common.base.MoreObjects; +import java.time.Duration; import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Optional; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; @ParametersAreNonnullByDefault public class FileStrategyConfiguration { - private static final int DEFAULT_SYNC_FILE_AFTER_RECORDS = 1000; - private static final Duration DEFAULT_SYNC_FILE_AFTER_DURATION = Duration.ofSeconds(30); + private static final String DEFAULT_SYNC_FILE_AFTER_RECORDS = "1000"; + private static final String DEFAULT_SYNC_FILE_AFTER_DURATION = "30 seconds"; private static final String DEFAULT_WORKING_DIR = "/tmp"; private static final String DEFAULT_PUBLISH_DIR = "/tmp"; - private static final Duration DEFAULT_ROLL_EVERY = Duration.ofHours(1); + private static final String DEFAULT_ROLL_EVERY = "1 hour"; static final FileStrategyConfiguration DEFAULT_FILE_STRATEGY_CONFIGURATION = - new FileStrategyConfiguration(Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); + new FileStrategyConfiguration( + DurationDeserializer.parseDuration(DEFAULT_ROLL_EVERY), + Integer.parseInt(DEFAULT_SYNC_FILE_AFTER_RECORDS), + DurationDeserializer.parseDuration(DEFAULT_SYNC_FILE_AFTER_DURATION), + DEFAULT_WORKING_DIR, + DEFAULT_PUBLISH_DIR); public final int syncFileAfterRecords; public final Duration syncFileAfterDuration; @@ -25,16 +31,17 @@ public class FileStrategyConfiguration { public final Duration rollEvery; @JsonCreator - FileStrategyConfiguration(final Optional rollEvery, - final Optional syncFileAfterRecords, - final Optional syncFileAfterDuration, - final Optional workingDir, - final Optional publishDir) { - this.rollEvery = rollEvery.orElse(DEFAULT_ROLL_EVERY); - this.syncFileAfterRecords = syncFileAfterRecords.orElse(DEFAULT_SYNC_FILE_AFTER_RECORDS); - this.syncFileAfterDuration = syncFileAfterDuration.orElse(DEFAULT_SYNC_FILE_AFTER_DURATION); - this.workingDir = workingDir.orElse(DEFAULT_WORKING_DIR); - this.publishDir = publishDir.orElse(DEFAULT_PUBLISH_DIR); + FileStrategyConfiguration(@JsonProperty(defaultValue=DEFAULT_ROLL_EVERY) final Duration rollEvery, + @JsonProperty(defaultValue=DEFAULT_SYNC_FILE_AFTER_RECORDS) final Integer syncFileAfterRecords, + @JsonProperty(defaultValue=DEFAULT_SYNC_FILE_AFTER_DURATION) final Duration syncFileAfterDuration, + @JsonProperty(defaultValue=DEFAULT_WORKING_DIR) final String workingDir, + @JsonProperty(defaultValue=DEFAULT_PUBLISH_DIR) final String publishDir) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this + this.rollEvery = rollEvery == null ? DurationDeserializer.parseDuration(DEFAULT_ROLL_EVERY) : rollEvery; + this.syncFileAfterRecords = syncFileAfterRecords == null ? Integer.valueOf(DEFAULT_SYNC_FILE_AFTER_RECORDS) : syncFileAfterRecords; + this.syncFileAfterDuration = syncFileAfterDuration == null ? DurationDeserializer.parseDuration(DEFAULT_SYNC_FILE_AFTER_DURATION) : syncFileAfterDuration; + this.workingDir = workingDir == null ? DEFAULT_WORKING_DIR : workingDir; + this.publishDir = publishDir == null ? DEFAULT_PUBLISH_DIR : publishDir; } @Override diff --git a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java index 4e4112b2..ef6097b5 100644 --- a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java @@ -1,6 +1,7 @@ package io.divolte.server.config; import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; import javax.annotation.ParametersAreNonnullByDefault; @@ -8,16 +9,23 @@ @ParametersAreNonnullByDefault public class HdfsSinkConfiguration extends SinkConfiguration { - private static final short DEFAULT_REPLICATION = 3; + private static final String DEFAULT_REPLICATION = "3"; public final short replication; public final FileStrategyConfiguration fileStrategy; @JsonCreator - HdfsSinkConfiguration(final Optional replication, - final Optional fileStrategy) { - this.replication = replication.orElse(DEFAULT_REPLICATION); - this.fileStrategy = fileStrategy.orElse(FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION); + HdfsSinkConfiguration(@JsonProperty(defaultValue=DEFAULT_REPLICATION) final Short replication, + final FileStrategyConfiguration fileStrategy) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this + this.replication = replication == null ? Short.valueOf(DEFAULT_REPLICATION) : replication; + /* + * Passing a null defaults to the default strategy. Reason for not making the parameter Optional<...> is + * that this way, we can at some point use a tool to automatically document the configuration objects + * including types. This type of defaults could then be documented through the parameter specific JavaDoc + * for that param. + */ + this.fileStrategy = Optional.ofNullable(fileStrategy).orElse(FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION); } @Override diff --git a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java index 45083074..6353f2e1 100644 --- a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java +++ b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java @@ -1,23 +1,27 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.google.common.base.MoreObjects; -import org.hibernate.validator.constraints.NotEmpty; - import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.constraints.NotNull; import javax.validation.constraints.Pattern; -import java.util.Optional; + +import org.hibernate.validator.constraints.NotEmpty; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; @ParametersAreNonnullByDefault public final class JavascriptConfiguration { private static final String DEFAULT_NAME = "divolte.js"; - private static final boolean DEFAULT_LOGGING = false; - private static final boolean DEFAULT_DEBUG = false; - private static final boolean DEFAULT_AUTO_PAGE_VIEW_EVENT = true; + private static final String DEFAULT_LOGGING = "false"; + private static final String DEFAULT_DEBUG = "false"; + private static final String DEFAULT_AUTO_PAGE_VIEW_EVENT = "true"; static final JavascriptConfiguration DEFAULT_JAVASCRIPT_CONFIGURATION = - new JavascriptConfiguration(Optional.empty(), Optional.empty(), Optional.empty(), Optional.empty()); + new JavascriptConfiguration(DEFAULT_NAME, + Boolean.parseBoolean(DEFAULT_LOGGING), + Boolean.parseBoolean(DEFAULT_DEBUG), + Boolean.parseBoolean(DEFAULT_AUTO_PAGE_VIEW_EVENT)); @NotNull @NotEmpty @Pattern(regexp="^[A-Za-z0-9_-]+\\.js$") public final String name; @@ -27,14 +31,15 @@ public final class JavascriptConfiguration { public final boolean autoPageViewEvent; @JsonCreator - JavascriptConfiguration(final Optional name, - final Optional logging, - final Optional debug, - final Optional autoPageViewEvent) { - this.name = name.orElse(DEFAULT_NAME); - this.logging = logging.orElse(DEFAULT_LOGGING); - this.debug = debug.orElse(DEFAULT_DEBUG); - this.autoPageViewEvent = autoPageViewEvent.orElse(DEFAULT_AUTO_PAGE_VIEW_EVENT); + JavascriptConfiguration(@JsonProperty(defaultValue=DEFAULT_NAME) final String name, + @JsonProperty(defaultValue=DEFAULT_LOGGING) final Boolean logging, + @JsonProperty(defaultValue=DEFAULT_DEBUG) final Boolean debug, + @JsonProperty(defaultValue=DEFAULT_AUTO_PAGE_VIEW_EVENT) final Boolean autoPageViewEvent) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this + this.name = name == null ? DEFAULT_NAME : name; + this.logging = logging == null ? Boolean.valueOf(DEFAULT_LOGGING) : logging; + this.debug = debug == null ? Boolean.valueOf(DEFAULT_DEBUG) : debug; + this.autoPageViewEvent = autoPageViewEvent == null ? Boolean.valueOf(DEFAULT_AUTO_PAGE_VIEW_EVENT) : autoPageViewEvent; } @Override diff --git a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java index 8b2a481e..13c83f4e 100644 --- a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java @@ -1,11 +1,11 @@ package io.divolte.server.config; +import javax.annotation.ParametersAreNonnullByDefault; + import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Optional; - @ParametersAreNonnullByDefault public class KafkaSinkConfiguration extends SinkConfiguration { private static final String DEFAULT_TOPIC = "divolte"; @@ -13,8 +13,9 @@ public class KafkaSinkConfiguration extends SinkConfiguration { public final String topic; @JsonCreator - KafkaSinkConfiguration(final Optional topic) { - this.topic = topic.orElse(DEFAULT_TOPIC); + KafkaSinkConfiguration(@JsonProperty(defaultValue=DEFAULT_TOPIC) final String topic) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this + this.topic = topic == null ? DEFAULT_TOPIC : topic; } @Override diff --git a/src/main/java/io/divolte/server/config/MappingConfiguration.java b/src/main/java/io/divolte/server/config/MappingConfiguration.java index c0483ae8..c5656acd 100644 --- a/src/main/java/io/divolte/server/config/MappingConfiguration.java +++ b/src/main/java/io/divolte/server/config/MappingConfiguration.java @@ -11,8 +11,8 @@ @ParametersAreNonnullByDefault public class MappingConfiguration { - private static final boolean DEFAULT_DISCARD_CORRUPTED = false; - private static final boolean DEFAULT_DISCARD_DUPLICATES = false; + private static final String DEFAULT_DISCARD_CORRUPTED = "false"; + private static final String DEFAULT_DISCARD_DUPLICATES = "false"; public final Optional schemaFile; public final Optional mappingScriptFile; @@ -30,14 +30,15 @@ public class MappingConfiguration { final ImmutableSet sources, @JsonProperty(required = true) final ImmutableSet sinks, - final Optional discardCorrupted, - final Optional discardDuplicates) { + @JsonProperty(defaultValue=DEFAULT_DISCARD_CORRUPTED) final Boolean discardCorrupted, + @JsonProperty(defaultValue=DEFAULT_DISCARD_DUPLICATES) final Boolean discardDuplicates) { this.schemaFile = Objects.requireNonNull(schemaFile); this.mappingScriptFile = Objects.requireNonNull(mappingScriptFile); this.sources = Objects.requireNonNull(sources); this.sinks = Objects.requireNonNull(sinks); - this.discardCorrupted = discardCorrupted.orElse(DEFAULT_DISCARD_CORRUPTED); - this.discardDuplicates = discardDuplicates.orElse(DEFAULT_DISCARD_DUPLICATES); + // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this + this.discardCorrupted = discardCorrupted == null ? Boolean.valueOf(DEFAULT_DISCARD_CORRUPTED) : discardCorrupted; + this.discardDuplicates = discardDuplicates == null ? Boolean.valueOf(DEFAULT_DISCARD_DUPLICATES) : discardDuplicates; } @Override diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index eff1087f..900dbbc1 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -93,7 +93,7 @@ public ValidatedConfiguration(final Supplier configLoader) { * errors to the resulting list of error messages. */ final Config config = configLoader.get(); - divolteConfiguration = mapped(config.getConfig("divolte")); + divolteConfiguration = mapped(config.getConfig("divolte").resolve()); configurationErrors.addAll(validate(divolteConfiguration)); } catch(final ConfigException e) { logger.debug("Configuration error caught during validation.", e); diff --git a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java index 21aab583..42da1a6b 100644 --- a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java +++ b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java @@ -16,22 +16,8 @@ package io.divolte.server.hdfs; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.FileStrategyConfiguration; -import io.divolte.server.config.ValidatedConfiguration; -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; -import javax.annotation.Nonnull; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; @@ -42,8 +28,22 @@ import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.FAILURE; -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.SUCCESS; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; + +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.config.FileStrategyConfiguration; +import io.divolte.server.config.ValidatedConfiguration; @NotThreadSafe @ParametersAreNonnullByDefault @@ -109,7 +109,7 @@ private Path newFilePath() { private static String findLocalHostName() { try { return InetAddress.getLocalHost().getHostName(); - } catch (UnknownHostException e) { + } catch (final UnknownHostException e) { return "localhost"; } } @@ -203,7 +203,7 @@ private void possiblyRollFile(final long time) throws IOException { final Path newFilePath = newFilePath(); try { currentFile = openNewFile(newFilePath); - } catch (IOException e) { + } catch (final IOException e) { throwsIoException(() -> hdfs.delete(newFilePath, false)); throw e; } @@ -250,7 +250,7 @@ private final class HadoopFile implements AutoCloseable { long totalRecords; @SuppressWarnings("resource") - public HadoopFile(Path path) throws IOException { + public HadoopFile(final Path path) throws IOException { this.path = path; this.stream = hdfs.create(path, hdfsReplication); @@ -275,6 +275,7 @@ private Path getPublishDestination() { return new Path(hdfsPublishDir, pathName.substring(0, pathName.length() - INFLIGHT_EXTENSION.length())); } + @Override public void close() throws IOException { totalRecords += recordsSinceLastSync; writer.close(); From 15d057b47d85cf281863c4c22c2b8fb4171b4d76 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Tue, 22 Dec 2015 15:22:28 +0100 Subject: [PATCH 11/80] Store source ID and affinity in items on the processing queues. --- .../server/ClientSideCookieEventHandler.java | 3 +- .../server/IncomingRequestProcessingPool.java | 4 -- .../server/IncomingRequestProcessor.java | 12 ++-- .../io/divolte/server/hdfs/HdfsFlusher.java | 4 +- .../divolte/server/hdfs/HdfsFlushingPool.java | 4 -- .../io/divolte/server/kafka/KafkaFlusher.java | 7 ++- .../server/kafka/KafkaFlushingPool.java | 6 +- .../io/divolte/server/processing/Item.java | 36 +++++++++++ .../server/processing/ItemProcessor.java | 8 +-- .../server/processing/ProcessingPool.java | 25 ++++---- .../divolte/server/hdfs/HdfsFlusherTest.java | 60 ++++++++++--------- 11 files changed, 103 insertions(+), 66 deletions(-) create mode 100644 src/main/java/io/divolte/server/processing/Item.java diff --git a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java index 7117b5a0..aeb453db 100644 --- a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java +++ b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java @@ -41,6 +41,7 @@ import com.google.common.io.Resources; import io.divolte.server.mincode.MincodeFactory; +import io.divolte.server.processing.Item; import io.undertow.server.HttpHandler; import io.undertow.server.HttpServerExchange; import io.undertow.util.ETag; @@ -182,7 +183,7 @@ private void handleRequestIfComplete(final HttpServerExchange exchange) throws I isNewPartyId, isFirstInSession, exchange); logger.debug("Enqueuing event (client generated cookies): {}/{}/{}/{}", partyId, sessionId, pageViewId, eventId); - processingPool.enqueueIncomingExchangeForProcessing(partyId, event); + processingPool.enqueue(Item.of(0, partyId.value, event)); } static DivolteEvent buildBrowserEventData(final boolean corruptEvent, diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index f7716208..dfcd4280 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -108,10 +108,6 @@ private static LookupService lookupServiceFromConfig(final ValidatedConfiguratio .orElse(null); } - public void enqueueIncomingExchangeForProcessing(final DivolteIdentifier partyId, final DivolteEvent event) { - enqueue(partyId.value, event); - } - @Override public void stop() { super.stop(); diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index 6525dd3a..ab96a98e 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -36,6 +36,7 @@ import io.divolte.server.ip2geo.LookupService; import io.divolte.server.kafka.KafkaFlusher; import io.divolte.server.kafka.KafkaFlushingPool; +import io.divolte.server.processing.Item; import io.divolte.server.processing.ItemProcessor; import io.divolte.server.processing.ProcessingPool; import io.divolte.server.recordmapping.DslRecordMapper; @@ -124,7 +125,8 @@ private DslRecordMapping defaultRecordMapping(final ValidatedConfiguration vc) { } @Override - public ProcessingDirective process(final DivolteEvent event) { + public ProcessingDirective process(final Item item) { + final DivolteEvent event = item.payload; if (!event.corruptEvent || keepCorrupted) { /* * Note: we cannot use the actual query string here, @@ -145,20 +147,20 @@ public ProcessingDirective process(final DivolteEvent event) { event.clientUtcOffset, avroRecord); listener.incomingRequest(event, avroBuffer, avroRecord); - doProcess(avroBuffer); + doProcess(item, avroBuffer); } } return CONTINUE; } - private void doProcess(final AvroRecordBuffer avroBuffer) { + private void doProcess(final Item sourceItem, final AvroRecordBuffer avroBuffer) { if (null != kafkaFlushingPool) { - kafkaFlushingPool.enqueue(avroBuffer.getPartyId().value, avroBuffer); + kafkaFlushingPool.enqueue(Item.withCopiedAffinity(0, sourceItem, avroBuffer)); } if (null != hdfsFlushingPool) { - hdfsFlushingPool.enqueue(avroBuffer.getPartyId().value, avroBuffer); + hdfsFlushingPool.enqueue(Item.withCopiedAffinity(0, sourceItem, avroBuffer)); } } } diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java index 5a9bb3ac..2d065d36 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java @@ -34,6 +34,7 @@ import io.divolte.server.AvroRecordBuffer; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult; +import io.divolte.server.processing.Item; import io.divolte.server.processing.ItemProcessor; @ParametersAreNonnullByDefault @@ -88,7 +89,8 @@ public void cleanup() { } @Override - public ProcessingDirective process(final AvroRecordBuffer record) { + public ProcessingDirective process(final Item item) { + final AvroRecordBuffer record = item.payload; if (lastHdfsResult == SUCCESS) { return (lastHdfsResult = fileStrategy.append(record)) == SUCCESS ? CONTINUE : PAUSE; } else { diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java index 8815b772..238322f4 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java @@ -44,8 +44,4 @@ public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema, fi "Hdfs Flusher", () -> new HdfsFlusher(vc, schema)); } - - public void enqueueRecordsForFlushing(final AvroRecordBuffer record) { - enqueue(record.getPartyId().value, record); - } } diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java index ef153751..865954d8 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java @@ -35,6 +35,7 @@ import io.divolte.server.AvroRecordBuffer; import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.processing.Item; import io.divolte.server.processing.ItemProcessor; import kafka.common.FailedToSendMessageException; import kafka.javaapi.producer.Producer; @@ -61,7 +62,8 @@ public KafkaFlusher(final ValidatedConfiguration vc) { } @Override - public ProcessingDirective process(final AvroRecordBuffer record) { + public ProcessingDirective process(final Item item) { + final AvroRecordBuffer record = item.payload; logger.debug("Processing individual record.", record); return send(() -> { producer.send(buildMessage(record)); @@ -70,7 +72,7 @@ public ProcessingDirective process(final AvroRecordBuffer record) { } @Override - public ProcessingDirective process(final Queue batch) { + public ProcessingDirective process(final Queue> batch) { final int batchSize = batch.size(); final ProcessingDirective result; switch (batchSize) { @@ -85,6 +87,7 @@ public ProcessingDirective process(final Queue batch) { logger.debug("Processing batch of {} records.", batchSize); final List> kafkaMessages = batch.stream() + .map(i -> i.payload) .map(this::buildMessage) .collect(Collectors.toCollection(() -> new ArrayList<>(batchSize))); // Clear the messages now; on failure they'll be retried as part of our diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java index 918a01b9..16849b5a 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java @@ -33,11 +33,7 @@ public KafkaFlushingPool(final ValidatedConfiguration vc) { vc.configuration().global.kafka.bufferSize); } - public KafkaFlushingPool(ValidatedConfiguration vc, int numThreads, int maxWriteQueue) { + public KafkaFlushingPool(final ValidatedConfiguration vc, final int numThreads, final int maxWriteQueue) { super(numThreads, maxWriteQueue, "Kafka Flusher", () -> new KafkaFlusher(vc)); } - - public void enqueueRecord(final AvroRecordBuffer record) { - enqueue(record.getPartyId().value, record); - } } diff --git a/src/main/java/io/divolte/server/processing/Item.java b/src/main/java/io/divolte/server/processing/Item.java new file mode 100644 index 00000000..33017d87 --- /dev/null +++ b/src/main/java/io/divolte/server/processing/Item.java @@ -0,0 +1,36 @@ +package io.divolte.server.processing; + +import java.nio.charset.StandardCharsets; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +public final class Item { + public final int sourceId; + public final int affinityHash; + public final E payload; + + private static final HashFunction hasher = Hashing.murmur3_32(42); + + + private Item(final int sourceId, final int affinityHash, final E payload) { + this.sourceId = sourceId; + this.affinityHash = affinityHash; + this.payload = payload; + } + + public static Item of(final int sourceId, final String key, final E payload) { + return new Item<>( + sourceId, + // making sure the hash result is non-negative by masking with max int + hasher.hashString(key, StandardCharsets.UTF_8).asInt() & Integer.MAX_VALUE, + payload); + } + + public static Item withCopiedAffinity(final int sourceId, final Item affinitySource,final E payload) { + return new Item<>( + sourceId, + affinitySource.affinityHash, + payload); + } +} diff --git a/src/main/java/io/divolte/server/processing/ItemProcessor.java b/src/main/java/io/divolte/server/processing/ItemProcessor.java index 17cc1dde..97f3c314 100644 --- a/src/main/java/io/divolte/server/processing/ItemProcessor.java +++ b/src/main/java/io/divolte/server/processing/ItemProcessor.java @@ -16,14 +16,14 @@ package io.divolte.server.processing; -import java.util.Queue; - import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; +import java.util.Queue; + public interface ItemProcessor { - ProcessingDirective process(E e); + ProcessingDirective process(Item e); - default ProcessingDirective process(final Queue batch) { + default ProcessingDirective process(final Queue> batch) { ProcessingDirective directive; do { // Note: processing should not throw an unchecked diff --git a/src/main/java/io/divolte/server/processing/ProcessingPool.java b/src/main/java/io/divolte/server/processing/ProcessingPool.java index b4fc512b..7032b42f 100644 --- a/src/main/java/io/divolte/server/processing/ProcessingPool.java +++ b/src/main/java/io/divolte/server/processing/ProcessingPool.java @@ -17,7 +17,6 @@ package io.divolte.server.processing; import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; -import io.divolte.server.processing.ItemProcessor.ProcessingDirective; import java.util.ArrayDeque; import java.util.ArrayList; @@ -42,6 +41,8 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; +import io.divolte.server.processing.ItemProcessor.ProcessingDirective; + @ParametersAreNonnullByDefault public class ProcessingPool, E> { private static final Logger logger = LoggerFactory.getLogger(ProcessingPool.class); @@ -49,7 +50,7 @@ public class ProcessingPool, E> { private static final int MAX_BATCH_SIZE = 128; private final ExecutorService executorService; - private final List> queues; + private final List>> queues; private volatile boolean running; @@ -71,7 +72,7 @@ public ProcessingPool( final ThreadFactory factory = createThreadFactory(threadGroup, threadBaseName + " - %d"); executorService = Executors.newFixedThreadPool(numThreads, factory); - this.queues = Stream.> + this.queues = Stream.>> generate(() -> new ArrayBlockingQueue<>(maxQueueSize)) .limit(numThreads) .collect(Collectors.toCollection(() -> new ArrayList<>(numThreads))); @@ -83,9 +84,9 @@ public ProcessingPool( } - public void enqueue(String key, E e) { - // We mask the hash-code to ensure we always get a positive bucket index. - if (!queues.get((key.hashCode() & Integer.MAX_VALUE) % queues.size()).offer(e)) { + public void enqueue(final Item item) { + final BlockingQueue> queue = queues.get(item.affinityHash % queues.size()); + if (!queue.offer(item)) { logger.warn("Failed to enqueue item. Dropping event."); } } @@ -95,12 +96,12 @@ public void stop() { running = false; executorService.shutdown(); executorService.awaitTermination(1, TimeUnit.HOURS); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Thread.currentThread().interrupt(); } } - private void scheduleQueueReader(final ExecutorService es, final BlockingQueue queue, final ItemProcessor processor) { + private void scheduleQueueReader(final ExecutorService es, final BlockingQueue> queue, final ItemProcessor processor) { CompletableFuture.runAsync(microBatchingQueueDrainerWithHeartBeat(queue, processor), es).whenComplete((voidValue, error) -> { processor.cleanup(); @@ -114,12 +115,12 @@ private void scheduleQueueReader(final ExecutorService es, final BlockingQueue queue, + final BlockingQueue> queue, final ItemProcessor processor) { return () -> { // The default item processor implementation removes items one-by-one as they // are processed. Using a Queue ensures that this is efficient. - final Queue batch = new ArrayDeque<>(MAX_BATCH_SIZE); + final Queue> batch = new ArrayDeque<>(MAX_BATCH_SIZE); while (!queue.isEmpty() || running) { ProcessingDirective directive; @@ -150,7 +151,7 @@ private Runnable microBatchingQueueDrainerWithHeartBeat( private static void sleepOneSecond() { try { Thread.sleep(1000); - } catch(InterruptedException e) { + } catch(final InterruptedException e) { Thread.currentThread().interrupt(); } } @@ -158,7 +159,7 @@ private static void sleepOneSecond() { private static E pollQuietly(final BlockingQueue queue, final long timeout, final TimeUnit unit) { try { return queue.poll(timeout, unit); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Thread.currentThread().interrupt(); return null; } diff --git a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java index 38d86844..b6904e78 100644 --- a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java +++ b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java @@ -16,12 +16,20 @@ package io.divolte.server.hdfs; -import com.google.common.collect.ImmutableMap; -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.DivolteIdentifier; -import io.divolte.server.config.ValidatedConfiguration; +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.LongStream; +import java.util.stream.StreamSupport; + +import javax.annotation.ParametersAreNonnullByDefault; + import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericData.Record; @@ -35,19 +43,14 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.LongStream; -import java.util.stream.StreamSupport; +import com.google.common.collect.ImmutableMap; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.DivolteIdentifier; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.processing.Item; @ParametersAreNonnullByDefault public class HdfsFlusherTest { @@ -186,15 +189,16 @@ private void setupFlusher(final String rollEvery, final int recordCount) throws } private void processRecords() { - records.forEach((record) -> - flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), - DivolteIdentifier.generate(), - System.currentTimeMillis(), - 0, - record))); + records.stream().map( + (record) -> AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), + DivolteIdentifier.generate(), + System.currentTimeMillis(), + 0, + record)) + .forEach((arb) -> flusher.process(Item.of(0, arb.getPartyId().value, arb))); } - private void deleteQuietly(Path p) { + private void deleteQuietly(final Path p) { try { Files.delete(p); } catch (final Exception e) { @@ -202,7 +206,7 @@ private void deleteQuietly(Path p) { } } - private void verifyAvroFile(List expected, Schema schema, Path avroFile) { + private void verifyAvroFile(final List expected, final Schema schema, final Path avroFile) { final List result = StreamSupport .stream(readAvroFile(schema, avroFile.toFile()).spliterator(), false) @@ -210,11 +214,11 @@ private void verifyAvroFile(List expected, Schema schema, Path avroFile) assertEquals(expected, result); } - private DataFileReader readAvroFile(Schema schema, File file) { + private DataFileReader readAvroFile(final Schema schema, final File file) { final DatumReader dr = new GenericDatumReader<>(schema); try { return new DataFileReader<>(file, dr); - } catch (IOException e) { + } catch (final IOException e) { throw new RuntimeException(e); } } From b3d0423c56061df34b21f1595b0d9c0455b92a49 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 23 Dec 2015 11:40:48 +0100 Subject: [PATCH 12/80] Use .isEmpty() instead of comparing size to 0. --- .../io/divolte/server/config/constraint/OneSchemaPerSink.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java index a207532c..c4a04ccb 100644 --- a/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java +++ b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java @@ -30,7 +30,7 @@ public void initialize(final OneSchemaPerSink constraintAnnotation) { @Override public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { - return value.sinksWithMultipleSchemas().size() == 0; + return value.sinksWithMultipleSchemas().isEmpty(); } } } From 166f6b3900ecc4ffb9a385f54e9e5a114adbb361 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 10:51:05 +0100 Subject: [PATCH 13/80] During tests, fail early if the test server configuration isn't valid. --- src/test/java/io/divolte/server/ServerTestUtils.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/test/java/io/divolte/server/ServerTestUtils.java b/src/test/java/io/divolte/server/ServerTestUtils.java index eda83104..2d68efda 100644 --- a/src/test/java/io/divolte/server/ServerTestUtils.java +++ b/src/test/java/io/divolte/server/ServerTestUtils.java @@ -16,6 +16,7 @@ package io.divolte.server; +import com.google.common.base.Preconditions; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigValueFactory; @@ -92,6 +93,8 @@ private TestServer(final int port, final Config config) { events = new ArrayBlockingQueue<>(100); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> this.config); + Preconditions.checkArgument(vc.isValid(), + "Invalid test server configuration: %s", vc.errors()); server = new Server(vc, (event, buffer, record) -> events.add(new EventPayload(event, buffer, record))); } From 18e83a6bba1eb3634e5dd778b0e4e051ce961289 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 10:54:18 +0100 Subject: [PATCH 14/80] Start the test server automatically after construction. All users of this class did this anyway. --- src/test/java/io/divolte/server/DslRecordMapperTest.java | 1 - .../io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java | 1 - src/test/java/io/divolte/server/RequestChecksumTest.java | 3 --- src/test/java/io/divolte/server/SeleniumTestBase.java | 1 - .../io/divolte/server/ServerSideCookieEventHandlerTest.java | 1 - src/test/java/io/divolte/server/ServerTestUtils.java | 1 + .../java/io/divolte/server/ShortTermDuplicateMemoryTest.java | 1 - 7 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/test/java/io/divolte/server/DslRecordMapperTest.java b/src/test/java/io/divolte/server/DslRecordMapperTest.java index c26ce01b..6b43f95b 100644 --- a/src/test/java/io/divolte/server/DslRecordMapperTest.java +++ b/src/test/java/io/divolte/server/DslRecordMapperTest.java @@ -551,7 +551,6 @@ private void setupServer(final String mapping) throws IOException { ); server = new TestServer("dsl-mapping-test.conf", mappingConfig); - server.server.run(); } private static void copyResourceToFile(final String resourceName, final File file) throws IOException { diff --git a/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java b/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java index 68e196ab..f4d9d461 100644 --- a/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java +++ b/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java @@ -105,7 +105,6 @@ public void shouldAllowMultipleXffHeaders() throws IOException, InterruptedExcep @Before public void setUp() { server = new TestServer("x-forwarded-for-test.conf"); - server.server.run(); } @After diff --git a/src/test/java/io/divolte/server/RequestChecksumTest.java b/src/test/java/io/divolte/server/RequestChecksumTest.java index c9ad7608..cc90c142 100644 --- a/src/test/java/io/divolte/server/RequestChecksumTest.java +++ b/src/test/java/io/divolte/server/RequestChecksumTest.java @@ -185,9 +185,6 @@ private void setServer(@Nullable final TestServer newServer) { oldServer.server.shutdown(); } this.server = newServer; - if (null != newServer) { - newServer.server.run(); - } } } } diff --git a/src/test/java/io/divolte/server/SeleniumTestBase.java b/src/test/java/io/divolte/server/SeleniumTestBase.java index b7dbfa4e..ccc995fe 100644 --- a/src/test/java/io/divolte/server/SeleniumTestBase.java +++ b/src/test/java/io/divolte/server/SeleniumTestBase.java @@ -146,7 +146,6 @@ protected void doSetUp(final String configFileName) throws Exception { } server = new TestServer(configFileName); - server.server.run(); } private void setupBrowserStack() throws MalformedURLException { diff --git a/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java b/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java index af579dd8..88a71b23 100644 --- a/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java +++ b/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java @@ -49,7 +49,6 @@ public void shouldRegisterServerSideCookieEvent() throws IOException, RuntimeExc @Before public void setUp() { server = new TestServer("server-side-cookies-test.conf"); - server.server.run(); } @After diff --git a/src/test/java/io/divolte/server/ServerTestUtils.java b/src/test/java/io/divolte/server/ServerTestUtils.java index 2d68efda..ce3cac54 100644 --- a/src/test/java/io/divolte/server/ServerTestUtils.java +++ b/src/test/java/io/divolte/server/ServerTestUtils.java @@ -96,6 +96,7 @@ private TestServer(final int port, final Config config) { Preconditions.checkArgument(vc.isValid(), "Invalid test server configuration: %s", vc.errors()); server = new Server(vc, (event, buffer, record) -> events.add(new EventPayload(event, buffer, record))); + server.run(); } public EventPayload waitForEvent() throws InterruptedException { diff --git a/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java b/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java index db141d5d..bd5cdfb2 100644 --- a/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java +++ b/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java @@ -138,7 +138,6 @@ private void request(final int which) throws IOException { @Before public void setUp() { server = new TestServer("duplicates-test.conf"); - server.server.run(); } @After From 262562fc662434ef6e18fdb9c3989d47238adac2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 10:54:49 +0100 Subject: [PATCH 15/80] Avoid empty blocks. --- .../io/divolte/server/config/constraint/OneSchemaPerSink.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java index c4a04ccb..0b0fe333 100644 --- a/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java +++ b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java @@ -26,6 +26,7 @@ public static class Validator implements ConstraintValidator { @Override public void initialize(final OneSchemaPerSink constraintAnnotation) { + // Nothing needed here. } @Override From d52b8328d751f946ad06236719e653b71921a7ca Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 11:52:08 +0100 Subject: [PATCH 16/80] Refactor test configurations. - Eliminate a bunch of empty configuration files. - Change test layout to let tests run against the default configuration with the test server, or a base convenience configuration. --- .../divolte/server/DslRecordMapperTest.java | 4 +- .../divolte/server/RequestChecksumTest.java | 40 ++++++++++--------- .../server/SeleniumJavaScriptTest.java | 2 +- .../io/divolte/server/SeleniumTestBase.java | 10 ++++- .../ServerSideCookieEventHandlerTest.java | 2 +- .../io/divolte/server/ServerTestUtils.java | 22 +++++----- .../config/ValidatedConfigurationTest.java | 1 + .../divolte/server/hdfs/HdfsFlusherTest.java | 3 +- ...orrupt-test.conf => base-test-server.conf} | 13 +++++- src/test/resources/checksum-test.conf | 19 --------- src/test/resources/dsl-mapping-test.conf | 17 -------- src/test/resources/hdfs-flusher-test.conf | 2 +- src/test/resources/reference-test.conf | 22 ---------- src/test/resources/selenium-test-config.conf | 17 -------- .../resources/server-side-cookies-test.conf | 17 -------- 15 files changed, 61 insertions(+), 130 deletions(-) rename src/test/resources/{checksum-discard-corrupt-test.conf => base-test-server.conf} (71%) delete mode 100644 src/test/resources/checksum-test.conf delete mode 100644 src/test/resources/dsl-mapping-test.conf delete mode 100644 src/test/resources/selenium-test-config.conf delete mode 100644 src/test/resources/server-side-cookies-test.conf diff --git a/src/test/java/io/divolte/server/DslRecordMapperTest.java b/src/test/java/io/divolte/server/DslRecordMapperTest.java index 6b43f95b..61c09860 100644 --- a/src/test/java/io/divolte/server/DslRecordMapperTest.java +++ b/src/test/java/io/divolte/server/DslRecordMapperTest.java @@ -366,7 +366,7 @@ public void shouldMapAllGeoIpFields() throws IOException, InterruptedException, ); final Config geoConfig = ConfigFactory.parseMap(mappingConfig) - .withFallback(ConfigFactory.parseResources("dsl-mapping-test.conf")) + .withFallback(ConfigFactory.parseResources("base-test-server.conf")) .withFallback(ConfigFactory.parseResources("reference-test.conf")); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> geoConfig); @@ -550,7 +550,7 @@ private void setupServer(final String mapping) throws IOException { "divolte.mappings.test.schema_file", avroFile.getAbsolutePath() ); - server = new TestServer("dsl-mapping-test.conf", mappingConfig); + server = new TestServer("base-test-server.conf", mappingConfig); } private static void copyResourceToFile(final String resourceName, final File file) throws IOException { diff --git a/src/test/java/io/divolte/server/RequestChecksumTest.java b/src/test/java/io/divolte/server/RequestChecksumTest.java index cc90c142..1158037c 100644 --- a/src/test/java/io/divolte/server/RequestChecksumTest.java +++ b/src/test/java/io/divolte/server/RequestChecksumTest.java @@ -16,23 +16,22 @@ package io.divolte.server; -import static org.junit.Assert.*; - -import java.io.IOException; -import java.net.HttpURLConnection; -import java.net.URL; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; - +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import io.divolte.server.ServerTestUtils.EventPayload; +import io.divolte.server.ServerTestUtils.TestServer; import org.junit.After; import org.junit.Before; import org.junit.Test; -import com.google.common.base.Preconditions; +import javax.annotation.Nullable; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.Map; -import io.divolte.server.ServerTestUtils.EventPayload; -import io.divolte.server.ServerTestUtils.TestServer; +import static org.junit.Assert.*; @ParametersAreNonnullByDefault public class RequestChecksumTest { @@ -93,10 +92,12 @@ public class RequestChecksumTest { + "t=sentinelEvent&" + "x=-y99lem"; - private String serverConfigurationResourceName; + private boolean discardCorruptEvents; @Nullable private TestServer server; + @Nullable + private ImmutableMap serverProperties; @Test public void shouldFlagCorrectChecksumAsNotCorrupted() throws IOException, InterruptedException { @@ -144,7 +145,7 @@ public void shouldChecksumCorrectlyWithNonAsciiParameters() throws IOException, @Test public void shouldDiscardCorruptedEventsIfConfigured() throws InterruptedException, IOException { - serverConfigurationResourceName = "checksum-discard-corrupt-test.conf"; + discardCorruptEvents = true; request(URL_QUERY_CHECKSUM_BAD); request(URL_QUERY_SENTINEL); Preconditions.checkState(null != server); @@ -155,22 +156,23 @@ public void shouldDiscardCorruptedEventsIfConfigured() throws InterruptedExcepti } private void request(final String queryString) throws IOException { - setServerConf(serverConfigurationResourceName); + setServerConf(ImmutableMap.of("divolte.mappings.test.discard_corrupted", discardCorruptEvents)); Preconditions.checkState(null != server); final URL url = new URL(String.format(URL_STRING, server.port) + queryString); final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); assertEquals(200, conn.getResponseCode()); } - private void setServerConf(final String configurationResourceName) { - if (null == server || !configurationResourceName.equals(server.config.origin().resource())) { - setServer(new TestServer(configurationResourceName)); + private void setServerConf(final Map configurationProperties) { + if (null == server || !configurationProperties.equals(serverProperties)) { + serverProperties = ImmutableMap.copyOf(configurationProperties); + setServer(new TestServer("base-test-server.conf", serverProperties)); } } @Before public void setUp() { - serverConfigurationResourceName = "checksum-test.conf"; + discardCorruptEvents = false; } @After diff --git a/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java b/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java index 0d392e68..1e69a9b6 100644 --- a/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java +++ b/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java @@ -258,6 +258,6 @@ public void shouldPickupProvidedPageViewIdFromHash() throws RuntimeException, In @Before public void setup() throws Exception { - doSetUp("selenium-test-config.conf"); + doSetUp(); } } diff --git a/src/test/java/io/divolte/server/SeleniumTestBase.java b/src/test/java/io/divolte/server/SeleniumTestBase.java index ccc995fe..489a7023 100644 --- a/src/test/java/io/divolte/server/SeleniumTestBase.java +++ b/src/test/java/io/divolte/server/SeleniumTestBase.java @@ -127,6 +127,14 @@ protected String urlOf(final TEST_PAGES page) { } protected void doSetUp(final String configFileName) throws Exception { + doSetUp(Optional.of(configFileName)); + } + + protected void doSetUp() throws Exception { + doSetUp(Optional.empty()); + } + + protected void doSetUp(final Optional configFileName) throws Exception { final String driverName = System.getenv().getOrDefault(DRIVER_ENV_VAR, PHANTOMJS_DRIVER); switch (driverName) { @@ -145,7 +153,7 @@ protected void doSetUp(final String configFileName) throws Exception { break; } - server = new TestServer(configFileName); + server = configFileName.map(TestServer::new).orElseGet(TestServer::new); } private void setupBrowserStack() throws MalformedURLException { diff --git a/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java b/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java index 88a71b23..ab7e294b 100644 --- a/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java +++ b/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java @@ -48,7 +48,7 @@ public void shouldRegisterServerSideCookieEvent() throws IOException, RuntimeExc @Before public void setUp() { - server = new TestServer("server-side-cookies-test.conf"); + server = new TestServer(); } @After diff --git a/src/test/java/io/divolte/server/ServerTestUtils.java b/src/test/java/io/divolte/server/ServerTestUtils.java index ce3cac54..b13dad18 100644 --- a/src/test/java/io/divolte/server/ServerTestUtils.java +++ b/src/test/java/io/divolte/server/ServerTestUtils.java @@ -70,21 +70,21 @@ public static final class TestServer { final Server server; final BlockingQueue events; + public TestServer() { + this(findFreePort(), ConfigFactory.parseResources("reference-test.conf")); + } + public TestServer(final String configResource) { - this( - findFreePort(), - ConfigFactory.parseResources(configResource) - .withFallback(ConfigFactory.parseResources("reference-test.conf")) - ); + this(findFreePort(), + ConfigFactory.parseResources(configResource) + .withFallback(ConfigFactory.parseResources("reference-test.conf"))); } public TestServer(final String configResource, final Map extraConfig) { - this( - findFreePort(), - ConfigFactory.parseMap(extraConfig, "Test-specific overrides") - .withFallback(ConfigFactory.parseResources(configResource)) - .withFallback(ConfigFactory.parseResources("reference-test.conf")) - ); + this(findFreePort(), + ConfigFactory.parseMap(extraConfig, "Test-specific overrides") + .withFallback(ConfigFactory.parseResources(configResource)) + .withFallback(ConfigFactory.parseResources("reference-test.conf"))); } private TestServer(final int port, final Config config) { diff --git a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java index d94e6fe3..8ef60c02 100644 --- a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java +++ b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java @@ -23,6 +23,7 @@ public void shouldValidateJavaScriptName() { final String propertyName = "divolte.sources.browser.javascript.name"; final String invalidValue = "404.exe"; final Config config = ConfigFactory.parseMap(ImmutableMap.of(propertyName, invalidValue)) + .withFallback(ConfigFactory.parseResources("base-test-server.conf")) .withFallback(ConfigFactory.parseResources("reference-test.conf")); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); diff --git a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java index b6904e78..4ab21911 100644 --- a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java +++ b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java @@ -174,7 +174,8 @@ private void setupFlusher(final String rollEvery, final int recordCount) throws "divolte.sinks.hdfs.file_strategy.roll_every", rollEvery, "divolte.sinks.hdfs.file_strategy.working_dir", tempInflightDir.toString(), "divolte.sinks.hdfs.file_strategy.publish_dir", tempPublishDir.toString())) - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); + .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")) + .withFallback(ConfigFactory.parseResources("reference-test.conf")); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); records = LongStream.range(0, recordCount) diff --git a/src/test/resources/checksum-discard-corrupt-test.conf b/src/test/resources/base-test-server.conf similarity index 71% rename from src/test/resources/checksum-discard-corrupt-test.conf rename to src/test/resources/base-test-server.conf index 8e86f26c..bf6d36fd 100644 --- a/src/test/resources/checksum-discard-corrupt-test.conf +++ b/src/test/resources/base-test-server.conf @@ -14,6 +14,17 @@ // limitations under the License. // +// Specify a basic source/sink/mapping configuration that tests can use. divolte { - mappings.test.discard_corrupted = false + sources.browser.type = browser + + sinks { + hdfs.type = hdfs + kafka.type = kafka + } + + mappings.test = { + sources = [browser] + sinks = [hdfs, kafka] + } } diff --git a/src/test/resources/checksum-test.conf b/src/test/resources/checksum-test.conf deleted file mode 100644 index 8e86f26c..00000000 --- a/src/test/resources/checksum-test.conf +++ /dev/null @@ -1,19 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -divolte { - mappings.test.discard_corrupted = false -} diff --git a/src/test/resources/dsl-mapping-test.conf b/src/test/resources/dsl-mapping-test.conf deleted file mode 100644 index 966e4072..00000000 --- a/src/test/resources/dsl-mapping-test.conf +++ /dev/null @@ -1,17 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// Nothing needed here. diff --git a/src/test/resources/hdfs-flusher-test.conf b/src/test/resources/hdfs-flusher-test.conf index 71e758e5..c6829adb 100644 --- a/src/test/resources/hdfs-flusher-test.conf +++ b/src/test/resources/hdfs-flusher-test.conf @@ -14,7 +14,7 @@ // limitations under the License. // -include classpath("reference-test.conf") +include classpath("base-test-server.conf") divolte { global { diff --git a/src/test/resources/reference-test.conf b/src/test/resources/reference-test.conf index 1942f87d..15733c3b 100644 --- a/src/test/resources/reference-test.conf +++ b/src/test/resources/reference-test.conf @@ -32,26 +32,4 @@ divolte { hdfs.enabled = false kafka.enabled = false } - - // Explicitly specify the default sinks and sources, so that tests can merge properties in. - sources { - browser = { - type = browser - } - } - sinks { - hdfs = { - type = hdfs - } - kafka = { - type = kafka - } - } - - mappings { - test = { - sources = [browser] - sinks = [hdfs,kafka] - } - } } diff --git a/src/test/resources/selenium-test-config.conf b/src/test/resources/selenium-test-config.conf deleted file mode 100644 index e252edc2..00000000 --- a/src/test/resources/selenium-test-config.conf +++ /dev/null @@ -1,17 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// Nothing needed here; the default test server configuration suffices. diff --git a/src/test/resources/server-side-cookies-test.conf b/src/test/resources/server-side-cookies-test.conf deleted file mode 100644 index e252edc2..00000000 --- a/src/test/resources/server-side-cookies-test.conf +++ /dev/null @@ -1,17 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// Nothing needed here; the default test server configuration suffices. From df316457794717117dba3dafee1f45f29c28b9d1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 11:54:00 +0100 Subject: [PATCH 17/80] Add a test for the /ping endpoint. --- .../io/divolte/server/ServerPingTest.java | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 src/test/java/io/divolte/server/ServerPingTest.java diff --git a/src/test/java/io/divolte/server/ServerPingTest.java b/src/test/java/io/divolte/server/ServerPingTest.java new file mode 100644 index 00000000..7b88c6e9 --- /dev/null +++ b/src/test/java/io/divolte/server/ServerPingTest.java @@ -0,0 +1,64 @@ +/* + * Copyright 2014 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import com.google.common.io.ByteStreams; +import io.divolte.server.ServerTestUtils.TestServer; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Optional; + +import static org.junit.Assert.assertEquals; + +@ParametersAreNonnullByDefault +public class ServerPingTest { + + private Optional testServer = Optional.empty(); + + @Before + public void setup() { + testServer = Optional.of(new TestServer()); + } + + @Test + public void shouldRespondToPingWithPong() throws IOException { + final URL url = new URL(String.format("http://localhost:%d/ping", testServer.get().port)); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + try { + conn.setRequestMethod("GET"); + assertEquals(200, conn.getResponseCode()); + assertEquals("text/plain; charset=utf-8", conn.getContentType()); + final String body = new String(ByteStreams.toByteArray(conn.getInputStream()), StandardCharsets.UTF_8); + assertEquals("pong", body); + } finally { + conn.disconnect(); + } + } + + @After + public void tearDown() { + testServer.ifPresent(testServer -> testServer.server.shutdown()); + testServer = Optional.empty(); + } +} From 8c88f760abdad795215275a3d1fbc0a8c11073c6 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 11:55:43 +0100 Subject: [PATCH 18/80] Ensure the prefix for a browser source has a trailing '/'. The browser source requires this or the tracking script won't correctly locate the event end-point. --- .../io/divolte/server/config/BrowserSourceConfiguration.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java index 60a03747..50e1db49 100644 --- a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java +++ b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java @@ -47,7 +47,8 @@ public class BrowserSourceConfiguration extends SourceConfiguration { @JsonProperty(defaultValue=DEFAULT_SESSION_TIMEOUT) final Duration sessionTimeout, final JavascriptConfiguration javascript) { // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - this.prefix = prefix == null ? DEFAULT_PREFIX : prefix; + final String rawPrefix = prefix == null ? DEFAULT_PREFIX : prefix; + this.prefix = rawPrefix.endsWith("/") ? rawPrefix : rawPrefix + '/'; this.cookieDomain = cookieDomain; this.partyCookie = partyCookie == null ? DEFAULT_PARTY_COOKIE : partyCookie; this.partyTimeout = partyTimeout == null ? DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT) : partyTimeout; From f3beb9b28b97488f095664a9e54427dad1f883c5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 11:56:40 +0100 Subject: [PATCH 19/80] Tweak log messages. No trailing period (.) required. --- .../java/io/divolte/server/config/ValidatedConfiguration.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 900dbbc1..58b3780d 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -124,7 +124,7 @@ private String messageForMappingException(final JsonMappingException e) { .map(Reference::getFieldName) .collect(Collectors.joining(".")); final String message = String.format( - "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.", + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'", e.getOriginalMessage(), Optional.ofNullable(e.getLocation()).map(JsonLocation::getSourceRef).orElse(""), "".equals(pathToError) ? "" : pathToError); @@ -133,7 +133,7 @@ private String messageForMappingException(final JsonMappingException e) { private static String messageForUnrecognizedPropertyException(final UnrecognizedPropertyException e) { final String message = String.format( - "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'.%n\tAvailable properties: %s.", + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'%n\tAvailable properties: %s.", e.getOriginalMessage(), e.getLocation().getSourceRef(), e.getPath().stream() From bcfff984209b4d5c4b76bcfabee69c0fc5d4313c Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 12:00:21 +0100 Subject: [PATCH 20/80] Start supporting multiple browser endpoints. --- src/main/java/io/divolte/server/Server.java | 31 +++-- .../server/config/DivolteConfiguration.java | 12 +- .../server/js/TrackingJavaScriptResource.java | 19 ++- .../server/ServerSourceConfigurationTest.java | 116 ++++++++++++++++++ .../js/TrackingJavaScriptResourceTest.java | 2 +- .../resources/browser-source-explicit.conf | 30 +++++ .../resources/browser-source-long-prefix.conf | 29 +++++ .../resources/browser-source-multiple.conf | 34 +++++ 8 files changed, 251 insertions(+), 22 deletions(-) create mode 100644 src/test/java/io/divolte/server/ServerSourceConfigurationTest.java create mode 100644 src/test/resources/browser-source-explicit.conf create mode 100644 src/test/resources/browser-source-long-prefix.conf create mode 100644 src/test/resources/browser-source-multiple.conf diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index 400a8f18..d741ab36 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -17,6 +17,7 @@ package io.divolte.server; import com.typesafe.config.ConfigFactory; +import io.divolte.server.config.BrowserSourceConfiguration; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.js.TrackingJavaScriptResource; import io.undertow.Undertow; @@ -62,15 +63,21 @@ public Server(final ValidatedConfiguration vc) { port = vc.configuration().global.server.port; processingPool = new IncomingRequestProcessingPool(vc, listener); - final ClientSideCookieEventHandler clientSideCookieEventHandler = - new ClientSideCookieEventHandler(processingPool); - final TrackingJavaScriptResource trackingJavaScript = loadTrackingJavaScript(vc); - final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); - - final PathHandler handler = new PathHandler(); - handler.addExactPath(vc.configuration().browserSourceConfiguration.prefix + "csc-event", - new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET)); - handler.addExactPath(vc.configuration().browserSourceConfiguration.prefix + trackingJavaScript.getScriptName(), javascriptHandler); + PathHandler handler = new PathHandler(); + for (final String name : vc.configuration().sources.keySet()) { + final ClientSideCookieEventHandler clientSideCookieEventHandler = + new ClientSideCookieEventHandler(processingPool); + final TrackingJavaScriptResource trackingJavaScript = loadTrackingJavaScript(vc, name); + final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); + final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().getBrowserSourceConfiguration(name); + final String eventPath = browserSourceConfiguration.prefix + "csc-event"; + final String scriptPath = browserSourceConfiguration.prefix + trackingJavaScript.getScriptName(); + handler = handler.addExactPath(eventPath, new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET)); + handler = handler.addExactPath(scriptPath, javascriptHandler); + logger.info("Registered source[{}] script location: {}", name, scriptPath); + logger.info("Registered source[{}] event handler: {}", name, eventPath); + } + handler.addExactPath("/ping", PingHandler::handlePingRequest); if (vc.configuration().global.server.serveStaticResources) { // Catch-all handler; must be last if present. @@ -92,11 +99,11 @@ public Server(final ValidatedConfiguration vc) { .build(); } - private TrackingJavaScriptResource loadTrackingJavaScript(final ValidatedConfiguration vc) { + private static TrackingJavaScriptResource loadTrackingJavaScript(final ValidatedConfiguration vc, final String sourceName) { try { - return new TrackingJavaScriptResource(vc); + return TrackingJavaScriptResource.create(vc, sourceName); } catch (final IOException e) { - throw new RuntimeException("Could not precompile tracking JavaScript.", e); + throw new RuntimeException("Could not precompile tracking JavaScript for source: " + sourceName, e); } } diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index b5808ead..58707ed0 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -16,6 +16,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.google.common.base.MoreObjects; +import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; @@ -39,8 +40,6 @@ public final class DivolteConfiguration { @Deprecated public final MappingConfiguration incomingRequestProcessor; @Deprecated - public final BrowserSourceConfiguration browserSourceConfiguration; - @Deprecated public final KafkaSinkConfiguration kafkaFlusher; @Deprecated public final HdfsSinkConfiguration hdfsFlusher; @@ -57,7 +56,6 @@ public final class DivolteConfiguration { // Temporary interop this.incomingRequestProcessor = Iterables.get(this.mappings.values(), 0); - this.browserSourceConfiguration = (BrowserSourceConfiguration) Iterables.get(this.sources.values(), 0); this.kafkaFlusher = (KafkaSinkConfiguration) Iterators.get(this.sinks.values().stream().filter((sink) -> sink instanceof KafkaSinkConfiguration).iterator(), 0); this.hdfsFlusher = (HdfsSinkConfiguration) Iterators.get(this.sinks.values().stream().filter((sink) -> sink instanceof HdfsSinkConfiguration).iterator(), 0); @@ -66,6 +64,14 @@ public final class DivolteConfiguration { // - Elide unreferenced sources and sinks. } + public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sourceName) { + final SourceConfiguration sourceConfiguration = sources.get(sourceName); + Objects.requireNonNull(sourceConfiguration, () -> "No source configuration with name: " + sourceName); + Preconditions.checkArgument(sourceConfiguration instanceof BrowserSourceConfiguration, + "Source configuration '%s' is not a browser source", sourceName); + return (BrowserSourceConfiguration)sourceConfiguration; + } + // Defaults; these will eventually disappear private static ImmutableMap defaultSourceConfigurations() { return ImmutableMap.of("browser", BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION); diff --git a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java index d5f91f64..e195f989 100644 --- a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java +++ b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java @@ -32,18 +32,17 @@ public class TrackingJavaScriptResource extends JavaScriptResource { private static final String SCRIPT_CONSTANT_NAME = "SCRIPT_NAME"; - public TrackingJavaScriptResource(final ValidatedConfiguration vc) throws IOException { - super(vc.configuration().browserSourceConfiguration.javascript.name, - createScriptConstants(vc), - vc.configuration().browserSourceConfiguration.javascript.debug); + public TrackingJavaScriptResource(final String resourceName, + final ImmutableMap scriptConstants, + final boolean debugMode) throws IOException { + super(resourceName, scriptConstants, debugMode); } public String getScriptName() { return (String)getScriptConstants().get(SCRIPT_CONSTANT_NAME); } - private static ImmutableMap createScriptConstants(final ValidatedConfiguration vc) { - final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().browserSourceConfiguration; + private static ImmutableMap createScriptConstants(final BrowserSourceConfiguration browserSourceConfiguration) { final ImmutableMap.Builder builder = ImmutableMap.builder(); builder.put("PARTY_COOKIE_NAME", browserSourceConfiguration.partyCookie); builder.put("PARTY_ID_TIMEOUT_SECONDS", trimLongToMaxInt(browserSourceConfiguration.partyTimeout.get(ChronoUnit.SECONDS))); @@ -67,4 +66,12 @@ private static int trimLongToMaxInt(long duration) { } return result; } + + public static TrackingJavaScriptResource create(final ValidatedConfiguration vc, + final String sourceName) throws IOException { + final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().getBrowserSourceConfiguration(sourceName); + return new TrackingJavaScriptResource(browserSourceConfiguration.javascript.name, + createScriptConstants(browserSourceConfiguration), + browserSourceConfiguration.javascript.debug); + } } diff --git a/src/test/java/io/divolte/server/ServerSourceConfigurationTest.java b/src/test/java/io/divolte/server/ServerSourceConfigurationTest.java new file mode 100644 index 00000000..7aa322b2 --- /dev/null +++ b/src/test/java/io/divolte/server/ServerSourceConfigurationTest.java @@ -0,0 +1,116 @@ +/* + * Copyright 2014 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import io.divolte.server.ServerTestUtils.TestServer; +import org.junit.After; +import org.junit.Test; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.Optional; + +import static org.junit.Assert.assertEquals; + +@ParametersAreNonnullByDefault +public class ServerSourceConfigurationTest { + + private static final String BROWSER_EVENT_URL_TEMPLATE = + "http://localhost:%d%s/csc-event?" + + "p=0%%3Ai1t84hgy%%3A5AF359Zjq5kUy98u4wQjlIZzWGhN~GlG&" + + "s=0%%3Ai1t84hgy%%3A95CbiPCYln_1e0a6rFvuRkDkeNnc6KC8&" + + "v=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF&" + + "e=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF0&" + + "c=i1t8q2b6&" + + "n=f&" + + "f=f&" + + "l=http%%3A%%2F%%2Flocalhost%%3A8290%%2F&" + + "i=1ak&" + + "j=sj&" + + "k=2&" + + "w=uq&" + + "h=qd&" + + "t=pageView&" + + "x=si9804"; + + private Optional testServer = Optional.empty(); + + private void startServer(final Optional configResource) { + stopServer(); + final TestServer newServer = configResource.map(TestServer::new).orElseGet(TestServer::new); + testServer = Optional.of(newServer); + } + + public void stopServer() { + testServer.ifPresent(testServer -> testServer.server.shutdown()); + testServer = Optional.empty(); + } + + private void request(final String sourcePrefix) throws IOException { + request(sourcePrefix, 200); + } + + private void request(final String sourcePrefix, final int expectedResponseCode) throws IOException { + final URL url = new URL(String.format(BROWSER_EVENT_URL_TEMPLATE, + testServer.get().port, + sourcePrefix)); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + assertEquals(expectedResponseCode, conn.getResponseCode()); + } + + @Test + public void shouldRegisterDefaultBrowserSource() throws IOException, InterruptedException { + // Test the default browser source that should be present by default. + startServer(Optional.empty()); + request(""); + testServer.get().waitForEvent(); + } + + @Test + public void shouldRegisterExplicitSourceOnly() throws IOException, InterruptedException { + // Test that if an explicit source is supplied, the builtin defaults are not present. + startServer(Optional.of("browser-source-explicit.conf")); + request("/a-prefix"); + testServer.get().waitForEvent(); + request("", 404); + } + + @Test + public void shouldSupportLongPaths() throws IOException, InterruptedException { + // Test that the browser sources work with different types of path. + startServer(Optional.of("browser-source-long-prefix.conf")); + request("/a/multi/component/prefix"); + testServer.get().waitForEvent(); + } + + @Test + public void shouldSupportMultipleBrowserSources() throws IOException, InterruptedException { + // Test that multiple browser sources are supported. + startServer(Optional.of("browser-source-multiple.conf")); + request("/path1"); + request("/path2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + } + + @After + public void tearDown() { + stopServer(); + } +} diff --git a/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java b/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java index 72b16875..07a291c8 100644 --- a/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java +++ b/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java @@ -43,7 +43,7 @@ public class TrackingJavaScriptResourceTest { public void setup() throws IOException { // Essential test to ensure at build-time that our JavaScript can be compiled. final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - trackingJavaScript = new TrackingJavaScriptResource(vc); + trackingJavaScript = TrackingJavaScriptResource.create(vc, "browser"); } @After diff --git a/src/test/resources/browser-source-explicit.conf b/src/test/resources/browser-source-explicit.conf new file mode 100644 index 00000000..7559f866 --- /dev/null +++ b/src/test/resources/browser-source-explicit.conf @@ -0,0 +1,30 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a single explicit browser source. +divolte { + sources.test-browser-source { + type = browser + // Specify a prefix, so we can differentiate this from the default one. + prefix = /a-prefix + } + + mappings.test = { + sources = [test-browser-source] + // Need at least one sink. + sinks = [hdfs] + } +} diff --git a/src/test/resources/browser-source-long-prefix.conf b/src/test/resources/browser-source-long-prefix.conf new file mode 100644 index 00000000..21ce7afa --- /dev/null +++ b/src/test/resources/browser-source-long-prefix.conf @@ -0,0 +1,29 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +divolte { + sources.test-browser-source { + type = browser + // Specify a multi-component prefix, to check that / doesn't need to be encoded. + prefix = /a/multi/component/prefix + } + + mappings.test = { + sources = [test-browser-source] + // Need at least one sink. + sinks = [hdfs, kafka] + } +} diff --git a/src/test/resources/browser-source-multiple.conf b/src/test/resources/browser-source-multiple.conf new file mode 100644 index 00000000..beda10ab --- /dev/null +++ b/src/test/resources/browser-source-multiple.conf @@ -0,0 +1,34 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +divolte { + sources { + test-browser-source-1 { + type = browser + prefix = /path1 + } + test-browser-source-2 { + type = browser + prefix = /path2 + } + } + + mappings.test = { + sources = [test-browser-source-1, test-browser-source-2] + // Need at least one sink. + sinks = [hdfs, kafka] + } +} From bcbce8053de7c8fcadba796a1027e7ebaf79add7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 12:01:37 +0100 Subject: [PATCH 21/80] Convert method to be static. It's invoked by the constructor, and constructors shouldn't normally invoke instance methods because they might not expect the instance to be partially constructed. --- src/main/java/io/divolte/server/Server.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index d741ab36..c067e1f8 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -107,9 +107,9 @@ private static TrackingJavaScriptResource loadTrackingJavaScript(final Validated } } - private HttpHandler createStaticResourceHandler() { + private static HttpHandler createStaticResourceHandler() { final ResourceManager staticResources = - new ClassPathResourceManager(getClass().getClassLoader(), "static"); + new ClassPathResourceManager(Server.class.getClassLoader(), "static"); // Cache tuning is copied from Undertow unit tests. final ResourceManager cachedResources = new CachingResourceManager(100, 65536, From 238dc12e494a44ecfc0c3d39830bdda08ab9d720 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 16:30:23 +0100 Subject: [PATCH 22/80] Minor optimisations during flushing: pre-size the arrays we're streaming into. --- src/main/java/io/divolte/server/kafka/KafkaFlusher.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java index 629985c7..fe1543ab 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java @@ -18,6 +18,7 @@ import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; +import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Queue; @@ -87,7 +88,7 @@ public ProcessingDirective process(final Queue> batch) { batch.stream() .map(i -> i.payload) .map(this::buildRecord) - .collect(Collectors.toList()); + .collect(Collectors.toCollection(() -> new ArrayList<>(batchSize))); // Clear the messages now; on failure they'll be retried as part of our // pending operation. batch.clear(); @@ -123,10 +124,11 @@ private ProcessingDirective flush(final List> sendBatch(final List> batch) throws InterruptedException { // First start sending the messages. // (This will serialize them, determine the partition and then assign them to a per-partition buffer.) + final int batchSize = batch.size(); final List> sendResults = batch.stream() .map(producer::send) - .collect(Collectors.toList()); + .collect(Collectors.toCollection(() -> new ArrayList<>(batchSize))); // The producer will send the messages in the background. As of 0.8.x we can't // flush, but have to wait for that to occur based on the producer configuration. // (By default it will immediately flush, but users can override this.) @@ -137,7 +139,6 @@ private ImmutableList> sendBa // - A fatal error occurred. // (In addition, we can be interrupted due to shutdown.) final ImmutableList.Builder> remaining = ImmutableList.builder(); - final int batchSize = batch.size(); for (int i = 0; i < batchSize; ++i) { final Future result = sendResults.get(i); try { From 6a5cda9616471f47de99c1d1726e488d3141c145 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 16:31:07 +0100 Subject: [PATCH 23/80] Nullability annotations and enforcement. --- src/main/java/io/divolte/server/processing/Item.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/processing/Item.java b/src/main/java/io/divolte/server/processing/Item.java index 33017d87..96ab23c2 100644 --- a/src/main/java/io/divolte/server/processing/Item.java +++ b/src/main/java/io/divolte/server/processing/Item.java @@ -1,10 +1,14 @@ package io.divolte.server.processing; import java.nio.charset.StandardCharsets; +import java.util.Objects; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; +import javax.annotation.ParametersAreNonnullByDefault; + +@ParametersAreNonnullByDefault public final class Item { public final int sourceId; public final int affinityHash; @@ -16,7 +20,7 @@ public final class Item { private Item(final int sourceId, final int affinityHash, final E payload) { this.sourceId = sourceId; this.affinityHash = affinityHash; - this.payload = payload; + this.payload = Objects.requireNonNull(payload); } public static Item of(final int sourceId, final String key, final E payload) { From 28f5b0c46b36cbe51a5f87b9b9e8c503378f7204 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 16:31:30 +0100 Subject: [PATCH 24/80] Static references are normally UPPER_CASE. --- src/main/java/io/divolte/server/processing/Item.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/io/divolte/server/processing/Item.java b/src/main/java/io/divolte/server/processing/Item.java index 96ab23c2..aa9d6e68 100644 --- a/src/main/java/io/divolte/server/processing/Item.java +++ b/src/main/java/io/divolte/server/processing/Item.java @@ -14,8 +14,7 @@ public final class Item { public final int affinityHash; public final E payload; - private static final HashFunction hasher = Hashing.murmur3_32(42); - + private static final HashFunction HASHER = Hashing.murmur3_32(42); private Item(final int sourceId, final int affinityHash, final E payload) { this.sourceId = sourceId; @@ -27,7 +26,7 @@ public static Item of(final int sourceId, final String key, final E paylo return new Item<>( sourceId, // making sure the hash result is non-negative by masking with max int - hasher.hashString(key, StandardCharsets.UTF_8).asInt() & Integer.MAX_VALUE, + HASHER.hashString(key, StandardCharsets.UTF_8).asInt() & Integer.MAX_VALUE, payload); } From 98c55b206e58d933d1bf3b3c4c7280957803418b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 24 Dec 2015 16:31:45 +0100 Subject: [PATCH 25/80] Whitespace. --- src/main/java/io/divolte/server/processing/Item.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/main/java/io/divolte/server/processing/Item.java b/src/main/java/io/divolte/server/processing/Item.java index aa9d6e68..1124d48e 100644 --- a/src/main/java/io/divolte/server/processing/Item.java +++ b/src/main/java/io/divolte/server/processing/Item.java @@ -30,10 +30,7 @@ public static Item of(final int sourceId, final String key, final E paylo payload); } - public static Item withCopiedAffinity(final int sourceId, final Item affinitySource,final E payload) { - return new Item<>( - sourceId, - affinitySource.affinityHash, - payload); + public static Item withCopiedAffinity(final int sourceId, final Item affinitySource, final E payload) { + return new Item<>(sourceId, affinitySource.affinityHash, payload); } } From bb4cecdbe06239931cae0b90e297c277e4650a10 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 15:00:30 +0100 Subject: [PATCH 26/80] No need for either of these tests to reference the Kafka sink. --- src/test/resources/browser-source-long-prefix.conf | 2 +- src/test/resources/browser-source-multiple.conf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/resources/browser-source-long-prefix.conf b/src/test/resources/browser-source-long-prefix.conf index 21ce7afa..4527cab1 100644 --- a/src/test/resources/browser-source-long-prefix.conf +++ b/src/test/resources/browser-source-long-prefix.conf @@ -24,6 +24,6 @@ divolte { mappings.test = { sources = [test-browser-source] // Need at least one sink. - sinks = [hdfs, kafka] + sinks = [hdfs] } } diff --git a/src/test/resources/browser-source-multiple.conf b/src/test/resources/browser-source-multiple.conf index beda10ab..54307237 100644 --- a/src/test/resources/browser-source-multiple.conf +++ b/src/test/resources/browser-source-multiple.conf @@ -29,6 +29,6 @@ divolte { mappings.test = { sources = [test-browser-source-1, test-browser-source-2] // Need at least one sink. - sinks = [hdfs, kafka] + sinks = [hdfs] } } From 46951c7f9287737029019d6b42ab997d9d6aa0dc Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 15:42:07 +0100 Subject: [PATCH 27/80] Support for multiple HDFS and Kafka sinks. Changes include: - Sink configurations now need to supply a factory for creating the sink. - The optional GeoIP service is now passed around as such. It's a bit weird why it was nullable in some places before. - There's now a central point for loading schemas. This is needed for schema inference with sinks and ensures schemas are only loaded once. - Some utilities for collecting streams into Guava's immutable collections. - Processing pools are now shutdown by the central server, instead of recursively via the mapper. --- .../server/ClientSideCookieEventHandler.java | 40 ++++---- .../io/divolte/server/EventForwarder.java | 21 +++++ .../server/IncomingRequestProcessingPool.java | 93 +++++++------------ .../server/IncomingRequestProcessor.java | 57 ++++-------- .../io/divolte/server/MoreCollectors.java | 35 +++++++ .../io/divolte/server/SchemaRegistry.java | 92 ++++++++++++++++++ src/main/java/io/divolte/server/Server.java | 44 ++++++++- .../server/config/DivolteConfiguration.java | 47 +++++----- .../server/config/HdfsSinkConfiguration.java | 6 ++ .../server/config/KafkaSinkConfiguration.java | 6 ++ .../server/config/SinkConfiguration.java | 14 +++ .../io/divolte/server/hdfs/HdfsFlusher.java | 9 +- .../divolte/server/hdfs/HdfsFlushingPool.java | 31 ++++--- .../hdfs/SimpleRollingFileStrategy.java | 10 +- .../server/kafka/KafkaFlushingPool.java | 27 +++--- .../divolte/server/hdfs/HdfsFlusherTest.java | 2 +- 16 files changed, 356 insertions(+), 178 deletions(-) create mode 100644 src/main/java/io/divolte/server/EventForwarder.java create mode 100644 src/main/java/io/divolte/server/MoreCollectors.java create mode 100644 src/main/java/io/divolte/server/SchemaRegistry.java diff --git a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java index aeb453db..8d30f591 100644 --- a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java +++ b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java @@ -16,30 +16,13 @@ package io.divolte.server; -import java.io.IOException; -import java.net.InetSocketAddress; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.Deque; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.SortedMap; -import java.util.TreeMap; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import com.google.common.base.Strings; +import com.google.common.collect.ImmutableList; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.common.io.Resources; - import io.divolte.server.mincode.MincodeFactory; import io.divolte.server.processing.Item; import io.undertow.server.HttpHandler; @@ -48,6 +31,16 @@ import io.undertow.util.ETagUtils; import io.undertow.util.Headers; import io.undertow.util.StatusCodes; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.*; @ParametersAreNonnullByDefault public final class ClientSideCookieEventHandler implements HttpHandler { @@ -57,7 +50,7 @@ public final class ClientSideCookieEventHandler implements HttpHandler { private final static String SENTINEL_ETAG_VALUE = SENTINEL_ETAG.toString(); private final ByteBuffer transparentImage; - protected final IncomingRequestProcessingPool processingPool; + private final EventForwarder processingPools; private static final String TRUE_STRING = "t"; @@ -83,8 +76,13 @@ public final class ClientSideCookieEventHandler implements HttpHandler { static final String EVENT_SOURCE_NAME = "browser"; + @Deprecated public ClientSideCookieEventHandler(final IncomingRequestProcessingPool processingPool) { - this.processingPool = Objects.requireNonNull(processingPool); + this(new EventForwarder<>(ImmutableList.of(processingPool))); + } + + public ClientSideCookieEventHandler(final EventForwarder processingPools) { + this.processingPools = Objects.requireNonNull(processingPools); try { this.transparentImage = ByteBuffer.wrap( @@ -183,7 +181,7 @@ private void handleRequestIfComplete(final HttpServerExchange exchange) throws I isNewPartyId, isFirstInSession, exchange); logger.debug("Enqueuing event (client generated cookies): {}/{}/{}/{}", partyId, sessionId, pageViewId, eventId); - processingPool.enqueue(Item.of(0, partyId.value, event)); + processingPools.forward(Item.of(0, partyId.value, event)); } static DivolteEvent buildBrowserEventData(final boolean corruptEvent, diff --git a/src/main/java/io/divolte/server/EventForwarder.java b/src/main/java/io/divolte/server/EventForwarder.java new file mode 100644 index 00000000..b9b7935f --- /dev/null +++ b/src/main/java/io/divolte/server/EventForwarder.java @@ -0,0 +1,21 @@ +package io.divolte.server; + +import com.google.common.collect.ImmutableList; +import io.divolte.server.processing.Item; +import io.divolte.server.processing.ProcessingPool; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; + +@ParametersAreNonnullByDefault +public final class EventForwarder { + private final ImmutableList> receivers; + + public EventForwarder(final ImmutableList> receivers) { + this.receivers = Objects.requireNonNull(receivers); + } + + public void forward(final Item event) { + receivers.forEach(receiver -> receiver.enqueue(event)); + } +} diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index dfcd4280..45095222 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -16,86 +16,68 @@ package io.divolte.server; -import java.io.File; -import java.io.IOException; -import java.nio.file.Paths; -import java.util.Optional; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; - -import org.apache.avro.Schema; -import org.apache.avro.Schema.Parser; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import io.divolte.record.DefaultEventRecord; +import com.google.common.collect.ImmutableSet; import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.hdfs.HdfsFlushingPool; import io.divolte.server.ip2geo.ExternalDatabaseLookupService; import io.divolte.server.ip2geo.LookupService; -import io.divolte.server.kafka.KafkaFlushingPool; import io.divolte.server.processing.ProcessingPool; +import org.apache.avro.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Optional; +import java.util.function.Function; @ParametersAreNonnullByDefault final class IncomingRequestProcessingPool extends ProcessingPool { private final static Logger logger = LoggerFactory.getLogger(IncomingRequestProcessingPool.class); - private final Optional kafkaPool; - private final Optional hdfsPool; - - public IncomingRequestProcessingPool(final ValidatedConfiguration vc, final IncomingRequestListener listener) { + public IncomingRequestProcessingPool(final ValidatedConfiguration vc, + final String name, + final SchemaRegistry schemaRegistry, + final Function>> sinkProvider, + final IncomingRequestListener listener) { this ( vc.configuration().global.mapper.threads, vc.configuration().global.mapper.bufferSize, vc, - schemaFromConfig(vc), - vc.configuration().global.kafka.enabled ? new KafkaFlushingPool(vc) : null, - vc.configuration().global.hdfs.enabled ? new HdfsFlushingPool(vc, schemaFromConfig(vc)) : null, + schemaRegistry.getSchemaByMappingName(name), + buildSinksForwarder(sinkProvider, vc.configuration().mappings.get(name).sinks), lookupServiceFromConfig(vc), listener ); } + private static EventForwarder buildSinksForwarder(final Function>> sinkProvider, + final ImmutableSet sinkNames) { + // Some sinks may not be available via the provider: these have been globally disabled. + return new EventForwarder<>(sinkNames.stream() + .map(sinkProvider::apply) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(MoreCollectors.toImmutableList())); + } + public IncomingRequestProcessingPool( final int numThreads, final int maxQueueSize, final ValidatedConfiguration vc, final Schema schema, - @Nullable final KafkaFlushingPool kafkaFlushingPool, - @Nullable final HdfsFlushingPool hdfsFlushingPool, - @Nullable final LookupService geoipLookupService, + final EventForwarder flushingPools, + final Optional geoipLookupService, final IncomingRequestListener listener) { super( numThreads, maxQueueSize, "Incoming Request Processor", - () -> new IncomingRequestProcessor(vc, kafkaFlushingPool, hdfsFlushingPool, geoipLookupService, schema, listener)); - - this.kafkaPool = Optional.ofNullable(kafkaFlushingPool); - this.hdfsPool = Optional.ofNullable(hdfsFlushingPool); + () -> new IncomingRequestProcessor(vc, flushingPools, geoipLookupService, schema, listener)); } - private static Schema schemaFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().incomingRequestProcessor.schemaFile - .map((schemaFileName) -> { - final Parser parser = new Schema.Parser(); - logger.info("Using Avro schema from configuration: {}", schemaFileName); - try { - return parser.parse(new File(schemaFileName)); - } catch(final IOException ioe) { - logger.error("Failed to load Avro schema file."); - throw new RuntimeException("Failed to load Avro schema file.", ioe); - } - }) - .orElseGet(() -> { - logger.info("Using built in default Avro schema."); - return DefaultEventRecord.getClassSchema(); - }); - } - - @Nullable - private static LookupService lookupServiceFromConfig(final ValidatedConfiguration vc) { + private static Optional lookupServiceFromConfig(final ValidatedConfiguration vc) { + // XXX: This service should be a singleton, instead of per-pool. return vc.configuration().global.mapper.ip2geoDatabase .map((path) -> { try { @@ -104,15 +86,6 @@ private static LookupService lookupServiceFromConfig(final ValidatedConfiguratio logger.error("Failed to configure GeoIP database: " + path, e); throw new RuntimeException("Failed to configure GeoIP lookup service.", e); } - }) - .orElse(null); - } - - @Override - public void stop() { - super.stop(); - - kafkaPool.ifPresent(KafkaFlushingPool::stop); - hdfsPool.ifPresent(HdfsFlushingPool::stop); + }); } } diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index ab96a98e..4c99be71 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -16,34 +16,26 @@ package io.divolte.server; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; - -import java.util.Objects; -import java.util.Optional; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; - -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import io.divolte.record.DefaultEventRecord; import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.hdfs.HdfsFlusher; -import io.divolte.server.hdfs.HdfsFlushingPool; import io.divolte.server.ip2geo.LookupService; -import io.divolte.server.kafka.KafkaFlusher; -import io.divolte.server.kafka.KafkaFlushingPool; import io.divolte.server.processing.Item; import io.divolte.server.processing.ItemProcessor; -import io.divolte.server.processing.ProcessingPool; import io.divolte.server.recordmapping.DslRecordMapper; import io.divolte.server.recordmapping.DslRecordMapping; import io.divolte.server.recordmapping.RecordMapper; import io.divolte.server.recordmapping.UserAgentParserAndCache; import io.undertow.util.AttachmentKey; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; +import java.util.Optional; + +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; @ParametersAreNonnullByDefault public final class IncomingRequestProcessor implements ItemProcessor { @@ -51,10 +43,7 @@ public final class IncomingRequestProcessor implements ItemProcessor DUPLICATE_EVENT_KEY = AttachmentKey.create(Boolean.class); - @Nullable - private final ProcessingPool kafkaFlushingPool; - @Nullable - private final ProcessingPool hdfsFlushingPool; + private final EventForwarder flushingPools; private final IncomingRequestListener listener; @@ -66,14 +55,11 @@ public final class IncomingRequestProcessor implements ItemProcessor flushingPools, + final Optional geoipLookupService, final Schema schema, final IncomingRequestListener listener) { - - this.kafkaFlushingPool = kafkaFlushingPool; - this.hdfsFlushingPool = hdfsFlushingPool; + this.flushingPools = flushingPools; this.listener = Objects.requireNonNull(listener); keepCorrupted = !vc.configuration().incomingRequestProcessor.discardCorrupted; @@ -84,7 +70,7 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, mapper = vc.configuration().incomingRequestProcessor.mappingScriptFile .map((mappingScriptFile) -> { logger.info("Using script based schema mapping."); - return new DslRecordMapper(vc, mappingScriptFile, Objects.requireNonNull(schema), Optional.ofNullable(geoipLookupService)); + return new DslRecordMapper(vc, mappingScriptFile, Objects.requireNonNull(schema), geoipLookupService); }).orElseGet(() -> { logger.info("Using built in default schema mapping."); @@ -147,20 +133,9 @@ public ProcessingDirective process(final Item item) { event.clientUtcOffset, avroRecord); listener.incomingRequest(event, avroBuffer, avroRecord); - doProcess(item, avroBuffer); + flushingPools.forward(Item.withCopiedAffinity(0, item, avroBuffer)); } } - return CONTINUE; } - - private void doProcess(final Item sourceItem, final AvroRecordBuffer avroBuffer) { - - if (null != kafkaFlushingPool) { - kafkaFlushingPool.enqueue(Item.withCopiedAffinity(0, sourceItem, avroBuffer)); - } - if (null != hdfsFlushingPool) { - hdfsFlushingPool.enqueue(Item.withCopiedAffinity(0, sourceItem, avroBuffer)); - } - } } diff --git a/src/main/java/io/divolte/server/MoreCollectors.java b/src/main/java/io/divolte/server/MoreCollectors.java new file mode 100644 index 00000000..11626322 --- /dev/null +++ b/src/main/java/io/divolte/server/MoreCollectors.java @@ -0,0 +1,35 @@ +package io.divolte.server; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.stream.Collector; + +public class MoreCollectors { + private MoreCollectors() { + // Prevent external instantiation. + } + + public static Collector, ImmutableList> toImmutableList() { + return Collector.of(ImmutableList.Builder::new, + ImmutableList.Builder::add, + (l, r) -> l.addAll(r.build()), + ImmutableList.Builder::build); + } + + public static Collector, ImmutableSet> toImmutableSet() { + return Collector.of(ImmutableSet.Builder::new, + ImmutableSet.Builder::add, + (l, r) -> l.addAll(r.build()), + ImmutableSet.Builder::build); + } + + public static Collector, ImmutableMap.Builder, ImmutableMap> toImmutableMap() { + return Collector.of(ImmutableMap.Builder::new, + ImmutableMap.Builder::put, + (l, r) -> l.putAll(r.build()), + ImmutableMap.Builder::build); + } +} diff --git a/src/main/java/io/divolte/server/SchemaRegistry.java b/src/main/java/io/divolte/server/SchemaRegistry.java new file mode 100644 index 00000000..0e92cf33 --- /dev/null +++ b/src/main/java/io/divolte/server/SchemaRegistry.java @@ -0,0 +1,92 @@ +package io.divolte.server; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import io.divolte.record.DefaultEventRecord; +import io.divolte.server.config.MappingConfiguration; +import io.divolte.server.config.ValidatedConfiguration; +import org.apache.avro.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.File; +import java.io.IOException; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class SchemaRegistry { + private static final Logger logger = LoggerFactory.getLogger(SchemaRegistry.class); + + private final ImmutableMap schemasByMappingName; + private final ImmutableMap schemasBySinkName; + + public SchemaRegistry(final ValidatedConfiguration vc) { + final ImmutableMap mappings = vc.configuration().mappings; + + // Build a mapping of the schema location for each mapping. + final ImmutableMap> schemaLocationsByMapping = + ImmutableMap.copyOf(Maps.transformValues(mappings, config -> config.schemaFile)); + + // Load the actual schemas. Once. + logger.debug("Loading schemas for mappings: {}", schemaLocationsByMapping.keySet()); + final ImmutableMap,Schema> schemasByLocation = + schemaLocationsByMapping.values() + .stream() + .distinct() + .map(schemaLocation -> + Maps.immutableEntry(schemaLocation, loadSchema(schemaLocation))) + .collect(MoreCollectors.toImmutableMap()); + + // Store the schema for each mapping. + schemasByMappingName = + ImmutableMap.copyOf(Maps.transformValues(schemaLocationsByMapping, schemasByLocation::get)); + logger.info("Loaded schemas used for mappings: {}", schemasByMappingName.keySet()); + + // Also calculate an inverse mapping by sink name. + // (Validation will ensure that multiple mappings for each sink have the same value.) + schemasBySinkName = + mappings.values() + .stream() + .flatMap(config -> config.sinks + .stream() + .map(sink -> + Maps.immutableEntry(sink, + schemasByLocation.get(config.schemaFile)))) + .collect(MoreCollectors.toImmutableMap()); + logger.info("Inferred schemas used for sinks: {}", schemasBySinkName.keySet()); + } + + public Schema getSchemaByMappingName(final String mappingName) { + final Schema schema = schemasByMappingName.get(mappingName); + Preconditions.checkArgument(null != schema, "Illegal mapping name: %s", mappingName); + return schema; + } + + public Schema getSchemaBySinkName(final String sinkName) { + final Schema schema = schemasBySinkName.get(sinkName); + // This means that the sink either doesn't exist, or isn't associated with a mapping. + // (Without a mapping, we can't infer the schema.) + Preconditions.checkArgument(null != schema, "Illegal sink name: %s", sinkName); + return schema; + } + + private static Schema loadSchema(final Optional schemaLocation) { + return schemaLocation + .map(filename -> { + final Schema.Parser parser = new Schema.Parser(); + logger.info("Loading Avro schema from path: {}", filename); + try { + return parser.parse(new File(filename)); + } catch(final IOException ioe) { + logger.error("Failed to load Avro schema file."); + throw new RuntimeException("Failed to load Avro schema file.", ioe); + } + }) + .orElseGet(() -> { + logger.info("Using builtin default Avro schema."); + return DefaultEventRecord.getClassSchema(); + }); + } +} diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index c067e1f8..2b73e9fd 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -16,10 +16,17 @@ package io.divolte.server; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.collect.Maps; import com.typesafe.config.ConfigFactory; import io.divolte.server.config.BrowserSourceConfiguration; +import io.divolte.server.config.HdfsSinkConfiguration; +import io.divolte.server.config.KafkaSinkConfiguration; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.js.TrackingJavaScriptResource; +import io.divolte.server.processing.ProcessingPool; import io.undertow.Undertow; import io.undertow.server.HttpHandler; import io.undertow.server.handlers.CanonicalPathHandler; @@ -40,6 +47,7 @@ import javax.annotation.ParametersAreNonnullByDefault; import java.io.IOException; import java.time.Duration; +import java.util.Map; import java.util.Optional; @ParametersAreNonnullByDefault @@ -49,6 +57,8 @@ public final class Server implements Runnable { private final Undertow undertow; private final GracefulShutdownHandler shutdownHandler; + private final ImmutableMap> sinks; + private final IncomingRequestProcessingPool processingPool; private final Optional host; @@ -62,7 +72,37 @@ public Server(final ValidatedConfiguration vc) { host = vc.configuration().global.server.host; port = vc.configuration().global.server.port; - processingPool = new IncomingRequestProcessingPool(vc, listener); + // First thing we need to do is load all the schemas: the sinks need these, but they come from the + // mappings. + final SchemaRegistry schemaRegistry = new SchemaRegistry(vc); + + // Build a set of referenced sinks. These are the only ones we need to instantiate. + final ImmutableSet referencedSinkNames = + vc.configuration().mappings.values() + .stream() + .flatMap(mc -> mc.sinks.stream()) + .collect(MoreCollectors.toImmutableSet()); + + // Instantiate the active sinks: + // - As a practical matter, unreferenced sinks have no associated schema, which means they + // can't be initialized. + // - This is also where we check whether HDFS and Kafka are globally enabled/disabled. + logger.debug("Initializing active sinks..."); + sinks = vc.configuration().sinks.entrySet() + .stream() + .filter(sink -> referencedSinkNames.contains(sink.getKey())) + .filter(sink -> vc.configuration().global.hdfs.enabled || !(sink.getValue() instanceof HdfsSinkConfiguration)) + .filter(sink -> vc.configuration().global.kafka.enabled || !(sink.getValue() instanceof KafkaSinkConfiguration)) + .>>map(sink -> + Maps.immutableEntry(sink.getKey(), + sink.getValue() + .getFactory() + .create(vc, sink.getKey(), schemaRegistry))) + .collect(MoreCollectors.toImmutableMap()); + logger.info("Initialized sinks: {}", sinks.keySet()); + + final String mappingName = Iterables.get(vc.configuration().mappings.keySet(), 0); + processingPool = new IncomingRequestProcessingPool(vc, mappingName, schemaRegistry, name -> Optional.ofNullable(sinks.get(name)), listener); PathHandler handler = new PathHandler(); for (final String name : vc.configuration().sources.keySet()) { final ClientSideCookieEventHandler clientSideCookieEventHandler = @@ -140,7 +180,9 @@ public void shutdown() { } logger.info("Stopping thread pools."); + // Stop the mappings before the sinks to ensure work in progress doesn't get stranded. processingPool.stop(); + sinks.values().forEach(ProcessingPool::stop); logger.info("Closing HDFS filesystem connection."); try { diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 58707ed0..8c093c1a 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -1,67 +1,56 @@ package io.divolte.server.config; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import javax.annotation.ParametersAreNonnullByDefault; -import javax.validation.Valid; - import com.fasterxml.jackson.annotation.JsonCreator; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; import com.google.common.collect.Sets; - import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; import io.divolte.server.config.constraint.OneSchemaPerSink; import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.Valid; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; @ParametersAreNonnullByDefault @MappingSourceSinkReferencesMustExist @SourceAndSinkNamesCannotCollide @OneSchemaPerSink public final class DivolteConfiguration { + private static Logger logger = LoggerFactory.getLogger(MappingConfiguration.class); + @Valid public final GlobalConfiguration global; + + // Mappings, sources and sinks are all keyed by their name. + @Valid public final ImmutableMap mappings; @Valid public final ImmutableMap sources; @Valid public final ImmutableMap sinks; - @Valid public final ImmutableMap mappings; @Deprecated public final MappingConfiguration incomingRequestProcessor; - @Deprecated - public final KafkaSinkConfiguration kafkaFlusher; - @Deprecated - public final HdfsSinkConfiguration hdfsFlusher; @JsonCreator DivolteConfiguration(final GlobalConfiguration global, final Optional> sources, final Optional> sinks, final Optional> mappings) { + this.global = Objects.requireNonNull(global); this.sources = sources.orElseGet(DivolteConfiguration::defaultSourceConfigurations); this.sinks = sinks.orElseGet(DivolteConfiguration::defaultSinkConfigurations); this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); - this.global = Objects.requireNonNull(global); // Temporary interop this.incomingRequestProcessor = Iterables.get(this.mappings.values(), 0); - this.kafkaFlusher = (KafkaSinkConfiguration) Iterators.get(this.sinks.values().stream().filter((sink) -> sink instanceof KafkaSinkConfiguration).iterator(), 0); - this.hdfsFlusher = (HdfsSinkConfiguration) Iterators.get(this.sinks.values().stream().filter((sink) -> sink instanceof HdfsSinkConfiguration).iterator(), 0); // TODO: Optimizations: // - Elide HDFS and Kafka sinks if they are globally disabled. - // - Elide unreferenced sources and sinks. } public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sourceName) { @@ -72,6 +61,14 @@ public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sou return (BrowserSourceConfiguration)sourceConfiguration; } + public T getSinkConfiguration(final String sinkName, final Class sinkClass) { + final SinkConfiguration sinkConfiguration = sinks.get(sinkName); + Objects.requireNonNull(sinkConfiguration, () -> "No sink configuration with name: " + sinkName); + Preconditions.checkArgument(sinkClass.isInstance(sinkConfiguration), + "Sink configuration '%s' is not a %s sink", sinkName, sinkClass.getSimpleName()); + return sinkClass.cast(sinkConfiguration); + } + // Defaults; these will eventually disappear private static ImmutableMap defaultSourceConfigurations() { return ImmutableMap.of("browser", BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION); diff --git a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java index ef6097b5..0088d2f3 100644 --- a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java @@ -3,6 +3,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; +import io.divolte.server.hdfs.HdfsFlushingPool; import javax.annotation.ParametersAreNonnullByDefault; import java.util.Optional; @@ -34,4 +35,9 @@ protected MoreObjects.ToStringHelper toStringHelper() { .add("replication", replication) .add("fileStrategy", fileStrategy); } + + @Override + public SinkFactory getFactory() { + return HdfsFlushingPool::new; + } } diff --git a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java index 13c83f4e..843ed7c1 100644 --- a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java @@ -5,6 +5,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; +import io.divolte.server.kafka.KafkaFlushingPool; @ParametersAreNonnullByDefault public class KafkaSinkConfiguration extends SinkConfiguration { @@ -22,4 +23,9 @@ public class KafkaSinkConfiguration extends SinkConfiguration { protected MoreObjects.ToStringHelper toStringHelper() { return super.toStringHelper().add("topic", topic); } + + @Override + public SinkFactory getFactory() { + return KafkaFlushingPool::new; + } } diff --git a/src/main/java/io/divolte/server/config/SinkConfiguration.java b/src/main/java/io/divolte/server/config/SinkConfiguration.java index afb22d54..f9398d34 100644 --- a/src/main/java/io/divolte/server/config/SinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/SinkConfiguration.java @@ -1,8 +1,12 @@ package io.divolte.server.config; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonSubTypes; import com.fasterxml.jackson.annotation.JsonTypeInfo; import com.google.common.base.MoreObjects; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.SchemaRegistry; +import io.divolte.server.processing.ProcessingPool; import javax.annotation.ParametersAreNonnullByDefault; @@ -21,4 +25,14 @@ protected MoreObjects.ToStringHelper toStringHelper() { public final String toString() { return toStringHelper().toString(); } + + @JsonIgnore + public abstract SinkFactory getFactory(); + + @FunctionalInterface + public interface SinkFactory { + ProcessingPool create(ValidatedConfiguration configuration, + String sinkName, + SchemaRegistry schemaRegistry); + } } diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java index 2d065d36..f530459b 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java @@ -25,6 +25,7 @@ import javax.annotation.ParametersAreNonnullByDefault; import javax.annotation.concurrent.NotThreadSafe; +import io.divolte.server.config.HdfsSinkConfiguration; import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -45,7 +46,7 @@ public final class HdfsFlusher implements ItemProcessor { private final FileCreateAndSyncStrategy fileStrategy; private HdfsOperationResult lastHdfsResult; - public HdfsFlusher(final ValidatedConfiguration vc, final Schema schema) { + public HdfsFlusher(final ValidatedConfiguration vc, final String name, final Schema schema) { Objects.requireNonNull(vc); final Configuration hdfsConfiguration = vc.configuration().global.hdfs.client @@ -77,9 +78,11 @@ public HdfsFlusher(final ValidatedConfiguration vc, final Schema schema) { logger.error("Could not initialize HDFS filesystem.", e); throw new RuntimeException("Could not initialize HDFS filesystem", e); } - final short hdfsReplication = vc.configuration().hdfsFlusher.replication; + final short hdfsReplication = + vc.configuration() + .getSinkConfiguration(Objects.requireNonNull(name), HdfsSinkConfiguration.class).replication; - fileStrategy = new SimpleRollingFileStrategy(vc, hadoopFs, hdfsReplication, Objects.requireNonNull(schema)); + fileStrategy = new SimpleRollingFileStrategy(vc, name, hadoopFs, hdfsReplication, Objects.requireNonNull(schema)); lastHdfsResult = fileStrategy.setup(); } diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java index 238322f4..47c4c733 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java @@ -17,6 +17,7 @@ package io.divolte.server.hdfs; import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.SchemaRegistry; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.processing.ProcessingPool; @@ -28,20 +29,24 @@ @ParametersAreNonnullByDefault public final class HdfsFlushingPool extends ProcessingPool{ - public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema) { - this( - Objects.requireNonNull(vc), - Objects.requireNonNull(schema), - vc.configuration().global.hdfs.threads, - vc.configuration().global.hdfs.bufferSize - ); + public HdfsFlushingPool(final ValidatedConfiguration vc, + final String name, + final SchemaRegistry schemaRegistry) { + this(vc, + name, + schemaRegistry.getSchemaBySinkName(name), + vc.configuration().global.hdfs.threads, + vc.configuration().global.hdfs.bufferSize); } - public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema, final int numThreads, final int maxQueueSize) { - super( - numThreads, - maxQueueSize, - "Hdfs Flusher", - () -> new HdfsFlusher(vc, schema)); + public HdfsFlushingPool(final ValidatedConfiguration vc, + final String name, + final Schema schema, + final int numThreads, + final int maxQueueSize) { + super(numThreads, + maxQueueSize, + String.format("Hdfs Flusher [%s]", Objects.requireNonNull(name)), + () -> new HdfsFlusher(Objects.requireNonNull(vc), name, Objects.requireNonNull(schema))); } } diff --git a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java index 42da1a6b..01ae78cb 100644 --- a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java +++ b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java @@ -31,6 +31,7 @@ import javax.annotation.ParametersAreNonnullByDefault; import javax.annotation.concurrent.NotThreadSafe; +import io.divolte.server.config.HdfsSinkConfiguration; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumWriter; @@ -74,11 +75,16 @@ public class SimpleRollingFileStrategy implements FileCreateAndSyncStrategy { private boolean isHdfsAlive; private long lastFixAttempt; - public SimpleRollingFileStrategy(final ValidatedConfiguration vc, final FileSystem fs, final short hdfsReplication, final Schema schema) { + public SimpleRollingFileStrategy(final ValidatedConfiguration vc, + final String name, + final FileSystem fs, + final short hdfsReplication, + final Schema schema) { Objects.requireNonNull(vc); this.schema = Objects.requireNonNull(schema); - final FileStrategyConfiguration fileStrategyConfiguration = vc.configuration().hdfsFlusher.fileStrategy; + final FileStrategyConfiguration fileStrategyConfiguration = + vc.configuration().getSinkConfiguration(name, HdfsSinkConfiguration.class).fileStrategy; syncEveryMillis = fileStrategyConfiguration.syncFileAfterDuration.toMillis(); syncEveryRecords = fileStrategyConfiguration.syncFileAfterRecords; newFileEveryMillis = fileStrategyConfiguration.rollEvery.toMillis(); diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java index 2396a9a0..0d02e10b 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java @@ -16,39 +16,44 @@ package io.divolte.server.kafka; -import java.util.Objects; - -import javax.annotation.ParametersAreNonnullByDefault; - -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.Producer; - import io.divolte.server.AvroRecordBuffer; import io.divolte.server.DivolteIdentifier; +import io.divolte.server.SchemaRegistry; +import io.divolte.server.config.KafkaSinkConfiguration; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.processing.ProcessingPool; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.Producer; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; @ParametersAreNonnullByDefault public class KafkaFlushingPool extends ProcessingPool { private final Producer producer; - public KafkaFlushingPool(final ValidatedConfiguration vc) { + public KafkaFlushingPool(final ValidatedConfiguration vc, final String name, final SchemaRegistry ignored) { this( + name, vc.configuration().global.kafka.threads, vc.configuration().global.kafka.bufferSize, - vc.configuration().kafkaFlusher.topic, + vc.configuration().getSinkConfiguration(name, KafkaSinkConfiguration.class).topic, new KafkaProducer<>(vc.configuration().global.kafka.producer, new DivolteIdentifierSerializer(), new AvroRecordBufferSerializer()) ); } - public KafkaFlushingPool(final int numThreads, + public KafkaFlushingPool(final String name, + final int numThreads, final int maxWriteQueue, final String topic, final Producer producer ) { - super(numThreads, maxWriteQueue, "Kafka Flusher", () -> new KafkaFlusher(topic, producer)); + super(numThreads, + maxWriteQueue, + String.format("Kafka Flusher [%s]", Objects.requireNonNull(name)), + () -> new KafkaFlusher(topic, producer)); this.producer = Objects.requireNonNull(producer); } diff --git a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java index 4ab21911..820f4de9 100644 --- a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java +++ b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java @@ -186,7 +186,7 @@ private void setupFlusher(final String rollEvery, final int recordCount) throws .build()) .collect(Collectors.toList()); - flusher = new HdfsFlusher(vc, schema); + flusher = new HdfsFlusher(vc, "hdfs", schema); } private void processRecords() { From f2523a585cb0434f0d4bf29772d969f7577d1483 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 20:16:41 +0100 Subject: [PATCH 28/80] Fix test configurations, now that we elide disabled sinks. --- src/test/resources/base-test-server.conf | 7 +++++++ src/test/resources/reference-test.conf | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/test/resources/base-test-server.conf b/src/test/resources/base-test-server.conf index bf6d36fd..a95655d1 100644 --- a/src/test/resources/base-test-server.conf +++ b/src/test/resources/base-test-server.conf @@ -16,6 +16,13 @@ // Specify a basic source/sink/mapping configuration that tests can use. divolte { + global { + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false + } + sources.browser.type = browser sinks { diff --git a/src/test/resources/reference-test.conf b/src/test/resources/reference-test.conf index 15733c3b..5bee2d5a 100644 --- a/src/test/resources/reference-test.conf +++ b/src/test/resources/reference-test.conf @@ -26,10 +26,5 @@ divolte { buffer_size = 16 threads = 1 } - - // By default the flushers are disabled. Instead events are placed on - // a special queue for the tests to collect. - hdfs.enabled = false - kafka.enabled = false } } From a50bbab3909eb66295219f4aba7ef66d6026082a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 20:19:01 +0100 Subject: [PATCH 29/80] Some basic tests for multiple sinks. --- .../server/ServerSinkConfigurationTest.java | 229 ++++++++++++++++++ src/test/resources/hdfs-sink-explicit.conf | 33 +++ src/test/resources/hdfs-sink-multiple.conf | 40 +++ 3 files changed, 302 insertions(+) create mode 100644 src/test/java/io/divolte/server/ServerSinkConfigurationTest.java create mode 100644 src/test/resources/hdfs-sink-explicit.conf create mode 100644 src/test/resources/hdfs-sink-multiple.conf diff --git a/src/test/java/io/divolte/server/ServerSinkConfigurationTest.java b/src/test/java/io/divolte/server/ServerSinkConfigurationTest.java new file mode 100644 index 00000000..dfb50a8f --- /dev/null +++ b/src/test/java/io/divolte/server/ServerSinkConfigurationTest.java @@ -0,0 +1,229 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.divolte.server.ServerTestUtils.TestServer; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.FileReader; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.junit.After; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.function.Supplier; +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@ParametersAreNonnullByDefault +public class ServerSinkConfigurationTest { + + private static final String BROWSER_EVENT_URL_TEMPLATE = + "http://localhost:%d/csc-event?" + + "p=0%%3Ai1t84hgy%%3A5AF359Zjq5kUy98u4wQjlIZzWGhN~GlG&" + + "s=0%%3Ai1t84hgy%%3A95CbiPCYln_1e0a6rFvuRkDkeNnc6KC8&" + + "v=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF&" + + "e=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF0&" + + "c=i1t8q2b6&" + + "n=f&" + + "f=f&" + + "l=http%%3A%%2F%%2Flocalhost%%3A8290%%2F&" + + "i=1ak&" + + "j=sj&" + + "k=2&" + + "w=uq&" + + "h=qd&" + + "t=pageView&" + + "x=si9804"; + + private final Set tempDirectories = new HashSet<>(); + private Optional testServer = Optional.empty(); + + private void startServer(final String configResource, + final ImmutableMap extraProperties) { + startServer(() -> new TestServer(configResource, extraProperties)); + } + + private void startServer() { + startServer(TestServer::new); + } + + private void startServer(final Supplier supplier) { + stopServer(); + testServer = Optional.of(supplier.get()); + } + + public void stopServer() { + testServer.ifPresent(testServer -> testServer.server.shutdown()); + testServer = Optional.empty(); + } + + public Path createTempDirectory() throws IOException { + final Path newTempDirectory = Files.createTempDirectory("divolte-test"); + tempDirectories.add(newTempDirectory); + return newTempDirectory; + } + + public void cleanupTempDirectories() { + tempDirectories.forEach(ServerSinkConfigurationTest::deleteRecursively); + tempDirectories.clear(); + } + + private void request() throws IOException { + final URL url = new URL(String.format(BROWSER_EVENT_URL_TEMPLATE, testServer.get().port)); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + assertEquals(200, conn.getResponseCode()); + } + + @ParametersAreNonnullByDefault + private static class AvroFileLocator { + private static final Logger logger = LoggerFactory.getLogger(AvroFileLocator.class); + + private final Path directory; + private final ImmutableSet existingFiles; + + private AvroFileLocator(final Path directory) throws IOException { + this.directory = Objects.requireNonNull(directory); + existingFiles = Files.list(directory) + .filter(AvroFileLocator::isAvroFile) + .collect(MoreCollectors.toImmutableSet()); + } + + private static boolean isAvroFile(final Path p) { + return p.toString().endsWith(".avro"); + } + + private static Stream listRecords(final Path avroFile) { + final GenericDatumReader datumReader = new GenericDatumReader<>(); + logger.debug("Reading records from new Avro file: {}", avroFile); + try (final FileReader fileReader = DataFileReader.openReader(avroFile.toFile(), datumReader)) { + final ImmutableList records = ImmutableList.copyOf(fileReader.iterator()); + logger.info("Read {} record(s) from new Avro file: {}", records.size(), avroFile); + return records.stream(); + } catch (final IOException e) { + throw new UncheckedIOException("Error reading records from file: " + avroFile, e); + } + } + + public Stream listNewRecords() throws IOException { + return Files.list(directory) + .filter(candidate -> isAvroFile(candidate) && !existingFiles.contains(candidate)) + .flatMap(AvroFileLocator::listRecords); + } + } + + @Test + public void shouldRegisterDefaultHdfsSink() throws IOException, InterruptedException { + // Test the default hdfs source that should be present by default. + startServer(); + final AvroFileLocator avroFileLocator = new AvroFileLocator(Paths.get("/tmp")); + request(); + testServer.get().waitForEvent(); + // Stopping the server flushes the HDFS files. + stopServer(); + // Now we can check the number of events that turned up in new files in /tmp. + assertEquals("Wrong number of new events logged to /tmp", + 1, avroFileLocator.listNewRecords().count()); + } + + @Test + public void shouldRegisterExplicitSinkOnly() throws IOException, InterruptedException { + // Test that if an explicit sink is supplied, the builtin defaults are not present. + final AvroFileLocator defaultAvroFileLocator = new AvroFileLocator(Paths.get("/tmp")); + final Path avroDirectory = createTempDirectory(); + startServer("hdfs-sink-explicit.conf", ImmutableMap.of( + "divolte.sinks.test-hdfs-sink.file_strategy.working_dir", avroDirectory.toString(), + "divolte.sinks.test-hdfs-sink.file_strategy.publish_dir", avroDirectory.toString() + )); + final AvroFileLocator explicitAvroFileLocator = new AvroFileLocator(avroDirectory); + request(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - The default location (/tmp) shouldn't have anything new. + // - Our explicit location should have a single record. + assertFalse("Default location (/tmp) shouldn't have any new logged events.", + defaultAvroFileLocator.listNewRecords().findFirst().isPresent()); + assertEquals("Wrong number of new events logged", + 1, explicitAvroFileLocator.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleSinks() throws IOException, InterruptedException { + // Test that multiple hdfs sinks are supported. + final AvroFileLocator defaultAvroFileLocator = new AvroFileLocator(Paths.get("/tmp")); + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + startServer("hdfs-sink-multiple.conf", ImmutableMap.of( + "divolte.sinks.test-hdfs-sink-1.file_strategy.working_dir", avroDirectory1.toString(), + "divolte.sinks.test-hdfs-sink-1.file_strategy.publish_dir", avroDirectory1.toString(), + "divolte.sinks.test-hdfs-sink-2.file_strategy.working_dir", avroDirectory2.toString(), + "divolte.sinks.test-hdfs-sink-2.file_strategy.publish_dir", avroDirectory2.toString() + )); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + request(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - The default location (/tmp) shouldn't have anything new. + // - Our locations should both have a single record. + assertFalse("Default location (/tmp) shouldn't have any new logged events.", + defaultAvroFileLocator.listNewRecords().findFirst().isPresent()); + assertEquals("Wrong number of new events logged in first location", + 1, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 1, explicitAvroFileLocator2.listNewRecords().count()); + } + + @After + public void tearDown() throws IOException { + stopServer(); + cleanupTempDirectories(); + } + + private static void deleteRecursively(final Path p) { + try (final Stream files = Files.walk(p).sorted(Comparator.reverseOrder())) { + files.forEachOrdered(path -> { + try { + Files.delete(path); + } catch (final IOException e) { + throw new UncheckedIOException("Error deleting file: " + path, e); + } + }); + } catch (final IOException e) { + throw new UncheckedIOException("Error recursively deleting directory: " + p, e); + } + } +} diff --git a/src/test/resources/hdfs-sink-explicit.conf b/src/test/resources/hdfs-sink-explicit.conf new file mode 100644 index 00000000..cfad7c46 --- /dev/null +++ b/src/test/resources/hdfs-sink-explicit.conf @@ -0,0 +1,33 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a single explicit hdfs sink. +divolte { + sources.test-browser-source.type = browser + + sinks.test-hdfs-sink { + type = hdfs + file_strategy = { + // working_directory: supplied by test. + // publish_directory: supplied by test. + } + } + + mappings.test = { + sources = [test-browser-source] + sinks = [test-hdfs-sink] + } +} diff --git a/src/test/resources/hdfs-sink-multiple.conf b/src/test/resources/hdfs-sink-multiple.conf new file mode 100644 index 00000000..db32577e --- /dev/null +++ b/src/test/resources/hdfs-sink-multiple.conf @@ -0,0 +1,40 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify multiple HDFS sinks for a mapping. +divolte { + sources.test-browser-source.type = browser + + sinks.test-hdfs-sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sinks.test-hdfs-sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + + mappings.test = { + sources = [test-browser-source] + sinks = [test-hdfs-sink-1, test-hdfs-sink-2] + } +} From 756b9bc0277d27d02922f4f32a708b3ad32043ec Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 20:29:13 +0100 Subject: [PATCH 30/80] Mark utility class as final. --- src/main/java/io/divolte/server/MoreCollectors.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/MoreCollectors.java b/src/main/java/io/divolte/server/MoreCollectors.java index 11626322..f8115562 100644 --- a/src/main/java/io/divolte/server/MoreCollectors.java +++ b/src/main/java/io/divolte/server/MoreCollectors.java @@ -7,7 +7,7 @@ import java.util.Map; import java.util.stream.Collector; -public class MoreCollectors { +public final class MoreCollectors { private MoreCollectors() { // Prevent external instantiation. } From 4f71734e98bf60d205e4857825fb2d122416c1e2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 22:58:11 +0100 Subject: [PATCH 31/80] Remove an unused logger. --- .../java/io/divolte/server/config/DivolteConfiguration.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 8c093c1a..66ae9f93 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -10,8 +10,6 @@ import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; import io.divolte.server.config.constraint.OneSchemaPerSink; import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.Valid; @@ -24,8 +22,6 @@ @SourceAndSinkNamesCannotCollide @OneSchemaPerSink public final class DivolteConfiguration { - private static Logger logger = LoggerFactory.getLogger(MappingConfiguration.class); - @Valid public final GlobalConfiguration global; // Mappings, sources and sinks are all keyed by their name. From 62115693b68f3255a5d9e116bc3eed740a14e85f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 23:02:47 +0100 Subject: [PATCH 32/80] Fix incompatible-schema detection for shared sinks. The associated tests have also been updated; the fixture for the previous test was not correct. --- .../server/config/DivolteConfiguration.java | 25 ++++------- .../config/ValidatedConfigurationTest.java | 10 ++++- ...appings-different-schema-shared-sink.conf} | 41 +++++------------ ...iple-mappings-same-schema-shared-sink.conf | 45 +++++++++++++++++++ 4 files changed, 72 insertions(+), 49 deletions(-) rename src/test/resources/{multiple-schemas-one-sink.conf => multiple-mappings-different-schema-shared-sink.conf} (54%) create mode 100644 src/test/resources/multiple-mappings-same-schema-shared-sink.conf diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 66ae9f93..47d7d45f 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -3,10 +3,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; +import com.google.common.collect.*; import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; import io.divolte.server.config.constraint.OneSchemaPerSink; import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; @@ -125,17 +122,13 @@ public Set collidingSourceAndSinkNames() { } public Set sinksWithMultipleSchemas() { - final Map> sinkSchemas = new HashMap<>(); - for (final MappingConfiguration mc : mappings.values()) { - for (final String s : mc.sinks) { - sinkSchemas.computeIfAbsent(s, i -> new ArrayList<>()).add(mc.schemaFile.orElse("")); - } - } - - return sinkSchemas.entrySet() - .stream() - .filter(e -> e.getValue().size() > 1) - .map(Map.Entry::getKey) - .collect(Collectors.toSet()); + final Map countsBySink = + mappings.values() + .stream() + .flatMap(config -> config.sinks.stream() + .map(sink -> Maps.immutableEntry(sink, config.schemaFile))) + .distinct() + .collect(Collectors.groupingBy(Map.Entry::getKey, Collectors.counting())); + return Maps.filterValues(countsBySink, count -> count > 1L).keySet(); } } diff --git a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java index 8ef60c02..b65d9065 100644 --- a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java +++ b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java @@ -82,8 +82,14 @@ public void sourceAndSinkNamesCannotCollide() { } @Test - public void sinksCanOnlyHaveOneSchema() { - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("multiple-schemas-one-sink.conf")); + public void sharedSinksAllowedWithSameSchema() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("multiple-mappings-same-schema-shared-sink.conf")); + assertTrue(vc.isValid()); + } + + @Test + public void sharedSinksCannotHaveDifferentSchemas() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("multiple-mappings-different-schema-shared-sink.conf")); assertFalse(vc.isValid()); assertEquals(1, vc.errors().size()); diff --git a/src/test/resources/multiple-schemas-one-sink.conf b/src/test/resources/multiple-mappings-different-schema-shared-sink.conf similarity index 54% rename from src/test/resources/multiple-schemas-one-sink.conf rename to src/test/resources/multiple-mappings-different-schema-shared-sink.conf index 1a2d54e0..881357bd 100644 --- a/src/test/resources/multiple-schemas-one-sink.conf +++ b/src/test/resources/multiple-mappings-different-schema-shared-sink.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,47 +16,26 @@ include classpath("reference.conf") +// A shared sink between two mappings, but not allowed because the mappings have different schemas. divolte { - global { - server.host = 127.0.0.1 - mapper { - // For tests we generally want single-threaded processing with a small - // buffer. - buffer_size = 16 - threads = 1 - } - - // By default the flushers are disabled. Instead events are placed on - // a special queue for the tests to collect. - hdfs.enabled = false - kafka.enabled = false - } + // Need a source for the mappings. + sources.browser.type = browser - // Explicitly specify the default sinks and sources, so that tests can merge properties in. - sources { - browser { - type = browser - } - } - + // Two sinks, the latter shared. sinks { - hdfs = { - type = hdfs - } - - kafka = { - type = kafka - } + hdfs.type = hdfs + kafka.type = kafka } + // Our mappings: The kafka sink is shared, but that's okay. mappings { foo = { sources = [browser] sinks = [kafka] - schema_file = bar.avsc + schema_file = foo.avsc } - + bar = { sources = [browser] sinks = [hdfs,kafka] diff --git a/src/test/resources/multiple-mappings-same-schema-shared-sink.conf b/src/test/resources/multiple-mappings-same-schema-shared-sink.conf new file mode 100644 index 00000000..f19329f7 --- /dev/null +++ b/src/test/resources/multiple-mappings-same-schema-shared-sink.conf @@ -0,0 +1,45 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +// A shared sink between two mappings, allowed because the mappings have the same schema. +divolte { + + // Need a source for the mappings. + sources.browser.type = browser + + // Two sinks, the latter shared. + sinks { + hdfs.type = hdfs + kafka.type = kafka + } + + // Our mappings: The kafka sink is shared, but that's okay. + mappings { + foo = { + sources = [browser] + sinks = [kafka] + schema_file = foobar.avsc + } + + bar = { + sources = [browser] + sinks = [hdfs,kafka] + schema_file = foobar.avsc + } + } +} From 18f55aece778590332c9967b8dab024a652ef98d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 23:09:26 +0100 Subject: [PATCH 33/80] Remove some deprecated code. --- .../io/divolte/server/ClientSideCookieEventHandler.java | 6 ------ src/main/java/io/divolte/server/Server.java | 9 +++------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java index 8d30f591..1ce175d8 100644 --- a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java +++ b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java @@ -19,7 +19,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import com.google.common.base.Strings; -import com.google.common.collect.ImmutableList; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.common.io.Resources; @@ -76,11 +75,6 @@ public final class ClientSideCookieEventHandler implements HttpHandler { static final String EVENT_SOURCE_NAME = "browser"; - @Deprecated - public ClientSideCookieEventHandler(final IncomingRequestProcessingPool processingPool) { - this(new EventForwarder<>(ImmutableList.of(processingPool))); - } - public ClientSideCookieEventHandler(final EventForwarder processingPools) { this.processingPools = Objects.requireNonNull(processingPools); diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index 2b73e9fd..100efbec 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -16,10 +16,7 @@ package io.divolte.server; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; +import com.google.common.collect.*; import com.typesafe.config.ConfigFactory; import io.divolte.server.config.BrowserSourceConfiguration; import io.divolte.server.config.HdfsSinkConfiguration; @@ -103,10 +100,10 @@ public Server(final ValidatedConfiguration vc) { final String mappingName = Iterables.get(vc.configuration().mappings.keySet(), 0); processingPool = new IncomingRequestProcessingPool(vc, mappingName, schemaRegistry, name -> Optional.ofNullable(sinks.get(name)), listener); + final EventForwarder processingPoolForwarder = new EventForwarder<>(ImmutableList.of(processingPool)); PathHandler handler = new PathHandler(); for (final String name : vc.configuration().sources.keySet()) { - final ClientSideCookieEventHandler clientSideCookieEventHandler = - new ClientSideCookieEventHandler(processingPool); + final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolForwarder); final TrackingJavaScriptResource trackingJavaScript = loadTrackingJavaScript(vc, name); final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().getBrowserSourceConfiguration(name); From ec10591b98150b22a7699b925e60343b98f61e3d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 23:09:55 +0100 Subject: [PATCH 34/80] Additional logging during startup. --- src/main/java/io/divolte/server/Server.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index 100efbec..5c526ade 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -98,8 +98,12 @@ public Server(final ValidatedConfiguration vc) { .collect(MoreCollectors.toImmutableMap()); logger.info("Initialized sinks: {}", sinks.keySet()); + logger.debug("Initializing first mapping..."); final String mappingName = Iterables.get(vc.configuration().mappings.keySet(), 0); processingPool = new IncomingRequestProcessingPool(vc, mappingName, schemaRegistry, name -> Optional.ofNullable(sinks.get(name)), listener); + logger.info("Initialized mapping: {}", mappingName); + + logger.debug("Initializing sources..."); final EventForwarder processingPoolForwarder = new EventForwarder<>(ImmutableList.of(processingPool)); PathHandler handler = new PathHandler(); for (final String name : vc.configuration().sources.keySet()) { @@ -114,6 +118,7 @@ public Server(final ValidatedConfiguration vc) { logger.info("Registered source[{}] script location: {}", name, scriptPath); logger.info("Registered source[{}] event handler: {}", name, eventPath); } + logger.info("Initialized sources: {}", vc.configuration().sources.keySet()); handler.addExactPath("/ping", PingHandler::handlePingRequest); if (vc.configuration().global.server.serveStaticResources) { From a784a15c82250376cd0faf77228d0b74ee6d660f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 29 Dec 2015 23:15:50 +0100 Subject: [PATCH 35/80] Remove superseded todo. --- .../java/io/divolte/server/config/DivolteConfiguration.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 47d7d45f..0967d62e 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -41,9 +41,6 @@ public final class DivolteConfiguration { // Temporary interop this.incomingRequestProcessor = Iterables.get(this.mappings.values(), 0); - - // TODO: Optimizations: - // - Elide HDFS and Kafka sinks if they are globally disabled. } public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sourceName) { From 1907a66a43ab0029ad61ea237b946701eac7d7a5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 00:18:10 +0100 Subject: [PATCH 36/80] Specialise event forwarding for common cases. --- .../io/divolte/server/EventForwarder.java | 55 +++++++++++++++++-- .../server/IncomingRequestProcessingPool.java | 10 ++-- src/main/java/io/divolte/server/Server.java | 2 +- 3 files changed, 55 insertions(+), 12 deletions(-) diff --git a/src/main/java/io/divolte/server/EventForwarder.java b/src/main/java/io/divolte/server/EventForwarder.java index b9b7935f..87410db5 100644 --- a/src/main/java/io/divolte/server/EventForwarder.java +++ b/src/main/java/io/divolte/server/EventForwarder.java @@ -1,6 +1,7 @@ package io.divolte.server; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; import io.divolte.server.processing.Item; import io.divolte.server.processing.ProcessingPool; @@ -8,14 +9,56 @@ import java.util.Objects; @ParametersAreNonnullByDefault -public final class EventForwarder { - private final ImmutableList> receivers; +public abstract class EventForwarder { + public abstract void forward(final Item event); - public EventForwarder(final ImmutableList> receivers) { - this.receivers = Objects.requireNonNull(receivers); + private static EventForwarder EMPTY_FORWARDER = new NoopEventForwarder<>(); + + @ParametersAreNonnullByDefault + private final static class NoopEventForwarder extends EventForwarder { + @Override + public void forward(final Item event) { + // Nothing to do. + } + } + + @ParametersAreNonnullByDefault + private final static class SingleReceiverEventForwarder extends EventForwarder { + private final ProcessingPool receiver; + + private SingleReceiverEventForwarder(final ProcessingPool receiver) { + this.receiver = Objects.requireNonNull(receiver); + } + + @Override + public void forward(final Item event) { + receiver.enqueue(event); + } + } + + private final static class MultipleReceiverEventForwarder extends EventForwarder { + private final ImmutableList> receivers; + + private MultipleReceiverEventForwarder(final ImmutableList> receivers) { + this.receivers = Objects.requireNonNull(receivers); + } + + @Override + public void forward(final Item event) { + receivers.forEach(receiver -> receiver.enqueue(event)); + } } - public void forward(final Item event) { - receivers.forEach(receiver -> receiver.enqueue(event)); + static EventForwarder create(final ImmutableList> receivers) { + switch (receivers.size()) { + case 0: + @SuppressWarnings("unchecked") + final EventForwarder emptyForwarder = (EventForwarder) EMPTY_FORWARDER; + return emptyForwarder; + case 1: + return new SingleReceiverEventForwarder<>(Iterables.getOnlyElement(receivers)); + default: + return new MultipleReceiverEventForwarder<>(receivers); + } } } diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index 45095222..a5388ee2 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -54,11 +54,11 @@ public IncomingRequestProcessingPool(final ValidatedConfiguration vc, private static EventForwarder buildSinksForwarder(final Function>> sinkProvider, final ImmutableSet sinkNames) { // Some sinks may not be available via the provider: these have been globally disabled. - return new EventForwarder<>(sinkNames.stream() - .map(sinkProvider::apply) - .filter(Optional::isPresent) - .map(Optional::get) - .collect(MoreCollectors.toImmutableList())); + return EventForwarder.create(sinkNames.stream() + .map(sinkProvider::apply) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(MoreCollectors.toImmutableList())); } public IncomingRequestProcessingPool( diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index 5c526ade..acfdadd5 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -104,7 +104,7 @@ public Server(final ValidatedConfiguration vc) { logger.info("Initialized mapping: {}", mappingName); logger.debug("Initializing sources..."); - final EventForwarder processingPoolForwarder = new EventForwarder<>(ImmutableList.of(processingPool)); + final EventForwarder processingPoolForwarder = EventForwarder.create(ImmutableList.of(processingPool)); PathHandler handler = new PathHandler(); for (final String name : vc.configuration().sources.keySet()) { final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolForwarder); From 6ed6dbfb77a986e7786c1be6251aa261d361f239 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 00:26:24 +0100 Subject: [PATCH 37/80] Instantiate all mappings during startup, not just one of them. --- .../server/IncomingRequestProcessingPool.java | 4 ++- .../server/IncomingRequestProcessor.java | 9 ++++--- src/main/java/io/divolte/server/Server.java | 25 +++++++++++++------ .../server/config/DivolteConfiguration.java | 12 ++++----- 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index a5388ee2..219dc559 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -44,6 +44,7 @@ public IncomingRequestProcessingPool(final ValidatedConfiguration vc, vc.configuration().global.mapper.threads, vc.configuration().global.mapper.bufferSize, vc, + name, schemaRegistry.getSchemaByMappingName(name), buildSinksForwarder(sinkProvider, vc.configuration().mappings.get(name).sinks), lookupServiceFromConfig(vc), @@ -65,6 +66,7 @@ public IncomingRequestProcessingPool( final int numThreads, final int maxQueueSize, final ValidatedConfiguration vc, + final String name, final Schema schema, final EventForwarder flushingPools, final Optional geoipLookupService, @@ -73,7 +75,7 @@ public IncomingRequestProcessingPool( numThreads, maxQueueSize, "Incoming Request Processor", - () -> new IncomingRequestProcessor(vc, flushingPools, geoipLookupService, schema, listener)); + () -> new IncomingRequestProcessor(vc, name, flushingPools, geoipLookupService, schema, listener)); } private static Optional lookupServiceFromConfig(final ValidatedConfiguration vc) { diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index 4c99be71..45c2cdce 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -17,6 +17,7 @@ package io.divolte.server; import io.divolte.record.DefaultEventRecord; +import io.divolte.server.config.MappingConfiguration; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.ip2geo.LookupService; import io.divolte.server.processing.Item; @@ -55,6 +56,7 @@ public final class IncomingRequestProcessor implements ItemProcessor flushingPools, final Optional geoipLookupService, final Schema schema, @@ -62,12 +64,13 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, this.flushingPools = flushingPools; this.listener = Objects.requireNonNull(listener); - keepCorrupted = !vc.configuration().incomingRequestProcessor.discardCorrupted; + final MappingConfiguration mappingConfiguration = vc.configuration().getMappingConfiguration(name); + keepCorrupted = !mappingConfiguration.discardCorrupted; memory = new ShortTermDuplicateMemory(vc.configuration().global.mapper.duplicateMemorySize); - keepDuplicates = !vc.configuration().incomingRequestProcessor.discardDuplicates; + keepDuplicates = !mappingConfiguration.discardDuplicates; - mapper = vc.configuration().incomingRequestProcessor.mappingScriptFile + mapper = mappingConfiguration.mappingScriptFile .map((mappingScriptFile) -> { logger.info("Using script based schema mapping."); return new DslRecordMapper(vc, mappingScriptFile, Objects.requireNonNull(schema), geoipLookupService); diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index acfdadd5..4920c935 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -46,6 +46,7 @@ import java.time.Duration; import java.util.Map; import java.util.Optional; +import java.util.function.Function; @ParametersAreNonnullByDefault public final class Server implements Runnable { @@ -55,8 +56,7 @@ public final class Server implements Runnable { private final GracefulShutdownHandler shutdownHandler; private final ImmutableMap> sinks; - - private final IncomingRequestProcessingPool processingPool; + private final ImmutableMap mappingProcessors; private final Optional host; private final int port; @@ -98,12 +98,23 @@ public Server(final ValidatedConfiguration vc) { .collect(MoreCollectors.toImmutableMap()); logger.info("Initialized sinks: {}", sinks.keySet()); - logger.debug("Initializing first mapping..."); - final String mappingName = Iterables.get(vc.configuration().mappings.keySet(), 0); - processingPool = new IncomingRequestProcessingPool(vc, mappingName, schemaRegistry, name -> Optional.ofNullable(sinks.get(name)), listener); - logger.info("Initialized mapping: {}", mappingName); + logger.debug("Initializing mappings..."); + final Function>> schemaProvider = + sinkName -> Optional.ofNullable(sinks.get(sinkName)); + mappingProcessors = + ImmutableMap.copyOf(Maps.transformEntries(vc.configuration().mappings, + (mappingName, config) -> + new IncomingRequestProcessingPool(vc, + mappingName, + schemaRegistry, + schemaProvider, + listener))); + logger.info("Initialized mappings: {}", mappingProcessors.keySet()); logger.debug("Initializing sources..."); + // TODO: Implement sources for all mappings, not just one of them. + @Deprecated + final IncomingRequestProcessingPool processingPool = Iterables.get(mappingProcessors.values(), 0); final EventForwarder processingPoolForwarder = EventForwarder.create(ImmutableList.of(processingPool)); PathHandler handler = new PathHandler(); for (final String name : vc.configuration().sources.keySet()) { @@ -183,7 +194,7 @@ public void shutdown() { logger.info("Stopping thread pools."); // Stop the mappings before the sinks to ensure work in progress doesn't get stranded. - processingPool.stop(); + mappingProcessors.values().forEach(ProcessingPool::stop); sinks.values().forEach(ProcessingPool::stop); logger.info("Closing HDFS filesystem connection."); diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 0967d62e..216a1964 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -26,9 +26,6 @@ public final class DivolteConfiguration { @Valid public final ImmutableMap sources; @Valid public final ImmutableMap sinks; - @Deprecated - public final MappingConfiguration incomingRequestProcessor; - @JsonCreator DivolteConfiguration(final GlobalConfiguration global, final Optional> sources, @@ -38,9 +35,6 @@ public final class DivolteConfiguration { this.sources = sources.orElseGet(DivolteConfiguration::defaultSourceConfigurations); this.sinks = sinks.orElseGet(DivolteConfiguration::defaultSinkConfigurations); this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); - - // Temporary interop - this.incomingRequestProcessor = Iterables.get(this.mappings.values(), 0); } public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sourceName) { @@ -51,6 +45,12 @@ public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sou return (BrowserSourceConfiguration)sourceConfiguration; } + public MappingConfiguration getMappingConfiguration(final String mappingName) { + final MappingConfiguration mappingConfiguration = mappings.get(mappingName); + Objects.requireNonNull(mappingConfiguration, () -> "No mapping configuration with name: " + mappingName); + return mappingConfiguration; + } + public T getSinkConfiguration(final String sinkName, final Class sinkClass) { final SinkConfiguration sinkConfiguration = sinks.get(sinkName); Objects.requireNonNull(sinkConfiguration, () -> "No sink configuration with name: " + sinkName); From 55513411330a8511f6ff0e4a50931e141e678989 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 14:42:12 +0100 Subject: [PATCH 38/80] Support many-to-many mappings between sources and mappings. --- .../io/divolte/server/EventForwarder.java | 8 +++---- .../io/divolte/server/MoreCollectors.java | 8 +++++++ src/main/java/io/divolte/server/Server.java | 21 ++++++++++++++----- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/main/java/io/divolte/server/EventForwarder.java b/src/main/java/io/divolte/server/EventForwarder.java index 87410db5..d6c0c16f 100644 --- a/src/main/java/io/divolte/server/EventForwarder.java +++ b/src/main/java/io/divolte/server/EventForwarder.java @@ -1,6 +1,6 @@ package io.divolte.server; -import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableCollection; import com.google.common.collect.Iterables; import io.divolte.server.processing.Item; import io.divolte.server.processing.ProcessingPool; @@ -37,9 +37,9 @@ public void forward(final Item event) { } private final static class MultipleReceiverEventForwarder extends EventForwarder { - private final ImmutableList> receivers; + private final ImmutableCollection> receivers; - private MultipleReceiverEventForwarder(final ImmutableList> receivers) { + private MultipleReceiverEventForwarder(final ImmutableCollection> receivers) { this.receivers = Objects.requireNonNull(receivers); } @@ -49,7 +49,7 @@ public void forward(final Item event) { } } - static EventForwarder create(final ImmutableList> receivers) { + static EventForwarder create(final ImmutableCollection> receivers) { switch (receivers.size()) { case 0: @SuppressWarnings("unchecked") diff --git a/src/main/java/io/divolte/server/MoreCollectors.java b/src/main/java/io/divolte/server/MoreCollectors.java index f8115562..c3a962dc 100644 --- a/src/main/java/io/divolte/server/MoreCollectors.java +++ b/src/main/java/io/divolte/server/MoreCollectors.java @@ -2,6 +2,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; import java.util.Map; @@ -32,4 +33,11 @@ public static Collector, ImmutableMap.Builder, Immutab (l, r) -> l.putAll(r.build()), ImmutableMap.Builder::build); } + + public static Collector, ImmutableMultimap.Builder, ImmutableMultimap> toImmutableMultimap() { + return Collector.of(ImmutableMultimap.Builder::new, + ImmutableMultimap.Builder::put, + (l, r) -> l.putAll(r.build()), + ImmutableMultimap.Builder::build); + } } diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index 4920c935..d69808a2 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -112,13 +112,24 @@ public Server(final ValidatedConfiguration vc) { logger.info("Initialized mappings: {}", mappingProcessors.keySet()); logger.debug("Initializing sources..."); - // TODO: Implement sources for all mappings, not just one of them. - @Deprecated - final IncomingRequestProcessingPool processingPool = Iterables.get(mappingProcessors.values(), 0); - final EventForwarder processingPoolForwarder = EventForwarder.create(ImmutableList.of(processingPool)); + // First build a list of which mappings are used by each source. + final ImmutableMultimap mappingProcessorsBySource = + vc.configuration().mappings.entrySet() + .stream() + .flatMap(mappingConfig -> { + final IncomingRequestProcessingPool mappingProcessor = mappingProcessors.get(mappingConfig.getKey()); + return mappingConfig.getValue() + .sources + .stream() + .map(source -> Maps.immutableEntry(source, mappingProcessor)); + }) + .collect(MoreCollectors.toImmutableMultimap()); PathHandler handler = new PathHandler(); + // Now instantiate all the sources. for (final String name : vc.configuration().sources.keySet()) { - final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolForwarder); + final ImmutableCollection mappingProcessors = mappingProcessorsBySource.get(name); + final EventForwarder processingPoolsForwarder = EventForwarder.create(mappingProcessors); + final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolsForwarder); final TrackingJavaScriptResource trackingJavaScript = loadTrackingJavaScript(vc, name); final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().getBrowserSourceConfiguration(name); From bf18049400e891360e2148de2738e2b318bdf65e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 16:09:19 +0100 Subject: [PATCH 39/80] Remove some redundant test dependencies. The main code already depends on these artefacts. --- build.gradle | 3 --- 1 file changed, 3 deletions(-) diff --git a/build.gradle b/build.gradle index 1471806d..0f52c8ff 100644 --- a/build.gradle +++ b/build.gradle @@ -133,9 +133,6 @@ dependencies { testCompile group: 'junit', name: 'junit', version: '4.12' testCompile group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' testCompile group: 'org.mockito', name: 'mockito-all', version: '1.10.19' - testCompile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: '2.6.3' - testCompile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: '2.6.3' - testCompile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: '2.6.3' testCompile group: 'com.saucelabs', name:'sauce_junit', version: '2.1.20' // Warning: SauceLabs doesn't work properly with Selenium 2.44. From adaee4cd1f351ea386fc26e101087d0f40dda404 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 16:09:45 +0100 Subject: [PATCH 40/80] Factor out versions where we have multiple artefacts that use the same version. --- build.gradle | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/build.gradle b/build.gradle index 0f52c8ff..cc57a6c4 100644 --- a/build.gradle +++ b/build.gradle @@ -75,11 +75,17 @@ repositories { } dependencies { + // Define some key versions for components that we use lots of artifacts from. + def avroVersion = '1.7.7' + def hadoopVersion = '2.7.1' + def jacksonVersion = '2.6.4' + def slf4jVersion = '1.7.13' + compile group: 'io.divolte', name: 'divolte-schema', version: version compile group: 'io.undertow', name: 'undertow-core', version: '1.3.11.Final' compile group: 'com.typesafe', name: 'config', version: '1.3.0' compile group: 'com.google.guava', name: 'guava', version: '19.0' - compile group: 'org.apache.avro', name: 'avro', version: '1.7.7' + compile group: 'org.apache.avro', name: 'avro', version: avroVersion /* * We package the Avro Tools to provide an easy way to view Avro files @@ -90,14 +96,14 @@ dependencies { * run and have a working tojson command. The other commands weren't fully * tested with these deps. */ - compile group: 'org.apache.avro', name: 'avro-tools', version: '1.7.7', classifier: 'nodeps' - compile group: 'org.apache.avro', name: 'trevni-core', version: '1.7.7' - compile group: 'org.apache.avro', name: 'avro-mapred', version: '1.7.7' + compile group: 'org.apache.avro', name: 'avro-tools', version: avroVersion, classifier: 'nodeps' + compile group: 'org.apache.avro', name: 'trevni-core', version: avroVersion + compile group: 'org.apache.avro', name: 'avro-mapred', version: avroVersion - compile (group: 'org.apache.hadoop', name:'hadoop-common', version: '2.7.1') { + compile (group: 'org.apache.hadoop', name:'hadoop-common', version: hadoopVersion) { exclude group: 'jline', module: 'jline' } - compile group: 'org.apache.hadoop', name:'hadoop-hdfs', version: '2.7.1' + compile group: 'org.apache.hadoop', name:'hadoop-hdfs', version: hadoopVersion compile (group: 'net.sf.uadetector', name: 'uadetector-core', version: '0.9.22') { exclude group: 'com.google.code.findbugs', module: 'jsr305' } @@ -114,10 +120,10 @@ dependencies { compile group: 'org.codehaus.groovy', name:'groovy', version: '2.4.5', classifier: 'indy' compile group: 'net.sf.jopt-simple', name:'jopt-simple', version: '4.9' compile group: 'com.jayway.jsonpath', name: 'json-path', version: '2.1.0' - compile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: '2.6.4' - compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: '2.6.4' - compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-guava', version: '2.6.4' - compile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: '2.6.4' + compile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: jacksonVersion + compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: jacksonVersion + compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-guava', version: jacksonVersion + compile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: jacksonVersion compile group: 'com.jasonclawson', name: 'jackson-dataformat-hocon', version: '1.1.0' // Used for configuration validation @@ -126,9 +132,9 @@ dependencies { // We use the SLF4J API. At runtime, this is LogBack. // (We also force any dependencies that use Log4J to go via SLF4J.) - compile group: 'org.slf4j', name: 'slf4j-api', version: '1.7.13' + compile group: 'org.slf4j', name: 'slf4j-api', version: slf4jVersion runtime group: 'ch.qos.logback', name: 'logback-classic', version: '1.1.3' - runtime group: 'org.slf4j', name: 'log4j-over-slf4j', version: '1.7.13' + runtime group: 'org.slf4j', name: 'log4j-over-slf4j', version: slf4jVersion testCompile group: 'junit', name: 'junit', version: '4.12' testCompile group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' From 7346c90772f1ce43782283e286264651b9aef7ec Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 16:29:48 +0100 Subject: [PATCH 41/80] Merge the test suites for testing various source and sink scenarios. --- ...=> ServerSinkSourceConfigurationTest.java} | 57 ++++++++- .../server/ServerSourceConfigurationTest.java | 116 ------------------ 2 files changed, 52 insertions(+), 121 deletions(-) rename src/test/java/io/divolte/server/{ServerSinkConfigurationTest.java => ServerSinkSourceConfigurationTest.java} (83%) delete mode 100644 src/test/java/io/divolte/server/ServerSourceConfigurationTest.java diff --git a/src/test/java/io/divolte/server/ServerSinkConfigurationTest.java b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java similarity index 83% rename from src/test/java/io/divolte/server/ServerSinkConfigurationTest.java rename to src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java index dfb50a8f..fe0ff489 100644 --- a/src/test/java/io/divolte/server/ServerSinkConfigurationTest.java +++ b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java @@ -45,10 +45,10 @@ import static org.junit.Assert.assertFalse; @ParametersAreNonnullByDefault -public class ServerSinkConfigurationTest { +public class ServerSinkSourceConfigurationTest { private static final String BROWSER_EVENT_URL_TEMPLATE = - "http://localhost:%d/csc-event?" + "http://localhost:%d%s/csc-event?" + "p=0%%3Ai1t84hgy%%3A5AF359Zjq5kUy98u4wQjlIZzWGhN~GlG&" + "s=0%%3Ai1t84hgy%%3A95CbiPCYln_1e0a6rFvuRkDkeNnc6KC8&" + "v=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF&" @@ -73,6 +73,10 @@ private void startServer(final String configResource, startServer(() -> new TestServer(configResource, extraProperties)); } + private void startServer(final String configResource) { + startServer(() -> new TestServer(configResource)); + } + private void startServer() { startServer(TestServer::new); } @@ -94,14 +98,22 @@ public Path createTempDirectory() throws IOException { } public void cleanupTempDirectories() { - tempDirectories.forEach(ServerSinkConfigurationTest::deleteRecursively); + tempDirectories.forEach(ServerSinkSourceConfigurationTest::deleteRecursively); tempDirectories.clear(); } private void request() throws IOException { - final URL url = new URL(String.format(BROWSER_EVENT_URL_TEMPLATE, testServer.get().port)); + request(""); + } + + private void request(final String sourcePrefix) throws IOException { + request(sourcePrefix, 200); + } + + private void request(final String sourcePrefix, final int expectedResponseCode) throws IOException { + final URL url = new URL(String.format(BROWSER_EVENT_URL_TEMPLATE, testServer.get().port, sourcePrefix)); final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); - assertEquals(200, conn.getResponseCode()); + assertEquals(expectedResponseCode, conn.getResponseCode()); } @ParametersAreNonnullByDefault @@ -141,6 +153,41 @@ public Stream listNewRecords() throws IOException { } } + @Test + public void shouldRegisterDefaultBrowserSource() throws IOException, InterruptedException { + // Test the default browser source that should be present by default. + startServer(); + request(); + testServer.get().waitForEvent(); + } + + @Test + public void shouldRegisterExplicitSourceOnly() throws IOException, InterruptedException { + // Test that if an explicit source is supplied, the builtin defaults are not present. + startServer("browser-source-explicit.conf"); + request("/a-prefix"); + testServer.get().waitForEvent(); + request("", 404); + } + + @Test + public void shouldSupportLongSourcePaths() throws IOException, InterruptedException { + // Test that the browser sources work with different types of path. + startServer("browser-source-long-prefix.conf"); + request("/a/multi/component/prefix"); + testServer.get().waitForEvent(); + } + + @Test + public void shouldSupportMultipleBrowserSources() throws IOException, InterruptedException { + // Test that multiple browser sources are supported. + startServer("browser-source-multiple.conf"); + request("/path1"); + request("/path2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + } + @Test public void shouldRegisterDefaultHdfsSink() throws IOException, InterruptedException { // Test the default hdfs source that should be present by default. diff --git a/src/test/java/io/divolte/server/ServerSourceConfigurationTest.java b/src/test/java/io/divolte/server/ServerSourceConfigurationTest.java deleted file mode 100644 index 7aa322b2..00000000 --- a/src/test/java/io/divolte/server/ServerSourceConfigurationTest.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server; - -import io.divolte.server.ServerTestUtils.TestServer; -import org.junit.After; -import org.junit.Test; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.IOException; -import java.net.HttpURLConnection; -import java.net.URL; -import java.util.Optional; - -import static org.junit.Assert.assertEquals; - -@ParametersAreNonnullByDefault -public class ServerSourceConfigurationTest { - - private static final String BROWSER_EVENT_URL_TEMPLATE = - "http://localhost:%d%s/csc-event?" - + "p=0%%3Ai1t84hgy%%3A5AF359Zjq5kUy98u4wQjlIZzWGhN~GlG&" - + "s=0%%3Ai1t84hgy%%3A95CbiPCYln_1e0a6rFvuRkDkeNnc6KC8&" - + "v=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF&" - + "e=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF0&" - + "c=i1t8q2b6&" - + "n=f&" - + "f=f&" - + "l=http%%3A%%2F%%2Flocalhost%%3A8290%%2F&" - + "i=1ak&" - + "j=sj&" - + "k=2&" - + "w=uq&" - + "h=qd&" - + "t=pageView&" - + "x=si9804"; - - private Optional testServer = Optional.empty(); - - private void startServer(final Optional configResource) { - stopServer(); - final TestServer newServer = configResource.map(TestServer::new).orElseGet(TestServer::new); - testServer = Optional.of(newServer); - } - - public void stopServer() { - testServer.ifPresent(testServer -> testServer.server.shutdown()); - testServer = Optional.empty(); - } - - private void request(final String sourcePrefix) throws IOException { - request(sourcePrefix, 200); - } - - private void request(final String sourcePrefix, final int expectedResponseCode) throws IOException { - final URL url = new URL(String.format(BROWSER_EVENT_URL_TEMPLATE, - testServer.get().port, - sourcePrefix)); - final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); - assertEquals(expectedResponseCode, conn.getResponseCode()); - } - - @Test - public void shouldRegisterDefaultBrowserSource() throws IOException, InterruptedException { - // Test the default browser source that should be present by default. - startServer(Optional.empty()); - request(""); - testServer.get().waitForEvent(); - } - - @Test - public void shouldRegisterExplicitSourceOnly() throws IOException, InterruptedException { - // Test that if an explicit source is supplied, the builtin defaults are not present. - startServer(Optional.of("browser-source-explicit.conf")); - request("/a-prefix"); - testServer.get().waitForEvent(); - request("", 404); - } - - @Test - public void shouldSupportLongPaths() throws IOException, InterruptedException { - // Test that the browser sources work with different types of path. - startServer(Optional.of("browser-source-long-prefix.conf")); - request("/a/multi/component/prefix"); - testServer.get().waitForEvent(); - } - - @Test - public void shouldSupportMultipleBrowserSources() throws IOException, InterruptedException { - // Test that multiple browser sources are supported. - startServer(Optional.of("browser-source-multiple.conf")); - request("/path1"); - request("/path2"); - testServer.get().waitForEvent(); - testServer.get().waitForEvent(); - } - - @After - public void tearDown() { - stopServer(); - } -} From 6e83889353b6323c2a808d283c9b97ef1d594a9a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 21:55:53 +0100 Subject: [PATCH 42/80] Fix a bug where the schema registry threw an exception while handling shared sinks. --- src/main/java/io/divolte/server/SchemaRegistry.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/io/divolte/server/SchemaRegistry.java b/src/main/java/io/divolte/server/SchemaRegistry.java index 0e92cf33..79685221 100644 --- a/src/main/java/io/divolte/server/SchemaRegistry.java +++ b/src/main/java/io/divolte/server/SchemaRegistry.java @@ -54,6 +54,7 @@ public SchemaRegistry(final ValidatedConfiguration vc) { .map(sink -> Maps.immutableEntry(sink, schemasByLocation.get(config.schemaFile)))) + .distinct() .collect(MoreCollectors.toImmutableMap()); logger.info("Inferred schemas used for sinks: {}", schemasBySinkName.keySet()); } From 1d21d7204dc50c5b772cb842912ba1f8cb0ee96f Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 22:03:15 +0100 Subject: [PATCH 43/80] Add some tests for various source/mapping/sink configurations. --- .../ServerSinkSourceConfigurationTest.java | 145 +++++++++++++++++- src/test/resources/browser-source-unused.conf | 25 +++ ...nf => mapping-configuration-explicit.conf} | 2 +- .../mapping-configuration-independent.conf | 57 +++++++ .../mapping-configuration-interdependent.conf | 88 +++++++++++ .../mapping-configuration-shared-sink.conf | 50 ++++++ .../mapping-configuration-shared-source.conf | 48 ++++++ 7 files changed, 408 insertions(+), 7 deletions(-) create mode 100644 src/test/resources/browser-source-unused.conf rename src/test/resources/{hdfs-sink-explicit.conf => mapping-configuration-explicit.conf} (95%) create mode 100644 src/test/resources/mapping-configuration-independent.conf create mode 100644 src/test/resources/mapping-configuration-interdependent.conf create mode 100644 src/test/resources/mapping-configuration-shared-sink.conf create mode 100644 src/test/resources/mapping-configuration-shared-source.conf diff --git a/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java index fe0ff489..ddd3276e 100644 --- a/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java +++ b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java @@ -189,8 +189,15 @@ public void shouldSupportMultipleBrowserSources() throws IOException, Interrupte } @Test - public void shouldRegisterDefaultHdfsSink() throws IOException, InterruptedException { - // Test the default hdfs source that should be present by default. + public void shouldSupportUnusedSource() throws IOException { + // Test that an unused source is still reachable. + startServer("browser-source-unused.conf"); + request("/unused"); + } + + @Test + public void shouldSupportDefaultSourceMappingSink() throws IOException, InterruptedException { + // Test that with an out-of-the-box default configuration the default source, mapping and sink are present. startServer(); final AvroFileLocator avroFileLocator = new AvroFileLocator(Paths.get("/tmp")); request(); @@ -203,11 +210,11 @@ public void shouldRegisterDefaultHdfsSink() throws IOException, InterruptedExcep } @Test - public void shouldRegisterExplicitSinkOnly() throws IOException, InterruptedException { - // Test that if an explicit sink is supplied, the builtin defaults are not present. + public void shouldOnlyRegisterExplicitSourceMappingSink() throws IOException, InterruptedException { + // Test that if an explicit source-mapping-sink is supplied, the builtin defaults are not present. final AvroFileLocator defaultAvroFileLocator = new AvroFileLocator(Paths.get("/tmp")); final Path avroDirectory = createTempDirectory(); - startServer("hdfs-sink-explicit.conf", ImmutableMap.of( + startServer("mapping-configuration-explicit.conf", ImmutableMap.of( "divolte.sinks.test-hdfs-sink.file_strategy.working_dir", avroDirectory.toString(), "divolte.sinks.test-hdfs-sink.file_strategy.publish_dir", avroDirectory.toString() )); @@ -227,7 +234,7 @@ public void shouldRegisterExplicitSinkOnly() throws IOException, InterruptedExce @Test public void shouldSupportMultipleSinks() throws IOException, InterruptedException { - // Test that multiple hdfs sinks are supported. + // Test that multiple hdfs sinks are supported for a single mapping. final AvroFileLocator defaultAvroFileLocator = new AvroFileLocator(Paths.get("/tmp")); final Path avroDirectory1 = createTempDirectory(); final Path avroDirectory2 = createTempDirectory(); @@ -254,6 +261,132 @@ public void shouldSupportMultipleSinks() throws IOException, InterruptedExceptio 1, explicitAvroFileLocator2.listNewRecords().count()); } + @Test + public void shouldSupportMultipleMappings() throws IOException, InterruptedException { + // Test that multiple independent mappings are supported. + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + startServer("mapping-configuration-independent.conf", ImmutableMap.of( + "divolte.sinks.sink-1.file_strategy.working_dir", avroDirectory1.toString(), + "divolte.sinks.sink-1.file_strategy.publish_dir", avroDirectory1.toString(), + "divolte.sinks.sink-2.file_strategy.working_dir", avroDirectory2.toString(), + "divolte.sinks.sink-2.file_strategy.publish_dir", avroDirectory2.toString() + )); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + request("/source-1"); + request("/source-2"); + request("/source-2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - One source should have a single event. + // - The other should have a two events. + assertEquals("Wrong number of new events logged in first location", + 1, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 2, explicitAvroFileLocator2.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleMappingsPerSource() throws IOException, InterruptedException { + // Test that a single source can send events to multiple mappings. + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + startServer("mapping-configuration-shared-source.conf", ImmutableMap.of( + "divolte.sinks.sink-1.file_strategy.working_dir", avroDirectory1.toString(), + "divolte.sinks.sink-1.file_strategy.publish_dir", avroDirectory1.toString(), + "divolte.sinks.sink-2.file_strategy.working_dir", avroDirectory2.toString(), + "divolte.sinks.sink-2.file_strategy.publish_dir", avroDirectory2.toString() + )); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + request(); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - Both sinks should have a single event. + assertEquals("Wrong number of new events logged in first location", + 1, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 1, explicitAvroFileLocator2.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleMappingsPerSink() throws IOException, InterruptedException { + // Test that a multiple mappings can send events to the same sink. + final Path avroDirectory = createTempDirectory(); + startServer("mapping-configuration-shared-sink.conf", ImmutableMap.of( + "divolte.sinks.only-sink.file_strategy.working_dir", avroDirectory.toString(), + "divolte.sinks.only-sink.file_strategy.publish_dir", avroDirectory.toString() + )); + final AvroFileLocator explicitAvroFileLocator = new AvroFileLocator(avroDirectory); + request("/source-1"); + request("/source-2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - The single location should have received both events. + assertEquals("Wrong number of new events logged", + 2, explicitAvroFileLocator.listNewRecords().count()); + } + + @Test + public void shouldSupportComplexSourceMappingSinkConfigurations() throws IOException, InterruptedException { + // Test that a complex source-mapping-sink configuration is possible. + // (This includes combinations of shared and non-shared sources and sinks.) + // Test that a single source can send events to multiple mappings. + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + final Path avroDirectory3 = createTempDirectory(); + final Path avroDirectory4 = createTempDirectory(); + startServer("mapping-configuration-interdependent.conf", new ImmutableMap.Builder() + .put("divolte.sinks.sink-1.file_strategy.working_dir", avroDirectory1.toString()) + .put("divolte.sinks.sink-1.file_strategy.publish_dir", avroDirectory1.toString()) + .put("divolte.sinks.sink-2.file_strategy.working_dir", avroDirectory2.toString()) + .put("divolte.sinks.sink-2.file_strategy.publish_dir", avroDirectory2.toString()) + .put("divolte.sinks.sink-3.file_strategy.working_dir", avroDirectory3.toString()) + .put("divolte.sinks.sink-3.file_strategy.publish_dir", avroDirectory3.toString()) + .put("divolte.sinks.sink-4.file_strategy.working_dir", avroDirectory4.toString()) + .put("divolte.sinks.sink-4.file_strategy.publish_dir", avroDirectory4.toString()) + .build() + ); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + final AvroFileLocator explicitAvroFileLocator3 = new AvroFileLocator(avroDirectory3); + final AvroFileLocator explicitAvroFileLocator4 = new AvroFileLocator(avroDirectory4); + request("/source-1"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + request("/source-2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + request("/source-3"); + testServer.get().waitForEvent(); + request("/source-4"); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - Each sink should have a specific number of events in it. + assertEquals("Wrong number of new events logged in first location", + 2, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 2, explicitAvroFileLocator2.listNewRecords().count()); + assertEquals("Wrong number of new events logged in third location", + 5, explicitAvroFileLocator3.listNewRecords().count()); + assertEquals("Wrong number of new events logged in fourth location", + 2, explicitAvroFileLocator4.listNewRecords().count()); + } + @After public void tearDown() throws IOException { stopServer(); diff --git a/src/test/resources/browser-source-unused.conf b/src/test/resources/browser-source-unused.conf new file mode 100644 index 00000000..a52731bc --- /dev/null +++ b/src/test/resources/browser-source-unused.conf @@ -0,0 +1,25 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a single browser source that isn't used by any mappings. +divolte { + sources.unused-source { + type = browser + prefix = /unused + } + mappings {} + sinks {} +} diff --git a/src/test/resources/hdfs-sink-explicit.conf b/src/test/resources/mapping-configuration-explicit.conf similarity index 95% rename from src/test/resources/hdfs-sink-explicit.conf rename to src/test/resources/mapping-configuration-explicit.conf index cfad7c46..4d2dee5f 100644 --- a/src/test/resources/hdfs-sink-explicit.conf +++ b/src/test/resources/mapping-configuration-explicit.conf @@ -14,7 +14,7 @@ // limitations under the License. // -// Specify a single explicit hdfs sink. +// Specify an explicit source-mapping-sink. divolte { sources.test-browser-source.type = browser diff --git a/src/test/resources/mapping-configuration-independent.conf b/src/test/resources/mapping-configuration-independent.conf new file mode 100644 index 00000000..6f5d96c7 --- /dev/null +++ b/src/test/resources/mapping-configuration-independent.conf @@ -0,0 +1,57 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify multiple independent source-mapping-sink chains. +divolte { + sources { + source-1 { + type = browser + prefix = /source-1 + } + source-2 { + type = browser + prefix = /source-2 + } + } + + mappings { + mapping-1 = { + sources = [source-1] + sinks = [sink-1] + } + mapping-2 = { + sources = [source-2] + sinks = [sink-2] + } + } + + sinks { + sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/mapping-configuration-interdependent.conf b/src/test/resources/mapping-configuration-interdependent.conf new file mode 100644 index 00000000..125ac189 --- /dev/null +++ b/src/test/resources/mapping-configuration-interdependent.conf @@ -0,0 +1,88 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a reasonably complex set of interdependent mappings with multiple +// sources, mapping and sinks with some shared and some not. +divolte { + sources { + source-1 { + type = browser + prefix = /source-1 + } + source-2 { + type = browser + prefix = /source-2 + } + source-3 { + type = browser + prefix = /source-3 + } + source-4 { + type = browser + prefix = /source-4 + } + } + + mappings { + mapping-1 = { + sources = [source-1, source-2] + sinks = [sink-1] + } + mapping-2 = { + sources = [source-1, source-2] + sinks = [sink-2, sink-3] + } + mapping-3 = { + sources = [source-3] + sinks = [sink-3] + } + mapping-4 = { + sources = [source-1, source-4] + sinks = [sink-3, sink-4] + } + } + + sinks { + sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-3 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-4 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/mapping-configuration-shared-sink.conf b/src/test/resources/mapping-configuration-shared-sink.conf new file mode 100644 index 00000000..699657a2 --- /dev/null +++ b/src/test/resources/mapping-configuration-shared-sink.conf @@ -0,0 +1,50 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify multiple mappings (with their own sources) that end up on the same sink. +divolte { + sources { + source-1 { + type = browser + prefix = /source-1 + } + source-2 { + type = browser + prefix = /source-2 + } + } + + mappings { + mapping-1 = { + sources = [source-1] + sinks = [only-sink] + } + mapping-2 = { + sources = [source-2] + sinks = [only-sink] + } + } + + sinks { + only-sink { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/mapping-configuration-shared-source.conf b/src/test/resources/mapping-configuration-shared-source.conf new file mode 100644 index 00000000..03e3f6d2 --- /dev/null +++ b/src/test/resources/mapping-configuration-shared-source.conf @@ -0,0 +1,48 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a single source with multiple mappings (and sinks). +divolte { + sources.only-source.type = browser + + mappings { + mapping-1 = { + sources = [only-source] + sinks = [sink-1] + } + mapping-2 = { + sources = [only-source] + sinks = [sink-2] + } + } + + sinks { + sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} From 56322448594ce6d230728ce1c2410c78823d5c2a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 22:28:22 +0100 Subject: [PATCH 44/80] Add missing copyright header. --- .../java/io/divolte/server/EventForwarder.java | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/main/java/io/divolte/server/EventForwarder.java b/src/main/java/io/divolte/server/EventForwarder.java index d6c0c16f..92b960cb 100644 --- a/src/main/java/io/divolte/server/EventForwarder.java +++ b/src/main/java/io/divolte/server/EventForwarder.java @@ -1,3 +1,19 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package io.divolte.server; import com.google.common.collect.ImmutableCollection; From a4030e87754bf469b09765b3dde0393a49365093 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 30 Dec 2015 23:12:46 +0100 Subject: [PATCH 45/80] Factor out most of the browser-source initialisation from the server class. This allows source initialisation to take place in parallel, a win because it tends to be fairly slow. --- .../java/io/divolte/server/BrowserSource.java | 82 +++++++++++++++++++ src/main/java/io/divolte/server/Server.java | 51 +++++------- 2 files changed, 102 insertions(+), 31 deletions(-) create mode 100644 src/main/java/io/divolte/server/BrowserSource.java diff --git a/src/main/java/io/divolte/server/BrowserSource.java b/src/main/java/io/divolte/server/BrowserSource.java new file mode 100644 index 00000000..57801ba1 --- /dev/null +++ b/src/main/java/io/divolte/server/BrowserSource.java @@ -0,0 +1,82 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import com.google.common.collect.ImmutableCollection; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.js.TrackingJavaScriptResource; +import io.undertow.server.HttpHandler; +import io.undertow.server.handlers.PathHandler; +import io.undertow.util.Methods; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Objects; + +@ParametersAreNonnullByDefault +public class BrowserSource { + private static final Logger logger = LoggerFactory.getLogger(BrowserSource.class); + + private final String sourceName; + private final String pathPrefix; + private final String javascriptName; + private final HttpHandler javascriptHandler; + private final HttpHandler eventHandler; + + public BrowserSource(final ValidatedConfiguration vc, + final String sourceName, + final ImmutableCollection mappingProcessors) { + this(sourceName, + vc.configuration().getBrowserSourceConfiguration(sourceName).prefix, + loadTrackingJavaScript(vc, sourceName), + mappingProcessors); + } + + private BrowserSource(final String sourceName, + final String pathPrefix, + final TrackingJavaScriptResource trackingJavascript, + final ImmutableCollection mappingProcessors) { + this.sourceName = Objects.requireNonNull(sourceName); + this.pathPrefix = Objects.requireNonNull(pathPrefix); + javascriptName = trackingJavascript.getScriptName(); + javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavascript), Methods.GET); + final EventForwarder processingPoolsForwarder = EventForwarder.create(mappingProcessors); + final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolsForwarder); + eventHandler = new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET); + } + + public PathHandler attachToPathHandler(PathHandler pathHandler) { + final String javascriptPath = pathPrefix + javascriptName; + pathHandler = pathHandler.addExactPath(javascriptPath, javascriptHandler); + logger.info("Registered source[{}] script location: {}", sourceName, javascriptPath); + final String eventPath = pathPrefix + "csc-event"; + pathHandler = pathHandler.addExactPath(eventPath, eventHandler); + logger.info("Registered source[{}] event handler: {}", sourceName, eventPath); + return pathHandler; + } + + private static TrackingJavaScriptResource loadTrackingJavaScript(final ValidatedConfiguration vc, final String sourceName) { + try { + return TrackingJavaScriptResource.create(vc, sourceName); + } catch (final IOException e) { + throw new UncheckedIOException("Could not precompile tracking JavaScript for source: " + sourceName, e); + } + } +} diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index d69808a2..c9ed9a14 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -16,13 +16,14 @@ package io.divolte.server; -import com.google.common.collect.*; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; import com.typesafe.config.ConfigFactory; -import io.divolte.server.config.BrowserSourceConfiguration; import io.divolte.server.config.HdfsSinkConfiguration; import io.divolte.server.config.KafkaSinkConfiguration; import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.js.TrackingJavaScriptResource; import io.divolte.server.processing.ProcessingPool; import io.undertow.Undertow; import io.undertow.server.HttpHandler; @@ -36,7 +37,6 @@ import io.undertow.server.handlers.resource.ResourceHandler; import io.undertow.server.handlers.resource.ResourceManager; import io.undertow.util.Headers; -import io.undertow.util.Methods; import org.apache.hadoop.fs.FileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -124,32 +124,29 @@ public Server(final ValidatedConfiguration vc) { .map(source -> Maps.immutableEntry(source, mappingProcessor)); }) .collect(MoreCollectors.toImmutableMultimap()); - PathHandler handler = new PathHandler(); - // Now instantiate all the sources. - for (final String name : vc.configuration().sources.keySet()) { - final ImmutableCollection mappingProcessors = mappingProcessorsBySource.get(name); - final EventForwarder processingPoolsForwarder = EventForwarder.create(mappingProcessors); - final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolsForwarder); - final TrackingJavaScriptResource trackingJavaScript = loadTrackingJavaScript(vc, name); - final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); - final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().getBrowserSourceConfiguration(name); - final String eventPath = browserSourceConfiguration.prefix + "csc-event"; - final String scriptPath = browserSourceConfiguration.prefix + trackingJavaScript.getScriptName(); - handler = handler.addExactPath(eventPath, new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET)); - handler = handler.addExactPath(scriptPath, javascriptHandler); - logger.info("Registered source[{}] script location: {}", name, scriptPath); - logger.info("Registered source[{}] event handler: {}", name, eventPath); + // Now instantiate all the sources. We do this in parallel because instantiation can be quite slow. + final ImmutableMap sources = + vc.configuration().sources.keySet() + .parallelStream() + .map(name -> + Maps.immutableEntry(name, new BrowserSource(vc, name, mappingProcessorsBySource.get(name)))) + .collect(MoreCollectors.toImmutableMap()); + logger.debug("Attaching sources: {}", sources.keySet()); + // Once all created we can attach them to the server. This has to be done sequentially. + PathHandler pathHandler = new PathHandler(); + for (final BrowserSource browserSource : sources.values()) { + pathHandler = browserSource.attachToPathHandler(pathHandler); } - logger.info("Initialized sources: {}", vc.configuration().sources.keySet()); + logger.info("Initialized sources: {}", sources.keySet()); - handler.addExactPath("/ping", PingHandler::handlePingRequest); + pathHandler.addExactPath("/ping", PingHandler::handlePingRequest); if (vc.configuration().global.server.serveStaticResources) { // Catch-all handler; must be last if present. // XXX: Our static resources assume the default 'browser' endpoint. - handler.addPrefixPath("/", createStaticResourceHandler()); + pathHandler.addPrefixPath("/", createStaticResourceHandler()); } final SetHeaderHandler headerHandler = - new SetHeaderHandler(handler, Headers.SERVER_STRING, "divolte"); + new SetHeaderHandler(pathHandler, Headers.SERVER_STRING, "divolte"); final HttpHandler canonicalPathHandler = new CanonicalPathHandler(headerHandler); final GracefulShutdownHandler rootHandler = new GracefulShutdownHandler( vc.configuration().global.server.useXForwardedFor ? @@ -163,14 +160,6 @@ public Server(final ValidatedConfiguration vc) { .build(); } - private static TrackingJavaScriptResource loadTrackingJavaScript(final ValidatedConfiguration vc, final String sourceName) { - try { - return TrackingJavaScriptResource.create(vc, sourceName); - } catch (final IOException e) { - throw new RuntimeException("Could not precompile tracking JavaScript for source: " + sourceName, e); - } - } - private static HttpHandler createStaticResourceHandler() { final ResourceManager staticResources = new ClassPathResourceManager(Server.class.getClassLoader(), "static"); From d07b71961a78528d75af828a0791e4b793bb3250 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 31 Dec 2015 11:22:41 +0100 Subject: [PATCH 46/80] Update the example configuration to use the new format. Somewhat embarrassingly, the mapping hadn't been converted to groovy yet. (The mapping hasn't been tested.) --- examples/divolte-collector.conf | 31 +++++--- examples/schema-mapping.conf | 129 ------------------------------- examples/schema-mapping.groovy | 130 ++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 140 deletions(-) delete mode 100644 examples/schema-mapping.conf create mode 100644 examples/schema-mapping.groovy diff --git a/examples/divolte-collector.conf b/examples/divolte-collector.conf index 9ee0cb8d..4f44ee58 100644 --- a/examples/divolte-collector.conf +++ b/examples/divolte-collector.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,17 +15,26 @@ // divolte { - tracking { - include "schema-mapping.conf" - schema_file = /some/dir/MySchema.avsc - } + sources { + my_source = { + type = browser + } + + my_sink = { + type = hdfs + file_strategy { + sync_file_after_records = 1000 + sync_file_after_duration = 30 seconds + working_dir = /tmp + publish_dir = /tmp + } + } - hdfs_flusher { - session_binning_file_strategy { - sync_file_after_records = 1000 - sync_file_after_duration = 30 seconds - working_dir = /tmp - publish_dir = /tmp + my_mapping = { + schema_file = /some/dir/MySchema.avsc + mapping_script_file = schema-mapping.groovy + sources = [my_source] + sinks = [my_sink] } } } diff --git a/examples/schema-mapping.conf b/examples/schema-mapping.conf deleted file mode 100644 index 0a24c810..00000000 --- a/examples/schema-mapping.conf +++ /dev/null @@ -1,129 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// to be included under config key: 'divolte.tracking' -schema_mapping { - version = 1 - - regexes { - // Matches the home page - // e.g. http://www.example.com/ - // e.g. http://www.example.com/index.html - home = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/(?:index.html)?$" - - // Match different levels of taxonomy pages (up to three levels deep) - // URL layout is: http://www.example.com/shop/section/category/ - // e.g. http://www.example.com/fashion/jeans/regular/ - taxonomy = "^(?:http|https):\\/\\/[a-z0-9\\.\\-:]+\\/(?:(?[a-z0-9\\-]+)\\/)(?:(?
[a-z0-9\\-]+)\\/)?(?:(?[a-z0-9\\-]+)\\/)?$" - shop = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/[a-z0-9\\-]+\\/$" - section = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/[a-z0-9\\-]+\\/[a-z0-9\\-]+\\/$" - category = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/[a-z0-9\\-]+\\/[a-z0-9\\-]+\\/[a-z0-9\\-]+\\/$" - - // http://www.example.com/products/311381 - product_detail = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/products\\/(?[0-9]{6})$" - - // http://www.example.com/basket - basket = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/basket$" - - // http://www.example.com/search?q=search+phrase - search = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/search\\?q=(?.*)$" - - // http://www.example.com/checkout - checkout = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/checkout$" - } - - fields { - // Simple field mappings - // For fields that are potentially not set, - // make sure that the Avro record field is nullable - firstInSession = firstInSession - timestamp = timestamp - remoteHost = remoteHost - referer = referer - location = location - viewportPixelWidth = viewportPixelWidth - viewportPixelHeight = viewportPixelHeight - screenPixelWidth = screenPixelWidth - screenPixelHeight = screenPixelHeight - devicePixelRatio = devicePixelRatio - partyId = partyId - sessionId = sessionId - pageViewId = pageViewId - - userAgentString = userAgent - userAgentName = userAgentName - userAgentFamily = userAgentFamily - userAgentVendor = userAgentVendor - userAgentType = userAgentType - userAgentVersion = userAgentVersion - userAgentDevicesection = userAgentDevicesection - userAgentOsFamily = userAgentOsFamily - userAgentOsVersion = userAgentOsVersion - userAgentOsVendor = userAgentOsVendor - - // pageType field will be set to the name of the first - // regex in the list that matches the location, or is - // not set if no regex matches (must be nullable in this - // case) - pageType { - type = regex_name - regexes = [home, category, section, shop, product_detail, basket, search, checkout] - field = location - } - - // productId will be set to the named capture group 'product' from - // the regex named product_detail or will not be set if the regex - // does not match the location (must be nullable in this case) - productId { - type = regex_group - regex = product_detail - field = location - group = product - } - - // Similar to productId - shop { - type = regex_group - regex = taxonomy - field = location - group = shop - } - - // Similar to productId - section { - type = regex_group - regex = taxonomy - field = location - group = section - } - - // Similar to productId - category { - type = regex_group - regex = taxonomy - field = location - group = category - } - - // In case of search, capture the search phrase - searchPhrase { - type = regex_group - regex = search - field = location - group = phrase - } - } -} diff --git a/examples/schema-mapping.groovy b/examples/schema-mapping.groovy new file mode 100644 index 00000000..f6c9bcef --- /dev/null +++ b/examples/schema-mapping.groovy @@ -0,0 +1,130 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +mapping { + // Simple field mappings. + // For fields that are potentially not set, + // make sure that the Avro record field is nullable + map firstInSession() onto 'firstInSession' + map timestamp() onto 'ts' + map remoteHost() onto 'remoteHost' + + map referer() onto 'referer' + map location() onto 'location' + map viewportPixelWidth() onto 'viewportPixelWidth' + map viewportPixelHeight() onto 'viewportPixelHeight' + map screenPixelWidth() onto 'screenPixelWidth' + map screenPixelHeight() onto 'screenPixelHeight' + map devicePixelRatio() onto 'devicePixelRatio' + map partyId() onto 'partyId' + map sessionId() onto 'sessionId' + map pageViewId() onto 'pageViewId' + + map userAgentString() onto 'userAgent' + def ua = userAgent() + map ua.name() onto 'userAgentName' + map ua.family() onto 'userAgentFamily' + map ua.vendor() onto 'userAgentVendor' + map ua.type() onto 'userAgentType' + map ua.version() onto 'userAgentVersion' + map ua.deviceCategory() onto 'userAgentDeviceCategory' + map ua.osFamily() onto 'userAgentOsFamily' + map ua.osVersion() onto 'userAgentOsVersion' + map ua.osVendor() onto 'userAgentOsVendor' + + section { + // Pagetype detection + + // Extract the location path; we don't care about the domain. + def locationUri = parse location() to uri + def locationPath = locationUri.path() + + // Matches the home page + // e.g. / + // e.g. /index.html + def homepageMatcher = match /^\/(?:index\.html)?$/ against locationPath + when homepageMatcher.matches apply { + map 'home' onto 'pageType' + exit() + } + + // Viewing product details + // e.g. /products/311381 + def productDetailMatcher = match /^\/product\/([0-9]{6})$/ against locationPath + when productDetailMatcher.matches apply { + map 'product_detail' onto 'pageType' + map productDetailMatcher.group(1) onto 'productId' + exit() + } + + // Search results. + // e.g. /search?q=search+phrase + when locationPath.equalTo('/search') apply { + map 'searchResults' onto 'pageType' + map locationUri.query().value('q') onto 'searchPhrase' + exit() + } + + // Viewing basket + // e.g. /basket + when locationPath.equalTo('/basket') apply { + map 'basket' onto 'pageType' + exit() + } + + // Checkout funnel + // e.g. /checkout + when locationPath.equalTo('/checkout') apply { + map 'checkout' onto 'pageType' + exit() + } + + // Match different levels of taxonomy pages (up to three levels deep) + // URL layout is: http://www.example.com/shop/section/category/ + // e.g. http://www.example.com/fashion/jeans/regular/ + // (These are last due to ambiguity with the special URLs above.) + + // Category + // e.g. /fashion/jeans/regular/ + def categoryMatcher = match /^\/([a-z0-9\-]+)\/([a-z0-9\-]+)\/([a-z0-9\-]+)\/$/ against locationPath + when categoryMatcher.matches() apply { + map 'category' onto 'pageType' + map categoryMatcher.group(1) onto 'shop' + map categoryMatcher.group(2) onto 'section' + map categoryMatcher.group(3) onto 'category' + exit() + } + + // Section + // e.g. /fashion/jeans/ + def sectionMatcher = match /^\/([a-z0-9\-]+)\/([a-z0-9\-]+)\/$/ against locationPath + when sectionMatcher.matches() apply { + map 'section' onto 'pageType' + map sectionMatcher.group(1) onto 'shop' + map sectionMatcher.group(2) onto 'section' + exit() + } + + // Stop + // e.g. /fashion/jeans/ + def shopMatcher = match /^\/([a-z0-9\-]+)\/$/ against locationPath + when shopMatcher.matches() apply { + map 'section' onto 'pageType' + map shopMatcher.group(1) onto 'shop' + exit() + } + } +} From 66ca03034c3e9a4504bba32095ffe01260c034ed Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 31 Dec 2015 14:48:22 +0100 Subject: [PATCH 47/80] Refactor the way source configurations are retrieved from the divolte configuration for consistency. Retrieving source configurations is now the same as for sinks. In addition, we now universally throw IllegalArgumentException if the the wrong name and/or type is used. (This is a programming mistake, not a user error, as the javadoc now indicates.) --- .../java/io/divolte/server/BrowserSource.java | 3 +- .../server/config/DivolteConfiguration.java | 58 ++++++++++++++++--- .../server/js/TrackingJavaScriptResource.java | 3 +- 3 files changed, 54 insertions(+), 10 deletions(-) diff --git a/src/main/java/io/divolte/server/BrowserSource.java b/src/main/java/io/divolte/server/BrowserSource.java index 57801ba1..ea8d4a7b 100644 --- a/src/main/java/io/divolte/server/BrowserSource.java +++ b/src/main/java/io/divolte/server/BrowserSource.java @@ -17,6 +17,7 @@ package io.divolte.server; import com.google.common.collect.ImmutableCollection; +import io.divolte.server.config.BrowserSourceConfiguration; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.js.TrackingJavaScriptResource; import io.undertow.server.HttpHandler; @@ -44,7 +45,7 @@ public BrowserSource(final ValidatedConfiguration vc, final String sourceName, final ImmutableCollection mappingProcessors) { this(sourceName, - vc.configuration().getBrowserSourceConfiguration(sourceName).prefix, + vc.configuration().getSourceConfiguration(sourceName, BrowserSourceConfiguration.class).prefix, loadTrackingJavaScript(vc, sourceName), mappingProcessors); } diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 216a1964..345ab574 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -3,7 +3,10 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; -import com.google.common.collect.*; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; import io.divolte.server.config.constraint.OneSchemaPerSink; import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; @@ -37,23 +40,62 @@ public final class DivolteConfiguration { this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); } - public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sourceName) { + /** + * Retrieve the configuration for the source with the given name, casting it to an expected type. + * + * It is an error to request a source that doesn't exist or is of the wrong type: the caller is + * responsible for knowing the name is valid and the type of source. + * + * @param sourceName the name of the source whose configuration should be retrieved. + * @param sourceClass the class of the source configuration to retrieve. + * @param the type of the source configuration to retrieve. + * @return the configuration for the given source. + * @throws IllegalArgumentException + * if no configuration exists for the given source or its type is different + * to that expected. + */ + public T getSourceConfiguration(final String sourceName, final Class sourceClass) { final SourceConfiguration sourceConfiguration = sources.get(sourceName); - Objects.requireNonNull(sourceConfiguration, () -> "No source configuration with name: " + sourceName); - Preconditions.checkArgument(sourceConfiguration instanceof BrowserSourceConfiguration, - "Source configuration '%s' is not a browser source", sourceName); - return (BrowserSourceConfiguration)sourceConfiguration; + Preconditions.checkArgument(null != sourceConfiguration, "No source configuration with name: %s", sourceName); + Preconditions.checkArgument(sourceClass.isInstance(sourceConfiguration), + "Source configuration '%s' is not a %s sink", sourceName, sourceClass.getSimpleName()); + return sourceClass.cast(sourceConfiguration); } + /** + * Retrieve the configuration for the mapping with the given name. + * + * It is an error to request a mapping that doesn't exist: the caller is responsible for knowing + * the name is valid. + * + * @param mappingName the name of the mapping whose configuration should be retrieved. + * @return the configuration for the given mapping. + * @throws IllegalArgumentException + * if no configuration exists for the given mapping. + */ public MappingConfiguration getMappingConfiguration(final String mappingName) { final MappingConfiguration mappingConfiguration = mappings.get(mappingName); - Objects.requireNonNull(mappingConfiguration, () -> "No mapping configuration with name: " + mappingName); + Preconditions.checkArgument(null != mappingConfiguration, "No mapping configuration with name: %s", mappingName); return mappingConfiguration; } + /** + * Retrieve the configuration for the sink with the given name, casting it to an expected type. + * + * It is an error to request a sink that doesn't exist or is of the wrong type: the caller is + * responsible for knowing the name is valid and the type of sink. + * + * @param sinkName the name of the sink whose configuration should be retrieved. + * @param sinkClass the class of the sink configuration to retrieve. + * @param the type of the sink configuration to retrieve. + * @return the configuration for the given sink. + * @throws IllegalArgumentException + * if no configuration exists for the given sink or its type is different + * to that expected. + */ public T getSinkConfiguration(final String sinkName, final Class sinkClass) { final SinkConfiguration sinkConfiguration = sinks.get(sinkName); - Objects.requireNonNull(sinkConfiguration, () -> "No sink configuration with name: " + sinkName); + Preconditions.checkArgument(null != sinkConfiguration, "No sink configuration with name: %s", sinkName); Preconditions.checkArgument(sinkClass.isInstance(sinkConfiguration), "Sink configuration '%s' is not a %s sink", sinkName, sinkClass.getSimpleName()); return sinkClass.cast(sinkConfiguration); diff --git a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java index e195f989..33a5b211 100644 --- a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java +++ b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java @@ -69,7 +69,8 @@ private static int trimLongToMaxInt(long duration) { public static TrackingJavaScriptResource create(final ValidatedConfiguration vc, final String sourceName) throws IOException { - final BrowserSourceConfiguration browserSourceConfiguration = vc.configuration().getBrowserSourceConfiguration(sourceName); + final BrowserSourceConfiguration browserSourceConfiguration = + vc.configuration().getSourceConfiguration(sourceName, BrowserSourceConfiguration.class); return new TrackingJavaScriptResource(browserSourceConfiguration.javascript.name, createScriptConstants(browserSourceConfiguration), browserSourceConfiguration.javascript.debug); From bf0677c3dd2a048e79d3b3df6c162c6b6b0c0967 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 31 Dec 2015 16:50:31 +0100 Subject: [PATCH 48/80] Annotate some methods whose overrides should invoke super. --- src/main/java/io/divolte/server/config/SinkConfiguration.java | 2 ++ .../java/io/divolte/server/config/SinkTypeConfiguration.java | 2 ++ src/main/java/io/divolte/server/config/SourceConfiguration.java | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/main/java/io/divolte/server/config/SinkConfiguration.java b/src/main/java/io/divolte/server/config/SinkConfiguration.java index f9398d34..4019bb9d 100644 --- a/src/main/java/io/divolte/server/config/SinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/SinkConfiguration.java @@ -8,6 +8,7 @@ import io.divolte.server.SchemaRegistry; import io.divolte.server.processing.ProcessingPool; +import javax.annotation.OverridingMethodsMustInvokeSuper; import javax.annotation.ParametersAreNonnullByDefault; @JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") @@ -17,6 +18,7 @@ }) @ParametersAreNonnullByDefault public abstract class SinkConfiguration { + @OverridingMethodsMustInvokeSuper protected MoreObjects.ToStringHelper toStringHelper() { return MoreObjects.toStringHelper(this); } diff --git a/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java b/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java index 70d8a423..b8d52fe0 100644 --- a/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java +++ b/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java @@ -2,6 +2,7 @@ import com.google.common.base.MoreObjects; +import javax.annotation.OverridingMethodsMustInvokeSuper; import javax.annotation.ParametersAreNonnullByDefault; @ParametersAreNonnullByDefault @@ -17,6 +18,7 @@ protected SinkTypeConfiguration(final int bufferSize, final int threads, final b this.enabled = enabled; } + @OverridingMethodsMustInvokeSuper protected MoreObjects.ToStringHelper toStringHelper() { return MoreObjects.toStringHelper(this) .add("enabled", enabled) diff --git a/src/main/java/io/divolte/server/config/SourceConfiguration.java b/src/main/java/io/divolte/server/config/SourceConfiguration.java index 114a84db..a6894200 100644 --- a/src/main/java/io/divolte/server/config/SourceConfiguration.java +++ b/src/main/java/io/divolte/server/config/SourceConfiguration.java @@ -4,6 +4,7 @@ import com.fasterxml.jackson.annotation.JsonTypeInfo; import com.google.common.base.MoreObjects; +import javax.annotation.OverridingMethodsMustInvokeSuper; import javax.annotation.ParametersAreNonnullByDefault; @JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") @@ -12,6 +13,7 @@ }) @ParametersAreNonnullByDefault public abstract class SourceConfiguration { + @OverridingMethodsMustInvokeSuper protected MoreObjects.ToStringHelper toStringHelper() { return MoreObjects.toStringHelper(this); } From ed789fad675faaf4eaef33c710ee26c044270f24 Mon Sep 17 00:00:00 2001 From: Friso van Vollenhoven Date: Mon, 1 Feb 2016 17:59:55 +0100 Subject: [PATCH 49/80] Move all mapping onto a single thread pool. --- .../java/io/divolte/server/BrowserSource.java | 29 ++- .../server/ClientSideCookieEventHandler.java | 14 +- .../io/divolte/server/EventForwarder.java | 80 ------ .../server/IncomingRequestProcessingPool.java | 54 ++-- .../server/IncomingRequestProcessor.java | 236 ++++++++++-------- src/main/java/io/divolte/server/Mapping.java | 112 +++++++++ src/main/java/io/divolte/server/Server.java | 53 ++-- .../server/config/DivolteConfiguration.java | 57 ++++- .../server/config/ValidatedConfiguration.java | 2 +- 9 files changed, 358 insertions(+), 279 deletions(-) delete mode 100644 src/main/java/io/divolte/server/EventForwarder.java create mode 100644 src/main/java/io/divolte/server/Mapping.java diff --git a/src/main/java/io/divolte/server/BrowserSource.java b/src/main/java/io/divolte/server/BrowserSource.java index 57801ba1..41102dc2 100644 --- a/src/main/java/io/divolte/server/BrowserSource.java +++ b/src/main/java/io/divolte/server/BrowserSource.java @@ -16,19 +16,20 @@ package io.divolte.server; -import com.google.common.collect.ImmutableCollection; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Objects; + +import javax.annotation.ParametersAreNonnullByDefault; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.js.TrackingJavaScriptResource; import io.undertow.server.HttpHandler; import io.undertow.server.handlers.PathHandler; import io.undertow.util.Methods; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Objects; @ParametersAreNonnullByDefault public class BrowserSource { @@ -40,25 +41,27 @@ public class BrowserSource { private final HttpHandler javascriptHandler; private final HttpHandler eventHandler; + public BrowserSource(final ValidatedConfiguration vc, final String sourceName, - final ImmutableCollection mappingProcessors) { + final IncomingRequestProcessingPool processingPool) { this(sourceName, vc.configuration().getBrowserSourceConfiguration(sourceName).prefix, loadTrackingJavaScript(vc, sourceName), - mappingProcessors); + processingPool, + vc.configuration().sourceIndex(sourceName)); } private BrowserSource(final String sourceName, final String pathPrefix, final TrackingJavaScriptResource trackingJavascript, - final ImmutableCollection mappingProcessors) { + final IncomingRequestProcessingPool processingPool, + final int sourceIndex) { this.sourceName = Objects.requireNonNull(sourceName); this.pathPrefix = Objects.requireNonNull(pathPrefix); javascriptName = trackingJavascript.getScriptName(); javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavascript), Methods.GET); - final EventForwarder processingPoolsForwarder = EventForwarder.create(mappingProcessors); - final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPoolsForwarder); + final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPool, sourceIndex); eventHandler = new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET); } diff --git a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java index 1ce175d8..2010b17e 100644 --- a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java +++ b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java @@ -49,7 +49,8 @@ public final class ClientSideCookieEventHandler implements HttpHandler { private final static String SENTINEL_ETAG_VALUE = SENTINEL_ETAG.toString(); private final ByteBuffer transparentImage; - private final EventForwarder processingPools; + private final IncomingRequestProcessingPool processingPool; + private final int sourceIndex; private static final String TRUE_STRING = "t"; @@ -75,8 +76,9 @@ public final class ClientSideCookieEventHandler implements HttpHandler { static final String EVENT_SOURCE_NAME = "browser"; - public ClientSideCookieEventHandler(final EventForwarder processingPools) { - this.processingPools = Objects.requireNonNull(processingPools); + public ClientSideCookieEventHandler(final IncomingRequestProcessingPool processingPool, final int sourceIndex) { + this.sourceIndex = sourceIndex; + this.processingPool = Objects.requireNonNull(processingPool); try { this.transparentImage = ByteBuffer.wrap( @@ -124,10 +126,6 @@ public void handleRequest(final HttpServerExchange exchange) { // If an ETag is present, this is a duplicate event. if (ETagUtils.handleIfNoneMatch(exchange, SENTINEL_ETAG, true)) { - /* - * Subclasses are responsible to logging events. - * We just ensure the pixel is always returned, no matter what. - */ try { logEvent(exchange); } finally { @@ -175,7 +173,7 @@ private void handleRequestIfComplete(final HttpServerExchange exchange) throws I isNewPartyId, isFirstInSession, exchange); logger.debug("Enqueuing event (client generated cookies): {}/{}/{}/{}", partyId, sessionId, pageViewId, eventId); - processingPools.forward(Item.of(0, partyId.value, event)); + processingPool.enqueue(Item.of(sourceIndex, partyId.value, event)); } static DivolteEvent buildBrowserEventData(final boolean corruptEvent, diff --git a/src/main/java/io/divolte/server/EventForwarder.java b/src/main/java/io/divolte/server/EventForwarder.java deleted file mode 100644 index 92b960cb..00000000 --- a/src/main/java/io/divolte/server/EventForwarder.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2015 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server; - -import com.google.common.collect.ImmutableCollection; -import com.google.common.collect.Iterables; -import io.divolte.server.processing.Item; -import io.divolte.server.processing.ProcessingPool; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public abstract class EventForwarder { - public abstract void forward(final Item event); - - private static EventForwarder EMPTY_FORWARDER = new NoopEventForwarder<>(); - - @ParametersAreNonnullByDefault - private final static class NoopEventForwarder extends EventForwarder { - @Override - public void forward(final Item event) { - // Nothing to do. - } - } - - @ParametersAreNonnullByDefault - private final static class SingleReceiverEventForwarder extends EventForwarder { - private final ProcessingPool receiver; - - private SingleReceiverEventForwarder(final ProcessingPool receiver) { - this.receiver = Objects.requireNonNull(receiver); - } - - @Override - public void forward(final Item event) { - receiver.enqueue(event); - } - } - - private final static class MultipleReceiverEventForwarder extends EventForwarder { - private final ImmutableCollection> receivers; - - private MultipleReceiverEventForwarder(final ImmutableCollection> receivers) { - this.receivers = Objects.requireNonNull(receivers); - } - - @Override - public void forward(final Item event) { - receivers.forEach(receiver -> receiver.enqueue(event)); - } - } - - static EventForwarder create(final ImmutableCollection> receivers) { - switch (receivers.size()) { - case 0: - @SuppressWarnings("unchecked") - final EventForwarder emptyForwarder = (EventForwarder) EMPTY_FORWARDER; - return emptyForwarder; - case 1: - return new SingleReceiverEventForwarder<>(Iterables.getOnlyElement(receivers)); - default: - return new MultipleReceiverEventForwarder<>(receivers); - } - } -} diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index 219dc559..f30d9333 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -16,66 +16,50 @@ package io.divolte.server; -import com.google.common.collect.ImmutableSet; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Optional; + +import javax.annotation.ParametersAreNonnullByDefault; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ImmutableMap; + import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.ip2geo.ExternalDatabaseLookupService; import io.divolte.server.ip2geo.LookupService; import io.divolte.server.processing.ProcessingPool; -import org.apache.avro.Schema; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.IOException; -import java.nio.file.Paths; -import java.util.Optional; -import java.util.function.Function; @ParametersAreNonnullByDefault final class IncomingRequestProcessingPool extends ProcessingPool { private final static Logger logger = LoggerFactory.getLogger(IncomingRequestProcessingPool.class); public IncomingRequestProcessingPool(final ValidatedConfiguration vc, - final String name, final SchemaRegistry schemaRegistry, - final Function>> sinkProvider, + final ImmutableMap> sinksByName, final IncomingRequestListener listener) { this ( - vc.configuration().global.mapper.threads, - vc.configuration().global.mapper.bufferSize, vc, - name, - schemaRegistry.getSchemaByMappingName(name), - buildSinksForwarder(sinkProvider, vc.configuration().mappings.get(name).sinks), + schemaRegistry, + sinksByName, lookupServiceFromConfig(vc), listener ); } - private static EventForwarder buildSinksForwarder(final Function>> sinkProvider, - final ImmutableSet sinkNames) { - // Some sinks may not be available via the provider: these have been globally disabled. - return EventForwarder.create(sinkNames.stream() - .map(sinkProvider::apply) - .filter(Optional::isPresent) - .map(Optional::get) - .collect(MoreCollectors.toImmutableList())); - } - public IncomingRequestProcessingPool( - final int numThreads, - final int maxQueueSize, final ValidatedConfiguration vc, - final String name, - final Schema schema, - final EventForwarder flushingPools, + final SchemaRegistry schemaRegistry, + final ImmutableMap> sinksByName, final Optional geoipLookupService, final IncomingRequestListener listener) { super( - numThreads, - maxQueueSize, + vc.configuration().global.mapper.threads, + vc.configuration().global.mapper.bufferSize, "Incoming Request Processor", - () -> new IncomingRequestProcessor(vc, name, flushingPools, geoipLookupService, schema, listener)); + () -> new IncomingRequestProcessor(vc, sinksByName, geoipLookupService, schemaRegistry, listener)); } private static Optional lookupServiceFromConfig(final ValidatedConfiguration vc) { diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index 45c2cdce..03cf24a3 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -16,27 +16,29 @@ package io.divolte.server; -import io.divolte.record.DefaultEventRecord; -import io.divolte.server.config.MappingConfiguration; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; + +import java.util.ArrayList; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import javax.annotation.ParametersAreNonnullByDefault; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; + import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.ip2geo.LookupService; import io.divolte.server.processing.Item; import io.divolte.server.processing.ItemProcessor; -import io.divolte.server.recordmapping.DslRecordMapper; -import io.divolte.server.recordmapping.DslRecordMapping; -import io.divolte.server.recordmapping.RecordMapper; -import io.divolte.server.recordmapping.UserAgentParserAndCache; +import io.divolte.server.processing.ProcessingPool; import io.undertow.util.AttachmentKey; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; -import java.util.Optional; - -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; @ParametersAreNonnullByDefault public final class IncomingRequestProcessor implements ItemProcessor { @@ -44,101 +46,139 @@ public final class IncomingRequestProcessor implements ItemProcessor DUPLICATE_EVENT_KEY = AttachmentKey.create(Boolean.class); - private final EventForwarder flushingPools; - - private final IncomingRequestListener listener; - - private final RecordMapper mapper; - - private final boolean keepCorrupted; - private final ShortTermDuplicateMemory memory; - private final boolean keepDuplicates; + + private final ImmutableList> mappingsBySourceIndex; + private final ImmutableList>> sinksByMappingIndex; public IncomingRequestProcessor(final ValidatedConfiguration vc, - final String name, - final EventForwarder flushingPools, + final ImmutableMap> sinksByName, final Optional geoipLookupService, - final Schema schema, + final SchemaRegistry schemaRegistry, final IncomingRequestListener listener) { - this.flushingPools = flushingPools; - this.listener = Objects.requireNonNull(listener); - - final MappingConfiguration mappingConfiguration = vc.configuration().getMappingConfiguration(name); - keepCorrupted = !mappingConfiguration.discardCorrupted; memory = new ShortTermDuplicateMemory(vc.configuration().global.mapper.duplicateMemorySize); - keepDuplicates = !mappingConfiguration.discardDuplicates; - - mapper = mappingConfiguration.mappingScriptFile - .map((mappingScriptFile) -> { - logger.info("Using script based schema mapping."); - return new DslRecordMapper(vc, mappingScriptFile, Objects.requireNonNull(schema), geoipLookupService); - }).orElseGet(() -> { - logger.info("Using built in default schema mapping."); - return new DslRecordMapper(DefaultEventRecord.getClassSchema(), defaultRecordMapping(vc)); - }); - } - - private DslRecordMapping defaultRecordMapping(final ValidatedConfiguration vc) { - final DslRecordMapping result = new DslRecordMapping(DefaultEventRecord.getClassSchema(), new UserAgentParserAndCache(vc), Optional.empty()); - result.map("detectedCorruption", result.corrupt()); - result.map("detectedDuplicate", result.duplicate()); - result.map("firstInSession", result.firstInSession()); - result.map("timestamp", result.timestamp()); - result.map("clientTimestamp", result.clientTimestamp()); - result.map("remoteHost", result.remoteHost()); - result.map("referer", result.referer()); - result.map("location", result.location()); - result.map("viewportPixelWidth", result.viewportPixelWidth()); - result.map("viewportPixelHeight", result.viewportPixelHeight()); - result.map("screenPixelWidth", result.screenPixelWidth()); - result.map("screenPixelHeight", result.screenPixelHeight()); - result.map("partyId", result.partyId()); - result.map("sessionId", result.sessionId()); - result.map("pageViewId", result.pageViewId()); - result.map("eventType", result.eventType()); - result.map("userAgentString", result.userAgentString()); - final DslRecordMapping.UserAgentValueProducer userAgent = result.userAgent(); - result.map("userAgentName", userAgent.name()); - result.map("userAgentFamily", userAgent.family()); - result.map("userAgentVendor", userAgent.vendor()); - result.map("userAgentType", userAgent.type()); - result.map("userAgentVersion", userAgent.version()); - result.map("userAgentDeviceCategory", userAgent.deviceCategory()); - result.map("userAgentOsFamily", userAgent.osFamily()); - result.map("userAgentOsVersion", userAgent.osVersion()); - result.map("userAgentOsVendor", userAgent.osVendor()); - return result; + /* + * Create all Mapping instances based on their config. + */ + final Map mappingsByName = vc.configuration() + .mappings + .entrySet() + .stream() + .collect(Collectors.toMap( + (kv) -> kv.getKey(), + (kv) -> new Mapping( + vc, + kv.getKey(), + geoipLookupService, + schemaRegistry, + listener) + )); + + /* + * Create a mapping from source index to a list of Mapping's that apply + * to events generated from that source index. Finally, we use a + * ImmutableList> as result, not a + * Map> because that way the backing + * data structure is effectively a two-dimensional array and no hashing + * is required for retrieval (list indexes are ints already). + */ + final ArrayList> sourceMappingResult = // temporary mutable container for the result + IntStream.range(0, vc.configuration().sources.size()) + .>mapToObj((ignored) -> ImmutableList.of()) // initialized with empty lists per default + .collect(Collectors.toCollection(ArrayList::new)); + + vc.configuration() + .mappings + .entrySet() + .stream() // stream of entries (mapping_name, mapping_configuration) + .flatMap( + (kv) -> kv.getValue() + .sources + .stream() + .map( + s -> Maps.immutableEntry( + vc.configuration().sourceIndex(s), + kv.getKey()))) // Results in stream of (source_index, mapping_name) + .collect(Collectors.groupingBy( + (e) -> e.getKey(), + Collectors.mapping( + e -> mappingsByName.get(e.getValue()), + MoreCollectors.toImmutableList()) + )) // Results in a Map> where the key is the source index + .forEach((idx, m) -> sourceMappingResult.set(idx, m)); // Populate the temporary result in ArrayList> + + mappingsBySourceIndex = ImmutableList.copyOf(sourceMappingResult); // Make immutable copy + + /* + * Create a mapping from mapping index to a list of sinks (ProcessingPools) + * that apply for events that came from the given mapping. Similar as above, + * we transform the result into a list of lists, instead of a map in order + * to make sure the underlying lookups are array index lookups instead of + * hash map lookups. + * + * Note that we need to know the sinks for a mapping here, instead of on the + * sink thread side, since we have one pool per sink at this moment. Later + * we'll likely move to one pool per sink type (i.e. Kafka, HDFS) and leave + * it to that pool to multiplex events to different sinks destinations (HDFS + * files or Kafka topics), which should move this code elsewhere. + */ + final ArrayList>> mappingMappingResult = // temporary mutable container for the result + IntStream.range(0, vc.configuration().mappings.size()) + .>>mapToObj((ignored) -> ImmutableList.of()) // initialized with empty lists per default + .collect(Collectors.toCollection(ArrayList::new)); + + /* + * Without the intermediate variable (collected), The Eclipse compiler's type + * inference doesn't know how to handle this. Don't know about Oracle Java compiler. + */ + final Map>> collected = vc.configuration() + .mappings + .entrySet() + .stream() + .flatMap( + (kv) -> kv.getValue() + .sinks + .stream() + .map( + s -> Maps.immutableEntry( + vc.configuration().mappingIndex(kv.getKey()), + s + ))) + .filter(e -> sinksByName.containsKey(e.getValue())) + .collect(Collectors.groupingBy( + (e) -> e.getKey(), + Collectors.mapping( + e -> sinksByName.get(e.getValue()), + MoreCollectors.toImmutableList() + ) + )); + collected.forEach((idx, s) -> mappingMappingResult.set(idx, s)); + + sinksByMappingIndex = ImmutableList.copyOf(mappingMappingResult); } @Override public ProcessingDirective process(final Item item) { final DivolteEvent event = item.payload; - if (!event.corruptEvent || keepCorrupted) { - /* - * Note: we cannot use the actual query string here, - * as the incoming request processor is agnostic of - * that sort of thing. The request may have come from - * an endpoint that doesn't require a query string, - * but rather generates these IDs on the server side. - */ - final boolean duplicate = memory.isProbableDuplicate(event.partyCookie.value, event.sessionCookie.value, event.eventId); - event.exchange.putAttachment(DUPLICATE_EVENT_KEY, duplicate); - - if (!duplicate || keepDuplicates) { - final GenericRecord avroRecord = mapper.newRecordFromExchange(event); - final AvroRecordBuffer avroBuffer = AvroRecordBuffer.fromRecord( - event.partyCookie, - event.sessionCookie, - event.requestStartTime, - event.clientUtcOffset, - avroRecord); - listener.incomingRequest(event, avroBuffer, avroRecord); - flushingPools.forward(Item.withCopiedAffinity(0, item, avroBuffer)); - } - } + + final boolean duplicate = memory.isProbableDuplicate(event.partyCookie.value, event.sessionCookie.value, event.eventId); + event.exchange.putAttachment(DUPLICATE_EVENT_KEY, duplicate); + + mappingsBySourceIndex.get(item.sourceId) + .stream() // For each mapping that applies to this source + .map(mapping -> mapping.map(item, duplicate)) + .filter(optionalBufferItem -> optionalBufferItem.isPresent()) // Filter discarded for duplication or corruption + .map(Optional::get) + .forEach( + bufferItem -> { + sinksByMappingIndex.get(bufferItem.sourceId) + .stream() // For each sink that applies to this mapping + .forEach(sink -> { + sink.enqueue(bufferItem); + }); + }); return CONTINUE; } } diff --git a/src/main/java/io/divolte/server/Mapping.java b/src/main/java/io/divolte/server/Mapping.java new file mode 100644 index 00000000..97ee87af --- /dev/null +++ b/src/main/java/io/divolte/server/Mapping.java @@ -0,0 +1,112 @@ +package io.divolte.server; + +import java.util.Optional; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.divolte.record.DefaultEventRecord; +import io.divolte.server.config.MappingConfiguration; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.ip2geo.LookupService; +import io.divolte.server.processing.Item; +import io.divolte.server.recordmapping.DslRecordMapper; +import io.divolte.server.recordmapping.DslRecordMapping; +import io.divolte.server.recordmapping.RecordMapper; +import io.divolte.server.recordmapping.UserAgentParserAndCache; + +public class Mapping { + private static final Logger logger = LoggerFactory.getLogger(Mapping.class); + + private final RecordMapper mapper; + private final boolean keepCorrupted; + private final boolean keepDuplicates; + private final int mappingIndex; + + private final IncomingRequestListener listener; + + public Mapping( + final ValidatedConfiguration vc, + final String mappingName, + final Optional geoipLookupService, + final SchemaRegistry schemaRegistry, + final IncomingRequestListener listener) { + this.listener = listener; + + final MappingConfiguration mappingConfiguration = vc.configuration().mappings.get(mappingName); + final Schema schema = schemaRegistry.getSchemaByMappingName(mappingName); + + this.mappingIndex = vc.configuration().mappingIndex(mappingName); + this.keepCorrupted = !mappingConfiguration.discardCorrupted; + this.keepDuplicates = !mappingConfiguration.discardDuplicates; + + this.mapper = mappingConfiguration.mappingScriptFile + .map((mappingScriptFile) -> { + logger.info("Using script based schema mapping."); + return new DslRecordMapper(vc, mappingScriptFile, schema, geoipLookupService); + }).orElseGet(() -> { + logger.info("Using built in default schema mapping."); + return new DslRecordMapper(DefaultEventRecord.getClassSchema(), defaultRecordMapping(vc)); + }); + } + + private DslRecordMapping defaultRecordMapping(final ValidatedConfiguration vc) { + final DslRecordMapping result = new DslRecordMapping(DefaultEventRecord.getClassSchema(), new UserAgentParserAndCache(vc), Optional.empty()); + result.map("detectedCorruption", result.corrupt()); + result.map("detectedDuplicate", result.duplicate()); + result.map("firstInSession", result.firstInSession()); + result.map("timestamp", result.timestamp()); + result.map("clientTimestamp", result.clientTimestamp()); + result.map("remoteHost", result.remoteHost()); + result.map("referer", result.referer()); + result.map("location", result.location()); + result.map("viewportPixelWidth", result.viewportPixelWidth()); + result.map("viewportPixelHeight", result.viewportPixelHeight()); + result.map("screenPixelWidth", result.screenPixelWidth()); + result.map("screenPixelHeight", result.screenPixelHeight()); + result.map("partyId", result.partyId()); + result.map("sessionId", result.sessionId()); + result.map("pageViewId", result.pageViewId()); + result.map("eventType", result.eventType()); + result.map("userAgentString", result.userAgentString()); + final DslRecordMapping.UserAgentValueProducer userAgent = result.userAgent(); + result.map("userAgentName", userAgent.name()); + result.map("userAgentFamily", userAgent.family()); + result.map("userAgentVendor", userAgent.vendor()); + result.map("userAgentType", userAgent.type()); + result.map("userAgentVersion", userAgent.version()); + result.map("userAgentDeviceCategory", userAgent.deviceCategory()); + result.map("userAgentOsFamily", userAgent.osFamily()); + result.map("userAgentOsVersion", userAgent.osVersion()); + result.map("userAgentOsVendor", userAgent.osVendor()); + return result; + } + + public Optional> map(final Item item, final boolean duplicate) { + final DivolteEvent event = item.payload; + if ( + (keepDuplicates || !duplicate) && + (keepCorrupted || !event.corruptEvent)) { + final GenericRecord avroRecord = mapper.newRecordFromExchange(event); + final AvroRecordBuffer avroBuffer = AvroRecordBuffer.fromRecord( + event.partyCookie, + event.sessionCookie, + event.requestStartTime, + event.clientUtcOffset, + avroRecord); + + /* + * We should really think of a way to get rid of this and test the + * mapping process in isolation of the server. + * In the many-to-many setup, this call is potentially amplified. + */ + listener.incomingRequest(event, avroBuffer, avroRecord); + + return Optional.of(Item.withCopiedAffinity(mappingIndex, item, avroBuffer)); + } else { + return Optional.empty(); + } + } +} diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index c9ed9a14..51d0f0f9 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -16,11 +16,22 @@ package io.divolte.server; +import java.io.IOException; +import java.time.Duration; +import java.util.Map; +import java.util.Optional; + +import javax.annotation.ParametersAreNonnullByDefault; + +import org.apache.hadoop.fs.FileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableMultimap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.typesafe.config.ConfigFactory; + import io.divolte.server.config.HdfsSinkConfiguration; import io.divolte.server.config.KafkaSinkConfiguration; import io.divolte.server.config.ValidatedConfiguration; @@ -37,16 +48,6 @@ import io.undertow.server.handlers.resource.ResourceHandler; import io.undertow.server.handlers.resource.ResourceManager; import io.undertow.util.Headers; -import org.apache.hadoop.fs.FileSystem; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.IOException; -import java.time.Duration; -import java.util.Map; -import java.util.Optional; -import java.util.function.Function; @ParametersAreNonnullByDefault public final class Server implements Runnable { @@ -56,7 +57,7 @@ public final class Server implements Runnable { private final GracefulShutdownHandler shutdownHandler; private final ImmutableMap> sinks; - private final ImmutableMap mappingProcessors; + private final IncomingRequestProcessingPool incomingRequestProcessingPool; private final Optional host; private final int port; @@ -99,37 +100,15 @@ public Server(final ValidatedConfiguration vc) { logger.info("Initialized sinks: {}", sinks.keySet()); logger.debug("Initializing mappings..."); - final Function>> schemaProvider = - sinkName -> Optional.ofNullable(sinks.get(sinkName)); - mappingProcessors = - ImmutableMap.copyOf(Maps.transformEntries(vc.configuration().mappings, - (mappingName, config) -> - new IncomingRequestProcessingPool(vc, - mappingName, - schemaRegistry, - schemaProvider, - listener))); - logger.info("Initialized mappings: {}", mappingProcessors.keySet()); + incomingRequestProcessingPool = new IncomingRequestProcessingPool(vc, schemaRegistry, sinks, listener); logger.debug("Initializing sources..."); - // First build a list of which mappings are used by each source. - final ImmutableMultimap mappingProcessorsBySource = - vc.configuration().mappings.entrySet() - .stream() - .flatMap(mappingConfig -> { - final IncomingRequestProcessingPool mappingProcessor = mappingProcessors.get(mappingConfig.getKey()); - return mappingConfig.getValue() - .sources - .stream() - .map(source -> Maps.immutableEntry(source, mappingProcessor)); - }) - .collect(MoreCollectors.toImmutableMultimap()); // Now instantiate all the sources. We do this in parallel because instantiation can be quite slow. final ImmutableMap sources = vc.configuration().sources.keySet() .parallelStream() .map(name -> - Maps.immutableEntry(name, new BrowserSource(vc, name, mappingProcessorsBySource.get(name)))) + Maps.immutableEntry(name, new BrowserSource(vc, name, incomingRequestProcessingPool))) .collect(MoreCollectors.toImmutableMap()); logger.debug("Attaching sources: {}", sources.keySet()); // Once all created we can attach them to the server. This has to be done sequentially. @@ -194,7 +173,7 @@ public void shutdown() { logger.info("Stopping thread pools."); // Stop the mappings before the sinks to ensure work in progress doesn't get stranded. - mappingProcessors.values().forEach(ProcessingPool::stop); + incomingRequestProcessingPool.stop(); sinks.values().forEach(ProcessingPool::stop); logger.info("Closing HDFS filesystem connection."); diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 216a1964..9a2b83f3 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -1,19 +1,29 @@ package io.divolte.server.config; +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.Valid; + import com.fasterxml.jackson.annotation.JsonCreator; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; -import com.google.common.collect.*; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; import io.divolte.server.config.constraint.OneSchemaPerSink; import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.validation.Valid; -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - @ParametersAreNonnullByDefault @MappingSourceSinkReferencesMustExist @SourceAndSinkNamesCannotCollide @@ -37,6 +47,39 @@ public final class DivolteConfiguration { this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); } + /* + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + private static int position(final T key, final ImmutableMap map) { + final ImmutableList keyList = map.keySet().asList(); + return keyList.indexOf(key); + } + + /** + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + public int sourceIndex(final String name) { + return position(name, sources); + } + + /** + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + public int sinkIndex(final String name) { + return position(name, sinks); + } + + /** + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + public int mappingIndex(final String name) { + return position(name, mappings); + } + public BrowserSourceConfiguration getBrowserSourceConfiguration(final String sourceName) { final SourceConfiguration sourceConfiguration = sources.get(sourceName); Objects.requireNonNull(sourceConfiguration, () -> "No source configuration with name: " + sourceName); diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 58b3780d..b151189b 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -176,7 +176,7 @@ private static DivolteConfiguration mapped(final Config input) throws IOExceptio mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true); // Deserialization for Duration - final SimpleModule module= new SimpleModule("Configuration Deserializers"); + final SimpleModule module = new SimpleModule("Configuration Deserializers"); module.addDeserializer(Duration.class, new DurationDeserializer()); module.addDeserializer(Properties.class, new PropertiesDeserializer()); From 7eeb399481cf1fe6ed9be45a5cb74a57be7b55a5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Mar 2016 16:42:20 +0100 Subject: [PATCH 50/80] Reflow some of the streaming. Also substitute some method handles. --- .../server/IncomingRequestProcessor.java | 89 +++++++------------ 1 file changed, 32 insertions(+), 57 deletions(-) diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index 03cf24a3..e75c5017 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -26,9 +26,6 @@ import javax.annotation.ParametersAreNonnullByDefault; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; @@ -42,13 +39,13 @@ @ParametersAreNonnullByDefault public final class IncomingRequestProcessor implements ItemProcessor { - private static final Logger logger = LoggerFactory.getLogger(IncomingRequestProcessor.class); - public static final AttachmentKey DUPLICATE_EVENT_KEY = AttachmentKey.create(Boolean.class); private final ShortTermDuplicateMemory memory; + // Given a source index, which mappings do we need to apply. private final ImmutableList> mappingsBySourceIndex; + // Given a mapping index, which sinks do we need to send it to. private final ImmutableList>> sinksByMappingIndex; public IncomingRequestProcessor(final ValidatedConfiguration vc, @@ -66,15 +63,12 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, .mappings .entrySet() .stream() - .collect(Collectors.toMap( - (kv) -> kv.getKey(), - (kv) -> new Mapping( - vc, - kv.getKey(), - geoipLookupService, - schemaRegistry, - listener) - )); + .collect(Collectors.toMap(Map.Entry::getKey, + kv -> new Mapping(vc, + kv.getKey(), + geoipLookupService, + schemaRegistry, + listener))); /* * Create a mapping from source index to a list of Mapping's that apply @@ -84,30 +78,25 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, * data structure is effectively a two-dimensional array and no hashing * is required for retrieval (list indexes are ints already). */ - final ArrayList> sourceMappingResult = // temporary mutable container for the result + final ArrayList> sourceMappingResult = // temporary mutable container for the result IntStream.range(0, vc.configuration().sources.size()) - .>mapToObj((ignored) -> ImmutableList.of()) // initialized with empty lists per default + .>mapToObj(ignored -> ImmutableList.of()) // initialized with empty lists per default .collect(Collectors.toCollection(ArrayList::new)); vc.configuration() .mappings .entrySet() .stream() // stream of entries (mapping_name, mapping_configuration) - .flatMap( - (kv) -> kv.getValue() - .sources - .stream() - .map( - s -> Maps.immutableEntry( - vc.configuration().sourceIndex(s), - kv.getKey()))) // Results in stream of (source_index, mapping_name) - .collect(Collectors.groupingBy( - (e) -> e.getKey(), - Collectors.mapping( - e -> mappingsByName.get(e.getValue()), - MoreCollectors.toImmutableList()) + .flatMap(kv -> kv.getValue() + .sources + .stream() + .map(s -> Maps.immutableEntry(vc.configuration().sourceIndex(s), + kv.getKey()))) // Results in stream of (source_index, mapping_name) + .collect(Collectors.groupingBy(Map.Entry::getKey, + Collectors.mapping(e -> mappingsByName.get(e.getValue()), + MoreCollectors.toImmutableList()) )) // Results in a Map> where the key is the source index - .forEach((idx, m) -> sourceMappingResult.set(idx, m)); // Populate the temporary result in ArrayList> + .forEach(sourceMappingResult::set); // Populate the temporary result in ArrayList> mappingsBySourceIndex = ImmutableList.copyOf(sourceMappingResult); // Make immutable copy @@ -126,7 +115,7 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, */ final ArrayList>> mappingMappingResult = // temporary mutable container for the result IntStream.range(0, vc.configuration().mappings.size()) - .>>mapToObj((ignored) -> ImmutableList.of()) // initialized with empty lists per default + .>>mapToObj(ignored -> ImmutableList.of()) // initialized with empty lists per default .collect(Collectors.toCollection(ArrayList::new)); /* @@ -137,24 +126,15 @@ public IncomingRequestProcessor(final ValidatedConfiguration vc, .mappings .entrySet() .stream() - .flatMap( - (kv) -> kv.getValue() - .sinks - .stream() - .map( - s -> Maps.immutableEntry( - vc.configuration().mappingIndex(kv.getKey()), - s - ))) + .flatMap(kv->kv.getValue() + .sinks + .stream() + .map(s -> Maps.immutableEntry(vc.configuration().mappingIndex(kv.getKey()), s))) .filter(e -> sinksByName.containsKey(e.getValue())) - .collect(Collectors.groupingBy( - (e) -> e.getKey(), - Collectors.mapping( - e -> sinksByName.get(e.getValue()), - MoreCollectors.toImmutableList() - ) - )); - collected.forEach((idx, s) -> mappingMappingResult.set(idx, s)); + .collect(Collectors.groupingBy(Map.Entry::getKey, + Collectors.mapping(e -> sinksByName.get(e.getValue()), + MoreCollectors.toImmutableList()))); + collected.forEach(mappingMappingResult::set); sinksByMappingIndex = ImmutableList.copyOf(mappingMappingResult); } @@ -169,16 +149,11 @@ public ProcessingDirective process(final Item item) { mappingsBySourceIndex.get(item.sourceId) .stream() // For each mapping that applies to this source .map(mapping -> mapping.map(item, duplicate)) - .filter(optionalBufferItem -> optionalBufferItem.isPresent()) // Filter discarded for duplication or corruption + .filter(Optional::isPresent) // Filter discarded for duplication or corruption .map(Optional::get) - .forEach( - bufferItem -> { - sinksByMappingIndex.get(bufferItem.sourceId) - .stream() // For each sink that applies to this mapping - .forEach(sink -> { - sink.enqueue(bufferItem); - }); - }); + .forEach(bufferItem -> sinksByMappingIndex.get(bufferItem.sourceId) + .stream() // For each sink that applies to this mapping + .forEach(sink -> sink.enqueue(bufferItem))); return CONTINUE; } } From e07c0e0a8322340d9d3021e52ee91a3d26eb4898 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:33:28 +0200 Subject: [PATCH 51/80] Consistent null annotation and default handling during configuration initialisation. --- .../config/BrowserSourceConfiguration.java | 21 ++++++++++++------- .../config/FileStrategyConfiguration.java | 15 +++++++------ .../server/config/HdfsSinkConfiguration.java | 12 ++++------- .../config/JavascriptConfiguration.java | 14 ++++++++----- .../server/config/KafkaSinkConfiguration.java | 11 ++++++---- .../server/config/MappingConfiguration.java | 15 ++++++++----- 6 files changed, 52 insertions(+), 36 deletions(-) diff --git a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java index 50e1db49..c6fcfe4b 100644 --- a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java +++ b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java @@ -1,9 +1,12 @@ package io.divolte.server.config; import java.time.Duration; +import java.util.Objects; import java.util.Optional; +import javax.annotation.Nonnull; import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; import javax.validation.Valid; import com.fasterxml.jackson.annotation.JsonCreator; @@ -39,21 +42,23 @@ public class BrowserSourceConfiguration extends SourceConfiguration { public final JavascriptConfiguration javascript; @JsonCreator + @ParametersAreNullableByDefault BrowserSourceConfiguration(@JsonProperty(defaultValue=DEFAULT_PREFIX) final String prefix, - final Optional cookieDomain, + @Nonnull final Optional cookieDomain, @JsonProperty(defaultValue=DEFAULT_PARTY_COOKIE) final String partyCookie, @JsonProperty(defaultValue=DEFAULT_PARTY_TIMEOUT) final Duration partyTimeout, @JsonProperty(defaultValue=DEFAULT_SESSION_COOKIE) final String sessionCookie, @JsonProperty(defaultValue=DEFAULT_SESSION_TIMEOUT) final Duration sessionTimeout, final JavascriptConfiguration javascript) { - // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - final String rawPrefix = prefix == null ? DEFAULT_PREFIX : prefix; + super(); + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + final String rawPrefix = Optional.ofNullable(prefix).map((p) -> p.endsWith("/") ? p : p + '/').orElse(DEFAULT_PREFIX); this.prefix = rawPrefix.endsWith("/") ? rawPrefix : rawPrefix + '/'; - this.cookieDomain = cookieDomain; - this.partyCookie = partyCookie == null ? DEFAULT_PARTY_COOKIE : partyCookie; - this.partyTimeout = partyTimeout == null ? DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT) : partyTimeout; - this.sessionCookie = sessionCookie == null ? DEFAULT_SESSION_COOKIE : sessionCookie; - this.sessionTimeout = sessionTimeout == null ? DurationDeserializer.parseDuration(DEFAULT_SESSION_TIMEOUT) : sessionTimeout; + this.cookieDomain = Objects.requireNonNull(cookieDomain); + this.partyCookie = Optional.ofNullable(partyCookie).orElse(DEFAULT_PARTY_COOKIE); + this.partyTimeout = Optional.ofNullable(partyTimeout).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT)); + this.sessionCookie = Optional.ofNullable(sessionCookie).orElse(DEFAULT_SESSION_COOKIE); + this.sessionTimeout = Optional.ofNullable(sessionTimeout).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_SESSION_TIMEOUT)); this.javascript = Optional.ofNullable(javascript).orElse(JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); } diff --git a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java index df1da0c3..5733725d 100644 --- a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java +++ b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java @@ -1,8 +1,10 @@ package io.divolte.server.config; import java.time.Duration; +import java.util.Optional; import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; @@ -31,17 +33,18 @@ public class FileStrategyConfiguration { public final Duration rollEvery; @JsonCreator + @ParametersAreNullableByDefault FileStrategyConfiguration(@JsonProperty(defaultValue=DEFAULT_ROLL_EVERY) final Duration rollEvery, @JsonProperty(defaultValue=DEFAULT_SYNC_FILE_AFTER_RECORDS) final Integer syncFileAfterRecords, @JsonProperty(defaultValue=DEFAULT_SYNC_FILE_AFTER_DURATION) final Duration syncFileAfterDuration, @JsonProperty(defaultValue=DEFAULT_WORKING_DIR) final String workingDir, @JsonProperty(defaultValue=DEFAULT_PUBLISH_DIR) final String publishDir) { - // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - this.rollEvery = rollEvery == null ? DurationDeserializer.parseDuration(DEFAULT_ROLL_EVERY) : rollEvery; - this.syncFileAfterRecords = syncFileAfterRecords == null ? Integer.valueOf(DEFAULT_SYNC_FILE_AFTER_RECORDS) : syncFileAfterRecords; - this.syncFileAfterDuration = syncFileAfterDuration == null ? DurationDeserializer.parseDuration(DEFAULT_SYNC_FILE_AFTER_DURATION) : syncFileAfterDuration; - this.workingDir = workingDir == null ? DEFAULT_WORKING_DIR : workingDir; - this.publishDir = publishDir == null ? DEFAULT_PUBLISH_DIR : publishDir; + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.rollEvery = Optional.ofNullable(rollEvery).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_ROLL_EVERY)); + this.syncFileAfterRecords = Optional.ofNullable(syncFileAfterRecords).orElseGet(() -> Integer.valueOf(DEFAULT_SYNC_FILE_AFTER_RECORDS)); + this.syncFileAfterDuration = Optional.ofNullable(syncFileAfterDuration).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_SYNC_FILE_AFTER_DURATION)); + this.workingDir = Optional.ofNullable(workingDir).orElse(DEFAULT_WORKING_DIR); + this.publishDir = Optional.ofNullable(publishDir).orElse(DEFAULT_PUBLISH_DIR); } @Override diff --git a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java index 0088d2f3..bcc1e8cf 100644 --- a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java @@ -6,6 +6,7 @@ import io.divolte.server.hdfs.HdfsFlushingPool; import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; import java.util.Optional; @ParametersAreNonnullByDefault @@ -16,16 +17,11 @@ public class HdfsSinkConfiguration extends SinkConfiguration { public final FileStrategyConfiguration fileStrategy; @JsonCreator + @ParametersAreNullableByDefault HdfsSinkConfiguration(@JsonProperty(defaultValue=DEFAULT_REPLICATION) final Short replication, final FileStrategyConfiguration fileStrategy) { - // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - this.replication = replication == null ? Short.valueOf(DEFAULT_REPLICATION) : replication; - /* - * Passing a null defaults to the default strategy. Reason for not making the parameter Optional<...> is - * that this way, we can at some point use a tool to automatically document the configuration objects - * including types. This type of defaults could then be documented through the parameter specific JavaDoc - * for that param. - */ + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.replication = Optional.ofNullable(replication).orElseGet(() -> Short.valueOf(DEFAULT_REPLICATION)); this.fileStrategy = Optional.ofNullable(fileStrategy).orElse(FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION); } diff --git a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java index 6353f2e1..ecdf89f1 100644 --- a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java +++ b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java @@ -1,6 +1,7 @@ package io.divolte.server.config; import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; import javax.validation.constraints.NotNull; import javax.validation.constraints.Pattern; @@ -10,6 +11,8 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; +import java.util.Optional; + @ParametersAreNonnullByDefault public final class JavascriptConfiguration { private static final String DEFAULT_NAME = "divolte.js"; @@ -31,15 +34,16 @@ public final class JavascriptConfiguration { public final boolean autoPageViewEvent; @JsonCreator + @ParametersAreNullableByDefault JavascriptConfiguration(@JsonProperty(defaultValue=DEFAULT_NAME) final String name, @JsonProperty(defaultValue=DEFAULT_LOGGING) final Boolean logging, @JsonProperty(defaultValue=DEFAULT_DEBUG) final Boolean debug, @JsonProperty(defaultValue=DEFAULT_AUTO_PAGE_VIEW_EVENT) final Boolean autoPageViewEvent) { - // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - this.name = name == null ? DEFAULT_NAME : name; - this.logging = logging == null ? Boolean.valueOf(DEFAULT_LOGGING) : logging; - this.debug = debug == null ? Boolean.valueOf(DEFAULT_DEBUG) : debug; - this.autoPageViewEvent = autoPageViewEvent == null ? Boolean.valueOf(DEFAULT_AUTO_PAGE_VIEW_EVENT) : autoPageViewEvent; + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.name = Optional.ofNullable(name).orElse(DEFAULT_NAME); + this.logging = Optional.ofNullable(logging).orElseGet(() -> Boolean.valueOf(DEFAULT_LOGGING)); + this.debug = Optional.ofNullable(debug).orElseGet(() -> Boolean.valueOf(DEFAULT_DEBUG)); + this.autoPageViewEvent = Optional.ofNullable(autoPageViewEvent).orElseGet(() -> Boolean.valueOf(DEFAULT_AUTO_PAGE_VIEW_EVENT)); } @Override diff --git a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java index 843ed7c1..04f45c8d 100644 --- a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java +++ b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java @@ -1,12 +1,14 @@ package io.divolte.server.config; -import javax.annotation.ParametersAreNonnullByDefault; - import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; import io.divolte.server.kafka.KafkaFlushingPool; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; +import java.util.Optional; + @ParametersAreNonnullByDefault public class KafkaSinkConfiguration extends SinkConfiguration { private static final String DEFAULT_TOPIC = "divolte"; @@ -14,9 +16,10 @@ public class KafkaSinkConfiguration extends SinkConfiguration { public final String topic; @JsonCreator + @ParametersAreNullableByDefault KafkaSinkConfiguration(@JsonProperty(defaultValue=DEFAULT_TOPIC) final String topic) { - // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - this.topic = topic == null ? DEFAULT_TOPIC : topic; + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.topic = Optional.ofNullable(topic).orElse(DEFAULT_TOPIC); } @Override diff --git a/src/main/java/io/divolte/server/config/MappingConfiguration.java b/src/main/java/io/divolte/server/config/MappingConfiguration.java index c5656acd..e92c09af 100644 --- a/src/main/java/io/divolte/server/config/MappingConfiguration.java +++ b/src/main/java/io/divolte/server/config/MappingConfiguration.java @@ -5,6 +5,7 @@ import com.google.common.base.MoreObjects; import com.google.common.collect.ImmutableSet; +import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; import java.util.Objects; import java.util.Optional; @@ -30,15 +31,19 @@ public class MappingConfiguration { final ImmutableSet sources, @JsonProperty(required = true) final ImmutableSet sinks, - @JsonProperty(defaultValue=DEFAULT_DISCARD_CORRUPTED) final Boolean discardCorrupted, - @JsonProperty(defaultValue=DEFAULT_DISCARD_DUPLICATES) final Boolean discardDuplicates) { + @JsonProperty(defaultValue=DEFAULT_DISCARD_CORRUPTED) + @Nullable + final Boolean discardCorrupted, + @JsonProperty(defaultValue=DEFAULT_DISCARD_DUPLICATES) + @Nullable + final Boolean discardDuplicates) { this.schemaFile = Objects.requireNonNull(schemaFile); this.mappingScriptFile = Objects.requireNonNull(mappingScriptFile); this.sources = Objects.requireNonNull(sources); this.sinks = Objects.requireNonNull(sinks); - // TODO: register a custom deserializer with Jackson that uses the defaultValue proprty from the annotation to fix this - this.discardCorrupted = discardCorrupted == null ? Boolean.valueOf(DEFAULT_DISCARD_CORRUPTED) : discardCorrupted; - this.discardDuplicates = discardDuplicates == null ? Boolean.valueOf(DEFAULT_DISCARD_DUPLICATES) : discardDuplicates; + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.discardCorrupted = Optional.ofNullable(discardCorrupted).orElseGet(() -> Boolean.valueOf(DEFAULT_DISCARD_CORRUPTED)); + this.discardDuplicates = Optional.ofNullable(discardDuplicates).orElseGet(() -> Boolean.valueOf(DEFAULT_DISCARD_DUPLICATES)); } @Override From 4231060be7777d05d44c530aea15b1790b98f040 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:33:59 +0200 Subject: [PATCH 52/80] Fix a typo in the .toString() implementation. --- src/main/java/io/divolte/server/config/MapperConfiguration.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/config/MapperConfiguration.java b/src/main/java/io/divolte/server/config/MapperConfiguration.java index 90f60dea..db8f9305 100644 --- a/src/main/java/io/divolte/server/config/MapperConfiguration.java +++ b/src/main/java/io/divolte/server/config/MapperConfiguration.java @@ -34,7 +34,7 @@ public String toString() { .add("bufferSize", bufferSize) .add("threads", threads) .add("duplicateMemorySize", duplicateMemorySize) - .add("userAgentParses", userAgentParser) + .add("userAgentParser", userAgentParser) .add("ip2geoDatabase", ip2geoDatabase) .toString(); } From 7cd5d0f4f693bf399dd42af45869a5dd0d77c3f1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:35:05 +0200 Subject: [PATCH 53/80] Speed up tests by disabling HDFS support unless it's needed for the test. --- .../io/divolte/server/ServerSinkSourceConfigurationTest.java | 2 +- src/test/java/io/divolte/server/ServerTestUtils.java | 4 ++++ src/test/resources/hdfs-sink-multiple.conf | 1 + src/test/resources/mapping-configuration-explicit.conf | 1 + src/test/resources/mapping-configuration-independent.conf | 2 ++ src/test/resources/mapping-configuration-interdependent.conf | 2 ++ src/test/resources/mapping-configuration-shared-sink.conf | 2 ++ src/test/resources/mapping-configuration-shared-source.conf | 1 + src/test/resources/reference-test.conf | 2 ++ 9 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java index ddd3276e..1a30cf7f 100644 --- a/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java +++ b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java @@ -198,7 +198,7 @@ public void shouldSupportUnusedSource() throws IOException { @Test public void shouldSupportDefaultSourceMappingSink() throws IOException, InterruptedException { // Test that with an out-of-the-box default configuration the default source, mapping and sink are present. - startServer(); + startServer(TestServer::createTestServerWithDefaultNonTestConfiguration); final AvroFileLocator avroFileLocator = new AvroFileLocator(Paths.get("/tmp")); request(); testServer.get().waitForEvent(); diff --git a/src/test/java/io/divolte/server/ServerTestUtils.java b/src/test/java/io/divolte/server/ServerTestUtils.java index b13dad18..c476c803 100644 --- a/src/test/java/io/divolte/server/ServerTestUtils.java +++ b/src/test/java/io/divolte/server/ServerTestUtils.java @@ -99,6 +99,10 @@ private TestServer(final int port, final Config config) { server.run(); } + static TestServer createTestServerWithDefaultNonTestConfiguration() { + return new TestServer(findFreePort(), ConfigFactory.defaultReference()); + } + public EventPayload waitForEvent() throws InterruptedException { // SauceLabs can take quite a while to fire up everything. return Optional.ofNullable(events.poll(5, TimeUnit.SECONDS)).orElseThrow(() -> new RuntimeException("Timed out while waiting for server side event to occur.")); diff --git a/src/test/resources/hdfs-sink-multiple.conf b/src/test/resources/hdfs-sink-multiple.conf index db32577e..e49e04ad 100644 --- a/src/test/resources/hdfs-sink-multiple.conf +++ b/src/test/resources/hdfs-sink-multiple.conf @@ -16,6 +16,7 @@ // Specify multiple HDFS sinks for a mapping. divolte { + global.hdfs.enabled = true sources.test-browser-source.type = browser sinks.test-hdfs-sink-1 { diff --git a/src/test/resources/mapping-configuration-explicit.conf b/src/test/resources/mapping-configuration-explicit.conf index 4d2dee5f..c9ab85ad 100644 --- a/src/test/resources/mapping-configuration-explicit.conf +++ b/src/test/resources/mapping-configuration-explicit.conf @@ -16,6 +16,7 @@ // Specify an explicit source-mapping-sink. divolte { + global.hdfs.enabled = true sources.test-browser-source.type = browser sinks.test-hdfs-sink { diff --git a/src/test/resources/mapping-configuration-independent.conf b/src/test/resources/mapping-configuration-independent.conf index 6f5d96c7..d4afcf6f 100644 --- a/src/test/resources/mapping-configuration-independent.conf +++ b/src/test/resources/mapping-configuration-independent.conf @@ -16,6 +16,8 @@ // Specify multiple independent source-mapping-sink chains. divolte { + global.hdfs.enabled = true + sources { source-1 { type = browser diff --git a/src/test/resources/mapping-configuration-interdependent.conf b/src/test/resources/mapping-configuration-interdependent.conf index 125ac189..e43db9ea 100644 --- a/src/test/resources/mapping-configuration-interdependent.conf +++ b/src/test/resources/mapping-configuration-interdependent.conf @@ -17,6 +17,8 @@ // Specify a reasonably complex set of interdependent mappings with multiple // sources, mapping and sinks with some shared and some not. divolte { + global.hdfs.enabled = true + sources { source-1 { type = browser diff --git a/src/test/resources/mapping-configuration-shared-sink.conf b/src/test/resources/mapping-configuration-shared-sink.conf index 699657a2..1c65bcee 100644 --- a/src/test/resources/mapping-configuration-shared-sink.conf +++ b/src/test/resources/mapping-configuration-shared-sink.conf @@ -16,6 +16,8 @@ // Specify multiple mappings (with their own sources) that end up on the same sink. divolte { + global.hdfs.enabled = true + sources { source-1 { type = browser diff --git a/src/test/resources/mapping-configuration-shared-source.conf b/src/test/resources/mapping-configuration-shared-source.conf index 03e3f6d2..5a581bc2 100644 --- a/src/test/resources/mapping-configuration-shared-source.conf +++ b/src/test/resources/mapping-configuration-shared-source.conf @@ -16,6 +16,7 @@ // Specify a single source with multiple mappings (and sinks). divolte { + global.hdfs.enabled = true sources.only-source.type = browser mappings { diff --git a/src/test/resources/reference-test.conf b/src/test/resources/reference-test.conf index 5bee2d5a..db5b77e4 100644 --- a/src/test/resources/reference-test.conf +++ b/src/test/resources/reference-test.conf @@ -26,5 +26,7 @@ divolte { buffer_size = 16 threads = 1 } + + hdfs.enabled = false } } From 7c88f892f8f458224d98df355fdcca64518bd8c4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:46:02 +0200 Subject: [PATCH 54/80] Refactor to use a builder for the list of errors. --- .../io/divolte/server/config/ValidatedConfiguration.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 221fcf5e..f4ba6a25 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.time.Duration; -import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.Properties; @@ -83,7 +82,7 @@ public final class ValidatedConfiguration { * Supplier of the underlying {@code Config} instance. */ public ValidatedConfiguration(final Supplier configLoader) { - final List configurationErrors = new ArrayList<>(); + final ImmutableList.Builder configurationErrors = ImmutableList.builder(); DivolteConfiguration divolteConfiguration; try { @@ -112,10 +111,10 @@ public ValidatedConfiguration(final Supplier configLoader) { divolteConfiguration = null; } catch (final IOException e) { logger.error("Error while reading configuration!", e); - throw new RuntimeException(e); + throw new RuntimeException("Error while reading configuration.", e); } - this.configurationErrors = ImmutableList.copyOf(configurationErrors); + this.configurationErrors = configurationErrors.build(); this.divolteConfiguration = Optional.ofNullable(divolteConfiguration); } From d4bc1a37a8f515a09efb89116a7e6814f9c5ce66 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:46:29 +0200 Subject: [PATCH 55/80] Return the value instead of assigning to intermediate reference. --- .../io/divolte/server/config/ValidatedConfiguration.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index f4ba6a25..c4ce7f69 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -122,16 +122,15 @@ private String messageForMappingException(final JsonMappingException e) { final String pathToError = e.getPath().stream() .map(Reference::getFieldName) .collect(Collectors.joining(".")); - final String message = String.format( + return String.format( "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'", e.getOriginalMessage(), Optional.ofNullable(e.getLocation()).map(JsonLocation::getSourceRef).orElse(""), "".equals(pathToError) ? "" : pathToError); - return message; } private static String messageForUnrecognizedPropertyException(final UnrecognizedPropertyException e) { - final String message = String.format( + return String.format( "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'%n\tAvailable properties: %s.", e.getOriginalMessage(), e.getLocation().getSourceRef(), @@ -141,7 +140,6 @@ private static String messageForUnrecognizedPropertyException(final Unrecognized e.getKnownPropertyIds().stream() .map(Object::toString).map(s -> "'" + s + "'") .collect(Collectors.joining(", "))); - return message; } private List validate(final DivolteConfiguration divolteConfiguration) { From e079f7e9e1a8c2ef3965bbdc9eb528a7cb8133e0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:47:00 +0200 Subject: [PATCH 56/80] Remove unnecessary exceptions from throw specification. --- .../java/io/divolte/server/config/ImmutableProperties.java | 3 +-- .../java/io/divolte/server/config/PropertiesDeserializer.java | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/io/divolte/server/config/ImmutableProperties.java b/src/main/java/io/divolte/server/config/ImmutableProperties.java index 8245e4f8..eaa5f4dc 100644 --- a/src/main/java/io/divolte/server/config/ImmutableProperties.java +++ b/src/main/java/io/divolte/server/config/ImmutableProperties.java @@ -3,7 +3,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.Reader; -import java.util.InvalidPropertiesFormatException; import java.util.Map; import java.util.Properties; @@ -31,7 +30,7 @@ public synchronized void load(final Reader reader) throws IOException { } @Override - public synchronized void loadFromXML(final InputStream in) throws IOException, InvalidPropertiesFormatException { + public synchronized void loadFromXML(final InputStream in) throws IOException { throw new UnsupportedOperationException(); } diff --git a/src/main/java/io/divolte/server/config/PropertiesDeserializer.java b/src/main/java/io/divolte/server/config/PropertiesDeserializer.java index 15aba723..7ed89e16 100644 --- a/src/main/java/io/divolte/server/config/PropertiesDeserializer.java +++ b/src/main/java/io/divolte/server/config/PropertiesDeserializer.java @@ -9,7 +9,6 @@ import java.util.Properties; import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; @@ -23,7 +22,7 @@ public class PropertiesDeserializer extends JsonDeserializer { private final static Joiner COMMA_JOINER = Joiner.on(','); @Override - public Properties deserialize(JsonParser p, DeserializationContext ctx) throws IOException, JsonProcessingException { + public Properties deserialize(JsonParser p, DeserializationContext ctx) throws IOException { if (START_OBJECT == p.getCurrentToken()) { final Properties properties = new Properties(); final Deque stack = new ArrayDeque<>(); From f3131092dacc48342d14f9ae11594d28c8e7fde3 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:47:17 +0200 Subject: [PATCH 57/80] Remove redundant type bounds. --- src/main/java/io/divolte/server/config/ImmutableProperties.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/config/ImmutableProperties.java b/src/main/java/io/divolte/server/config/ImmutableProperties.java index eaa5f4dc..26c8a3bc 100644 --- a/src/main/java/io/divolte/server/config/ImmutableProperties.java +++ b/src/main/java/io/divolte/server/config/ImmutableProperties.java @@ -45,7 +45,7 @@ public synchronized Object put(final Object key, final Object value) { } @Override - public synchronized void putAll(final Map t) { + public synchronized void putAll(final Map t) { throw new UnsupportedOperationException(); } From 2a8476d1c480684fb8d48e6809b2c5fc85d7e293 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 16 Jun 2016 14:47:55 +0200 Subject: [PATCH 58/80] Avoid Optional.get(); the intent here is that the reference is available, and if it's not that's exceptional. --- .../java/io/divolte/server/config/ValidatedConfiguration.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index c4ce7f69..dcf9d769 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -199,7 +199,7 @@ private static DivolteConfiguration mapped(final Config input) throws IOExceptio public DivolteConfiguration configuration() { Preconditions.checkState(configurationErrors.isEmpty(), "Attempt to access invalid configuration."); - return divolteConfiguration.get(); + return divolteConfiguration.orElseThrow(() -> new IllegalStateException("Configuration not available.")); } /** From 8b700bf28eac638ad0c2baea492697f682a8c556 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 3 Aug 2016 12:05:15 +0200 Subject: [PATCH 59/80] Update the configuration file we bundle with the RPM to use the new layout. --- rpm/SOURCES/divolte-collector.conf | 71 ++++++++++++++---------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/rpm/SOURCES/divolte-collector.conf b/rpm/SOURCES/divolte-collector.conf index dc94c98b..7ffd6b93 100644 --- a/rpm/SOURCES/divolte-collector.conf +++ b/rpm/SOURCES/divolte-collector.conf @@ -1,49 +1,44 @@ # This is the configuration for the Divolte collector. divolte { - server { - # The address of the interface on which to bind and listen. - # (Alternatively, you can set the DIVOLTE_HOST environment variable.) - # - # Default: localhost - # - # Uncomment to listen on all interfaces. - #host=0.0.0.0 - - # The TCP port on which to listen. - # (Alternatively, you can set the DIVOLTE_PORT environment variable.) - # - # Default: 8290 - # - #port=8290 - } - - # Custom URL mappings are possible if you wish to extract parts of the URL - # into the click-stream events. - # - #tracking { - # include "schema-mapping.conf" - #} - - # By default, we flush to local HDFS. - hdfs_flusher { - - hdfs { - # The URI of the HDFS where events should be stored. - # (Alternatively, you can set the DIVOLTE_HDFS_URI environment variable.) + global { + server { + # The address of the interface on which to bind and listen. + # (Alternatively, you can set the DIVOLTE_HOST environment variable.) # - # Default: "file:///" (local filesystem) + # Default: localhost # - #uri = "file:///" + # Uncomment to listen on all interfaces. + #host=0.0.0.0 - # The replication factor that should be used for events stored to HDFS. - # (Alternatively, you can set the DIVOLTE_HDFS_REPLICATION environment - # variable.) - # For production this would normally be 3. + # The TCP port on which to listen. + # (Alternatively, you can set the DIVOLTE_PORT environment variable.) # - # Default: 1 + # Default: 8290 # - #replication = 3 + #port=8290 } } + + # Custom sources, mappings and sinks. + # If anything is configured, these all needed to be configured. The default + # configuration is: + # - A single browser-based source (/divolte.js) + # - A default mapping that produces events that conform to the default schema. + # - A single HDFS-based sink that writes to /tmp. + # (If you have not configured HDFS, this will be the local filesystem.) + # + # Refer to the Divolte documentation for more information. + # + # sources { + # + # } + # + # mappings { + # + # } + # + # sinks { + # + # } } From 64febccecc25a1e813ab94f13db886c038251e06 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 3 Aug 2016 13:25:55 +0200 Subject: [PATCH 60/80] Update the 'Getting Started' documentation to match the new configuration format. --- docs/getting_started.rst | 128 ++++++++++++++++++++++----------------- 1 file changed, 74 insertions(+), 54 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 289c218f..46c46f06 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -190,11 +190,12 @@ Finally, we need to configure Divolte Collector to use our custom schema and map .. code-block:: none divolte { - tracking { - schema_file = "/path/to/divolte-collector/conf/MyEventRecord.avsc" - schema_mapping { - version = 2 + mappings { + my_mapping = { + schema_file = "/path/to/divolte-collector/conf/MyEventRecord.avsc" mapping_script_file = "/path/to/divolte-collector/conf/mapping.groovy" + sources = [browser] + sinks = [hdfs] } } } @@ -303,55 +304,65 @@ First, we'll change the configuration to write files to HDFS. Add the following .. code-block:: none divolte { - hdfs_flusher { - // Enable the HDFS flushing - enabled = true - - // Use multiple threads to write to HDFS - threads = 2 - - // Use a simple strategy of rolling files after a certain period of time. - // For other strategies, have a look at the configuration documentation. - simple_rolling_file_strategy { - // Create a new file every hour - roll_every = 1 hour - - // Perform a hsync call on the HDFS files after every 1000 record written or - // after every 5 seconds, whichever happens first. - - // Performing a hsync call periodically prevents data loss incase of failure - // scenarios. - sync_file_after_records = 1000 - sync_file_after_duration = 5 seconds - - // Files that are being written will be created in a working directory. - // Once a file is closed, Divolte Collector will move the file to a - // publish directory. The working and publish directories are allowed - // to be the same, but this is not recommended. - working_dir = "/divolte/inflight" - publish_dir = "/divolte/published" + global { + hdfs { + // Enable HDFS sinks. + enabled = true + + // Use multiple threads to write to HDFS. + threads = 2 + } + } + + sinks { + // The name of the sink. (It's referred to by the mapping.) + hdfs { + type = hdfs + + // For HDFS sinks we can control how the files are created. + file_strategy { + // Create a new file every hour + roll_every = 1 hour + + // Perform a hsync call on the HDFS files after every 1000 records are written + // or every 5 seconds, whichever happens first. + + // Performing a hsync call periodically can prevent data loss in the case of + // some failure scenarios. + sync_file_after_records = 1000 + sync_file_after_duration = 5 seconds + + // Files that are being written will be created in a working directory. + // Once a file is closed, Divolte Collector will move the file to the + // publish directory. The working and publish directories are allowed + // to be the same, but this is not recommended. + working_dir = "/divolte/inflight" + publish_dir = "/divolte/published" + } + + // Set the replication factor for created files. + replication = 3 } } } -Note that you need to create these directories on HDFS prior to starting Divolte Collector. It will not startup if the directories do not exist. +Note that you need to create these directories prior to starting Divolte Collector. It will not startup if the directories do not exist. -If you have a working HDFS setup and a directory with the appropriate configuration files, Divolte Collector will use them automatically if a ``HADOOP_CONF_DIR`` environment variable is set pointing to that directory. Otherwise, it is possible to tell Divolte Collector directly about your HDFS location from the configuration: +If you have a working HDFS setup and a directory with the appropriate configuration files, Divolte Collector will use them automatically if a ``HADOOP_CONF_DIR`` environment variable is set pointing to that directory. Alternatively, HDFS client properties can be provided in the configuration: .. code-block:: none divolte { - hdfs_flusher { + global { hdfs { - uri = "hdfs://192.168.100.128:8020/" - replication = 1 + client { + fs.defaultFS = "hdfs://192.168.100.128:8020/" + } } } } -Do note that in this scenario it is not possible to set additional HDFS client configuration, as you can do when using the ``HADOOP_CONF_DIR`` environment variable. Also, when your HDFS NameNode is setup redundantly you can configure only one using the Divolte Collector configuration. This is why it is recommended to use a ``HADOOP_CONF_DIR``. - -With everything in place, start Divolte Collector again, create some events and see verify that files are being created on HDFS: +With everything in place, start Divolte Collector again, create some events and verify that files are being created on HDFS: .. code-block:: console @@ -375,21 +386,30 @@ Configuring Divolte Collector to write data to a Kafka topic is quite similar to .. code-block:: none divolte { - kafka_flusher { - // Enable Kafka flushing - enabled = true - - // This is the name of the topic that data will be produced on - topic = divolte-data - - // The properties under the producer key in this - // configuration are used to create a Properties object - // which is passed to Kafka as is. At the very least, - // configure the broker list here. For more options - // that can be passed to a Kafka producer, see this link: - // http://kafka.apache.org/documentation.html#producerconfigs - producer = { - bootstrap.servers = "10.200.8.55:9092,10.200.8.53:9092,10.200.8.54:9092" + global { + kafka { + // Enable Kafka flushing + enabled = true + + // The properties under the producer key in this + // configuration are used to create a Properties object + // which is passed to Kafka as is. At the very least, + // configure the broker list here. For more options + // that can be passed to a Kafka producer, see this link: + // http://kafka.apache.org/082/documentation.html#newproducerconfigs + producer = { + bootstrap.servers = "10.200.8.55:9092,10.200.8.53:9092,10.200.8.54:9092" + } + } + } + + sinks { + // The name of the sink. (It's referred to by the mapping.) + kafka { + type = kafka + + // This is the name of the topic that data will be produced on + topic = divolte-data } } } From 0f2df5da67297c36c90525197f34212a5a6ccdac Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 15:25:31 +0200 Subject: [PATCH 61/80] Ask sphinx to be picky about problems. --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 8b0f4c23..2ae787ba 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -103,6 +103,8 @@ # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False +# If true, Sphinx will warn about all references where the target cannot be found. +nitpicky = True # -- Options for HTML output ---------------------------------------------- From 288e3ba070f9f397fdb7f6d35355f4059335ee9a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 15:26:04 +0200 Subject: [PATCH 62/80] Update link in comment to refer to the documentation for the specific version of Kafka that we're using. --- src/main/resources/reference.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf index 281f028b..7e40a31f 100644 --- a/src/main/resources/reference.conf +++ b/src/main/resources/reference.conf @@ -127,7 +127,7 @@ divolte { // All settings in here are used as-is to configure // the Kafka producer. - // See: http://kafka.apache.org/documentation.html#producerconfigs + // See: http://kafka.apache.org/082/documentation.html#newproducerconfigs producer = { bootstrap.servers = ["localhost:9092"] bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} From 44d3a073eea28835784b01bb93710c512c6849fd Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 15:26:40 +0200 Subject: [PATCH 63/80] Introduce event-flows, and mention that we now support many-to-many mappings in Divolte. --- docs/getting_started.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 46c46f06..325e4b99 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -418,6 +418,21 @@ Data in Kafka ------------- Avro files on HDFS are written with the schema in the header. Unfortunately Kafka doesn't really have a clear way of passing along the schema. For the messages on Kafka queues we expect the consumer to know the schema in advance, meaning that *the messages that are passed onto the queue only contain the raw bytes of the serialized Avro record without any metadata*. The key of each message is the party ID that for the event. Divolte Collector provides a small helper library to easily create Kafka consumers in Java using Avro's code generation support. There is an example Kafka consumer with step by step instruction on getting it up and running in our usage examples repository here: `https://github.com/divolte/divolte-examples/tree/master/tcp-kafka-consumer `_. +Event Flows +=========== + +So far we've seen a single source of events being mapped to HDFS, and Kafka if you tried this. However Divolte can be +configured with multiple: + +- *Sources* of events, which is where Divolte events arrive. +- *Sinks* (destinations) where Avro records can be written after they have been produced by mapping Divolte events. +- *Mappings* between sources and sinks, which controls which sources are connected to which sinks, and how the events + are converted to Avro records. + +Events flow from sources to sinks, via an intermediate mapping. Allowing multiple sources, sinks and mappings allows Divolte to support multiple sites and domains, each of which may require independent mapping. Note, however, that a sink can only support a single Avro schema: all mappings which refer to it must be configured to produce records conforming to the same Avro schema. + +An event flow imposes a partial ordering on the events it receives: events from a source that have the same party identifier will be written to sinks in the same order that they were received in. (This doesn't apply to events received across different sources: even if they share the same party identifier their relative ordering is not guaranteed.) + What's next? ============ * Once you are collecting data to either HDFS or Kafka, see our `examples `_ to learn how to use your clickstream data in tools like Apache Spark, Apache Hive or Impala or build near real-time consumers for Apache Kafka with your Divolte Collector data. From 8eb2b626e96ce925c6959b65354c86b4eee0a3e5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 15:27:01 +0200 Subject: [PATCH 64/80] Bump documentation copyright. --- docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2ae787ba..2feab510 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014 GoDataDriven B.V. +# Copyright 2016 GoDataDriven B.V. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ # General information about the project. project = u'Divolte' -copyright = u'2015, GoDataDriven' +copyright = u'2016, GoDataDriven' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the From 29ecdb9e2a9e93702f580b40acba5acf7d5abcd0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 15:27:34 +0200 Subject: [PATCH 65/80] Update the configuration reference to describe the new way Divolte is configured. --- docs/configuration.rst | 868 +++++++++++++++++++++++------------------ 1 file changed, 484 insertions(+), 384 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 3567a851..2cd07a23 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5,11 +5,11 @@ This chapter describes the configuration mechanisms and available options for Di Configuration files =================== -The configuration for Divolte Collector consists of three files: +The main configuration for Divolte Collector consists of three files: - ``divolte-env.sh``: a shell script that is included in the startup script to set environment variables and JVM startup arguments. -- ``divolte-collector.conf``: the main configuration file for Divolte Collector. - ``logback.xml``: the logging configuration. +- ``divolte-collector.conf``: the main configuration file for Divolte Collector. Configuration directory ----------------------- @@ -22,7 +22,7 @@ This shell script is run by the startup script prior to starting the Divolte Col HADOOP_CONF_DIR ^^^^^^^^^^^^^^^ :Description: - Directory where Hadoop / HDFS configuration files are to be found. This directory is included in the classpath on startup, which causes the HDFS client to load the configuration files. + Directory where Hadoop/HDFS configuration files are to be found. This directory is included in the classpath on startup, which causes the HDFS client to load the configuration files. :Example: @@ -33,7 +33,7 @@ HADOOP_CONF_DIR JAVA_HOME ^^^^^^^^^ :Description: - The directory where the JRE/JDK is located. Divolte Collector will use ``$JAVA_HOME/bin/java`` as Java executable for startup. If this is not set, Divolte Collector will attempt to find a suitable JDK in a number of common Java installation locations on Linux systems. It is however not recommended to rely on this mechanism for production use. + The directory where the JRE/JDK is located. Divolte Collector will use ``$JAVA_HOME/bin/java`` as the Java executable during startup. If this is not set, Divolte Collector will attempt to find a suitable JDK in a number of common Java installation locations on Linux systems. It is however not recommended to rely on this mechanism for production use. :Example: @@ -52,6 +52,10 @@ DIVOLTE_JAVA_OPTS DIVOLTE_JAVA_OPTS="-XX:+UseG1GC -Djava.awt.headless=true -XX:+HeapDumpOnOutOfMemoryError" +logback.xml +----------- +Divolte Collector uses the `Logback Project `_ as its logging provider. This provider is configured through the ``logback.xml`` file in the configuration directory. For more information about the settings in this file, review the `Configuration chapter in the Logback Manual `_. + divolte-collector.conf ---------------------- This is the main configuration file for Divolte Collector. For configuration, Divolte Collector uses the `Typesafe Config library `_. The dialect of the configuration file is a JSON superset called HOCON (for *Human-Optimized Config Object Notation*). HOCON has a nested structure, like JSON, but is slightly less verbose and doesn't require escaping and quoting of strings in many cases. Here we outline some basic features of HOCON. @@ -62,22 +66,26 @@ Nesting and dot separated namespacing can be used interchangeably: // This: divolte { - server { - host = 127.0.0.1 + global { + server { + host = 127.0.0.1 + } } } // Is the same as this: - divolte.server.host = 127.0.0.1 + divolte.global.server.host = 127.0.0.1 -Environment variable overrides can be used. In this example the ``divolte.server.port`` setting defaults to 8290, unless the ``DIVOLTE_PORT`` environment variable is set: +Environment variable overrides can be used. In this example the ``divolte.global.server.port`` setting defaults to 8290, unless the ``DIVOLTE_PORT`` environment variable is set: .. code-block:: none divolte { - server { - port = 8290 - port = ${?DIVOLTE_PORT} + global { + server { + port = 8290 + port = ${?DIVOLTE_PORT} + } } } @@ -87,71 +95,85 @@ Objects are merged: // This configuration divolte { - server { - host = 0.0.0.0 + global { + server { + host = 0.0.0.0 + } } } - divolte.server { + divolte.global.server { port = 8290 } // Will result in this: - divolte.server.host = 0.0.0.0 - divolte.server.port = 8290 + divolte.global.server.host = 0.0.0.0 + divolte.global.server.port = 8290 For a full overview please refer to the `HOCON features and specification `_. .. warning:: - Be careful when enclosing values in quotes. Quotes are optional, but if present they must be JSON-style double-quotes (``"``). - This can easily lead to confusion: + Be careful when enclosing values in quotes. Quotes are optional, but if present they must be JSON-style double-quotes (``"``). This can easily lead to confusion: .. code-block:: none // This ... - divolte.tracking.cookie_domain = '.example.com' + divolte.sources.browser.cookie_domain = '.example.com' // ... is really equivalent to: - divolte.tracking.cookie_domain = "'.example.com'" + divolte.sources.browser.cookie_domain = "'.example.com'" Configuration reference ======================= -The following sections and settings are available in the ``divolte-collector.conf`` file. Note that in this documentation the path notation for configuration options is used (e.g. ``divolte.server``) but in examples the path and nested notation is used interchangeably. -divolte.server --------------- +The main configuration is read from ``divolte-collector.conf``, which consists of several sections: + +- *Global* (``divolte.global``): Global settings that affect the entire service. +- *Sources* (``divolte.sources``): Configured sources for Divolte Collector events. +- *Mappings* (``divolte.mappings``): Configured mappings between sources and sinks. +- *Sinks* (``divolte.sinks``): Configured sinks, where Avro events are written. + +This documentation uses the path notation for configuration options (e.g. ``divolte.global.server``) but in examples the path and nested notations are used interchangeably. + +Global Settings (``divolte.global``) +------------------------------------ + +This section contains settings which are global in nature. All settings have default values. + +HTTP Server Settings (``divolte.global.server``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This section controls the settings for the internal HTTP server of Divolte Collector. -divolte.server.host -^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.host`` +"""""""""""""""""""""""""""""""""""""""" :Description: - The address to which the server binds. Set to a specific IP address to selectively listen on that interface. + The address to which the server binds. Set to a specific IP address to selectively listen on that interface, or `0.0.0.0` to listen on all interfaces. :Default: - ``0.0.0.0`` + The address of a loopback interface. :Example: .. code-block:: none - divolte.server { + divolte.global.server { host = 0.0.0.0 } -divolte.server.port -^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.port`` +"""""""""""""""""""""""""""""""""""""""" :Description: The TCP port on which the server listens. :Default: - ``8290`` + ``8290``, or the content of the ``DIVOLTE_PORT`` environment variable if set. :Example: .. code-block:: none - divolte.server { + divolte.global.server { port = 8290 } -divolte.server.use_x_forwarded_for -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.use_x_forwarded_for`` +""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: Whether to use the ``X-Forwarded-For`` HTTP header for determining the source IP of a request, if present. If multiple values are present, the last value is used. @@ -167,12 +189,12 @@ divolte.server.use_x_forwarded_for .. code-block:: none - divolte.server { + divolte.global.server { use_x_forwarded_for = true } -divolte.server.serve_static_resources -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.serve_static_resources`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: When true Divolte Collector serves a static test page at ``/``. :Default: @@ -181,87 +203,58 @@ divolte.server.serve_static_resources .. code-block:: none - divolte.server { + divolte.global.server { serve_static_resources = false } -divolte.tracking ----------------- -This section controls the tracking mechanism for Divolte Collector, covering areas such as the cookies and session timeouts, user agent parsing and ip2geo lookups. - -divolte.tracking.party_cookie -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - The name of the cookie used for setting a party ID. -:Default: - ``_dvp`` -:Example: - - .. code-block:: none - - divolte.tracking { - party_cookie = _pid - } - -divolte.tracking.party_timeout -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - The expiry timeout for the party identifier. If no events occur for this duration, the party identifier is discarded. - Any subsequent events will be assigned a new party identifier. -:Default: - 730 days -:Example: - - .. code-block:: none - - divolte.tracking { - party_timeout = 1000 days - } +Global Mapper Settings (``divolte.global.mapper``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section controls global settings related to the processing of incoming requests after they have been received by the server. Incoming requests for Divolte Collector are responded to as quickly as possible, with mapping and flushing occurring in the background. -divolte.tracking.session_cookie -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.threads`` +""""""""""""""""""""""""""""""""""""""""""" :Description: - The name of the cookie used for tracking the session ID. + The number of threads that each mapper should use to process events. :Default: - ``_dvs`` + 1 :Example: .. code-block:: none - divolte.tracking { - session_cookie = _sid + divolte.global.mapper { + threads = 4 } -divolte.tracking.session_timeout -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.buffer_size`` +""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The expiry timeout for a session. A session lapses if no events occur for this duration. + The maximum number of incoming events, rounded up to the nearest power of 2, to queue for processing *per mapper thread* before starting to drop incoming events. While this buffer is full new events are dropped and a warning is logged. (Dropped requests are not reported to the client: Divolte Collector always responds to clients immediately once minimal validation has taken place.) :Default: - 30 minutes + 1048576 :Example: .. code-block:: none - divolte.tracking { - session_timeout = 1 hour + divolte.global.mapper { + buffer_size = 10M } -divolte.tracking.cookie_domain -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.duplicate_memory_size`` +""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The cookie domain that is assigned to the cookies. When left empty, the cookies will have no domain explicitly associated with them, which effectively sets it to the website domain of the page that loaded the Divolte Collector JavaScript. + Clients will sometimes deliver an event multiple times, normally within a short period of time. Divolte Collector contains a probabilistic filter which can detect this, trading off memory for improved results. This setting configures the size of the filter *per mapper thread*, and is multiplied by 8 to yield the actual memory usage. :Default: - *Empty* + 1000000 :Example: .. code-block:: none - divolte.tracking { - cookie_domain = ".example.com" + divolte.global.mapper { + duplicate_memory_size = 10000000 } -divolte.tracking.ip2geo_database -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.ip2geo_database`` +""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: This configures the ip2geo database for geographic lookups. An ip2geo database can be obtained from `MaxMind `_. (Free 'lite' versions and commercial versions are available.) @@ -272,16 +265,16 @@ divolte.tracking.ip2geo_database .. code-block:: none - divolte.tracking { + divolte.global.mapper { ip2geo_database = "/etc/divolte/ip2geo/GeoLite2-City.mmdb" } -divolte.tracking.ua_parser --------------------------- +Property: ``divolte.global.mapper.user_agent_parser`` +""""""""""""""""""""""""""""""""""""""""""""""""""""" This section controls the user agent parsing settings. The user agent parsing is based on an `open source parsing library `_ and supports dynamic reloading of the backing database if an internet connection is available. -divolte.tracking.ua_parser.type -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.user_agent_parser.type`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: This setting controls the updating behavior of the user agent parser. @@ -298,12 +291,12 @@ divolte.tracking.ua_parser.type .. code-block:: none - divolte.tracking.ua_parser { - type = caching_and_updating + divolte.global.mapper.user_agent_parser { + type = caching_and_updating } -divolte.tracking.ua_parser.cache_size -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.user_agent_parser.cache_size`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: User agent parsing is a relatively expensive operation that requires many regular expression evaluations. Very often the same user agent will make consecutive requests and many clients will have the exact same user agent as well. It therefore makes sense to cache the parsing results for re-use in subsequent requests. This setting determines how many unique user agent strings will be cached. :Default: @@ -312,557 +305,664 @@ divolte.tracking.ua_parser.cache_size .. code-block:: none - divolte.tracking.ua_parser { + divolte.global.mapper.user_agent_parser { cache_size = 10000 } -divolte.tracking.schema_file -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - By default, Divolte Collector will use a built-in Avro schema for writing data and a default mapping, which is documented in the Mapping section of the user documentation. If not set, a `default built-in schema `_ will be used. +Global HDFS Settings (``divolte.global.hdfs``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section controls global HDFS settings shared by all HDFS sinks. - Typically, users will configure their own schema, usually with fields specific to their domain and custom events and other mappings. When using a user defined schema, it is also required to provide a mapping script. See :doc:`mapping_reference` for further reference. +Property: ``divolte.global.hdfs.enabled`` +""""""""""""""""""""""""""""""""""""""""" +:Description: + Whether or not HDFS support is enabled or not. If disabled all HDFS sinks are ignored. :Default: - *`Built-in schema `_* + ``true`` :Example: .. code-block:: none - divolte.tracking { - schema_file = /etc/divolte/MyEventRecord.avsc + divolte.global.hdfs { + enabled = false } -divolte.tracking.schema_mapping -------------------------------- -This section controls the schema mapping to use. Schema mapping is an important feature of Divolte Collector, as it allows users to map incoming requests onto custom Avro schemas in non-trivial ways. See :doc:`mapping_reference` for details about this process and the internal mapping DSL used for defining mappings. - -divolte.tracking.schema_mapping.version -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.hdfs.threads`` +""""""""""""""""""""""""""""""""""""""""" :Description: - Prior versions of Divolte Collector supported an alternative mapping DSL. The current version is 2, and this is the only - value supported if the built-in mapping is not being used. + Number of threads to use per HDFS sink for writing events. Each thread creates its own files on HDFS. :Default: - *Not set (for built-in mapping)* + 2 :Example: .. code-block:: none - divolte.tracking.schema_mapping { - version = 2 + divolte.global.hdfs { + threads = 1 } -divolte.tracking.schema_mapping.mapping_script_file -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.hdfs.buffer_size`` +""""""""""""""""""""""""""""""""""""""""""""" :Description: - The location of the Groovy script that defines the how events will be mapped to Avro records. If unset, a default built-in mapping will be used. + The maximum number of mapped events to queue internally *per sink thread* for HDFS before starting to drop them. This value will be rounded up to the nearest power of 2. :Default: - *Built-in mapping* + 1048576 :Example: .. code-block:: none - divolte.tracking.schema_mapping { - mapping_script_file = /etc/divolte/my-mapping.groovy + divolte.global.hdfs.buffer_size { + max_write_queue = 10M } -divolte.javascript ------------------- -This section controls various aspects of the JavaScript tag that will be loaded. - -divolte.javascript.name -^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.hdfs.client`` +"""""""""""""""""""""""""""""""""""""""" :Description: - The path with which the JavaScript is served. This changes the ``divolte.js`` part in the script url: http://example.com/divolte.js. + Properties that will be used to configure the HDFS client used by HDFS sinks. If set, these properties will be used *instead of* the settings from ``hdfs-site.xml`` in the directory specified by the ``HADOOP_CONF_DIR``. Although it is possible to configure all settings here instead of in ``HADOOP_CONF_DIR`` this is not recommended. :Default: - ``divolte.js`` + *Not set* :Example: .. code-block:: none - divolte.javascript { - name = tracking.js + divolte.global.hdfs.client { + fs.defaultFS = "file:///var/log/divolte/" } -divolte.javascript.logging -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Global Kafka Settings (``divolte.global.kafka``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section controls global Kafka settings shared by all Kafka sinks. At present Divolte Collector only supports connecting to a single Kafka cluster. + +Property: ``divolte.global.kafka.enabled`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - Enable or disable the logging on the JavaScript console in the browser. + This controls whether flushing to Kafka is enabled or not. If disabled all Kafka sinks are ignored. (This is disabled by default because the producer configuration for Kafka is normally site-specific.) :Default: ``false`` :Example: .. code-block:: none - divolte.javascript { - logging = true + divolte.global.kafka { + enabled = true } -divolte.javascript.debug -^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.kafka.threads`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - When enabled, the served JavaScript will be less compact and *slightly* easier to debug. This setting is mainly intended - to help track down problems in either the minification process used to reduce the size of the tracking script, or in the - behaviour of specific browser versions. + Number of threads to use per Kafka sink for flushing events to Kafka. :Default: - ``false`` + 2 :Example: .. code-block:: none - divolte.javascript { - debug = true + divolte.global.kafka { + threads = 1 } -divolte.javascript.auto_page_view_event -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.kafka.buffer_size`` +"""""""""""""""""""""""""""""""""""""""""""""" :Description: - When enabled the JavaScript tag automatically generates a ``pageView`` event when loaded, simplifying site integration. - If sites wish to control all events (including the initial ``pageView`` event) this can be disabled. + The maximum number of mapped events to queue internally *per sink thread* for Kafka before starting to drop them. This value will be rounded up to the nearest power of 2. :Default: - ``true`` + 1048576 :Example: .. code-block:: none - divolte.javascript { - auto_page_view_event = false + divolte.global.kafka.buffer_size { + max_write_queue = 10M } - -divolte.incoming_request_processor ----------------------------------- -This section controls settings related to the processing of incoming requests after they have been received by the server. Incoming requests for Divolte Collector are responded to as quickly as possible, with mapping and flushing occurring in the - background. Only minimal validation is performed before issuing a HTTP `200 OK` response that contains a transparent 1x1 pixel GIF image.containing a handled by a pool of HTTP threads, which immediately respond with a HTTP code 200 and send the response payload (a 1x1 pixel transparent GIF image). The background mapping and processing is performed by the incoming request processor and configured in this section. - -divolte.incoming_request_processor.threads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.kafka.producer`` +""""""""""""""""""""""""""""""""""""""""""" :Description: - Number of threads to use for processing incoming requests. All requests for a single party are processed on the same thread. + The configuration to use for Kafka producers. All settings are used as-is to configure the Kafka producer; refer to the `Kafka Documentation `_ for further details. :Default: - ``2`` -:Example: .. code-block:: none - divolte.incoming_request_processor { - threads = 1 + { + bootstrap.servers = ["localhost:9092"] + bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} + client.id = divolte.collector + client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} + + acks = 1 + retries = 0 + compression.type = lz4 + max.in.flight.requests.per.connection = 1 } -divolte.incoming_request_processor.max_write_queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - The maximum number of incoming requests to queue for processing *per thread* before starting to drop incoming requests. While this queue is full new requests are dropped and a warning is logged. (Dropped requests are not reported to the client: Divolte Collector will always respond with a HTTP 200 status code once minimal validation has taken place.). -:Default: - ``100000`` :Example: .. code-block:: none - divolte.incoming_request_processor { - max_write_queue = 1000000 + divolte.global.kafka.producer = { + metadata.broker.list = ["broker1:9092", "broker2:9092", "broker3:9092"] + client.id = divolte.collector + + acks = 0 + retries = 5 } -divolte.incoming_request_processor.max_enqueue_delay -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Sources (``divolte.sources``) +----------------------------- + +Sources are endpoints that can receive events. Each source has a name used to identify it when configuring a mapper that uses the source. A source cannot have the same name as a sink (and vice versa). Sources are configured in sections using their name as the configuration path. (Due to the `HOCON merging rules `_, it's not possible to configure multiple sources with the same name.) + +Each source has a type configured via a mandatory ``type`` property. At present the only supported type is ``browser``. + +For example: + +.. code-block:: none + + divolte.sources { + // The name of the source is 'my_source' + my_source = { + // This is a browser source. + type = browser + } + } + +If no sources are specified a single implicit browser source is created that is equivalent to: + +.. code-block:: none + + divolte.sources { + // The name of the implicit source is 'browser' + browser = { + type = browser + } + } + +If *any* sources are configured this implicit source is not present and all sources must be explicitly specified. + +Browser Sources +^^^^^^^^^^^^^^^ + +A browser source is intended to receive tracking events from a browser. Each browser source serves up a tracking tag (JavaScript). This tag must be integrated into a website for Divolte Collector to receive tracking events. Each page of a website needs to include this: + +.. code-block:: html + + + +The URL will need to use the domain name where you are hosting Divolte Collector, and ``divolte.js`` needs to match the ``javascript.name`` setting of the browser source. + +By default loading the tag will trigger a ``pageView`` event. The tag also provides an API for issuing custom +events: + +.. code-block:: html + + + +The first argument to the ``divolte.signal(...)`` function is the type of event, while the second argument is an arbitrary object containing custom parameters associated with the event. Storing the event and its parameters into the configured Avro schema is controlled via mapping; see the :doc:`mapping_reference` chapter for details. + +Browser sources are able to detect some cases of corruption in the event data. The most common source of this is due to URLs being truncated, but there are also other sources of corruption between the client and the server. Corrupted events are flagged as such but still made available for mapping. (Mappings may choose to discard corrupted events, but by default they are processed normally.) + +Within the namespace for a browser source properties are used to configure it. + +Browser source property: ``prefix`` +""""""""""""""""""""""""""""""""""" :Description: - The maximum time to wait if the queue is full before dropping an event. + The path prefix under which the tracking tag is available. Each browser source must have a unique prefix. A trailing slash (``/``) is automatically appended if not specified. :Default: - 1 second + ``/`` :Example: .. code-block:: none - divolte.incoming_request_processor { - max_enqueue_delay = 20 seconds + divolte.sources.a_source { + type = browser + prefix = /tracking } -divolte.incoming_request_processor.discard_corrupted -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + In this case the tracking tag could be included using: + + .. code-block:: html + + + +Browser source property: ``party_cookie`` +""""""""""""""""""""""""""""""""""""""""" :Description: - Events from the JavaScript tag contain a checksum to detect corrupted events. (The most common source of this is - URLs being truncated, but there are also other sources of corruption between the client and the server.) If enabled, - corrupt events will be discarded and not subject to mapping and further processing. If disabled, a best effort will - be made to map and process the event as if it was normal. + The name of the cookie used for setting a party identifier. :Default: - ``false`` + ``_dvp`` :Example: .. code-block:: none - divolte.incoming_request_processor { - discard_corrupted = true + divolte.sources.a_source { + type = browser + party_cookie = _pid } -divolte.incoming_request_processor.duplicate_memory_size -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``party_timeout`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - Browsers and other clients will sometimes deliver an event to the Divolte Collector multiple times, normally - within a short period of time. Divolte Collector contains a probabilistic filter which can detect this, trading - off memory for improved results. This setting configures the size of the filter *per thread*, and is multuplied - by 8 to yield the actual memory usage. + The expiry timeout for the party identifier. If no events occur for this duration, the party identifier is discarded by the browser. Any subsequent events will be cause a new party identifier to be assigned to the browser. :Default: - ``1000000`` + 730 days :Example: .. code-block:: none - divolte.incoming_request_processor { - duplicate_memory_size = 10000000 + divolte.sources.a_source { + type = browser + party_timeout = 1000 days } -divolte.incoming_request_processor.discard_duplicates -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``session_cookie`` +""""""""""""""""""""""""""""""""""""""""""" :Description: - Browsers and other clients will sometimes deliver an event to the Divolte Collector multiple times, normally - within a short period of time. Divolte Collector contains a probabilistic filter which can detect this, and - when this setting is enabled events considered duplicates will be discarded without further mapping or processing. + The name of the cookie used for tracking the session identifier. :Default: - ``false`` + ``_dvs`` :Example: .. code-block:: none - divolte.incoming_request_processor { - discard_duplicates = true + divolte.sources.a_source { + type = browser + session_cookie = _sid } -divolte.kafka_flusher ---------------------- -This section controls settings related to forwarding the event stream to a Apache Kafka topic. Events for Kafka topics -are keyed by their party identifier. - -divolte.kafka_flusher.enabled -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``session_timeout`` +"""""""""""""""""""""""""""""""""""""""""""" :Description: - This controls whether flushing to Kafka is enabled or not. (This is disabled by default because the producer configuration for Kafka is normally site-specific.) + The expiry timeout for a session. A session lapses if no events occur for this duration. :Default: - ``false`` + 30 minutes :Example: .. code-block:: none - divolte.kafka_flusher { - enabled = true + divolte.sources.a_source { + type = browser + session_timeout = 1 hour } -divolte.kafka_flusher.threads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``cookie_domain`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - Number of threads to use for flushing events to Kafka. + The cookie domain that is assigned to the cookies. When left empty, the cookies will have no domain explicitly associated with them, which effectively sets it to the website domain of the page that loaded the tag. :Default: - ``2`` + *Empty* :Example: .. code-block:: none - divolte.kafka_flusher { - threads = 1 + divolte.sources.a_source { + type = browser + cookie_domain = ".example.com" } -divolte.kafka_flusher.max_write_queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``javascript.name`` +"""""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum number of mapped events to queue internally *per thread* for Kafka before starting to drop them. + The name of the JavaScript loaded as the tag. This is appended to the value of the ``prefix`` property to form the complete path of the tag in the URL. :Default: - ``200000`` + ``divolte.js`` :Example: .. code-block:: none - divolte.kafka_flusher { - max_write_queue = 1000000 + divolte.sources.a_source { + type = browser + javascript.name = tracking.js } -divolte.kafka_flusher.max_enqueue_delay -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + In this case the tracking tag could be included using: + + .. code-block:: html + + + +Browser source property: ``javascript.logging`` +""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum time to wait before dropping the event if the internal queue for one of the Kafka threads is full. + Enable or disable the logging to the JavaScript console in the browser. :Default: - 1 second + ``false`` :Example: .. code-block:: none - divolte.kafka_flusher { - max_enqueue_delay = 20 seconds + divolte.sources.a_source { + type = browser + javascript.logging = true } -divolte.kafka_flusher.topic -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``javascript.debug`` +""""""""""""""""""""""""""""""""""""""""""""" :Description: - The Kafka topic onto which events are published. + When enabled, the served JavaScript will be less compact and *slightly* easier to debug. This setting is mainly intended to help track down problems in either the minification process used to reduce the size of the tracking script, or in the behaviour of specific browser versions. :Default: - ``divolte`` + ``false`` :Example: .. code-block:: none - divolte.kafka_flusher { - topic = clickevents + divolte.sources.a_source { + type = browser + javascript.debug = true } -divolte.kafka_flusher.producer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``javascript.auto_page_view_event`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The producer configuration. All settings are used as-is to configure the Kafka producer; refer to the `Kafka Documentation `_ for further details. + When enabled the JavaScript tag automatically generates a ``pageView`` event when loaded, simplifying site integration. If sites wish to control all events (including the initial ``pageView`` event) this can be disabled. :Default: + ``true`` +:Example: .. code-block:: none - producer = { - bootstrap.servers = "localhost:9092" - bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} + divolte.sources.a_source { + type = browser + javascript.auto_page_view_event = false + } - client.id = divolte.collector - client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} +Mappings (``divolte.mappings``) +------------------------------- - acks = 0 - retries = 5 - retry.backoff.ms = 200 - } +Mappings are used to specify event flows between sources and sinks, along with the transformation ("mapping") required to convert events into Avro records that conform to a schema. Schema mapping is an important feature of Divolte Collector as it allows incoming events to be mapped onto custom Avro schemas in non-trivial ways. See :doc:`mapping_reference` for details about this process and the internal mapping DSL used for defining mappings. -:Example: +Each configured mapping has a name and produces homogenous records conforming to an Avro schema. It may consume events from multiple sources, and the resulting records may be sent to multiple sinks. Sources and sinks may be shared between multiple mappings. If multiple mappings produce records for the same sink, all mappings must use the same Avro schema. - .. code-block:: none +An example mapping configuration could be: - divolte.kafka_flusher.producer = { - metadata.broker.list = ["broker1:9092", "broker2:9092", "broker3:9092"] - client.id = divolte.collector +.. code-block:: none - request.required.acks = 0 - message.send.max.retries = 5 - retry.backoff.ms = 200 + divolte.mappings { + // The name of the mapping is 'a_mapping' + a_mapping = { + schema_file = /some/dir/MySchema.avsc + mapping_script_file = schema-mapping.groovy + sources = [browser] + sinks = [hdfs,kafka] } + } + +If no mappings are specified a single implicit mapping is created that is equivalent to: + +.. code-block:: none + + divolte.mappings { + // The name of the implicit mapping is 'default' + default = { + sources = [ /* All configured sources */ ] + sinks = [ /* All configured sinks */ ] + } + } + +If *any* mappings are configured this implicit mapping is not present and all mappings must be explicitly specified. -divolte.hdfs_flusher --------------------- -This section controls settings related to flushing the event stream. +Mapping properties +^^^^^^^^^^^^^^^^^^ -divolte.hdfs_flusher.enabled -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Within the namespace for a mapping properties are used to configure it. At a minimum the ``sources`` and ``sinks`` should be specified; without these a mapping has no work to do. + +Mapping property: ``sources`` +""""""""""""""""""""""""""""" :Description: - This controls whether flushing to HDFS is enabled. Note that in absence of further HDFS configuration events will be written to the local filesystem. + A list of the names of the sources that this mapping should consume events from. A source may be shared by multiple mappings; each mapping will process every event from the source. :Default: - ``true`` + *Not specified* :Example: .. code-block:: none - divolte.hdfs_flusher { - enabled = false + divolte.mappings.a_mapping { + sources = [site1, site2] } -divolte.hdfs_flusher.threads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``sinks`` +""""""""""""""""""""""""""" :Description: - Number of threads to use for flushing events to HDFS. Each thread creates its own files on HDFS. Depending on the flushing strategy, multiple concurrent files can be kept open per thread. + A list of the names of the sinks that this mapping should write produced Avro records to. Each produced record is written to all sinks. A sink may be shared by multiple mappings; in this case all mappings must produce records conforming to the same Avro schema. :Default: - ``2`` + *Not specified* :Example: .. code-block:: none - divolte.hdfs_flusher { - threads = 1 + divolte.mappings.a_mapping { + sinks = [hdfs, kafka] } -divolte.hdfs_flusher.max_write_queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``schema_file`` +""""""""""""""""""""""""""""""""" :Description: - The maximum number of mapped events to queue internally *per thread* for HDFS before starting to drop them. + By default a mapping will produce records that conform to a `built-in Avro schema `_. However, a custom schema makes usually makes sense that contains fields specific to the domain and custom events. :Default: - 100000 + |Built-in schema|_ :Example: .. code-block:: none - divolte.hdfs_flusher { - max_write_queue = 1000000 + divolte.mappings.a_mapping { + schema_file = /etc/divolte/MyEventRecord.avsc } -divolte.hdfs_flusher.max_enqueue_delay -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. |Built-in schema| replace:: *Built-in schema* +.. _Built-in schema: https://github.com/divolte/divolte-schema + +Mapping property: ``mapping_script_file`` +""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum time to wait before dropping the event if the internal queue for one of the HDFS threads is full. + The location of the Groovy script that defines the how events from sources will be mapped to Avro records that are written to sinks. If unset, a default built-in mapping will be used. See the :doc:`mapping_reference` for details on mapping events. :Default: - 1 second + *Built-in mapping* :Example: .. code-block:: none - divolte.hdfs_flusher { - max_enqueue_delay = 20 seconds + divolte.mappings.a_mapping { + mapping_script_file = /etc/divolte/my-mapping.groovy } -divolte.hdfs_flusher.hdfs -------------------------- -HDFS specific settings. Although it is possible to configure a HDFS URI here, it is more advisable to configure HDFS settings by specifying a ``HADOOP_CONF_DIR`` environment variable which will be added to the classpath on startup. - -divolte.hdfs_flusher.hdfs.uri -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``discard_corrupted`` +""""""""""""""""""""""""""""""""""""""" :Description: - The filesystem URI to configure the HDFS client with. When absent, the URI is not set. When using ``HADOOP_CONF_DIR`` this should not be set. + Events contain a flag indicating whether the source detected corruption in the event data. If this property is enabled corrupt events will be discarded and not subject to mapping and further processing. Otherwise a best effort will be made to map and process the event as if it was normal. :Default: - *Not set* + ``false`` :Example: .. code-block:: none - divolte.hdfs_flusher.hdfs { - uri = "file:///" + divolte.mappings.a_mapping { + discard_corrupted = true } -divolte.hdfs_flusher.hdfs.replication -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``discard_duplicates`` +"""""""""""""""""""""""""""""""""""""""" :Description: - The HDFS replication factor to use when creating files. + Clients sometimes deliver events to sources multiple times, normally within a short period of time. Sources contain a probabilistic filter which can detect this and set a flag on the event. If this property is enabled events flagged as duplicates will be discarded without further mapping or processing. :Default: - ``1`` + ``false`` :Example: .. code-block:: none - divolte.hdfs_flusher.hdfs { - replication = 3 + divolte.incoming_request_processor { + discard_duplicates = true + } + +Sinks (``divolte.sinks``) +------------------------- + +Sinks are used to write Avro records that have been mapped from received events. Each sink has a name used to identify it when configuring a mapper that produces records for the sink. A sink cannot have the same name as a source (and vice versa). Sinks are configured in sections using their name as the configuration path. (Due to the `HOCON merging rules `_, it's not possible to configure multiple sinks with the same name.) + +Each sink has a type configured via a mandatory ``type`` property. The supported types are: + +- ``hdfs`` +- ``kafka`` + +For example: + +.. code-block:: none + + divolte.sinks { + // The name of the source is 'my_sink' + my_sink = { + // This is a HDFS sink. + type = hdfs } + } + +If no sinks are specified two implicit sinks are created that are equivalent to: + +.. code-block:: none + + divolte.sinks { + // The name of the implicit sinks are 'hdfs' and 'kakfa'. + hdfs = { + type = hdfs + replication_factor = 1 + } + kafka = { + type = kafka + } + } + +If *any* sinks are configured these implicit sinks are not present and all sinks must be explicitly specified. + -divolte.hdfs.file_strategy --------------------------- -Divolte Collector has two strategies for creating files on HDFS and flushing data. One of these must be configured, but not both. Which strategy to use is set using the `type` property of this configuration; accepted values are either ``SIMPLE_ROLLING_FILE` (default) or ``SESSION_BINNING``. +HDFS Sinks +^^^^^^^^^^ -Simple rolling file strategy -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default a simple rolling file strategy is employed. This opens one file per thread and rolls over to a new file after a configurable interval. Files that are being written to have an extension of ``.avro.partial`` and are created in the the directory configured in the ``working_dir`` setting. When a file is closed, it is renamed to have an ``.avro`` extension and moved to the directory configured in the ``publish_dir`` setting. This happens in a single (atomic) filesystem move operation. +A HDFS sink uses a HDFS client to write `Avro files `_ containing records produced by mapping. The schema of the Avro file is the schema of the mapping producing the records. If multiple mappings produce records for a sink they must all use the same schema. -Session binning file strategy -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A more complex strategy is the session binning strategy. This strategy attempts to place events that belong to the same session in the same file. +The HDFS client used to write files is configured according to the global HDFS settings. Depending on the HDFS client version in use, HDFS sinks can write to various locations: -Events are assigned to files using the following rules: +- Native HDFS in a Hadoop cluster. +- A local filesystem. +- S3 in Amazon Web Services (AWS). (See `here `_ for details.) -- The strategy always has a 'current' open file to which events will be written. -- When a session starts, its events are assigned to the current file and will be written there for as long as possible. -- When a period of time the length of the configured session timeout has elapsed, a new file is opened and designed 'current'. -- The previously current file remains open for a further period of time equal to twice the session timeout. During this - period events for sessions assigned to that file will be written there. -- If an event arrives assigned to file that has been closed, the session's events will be reassigned to the oldest open - file. +A HDFS sink uses multiple threads to write the records as they are produced. Each thread writes to its own Avro file, flushing regularly. Periodically the Avro files are closed and new ones started. Files are initially created in the configured working directory and have an extension of ``.avro.partial`` while open and being written to. When closed, they are renamed to have an extension of ``.avro`` and moved to the publish directory. This happens in a single (atomic) move operation, so long as the underlying storage supports this. -.. note:: +Records produced from events with the same party identifier are always written to the same Avro file, and in the order they were received by the originating source. (The relative ordering of records produced from events with the same party identifier is undefined if they originated from different sources, although they will still be written to the same Avro file.) - If the Divolte Collector is shutdown or fails, open files are not moved into the published directory. Instead they - remain in the working directory and need to be manually processed. +Within the namespace for a HDFS sink properties are used to configure it. -divolte.hdfs.file_strategy.type -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``replication`` +""""""""""""""""""""""""""""""""""" :Description: - Identify which strategy to use for flushing HDFS files. Type can be either `SIMPLE_ROLLING_FILE` or `SESSION_BINNING` for the respective strategies. + The HDFS replication factor to use when creating files. :Default: - ``SIMPLE_ROLLING_FILE`` + 3 :Example: .. code-block:: none - divolte.hdfs.file_strategy { - type = SESSION_BINNING + divolte.sinks.a_sink { + type = hdfs + replication = 1 } -divolte.hdfs.file_strategy.sync_file_after_records -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.working_dir`` +""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - How often a ``hsync()`` should be issued to flush HDFS data based on the number of records that have been written since the last flush. + Directory where files are created and kept while being written to. Files being written have a ``.avro.partial`` extension. :Default: - ``1000`` + ``/tmp`` :Example: .. code-block:: none - divolte.hdfs.file_strategy { - sync_file_after_records = 100 + divolte.sinks.a_sink { + type = hdfs + file_strategy.working_dir = /webdata/inflight } -divolte.hdfs.file_strategy.sync_file_after_duration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.publish_dir`` +""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - How often a ``hsync()`` should be issued to flush HDFS data based on how long it is since the last flush. + Directory where files are moved to after they are closed. Files when closed have a ``.avro`` extension. :Default: - 30 seconds + ``/tmp`` :Example: .. code-block:: none - divolte.hdfs.file_strategy { - sync_file_after_duration = 1 minute + divolte.sinks.a_sink { + type = hdfs + file_strategy.publish_dir = /webdata/published } -divolte.hdfs.file_strategy.working_dir -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.roll_every`` +"""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Directory where files are created and kept while being written to. + Roll over files on HDFS after this amount of time. (If the working file doesn't contain any records it will be discarded.) :Default: - ``/tmp`` + 1 hour :Example: .. code-block:: none - divolte.hdfs.file_strategy { - working_dir = /webdata/inflight + divolte.sinks.a_sink { + type = hdfs + file_strategy.roll_every = 15 minutes } -divolte.hdfs.file_strategy.publish_dir -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.sync_file_after_records`` +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Directory where files are moved to after they are closed. + The maximum number of records that should be written to the working file since the last flush before flushing again. Flushing is performed by issuing a ``hsync()`` call to flush HDFS data. :Default: - ``/tmp`` + 1000 :Example: .. code-block:: none - divolte.hdfs.file_strategy { - publish_dir = /webdata/published + divolte.sinks.a_sink { + type = hdfs + file_strategy.sync_file_after_records = 100 } -divolte.hdfs.file_strategy.roll_every *(simple rolling strategy only)* -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.sync_file_after_duration`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Roll over files on HDFS after this amount of time. + The maximum time that may elapse after a record is written to the working file before it is flushed. Flushing is performed by issuing a ``hsync()`` call to flush HDFS data. :Default: - 60 minutes + 30 seconds :Example: .. code-block:: none - divolte.hdfs.file_strategy { - roll_every = 15 minutes + divolte.sinks.a_sink { + type = hdfs + file_strategy.sync_file_after_duration = 10 seconds } -logback.xml ------------ -Divolte Collector uses the `Logback Project `_ as its logging provider. This provider is configured through the ``logback.xml`` file in the configuration directory. For more information about the settings in this file, review the `Configuration chapter in the Logback Manual `_. +Kafka Sinks +^^^^^^^^^^^ -Website integration -=================== -Next to the server side configuration, Divolte Collector needs to be integrated into a website in order to log events. The minimum integration involves adding a single tag to collect pageviews. This can be extended with custom events for tracking specific user interactions. +A Kafka sink uses a Kafka producer to write Avro records as individual messages on a Kafka topic. The producer is configured according to the global Kafka settings. -The tag -------- -The tag for Divolte Collector to include in each page of a website is this: +Records produced from events with the same party identifier are queued on a topic in the same order they were received by the originating source. (The relative ordering across sources is not guaranteed.) The messages are keyed by their party identifier meaning that Kafka will preserve the relative ordering between messages with the same party identifier. -.. code-block:: html +The body of each Kafka message contains a single Avro record, serialised using Avro's `binary encoding `_. The schema is not included or referenced in the message. Because Avro's binary encoding is not self-describing, a topic consumer must be independently configured to use a *write schema* that corresponds to the schema used by the mapper that produced the record. - +Within the namespace for a Kafka sink properties are used to configure it. -The URL will need to use the domain name where you are hosting Divolte Collector, and ``divolte.js`` needs to match the ``divolte.javascript.name`` configuration setting. - -Custom events -------------- -The tracking tag provides an API for pages to fire custom events: - -.. code-block:: html +Kafka sink property: ``topic`` +"""""""""""""""""""""""""""""" +:Description: + The Kafka topic onto which events are published. +:Default: + ``divolte`` +:Example: - + .. code-block:: none -The first argument to the ``divolte.signal(...)`` function is the event type parameter. The second argument is a arbitrary object with custom event parameters. Storing the event parameter and the custom event parameters into the configured Avro data is achieved through the mapping. See the :doc:`mapping_reference` chapter for details. + divolte.sinks.a_sink { + type = kafka + topic = clickevents + } From 49146e054950cb7a17b422da571fd11ce5da79e7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 15:50:50 +0200 Subject: [PATCH 66/80] Update links to refer to the Avro specification version that we use. --- docs/configuration.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 2cd07a23..0ceb7b99 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -837,7 +837,7 @@ If *any* sinks are configured these implicit sinks are not present and all sinks HDFS Sinks ^^^^^^^^^^ -A HDFS sink uses a HDFS client to write `Avro files `_ containing records produced by mapping. The schema of the Avro file is the schema of the mapping producing the records. If multiple mappings produce records for a sink they must all use the same schema. +A HDFS sink uses a HDFS client to write `Avro files `_ containing records produced by mapping. The schema of the Avro file is the schema of the mapping producing the records. If multiple mappings produce records for a sink they must all use the same schema. The HDFS client used to write files is configured according to the global HDFS settings. Depending on the HDFS client version in use, HDFS sinks can write to various locations: @@ -948,7 +948,7 @@ A Kafka sink uses a Kafka producer to write Avro records as individual messages Records produced from events with the same party identifier are queued on a topic in the same order they were received by the originating source. (The relative ordering across sources is not guaranteed.) The messages are keyed by their party identifier meaning that Kafka will preserve the relative ordering between messages with the same party identifier. -The body of each Kafka message contains a single Avro record, serialised using Avro's `binary encoding `_. The schema is not included or referenced in the message. Because Avro's binary encoding is not self-describing, a topic consumer must be independently configured to use a *write schema* that corresponds to the schema used by the mapper that produced the record. +The body of each Kafka message contains a single Avro record, serialised using Avro's `binary encoding `_. The schema is not included or referenced in the message. Because Avro's binary encoding is not self-describing, a topic consumer must be independently configured to use a *write schema* that corresponds to the schema used by the mapper that produced the record. Within the namespace for a Kafka sink properties are used to configure it. From 7e2fc06e9f42d750f5171eebdb6beb141bb5fbe4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 17:52:15 +0200 Subject: [PATCH 67/80] Capitalize list items. --- docs/introduction.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/introduction.rst b/docs/introduction.rst index dca4398b..691f69d4 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -79,10 +79,10 @@ In addition to collecting click events, Divolte Collector provides a number of w * Corrupt request detection for similar issues as above. * Generates unique identifiers: - * party ID: a long lived cookie that is set on the client - * session ID: a cookie that expires after 30 minutes of inactivity - * pageview ID: a unique identifier for each pageview and subsequent custom events fired from the same page - * event ID: a unique identifier for each event + * Party ID: a long lived cookie that is set on the client + * Session ID: a cookie that expires after 30 minutes of inactivity + * Pageview ID: a unique identifier for each pageview and subsequent custom events fired from the same page + * Event ID: a unique identifier for each event * User agent parsing: the user agent string is parsed on the fly and the resulting fields (e.g. operating system, browser type, device type) can be mapped onto the schema. * On the fly geolocation lookup based on IP address can be done using the `Maxmind databases `_. From 6342043ba60a9fcd863473daa06470ff01cb21e3 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 17:52:28 +0200 Subject: [PATCH 68/80] Render HTML as code block. --- docs/introduction.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/introduction.rst b/docs/introduction.rst index 691f69d4..e283f659 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -69,7 +69,12 @@ Features ======== In addition to collecting click events, Divolte Collector provides a number of welcome features: -* Single line JavaScript deployment: +* Single line JavaScript deployment: + + .. code-block:: html + + + * Mapping clickstream data onto a domain specific (Avro) schema; on the fly parsing * Comes with a built in default schema and mapping for basic, zero-config deployment From 469f1fba4f4dc6013ac88f2616eeef151d53e1b4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 17:54:32 +0200 Subject: [PATCH 69/80] Use Sphinx semantic markup where possible. --- docs/conf.py | 2 +- docs/configuration.rst | 56 +++++++++++++++++++++------------------- docs/getting_started.rst | 28 ++++++++++---------- 3 files changed, 44 insertions(+), 42 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2feab510..5c9c702b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -104,7 +104,7 @@ #keep_warnings = False # If true, Sphinx will warn about all references where the target cannot be found. -nitpicky = True +#nitpicky = False # -- Options for HTML output ---------------------------------------------- diff --git a/docs/configuration.rst b/docs/configuration.rst index 0ceb7b99..e0df6a6d 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -7,13 +7,13 @@ Configuration files =================== The main configuration for Divolte Collector consists of three files: -- ``divolte-env.sh``: a shell script that is included in the startup script to set environment variables and JVM startup arguments. -- ``logback.xml``: the logging configuration. -- ``divolte-collector.conf``: the main configuration file for Divolte Collector. +- :file:`divolte-env.sh`: a shell script that is included in the startup script to set environment variables and JVM startup arguments. +- :file:`logback.xml`: the logging configuration. +- :file:`divolte-collector.conf`: the main configuration file for Divolte Collector. Configuration directory ----------------------- -Divolte Collector will try to find configuration files at startup in the configuration directory. Typically this is the ``conf/`` directory nested under the Divolte Collector installation. Divolte Collector will try to locate the configuration directory at ``../conf`` relative to the startup script. The configuration directory can be overridden by setting the ``DIVOLTE_CONF_DIR`` environment variable. If set, the value will be used as configuration directory. If you have installed Divolte Collector from a RPM, the init script will set this variable to ``/etc/divolte-collector``. +Divolte Collector will try to find configuration files at startup in the configuration directory. Typically this is the :file:`conf/` directory nested under the Divolte Collector installation. Divolte Collector will try to locate the configuration directory at :file:`../conf` relative to the startup script. The configuration directory can be overridden by setting the :envvar:`DIVOLTE_CONF_DIR` environment variable. If set, the value will be used as configuration directory. If you have installed Divolte Collector from a RPM, the init script will set this variable to :file:`/etc/divolte-collector`. divolte-env.sh -------------- @@ -33,7 +33,7 @@ HADOOP_CONF_DIR JAVA_HOME ^^^^^^^^^ :Description: - The directory where the JRE/JDK is located. Divolte Collector will use ``$JAVA_HOME/bin/java`` as the Java executable during startup. If this is not set, Divolte Collector will attempt to find a suitable JDK in a number of common Java installation locations on Linux systems. It is however not recommended to rely on this mechanism for production use. + The directory where the JRE/JDK is located. Divolte Collector will use :command:`$JAVA_HOME/bin/java` as the Java executable during startup. If this is not set, Divolte Collector will attempt to find a suitable JDK in a number of common Java installation locations on Linux systems. It is however not recommended to rely on this mechanism for production use. :Example: @@ -44,7 +44,7 @@ JAVA_HOME DIVOLTE_JAVA_OPTS ^^^^^^^^^^^^^^^^^ :Description: - Additional arguments passed to the Java Virtual Machine on startup. If not set, by default Divolte Collector will start the JVM with ``-XX:+UseG1GC -Djava.awt.headless=true``. It is recommended to use the G1 garbage collector. For light and medium traffic, the defaults tend to work fine. *If this setting is set, Divolte Collector will not add any arguments by itself; this setting overrides the defaults.* + Additional arguments passed to the Java Virtual Machine on startup. If not set, by default Divolte Collector will start the JVM with :code:`-XX:+UseG1GC -Djava.awt.headless=true`. It is recommended to use the G1 garbage collector. For light and medium traffic, the defaults tend to work fine. *If this setting is set, Divolte Collector will not add any arguments by itself; this setting overrides the defaults.* :Example: @@ -54,7 +54,7 @@ DIVOLTE_JAVA_OPTS logback.xml ----------- -Divolte Collector uses the `Logback Project `_ as its logging provider. This provider is configured through the ``logback.xml`` file in the configuration directory. For more information about the settings in this file, review the `Configuration chapter in the Logback Manual `_. +Divolte Collector uses the `Logback Project `_ as its logging provider. This provider is configured through the :file:`logback.xml` file in the configuration directory. For more information about the settings in this file, review the `Configuration chapter in the Logback Manual `_. divolte-collector.conf ---------------------- @@ -76,7 +76,7 @@ Nesting and dot separated namespacing can be used interchangeably: // Is the same as this: divolte.global.server.host = 127.0.0.1 -Environment variable overrides can be used. In this example the ``divolte.global.server.port`` setting defaults to 8290, unless the ``DIVOLTE_PORT`` environment variable is set: +Environment variable overrides can be used. In this example the ``divolte.global.server.port`` setting defaults to 8290, unless the :envvar:`DIVOLTE_PORT` environment variable is set: .. code-block:: none @@ -126,7 +126,7 @@ For a full overview please refer to the `HOCON features and specification -The first argument to the ``divolte.signal(...)`` function is the type of event, while the second argument is an arbitrary object containing custom parameters associated with the event. Storing the event and its parameters into the configured Avro schema is controlled via mapping; see the :doc:`mapping_reference` chapter for details. +The first argument to the :samp:`divolte.signal({...})` function is the type of event, while the second argument is an arbitrary object containing custom parameters associated with the event. Storing the event and its parameters into the configured Avro schema is controlled via mapping; see the :doc:`mapping_reference` chapter for details. Browser sources are able to detect some cases of corruption in the event data. The most common source of this is due to URLs being truncated, but there are also other sources of corruption between the client and the server. Corrupted events are flagged as such but still made available for mapping. (Mappings may choose to discard corrupted events, but by default they are processed normally.) @@ -627,7 +629,7 @@ Browser source property: ``javascript.logging`` :Description: Enable or disable the logging to the JavaScript console in the browser. :Default: - ``false`` + :code:`false` :Example: .. code-block:: none @@ -642,7 +644,7 @@ Browser source property: ``javascript.debug`` :Description: When enabled, the served JavaScript will be less compact and *slightly* easier to debug. This setting is mainly intended to help track down problems in either the minification process used to reduce the size of the tracking script, or in the behaviour of specific browser versions. :Default: - ``false`` + :code:`false` :Example: .. code-block:: none @@ -657,7 +659,7 @@ Browser source property: ``javascript.auto_page_view_event`` :Description: When enabled the JavaScript tag automatically generates a ``pageView`` event when loaded, simplifying site integration. If sites wish to control all events (including the initial ``pageView`` event) this can be disabled. :Default: - ``true`` + :code:`true` :Example: .. code-block:: none @@ -771,7 +773,7 @@ Mapping property: ``discard_corrupted`` :Description: Events contain a flag indicating whether the source detected corruption in the event data. If this property is enabled corrupt events will be discarded and not subject to mapping and further processing. Otherwise a best effort will be made to map and process the event as if it was normal. :Default: - ``false`` + :code:`false` :Example: .. code-block:: none @@ -785,7 +787,7 @@ Mapping property: ``discard_duplicates`` :Description: Clients sometimes deliver events to sources multiple times, normally within a short period of time. Sources contain a probabilistic filter which can detect this and set a flag on the event. If this property is enabled events flagged as duplicates will be discarded without further mapping or processing. :Default: - ``false`` + :code:`false` :Example: .. code-block:: none @@ -871,7 +873,7 @@ HDFS Sink Property: ``file_strategy.working_dir`` :Description: Directory where files are created and kept while being written to. Files being written have a ``.avro.partial`` extension. :Default: - ``/tmp`` + :file:`/tmp` :Example: .. code-block:: none @@ -886,7 +888,7 @@ HDFS Sink Property: ``file_strategy.publish_dir`` :Description: Directory where files are moved to after they are closed. Files when closed have a ``.avro`` extension. :Default: - ``/tmp`` + :file:`/tmp` :Example: .. code-block:: none @@ -914,7 +916,7 @@ HDFS Sink Property: ``file_strategy.roll_every`` HDFS Sink Property: ``file_strategy.sync_file_after_records`` """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum number of records that should be written to the working file since the last flush before flushing again. Flushing is performed by issuing a ``hsync()`` call to flush HDFS data. + The maximum number of records that should be written to the working file since the last flush before flushing again. Flushing is performed by issuing a :code:`hsync()` call to flush HDFS data. :Default: 1000 :Example: @@ -929,7 +931,7 @@ HDFS Sink Property: ``file_strategy.sync_file_after_records`` HDFS Sink Property: ``file_strategy.sync_file_after_duration`` """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum time that may elapse after a record is written to the working file before it is flushed. Flushing is performed by issuing a ``hsync()`` call to flush HDFS data. + The maximum time that may elapse after a record is written to the working file before it is flushed. Flushing is performed by issuing a :code:`hsync()` call to flush HDFS data. :Default: 30 seconds :Example: diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 325e4b99..7c3a1a27 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -55,7 +55,7 @@ Now, take your web browser to http://127.0.0.1:8290/ and check that you see a pa Looking at the data =================== -Now, go back to the console where Divolte Collector is running and hit CTRL+C (or kill the process). You should see output similar to this: +Now, go back to the console where Divolte Collector is running and hit :kbd:`Control-c` (or kill the process). You should see output similar to this: .. code-block:: none @@ -63,13 +63,13 @@ Now, go back to the console where Divolte Collector is running and hit CTRL+C (o 2014-12-17 09:27:15.396+01 [Thread-8] INFO [Server]: Stopping thread pools. 2014-12-17 09:27:17.399+01 [Thread-8] INFO [Server]: Closing HDFS filesystem connection. -When Divolte Collector shuts down it will flush and close all open files, so now we can have a look at the data that was generated. By default, with no configuration, Divolte Collector will write ``.avro`` files in ``/tmp`` on the local filesystem. For convenience, Divolte Collector packages a version of the avro-tools that come with Apache Avro, so you can look at the contents of these files as JSON records. Try the following: +When Divolte Collector shuts down it will flush and close all open files, so now we can have a look at the data that was generated. By default, with no configuration, Divolte Collector will write ``.avro`` files in :file:`/tmp` on the local filesystem. For convenience, Divolte Collector packages a version of the avro-tools that come with Apache Avro, so you can look at the contents of these files as JSON records. Try the following: .. code-block:: bash % find /tmp/*.avro -name '*divolte-tracking-*.avro' | sort | tail -n1 | xargs ./bin/avro-tools tojson --pretty -This finds a ``.avro`` file in your ``/tmp`` directory and passes it to the ``avro-tools tojson`` command. Depending on how many requests you made, it will display multiple records. The output for a single record should look like this: +This finds a ``.avro`` file in your :file:`/tmp` directory and passes it to the :code:`avro-tools tojson` command. Depending on how many requests you made, it will display multiple records. The output for a single record should look like this: .. code-block:: json @@ -140,14 +140,14 @@ This finds a ``.avro`` file in your ``/tmp`` directory and passes it to the ``av Bring your own schema ===================== -Divolte Collector uses Avro to write data to files. Avro records require you to define a `Avro schema `_ that defines the fields in the records. Divolte Collector comes with a `built in generic schema `_ that is useful for keeping track of the basics of your clickstream data, but in most cases it makes sense to create your own schema with more specific fields that have a meaning within your website's domain. In order to achieve this two things are needed: +Divolte Collector uses Avro to write data to files. Avro records require you to define a `Avro schema `_ that defines the fields in the records. Divolte Collector comes with a `built-in generic schema `_ that is useful for keeping track of the basics of your clickstream data, but in most cases it makes sense to create your own schema with more specific fields that have a meaning within your website's domain. In order to achieve this two things are needed: 1. A custom Avro schema 2. A mapping that defines how to map requests onto the custom schema. Let's create a custom schema. -Create a file called ``MyEventRecord.avsc`` with the following contents (for example in the ``conf/`` directory under the Divolte Collector installation): +Create a file called :file:`MyEventRecord.avsc` with the following contents (for example in the :file:`conf/` directory under the Divolte Collector installation): .. code-block:: json @@ -165,7 +165,7 @@ Create a file called ``MyEventRecord.avsc`` with the following contents (for exa ] } -This is a very minimal custom schema, but it allows us to demonstrate a very important feature in Divolte Collector: mapping. In order to use the custom schema, we need to create a mapping that maps incoming requests onto the schema fields. Create a file called ``mapping.groovy`` with the following contents: +This is a very minimal custom schema, but it allows us to demonstrate a very important feature in Divolte Collector: mapping. In order to use the custom schema, we need to create a mapping that maps incoming requests onto the schema fields. Create a file called :file:`mapping.groovy` with the following contents: .. code-block:: groovy @@ -185,7 +185,7 @@ This is a very minimal custom schema, but it allows us to demonstrate a very imp The mapping is defined using a internal Groovy DSL in Divolte Collector. In this example we map a number of values onto fields in the Avro schema. The values for timestamp, remoteHost and location are mapped directly onto fields in the schema. In the remainder of the script, we tell Divolte Collector to take the fragment of the location (the part after the ``#`` in the URL) and try to parse that into a (partial) URI again. From the result URI, we map the path onto a schema field. Subsequently, parse out the values to two query string parameters (``q`` and ``n``) and map those onto separate schema fields after trying to parse an integer out of the ``n`` parameter. The mapping DSL allows for a lot more constructs, including conditional logic, regex matching and more; see the :doc:`mapping_reference` documentation for more information on this. -Finally, we need to configure Divolte Collector to use our custom schema and mapping. Edit the (empty) ``divolte-collector.conf`` file in the ``conf/`` directory of your installation to resemble the following configuration (be sure to use the correct paths for the schema and mapping file that you just created): +Finally, we need to configure Divolte Collector to use our custom schema and mapping. Edit the (empty) :file:`divolte-collector.conf` file in the :file:`conf/` directory of your installation to resemble the following configuration (be sure to use the correct paths for the schema and mapping file that you just created): .. code-block:: none @@ -204,7 +204,7 @@ Finally, we need to configure Divolte Collector to use our custom schema and map Divolte Collector configuration uses the `Typesafe Config `_ library, which uses a configuration dialect called `HOCON `_. -Now, once more, start Divolte Collector as before. Only this time, take your web browser to this address: `http://127.0.0.1:8290/#/fragment/path?q=textual&n=42 `_. You can refresh the page a couple of times and perhaps change the query string parameter values that are in the URL to something else. After you have done one or more requests, stop Divolte Collector again (using CTRL+C) and look at the collected data using this command again: +Now, once more, start Divolte Collector as before. Only this time, take your web browser to this address: `http://127.0.0.1:8290/#/fragment/path?q=textual&n=42 `_. You can refresh the page a couple of times and perhaps change the query string parameter values that are in the URL to something else. After you have done one or more requests, stop Divolte Collector again (using :kbd:`Control-c`) and look at the collected data using this command again: .. code-block:: console @@ -255,7 +255,7 @@ The tag is the line: -The tag performs a number of important tasks. It generates unique identifiers for parties, sessions, pageviews and events. It collects the location, referer, screen and viewport size information from the browser sends it to the Divolte Collector server. +The tag performs a number of important tasks. It generates unique identifiers for parties, sessions, page-views and events. It collects the location, referer, screen and viewport size information from the browser sends it to the Divolte Collector server. In order to instrument a web page of your own, insert the tag as above into the HTML code on each page. Additionally, once the Divolte Collector JavaScript is loaded in the browser it is possible to fire custom events from JavaScript in the page: @@ -290,16 +290,16 @@ In order to use the custom events in your mapping, map values onto fields like t Writing to HDFS =============== -So far, we've been writing our data to the local filesystem in ``/tmp``. Although this works it not the intended use of Divolte Collector. The aim is to write the clickstream data to HDFS, such that it is safely and redundantly stored and available for processing using any tool available that knows how to process Avro files (e.g. Apache Hive or Apache Spark). It is trivial to configure Divolte Collector to write to HDFS, assuming you have a working HDFS instance setup. (Setting this up is out of the scope of this getting started guide. There are many great resources to be found on the internet about getting started with and running Hadoop and HDFS.) +So far, we've been writing our data to the local filesystem in :file:`/tmp`. Although this works it not the intended use of Divolte Collector. The aim is to write the clickstream data to HDFS, such that it is safely and redundantly stored and available for processing using any tool available that knows how to process Avro files (e.g. Apache Hive or Apache Spark). It is trivial to configure Divolte Collector to write to HDFS, assuming you have a working HDFS instance setup. (Setting this up is out of the scope of this getting started guide. There are many great resources to be found on the internet about getting started with and running Hadoop and HDFS.) Assuming you have a HDFS instance running somewhere, there are two ways of making Divolte Collector write files to it: 1. Direct configuration; or -2. Setting the ``HADOOP_CONF_DIR`` environment variable to point to a directory containing valid Hadoop configuration files. +2. Setting the :envvar:`HADOOP_CONF_DIR` environment variable to point to a directory containing valid Hadoop configuration files. While the first option works, it is recommended to use the latter as it is easier to maintain when your HDFS parameters change over time. -First, we'll change the configuration to write files to HDFS. Add the following section to ``conf/divolte-collector.conf``: +First, we'll change the configuration to write files to HDFS. Add the following section to :file:`conf/divolte-collector.conf`: .. code-block:: none @@ -348,7 +348,7 @@ First, we'll change the configuration to write files to HDFS. Add the following Note that you need to create these directories prior to starting Divolte Collector. It will not startup if the directories do not exist. -If you have a working HDFS setup and a directory with the appropriate configuration files, Divolte Collector will use them automatically if a ``HADOOP_CONF_DIR`` environment variable is set pointing to that directory. Alternatively, HDFS client properties can be provided in the configuration: +If you have a working HDFS setup and a directory with the appropriate configuration files, Divolte Collector will use them automatically if a :envvar:`HADOOP_CONF_DIR` environment variable is set pointing to that directory. Alternatively, HDFS client properties can be provided in the configuration: .. code-block:: none @@ -371,7 +371,7 @@ With everything in place, start Divolte Collector again, create some events and -rw-r--r-- 1 divolte supergroup 617 2014-08-30 11:46 /divolte/inflight/20141220152512-divolte-tracking-divoltehost-1.avro.partial -rw-r--r-- 1 divolte supergroup 617 2014-08-30 11:46 /divolte/inflight/20141220152513-divolte-tracking-divoltehost-2.avro.partial -After the rolling interval, files should show up in the publish directory with a .avro extension (without the .partial). However, if a file was opened in the working directory, but no events were ever written to it (because there was no activity or otherwise), it will not be moved to the publish directory, but will be deleted entirely instead: +After the rolling interval, files should show up in the publish directory with a ``.avro`` extension (without the ``.partial``). However, if a file was opened in the working directory, but no events were ever written to it (because there was no activity or otherwise), it will not be moved to the publish directory, but will be deleted entirely instead: .. code-block:: console From d9497bcc9a1427e77db7f8727b163e03ab5354de Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 4 Aug 2016 22:02:16 +0200 Subject: [PATCH 70/80] Clarify that a specified schema is ignored by a mapping unless the script is specified. --- docs/configuration.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index e0df6a6d..d11d230e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -740,7 +740,7 @@ Mapping property: ``sinks`` Mapping property: ``schema_file`` """"""""""""""""""""""""""""""""" :Description: - By default a mapping will produce records that conform to a `built-in Avro schema `_. However, a custom schema makes usually makes sense that contains fields specific to the domain and custom events. + By default a mapping will produce records that conform to a `built-in Avro schema `_. However, a custom schema makes usually makes sense that contains fields specific to the domain and custom events. Note that the value for this property is ignored unless ``mapping_script_file`` is also set. :Default: |Built-in schema|_ :Example: @@ -757,7 +757,9 @@ Mapping property: ``schema_file`` Mapping property: ``mapping_script_file`` """"""""""""""""""""""""""""""""""""""""" :Description: - The location of the Groovy script that defines the how events from sources will be mapped to Avro records that are written to sinks. If unset, a default built-in mapping will be used. See the :doc:`mapping_reference` for details on mapping events. + The location of the Groovy script that defines the how events from sources will be mapped to Avro records that are written to sinks. If unset, a default built-in mapping will be used. (In this case any value for the ``schema_file`` property is ignored: the default built-in mapping always produces records conforming to the `built-in schema `.) + + See the :doc:`mapping_reference` for details on mapping events. :Default: *Built-in mapping* :Example: From ecc4af2cc242715c89bcfd28a67a1394197fdabe Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 15:55:47 +0200 Subject: [PATCH 71/80] Update the mapping documentation. Changes include Sphinx markup, and updates to reflect that we now support more than a single mapping. --- docs/mapping_reference.rst | 1280 ++++++++++++++++++++---------------- 1 file changed, 710 insertions(+), 570 deletions(-) diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index 154430e7..ab1b7f06 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -2,25 +2,25 @@ Mapping ******* -Mapping in Divolte Collector is the definition that determines how incoming requests are translated into Avro records with a given schema. This definition is composed in a special, built in `Groovy `_ based DSL (domain specific language). +Mapping in Divolte Collector is the definition that determines how incoming events are translated into Avro records conforming to a schema. This definition is constructed using a `Groovy\ `_\ -based DSL (Domain-Specific Language). Why mapping? ============ -Most clickstream data collection services or solutions use a canonical data model that is specific to click events and related properties. Things such as location, referer, remote IP address, path, etc. are all properties of a click event that come to mind. While Divolte Collector exposes all of these fields just as well, it is our vision that this is not enough to make it easy to build online and near real-time data driven products within specific domains and environments. For example, when working on a system for product recommendation, the notion of a URL or path for a specific page is completely in the wrong domain; what you would care about in this case is likely a product ID and probably a type of interaction (e.g. product page view, large product photo view, add to basket, etc.). It is usually possible to extract these pieces of information from the clickstream representation, which means custom parsers have to be created to parse this information out of URLs, custom events from JavaScript and other sources. This means that whenever you work with the clickstream data, you have to run these custom parsers initially in order to get meaninful, domain specific information from the data. When building real-time systems, it normally means that this parser has to run in multiple locations: as part of the off line processing jobs and as part of the real-time processing. +Most clickstream data collection services or solutions use a canonical data model that is specific to click events and related properties. Things such as location, referrer, remote IP address, path, etc. are all properties of a click event that come to mind. While Divolte Collector exposes all of these fields just as well, it is our vision that this is not enough to make it easy to build online and near real-time data driven products within specific domains and environments. For example, when working on a system for product recommendation, the notion of a URL or path for a specific page is completely in the wrong domain; what you care about in this case is likely a product ID and probably a type of interaction (e.g. product page view, large product photo view, add to basket, etc.). It is usually possible to extract these pieces of information from the clickstream representation, which means custom parsers have to be created to parse this information out of URLs, custom events from JavaScript and other sources. This means that whenever you work with the clickstream data, you have to run these custom parsers initially in order to get meaninful, domain specific information from the data. When building real-time systems, it normally means that this parser has to run in multiple locations: as part of the off line processing jobs and as part of the real-time processing. -With Divolte Collector, instead of writing parsers and working with the raw clickstream event data in your processing, you define a mapping that allows Divolte Collector to do all the required parsing on the fly as events come in and subsequently produce structured records with a schema to use in further processing. This means that all data that comes in can already have the relevant domain specific fields populated. And whenever the need for a new extracted piece of information arises, you can update the mapping to include the new field in the newly produced data. The older data that lacks newly additional fields can co-exist with newer data that does have the additional fields through a process called schema evolution. This is supported by Avro's ability to read data with a different schema from the one that the data was written with. +With Divolte Collector, instead of writing parsers and working with the raw clickstream event data in your processing, you define mappings that allows Divolte Collector to do all the required parsing on the fly as events come in and subsequently produce structured records with a schema to use in further processing. This means that all data that comes in can already have the relevant domain specific fields populated. Whenever the need for a new extracted piece of information arises, you can update the mapping to include the new field in the newly produced data. The older data that lacks newly additional fields can co-exist with newer data that does have the additional fields through a process called schema evolution. This is supported by Avro's ability to read data with a different schema from the one that the data was written with. (This is implemented at read-time using a process called `schema resolution `_.) -In essence, the goal of the mapping is to get rid of log file or URL parsing on collected data after it is published. The event stream from Divolte Collector should have all the domain specific fields to support you use cases directly. +The goal of the mapping is to get rid of log file or URL parsing on collected data after it is published. The event stream from Divolte Collector should have all the domain specific fields to support you use cases directly. Understanding the mapping process --------------------------------- -Before you dive in to creating your own mappings, it is important to understand a little bit about how the mapping is actually performed. **The most notable thing to keep in mind is that the mapping script that you provide, is not evaluated at request time for each request.** Rather, it is evaluated only once on startup and the result of the script is used to perform the actual mapping. This means that your mapping script is evaluated only once during the run-time of the Divolte Collector server. +Before you dive in to creating your own mappings, it is important to understand a little bit about how a mapping is actually performed. **The most notable thing to keep in mind is that a mapping script that you provide is not evaluated at request time for each event.** Instead a mapping is evaluated only once during startup and *declares* how the actual mapping should take place. .. image:: images/mapping-request-run-time.png -Built in default mapping +Built-in default mapping ------------------------ -Divolte Collector comes with a built in default schema and mapping. This will map pretty much all of the basics that you would expect from a clickstream data collector. The Avro schema that is used can be found in the `divolte-schema Github repository `_. The following mappings are present in the default mapping: +Divolte Collector comes with a built-in default schema and mapping. A mapping will use these if the mapping schema or script file are not specified. The default mapping will map pretty much all of the basics that you would expect from a clickstream data collector. The Avro schema that is used can be found in the `divolte-schema Github repository `_. The following mappings are present in the default mapping: =============================== ================= Mapped value Avro schema field @@ -53,145 +53,165 @@ Mapped value Avro schema field `User agent OS vendor`_ userAgentOsVendor =============================== ================= -The default schema is not available as a mapping script. Instead, it is hard coded into Divolte Collector. This way, you can setup Divolte Collector to do something useful out-of-the-box without any complex configuration. +The default schema is not available as a mapping script. Instead, it is hard coded into Divolte Collector. This allows Divolte Collector to do something useful out-of-the-box without any complex configuration. Schema evolution and default values ----------------------------------- Schema evolution is the process of changing the schema over time as requirements change. For example when a new feature is added to your website, you add additional fields to the schema that contain specific information about user interactions with this new feature. In this scenario, you would update the schema to have these additional fields, update the mapping and then run Divolte Collector with the new schema and mapping. This means that there will be a difference between data that was written prior to the update and data that is written after the update. Also, it means that after the update, there can still be consumers of the data (from HDFS or Kafka) that still use the old schema. In order to make sure that this isn't a problem, the readers with the old schema need to be able to read data written with the new schema and readers with the new schema should also still work on data written with the old schema. -Luckily, Avro supports both of these cases. When reading newer data with an older schema, the fields that are not present in the old schema are simply ignored by the reader. The other way araound is slightly trickier. When reading older data with a new schema, Avro will fill in the default values for fields that are present in the schema but not in the data. *This is provided that there is a default value.* Basically, this means that it is recommended to always provide a default value for all your fields in the schema. In case of nullable fields, the default value could just be null. +Luckily, Avro supports both of these cases. When reading newer data with an older schema, the fields that are not present in the old schema are simply ignored by the reader. The other way around is slightly trickier. When reading older data with a new schema, Avro will fill in the default values for fields that are present in the schema but not in the data. *This is provided that there is a default value.* Basically, this means that it is recommended to always provide a default value for all your fields in the schema. In case of nullable fields, the default value could just be null. One other reason to always provide a default value is that Avro does not allow to create records with missing values if there are no default values. As a result of this, fields that have no default value always must be populated in the mapping, otherwise an error will occur. This is problematic if the mapping for some reason fails to set a field (e.g. because of a user typing in a non-conforming location in the browser). +In addition to introducing new fields with defaults, other forms of changes such as renaming and type changes can be permitted under some circumstances. For full details on the changes that are permitted and how the writing and reading schemas are reconciled refer to the `Avro documentation on schema resolution `_. + Mapping DSL =========== -The mapping is a Groovy script that is compiled and run by Divolte Collector on startup. This script is written in the mapping DSL. The result of this script is a mapping that Divolte Collector can use to map incoming requests onto a Avro schema. +Mappings are specified by Groovy scripts that are compiled and run by Divolte Collector on startup. Each mapping script is written in the mapping DSL. The result of running this script is a mapping that Divolte Collector can use to map incoming events from its configured sources onto an Avro schema. Values, fields and mappings --------------------------- -The mapping involves three main concepts: values, fields and mappings. +Mapping involves three main concepts: values, fields and mappings. + +A *value* is something that is extracted from the incoming event (e.g. the location or a HTTP header value) or is derived from another value (e.g. a query parameter from the location URI). Values in the mapping are produced using calls to functions that are built into the mapping DSL. Below is the complete documentation for all values that can be produced. One example of such a function call would be calling :code:`location()` for the location value or :code:`referer()` for the referrer value of the event. -A value is something that is extracted from the incoming request (e.g. the location or a HTTP header value) or is derived from another value (e.g. a query parameter from the location URI). Values in the mapping are produced using method calls to methods that are built into the mapping DSL. Below is the complete documentation for all values that can be produced. One example of such a method call would be calling location() for the location value or referer() for the referer value of the request. +A *field* is a field in the Avro record that will be produced as a result of the mapping process. The type of a field is defined by the Avro schema that is used. Mapping is the process of mapping values extracted from the event onto fields in the Avro record. -A field is a field in the Avro record that will be produced as a result of the mapping process. The type of a field is defined by the Avro schema that is used. Mapping is the process of mapping values extracted from the request onto fields in the Avro record. +A *mapping* is the piece that tells Divolte Collector which values need to be mapped onto which fields. The mapping DSL has a built in construct for this, explained below. -A mapping is the piece that tells Divolte Collector which values need to be mapped onto which fields. The mapping DSL has a built in construct for this, explained below. +Mapping values onto fields (:code:`map`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The simplest possible mapping is mapping a simple value onto a schema field. The syntax is as follows: -Mapping values onto fields (map) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The simplest possible mapping is mapping a simple value onto a schema field. The syntax is as follows:: +.. code-block:: groovy map location() onto 'locationField' -Alternatively, the map methods takes a closure as first argument, which can come in handy when the value is the result of several operations or a more complex construct, such as this example where we take a query parameter form the location and parse it to an int:: +Alternatively, the :code:`map` function takes a closure as first argument, which can come in handy when the value is the result of several operations or a more complex construct, such as this example where we take a query parameter from the location and parse it as an integer: + +.. code-block:: groovy map { def u = parse location() to uri // Parse the URI out of the location parse u.query().value('n') to int32 // Take the n query parameter and try to parse an int out of it } onto 'intField' -In Groovy, the last statement in a closure becomes the return value for the closure. So in the closure above, the value returned by the parse call is the result of the entire closure. This is in turn mapped onto the 'intField' field of the Avro record. +In Groovy the last statement in a closure becomes the return value for the closure. So in the closure above, the value returned by the :code:`parse` call is the result of the entire closure. This is in turn mapped onto the :code:`intField` field of the Avro record. -Apart from mapping values onto fields, it is also possible to map a literal onto a field:: +Apart from mapping values onto fields, it is also possible to map a literal onto a field: + +.. code-block:: groovy map 'string literal' onto 'stringField' map true onto 'booleanField' -This is most often used in combination with `Conditional mapping (when)`_, like in this example:: +This is most often used in combination with `Conditional mapping (when)`_ as in this example: + +.. code-block:: groovy - when referer().isAbsent() apply { // Only apply this mapping when a referer is absent + when referer().isAbsent() apply { // Only apply this mapping when a referer is absent map true onto 'directTraffic' } Value presence and nulls """""""""""""""""""""""" -Not all values are present in each request. For example when using a custom cookie value, there could be incoming requests where the cookie is not sent by the client. In this case, the cookie value is said to absent. Divolte Collector will never actively set a null value. Instead for absent values it does nothing at all; i.e. the mapped field is not set on the Avro record. When values that are absent are used in subsequent constructs, the resulting values will also be absent. In the following example, if the incoming request has no referrer, the field 'intField' will never be set, but no error occurs:: +Not all values are present in each event. For example, when using a custom cookie value there could be incoming events where the cookie is not sent by the client. In this case the cookie value is said to absent. Divolte Collector will never actively set a null value. Instead for absent values it does nothing at all: the mapped field is not set on the Avro record. When values that are absent are used in subsequent expressions the derived values will also be absent. In the following example the :code:`intField` field will never be set because the incoming request has no referrer. This is not an error: + +.. code-block:: groovy def u = parse referer() to uri // parse a URI out of the referer def q = u.query() // parse the query string of the URI def i = parse q.value('foo') to int32 // parse a int out of the query parameter 'foo' map i onto 'intField' // map it onto the field 'intField' -Because absent values result in fields not being set, your schema must have default values for all fields that are used for mappings where the value can be absent. In practice, it is recommended to always use default values for all fields in your schema. +Because absent values result in fields not being set your schema must have default values for all fields that are used for mappings where the value can be absent. In practice, it is recommended to always use default values for all fields in your schema. Types ^^^^^ -Values in the mapping are typed and the value type must match the type of the Avro field that they are mapped onto. Divolte Collector checks the type compatibility during startup and will report an error if there is a mismatch. The type for a value can be found in the documentation below. - -Below is a table of all types that can be produced in a mapping and the corresponding Avro schema's that match them: - -+----------------------------+------------------------------------------------------------------------+ -| type | Avro type | -+============================+========================================================================+ -| string | :: | -| | | -| | { "name": "fieldName", "type": ["null","string"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| boolean | :: | -| | | -| | { "name": "fieldName", "type": ["null","boolean"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| int | :: | -| | | -| | { "name": "fieldName", "type": ["null","int"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| long | :: | -| | | -| | { "name": "fieldName", "type": ["null","long"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| float | :: | -| | | -| | { "name": "fieldName", "type": ["null","float"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| double | :: | -| | | -| | { "name": "fieldName", "type": ["null","double"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| map> | :: | -| | | -| | { | -| | "name": "fieldName", | -| | "type": [ | -| | "null", | -| | { | -| | "type": "map", | -| | "values": { | -| | "type": "array", | -| | "items": "string" | -| | } | -| | } | -| | ], | -| | "default": null | -| | } | -+----------------------------+------------------------------------------------------------------------+ -| list | :: | -| | | -| | { | -| | "name": "fieldName", | -| | "type": | -| | [ | -| | "null", | -| | { | -| | "type": "array", | -| | "items": "int" | -| | } | -| | ], | -| | "default": null | -| | } | -+----------------------------+------------------------------------------------------------------------+ -| JSON (JsonNode) | _Must match the structure of the JSON fragment._ | -| | _See :ref:`mapping-json-label`._ | -+----------------------------+------------------------------------------------------------------------+ - -Casting / parsing -""""""""""""""""" -Many of the simple values that can be extracted from a request are strings. Possibly, these values are not intended to be strings. Because type information about things like query parameters or path components is lost in a HTTP request, Divolte Collector can only treat these as strings. It is, however, possible to parse string to other primitive or other types in the mapping using this construct:: +Values in a mapping are typed and the value type must match the type of the Avro field that they are mapped onto. Divolte Collector checks for type compatibility during startup and will report an error if there is a mismatch. The type for a value can be found in the documentation below. + +Below is a table of all types that can be produced in a mapping and the corresponding Avro types that match them: + ++----------------------------------+------------------------------------------------------------------------+ +| Type | Avro type | ++==================================+========================================================================+ +| :code:`String` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","string"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`Boolean` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","boolean"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`int` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","int"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`long` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","long"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`float` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","float"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`double` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","double"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`Map>` | .. code-block:: json | +| | | +| | { | +| | "name": "fieldName", | +| | "type": [ | +| | "null", | +| | { | +| | "type": "map", | +| | "values": { | +| | "type": "array", | +| | "items": "string" | +| | } | +| | } | +| | ], | +| | "default": null | +| | } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`List` | .. code-block:: json | +| | | +| | { | +| | "name": "fieldName", | +| | "type": | +| | [ | +| | "null", | +| | { | +| | "type": "array", | +| | "items": "int" | +| | } | +| | ], | +| | "default": null | +| | } | ++----------------------------------+------------------------------------------------------------------------+ +| JSON (:code:`JsonNode`) | Must match the structure of the JSON fragment. | +| | See :ref:`mapping-json-label`. | ++----------------------------------+------------------------------------------------------------------------+ + +Casting/parsing +""""""""""""""" +Many of the simple values that can be extracted from an event are strings. Sometimes these values are not intended to be strings. Because type information about things like query parameters or path components is not present in a HTTP request, Divolte Collector can only treat these values as strings. It is, however, possible to parse a string to a primitive or other type in the mapping using this construct: + +.. code-block:: groovy def i = parse stringValue to int32 -In the example above, stringValue is a value of type string and the result value, assigned to i, will be of type int. *Note that this is not casting, but string parsing. When the string value cannot be parsed to an int (because it is not a number), then the resulting value will be absent, but no error occurs.* +In the example above, :code:`stringValue` is a string value and the result value, assigned to :code:`i`, will be of type :code:`int`. + +.. note:: + + This is not casting, but string parsing. If the string value cannot be parsed to an integer (because it is not a number) the resulting value will be absent, but no error occurs. + +A more complete example is this: -A more complete example is this:: +.. code-block:: groovy def u = parse referer() to uri // u is of type URI (which is not mappable) def q = u.query() // q is of type map> @@ -199,75 +219,77 @@ A more complete example is this:: def i = parse s to int32 // i is of type int map i onto 'intField' // map it onto the field 'intField' -Because int, long, boolean, etc. are reserved words in Groovy, the mapping DSL uses aliases for casting. These are all the type that can be used for parsing and the corresponding mapping type: +Because :code:`int`, :code:`long`, :code:`Boolean`, etc. are reserved words in Groovy, the mapping DSL uses aliases for parsing. The following table lists the types that can be used for parsing and the corresponding mapping types: +-------------------+-------------------+ -| parsing alias | type | +| Parsing alias | Type | +===================+===================+ -| int32 | int | +| :code:`int32` | :code:`int` | +-------------------+-------------------+ -| int64 | long | +| :code:`int64` | :code:`long` | +-------------------+-------------------+ -| fp32 | float | +| :code:`fp32` | :code:`float` | +-------------------+-------------------+ -| fp64 | double | +| :code:`fp64` | :code:`double` | +-------------------+-------------------+ -| bool | boolean | +| :code:`bool` | :code:`Boolean` | +-------------------+-------------------+ -| uri | `URI`_ | +| :code:`uri` | :code:`URI` | +-------------------+-------------------+ .. _mapping-json-label: -Mapping JSON (``JsonNode``) to Avro fields -"""""""""""""""""""""""""""""""""""""""""" - -Some expressions, for example, ``eventParameters()`` (and its ``path()`` method), produce a ``JsonNode`` value that represents JSON supplied by a client. Because Avro doesn't have a type built in to handle arbitrary JSON data, a *compatible* Avro type must be chosen to match the expected structure of the JSON from the client. The following table lists the rules for compatibility between JSON values and Avro types. - -+---------------+-------------------------------------------------------------------------+ -| Avro type | JSON value | -+===============+=========================================================================+ -| | ``null`` | JSON's ``null`` value | -+---------------+-------------------------------------------------------------------------+ -| | ``boolean`` | A JSON boolean, or a string if it can be parsed as a boolean. | -+---------------+-------------------------------------------------------------------------+ -| | ``int`` | A JSON number, or a string if it can be parsed as a number. | -| | ``long`` | Fractional components are truncated for ``float`` and ``double``. | -+---------------+-------------------------------------------------------------------------+ -| | ``float`` | A JSON number, or a string if it can be parsed as a number. | -| | ``double`` | Note that full floating-point precision may not be preserved. | -+---------------+-------------------------------------------------------------------------+ -| | ``bytes`` | A JSON string, with BASE64 encoded binary data. | -+---------------+-------------------------------------------------------------------------+ -| | ``string`` | A JSON string, number or boolean value. | -+---------------+-------------------------------------------------------------------------+ -| | ``enum`` | A JSON string, so long as the it's identical to one of the | -| | enumeration's symbols. (If not, the value will be treated as null.) | -+---------------+-------------------------------------------------------------------------+ -| | ``record`` | A JSON object, with each property corresponding to a field in the | -| | record. (Extraneous properties are ignored.) The property values and | -| | field types must also be compatible. | -+---------------+-------------------------------------------------------------------------+ -| | ``array`` | A JSON array. Each element of the JSON array must be compatible with | -| | the type declared for the Avro array. | -+---------------+-------------------------------------------------------------------------+ -| | ``map`` | A JSON object, with each property being an entry in the map. Property | -| | names are used for keys, and the values must be compatible with the | -| | Avro type for the map values. | -+---------------+-------------------------------------------------------------------------+ -| | ``union`` | Only trivial unions are supported of ``null`` with another type. The | -| | JSON value must either be null or compatible with the other union type. | -+---------------+-------------------------------------------------------------------------+ -| | ``fixed`` | The same as ``bytes``, as above. Data beyond the declared length will | -| | be truncated. | -+---------------+-------------------------------------------------------------------------+ +Mapping JSON (:code:`JsonNode`) to Avro fields +"""""""""""""""""""""""""""""""""""""""""""""" + +Some expressions, for example, :code:`eventParameters()` (and its :code:`path()` method), produce a :code:`JsonNode` value that represents JSON supplied by a client. Because Avro doesn't have a type for handling arbitrary JSON data, a *compatible* Avro type must be chosen to match the expected structure of the JSON from the client. The following table lists the rules for compatibility between JSON values and Avro types. + ++-------------------+---------------------------------------------------------------------------+ +| Avro type | JSON value | ++===================+===========================================================================+ +| | :code:`null` | JSON's :code:`null` value | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`boolean` | A JSON boolean, or a string if it can be parsed as a boolean. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`int` | A JSON number, or a string if it can be parsed as a number. | +| | :code:`long` | Fractional components are truncated for :code:`float` and :code:`double`. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`float` | A JSON number, or a string if it can be parsed as a number. | +| | :code:`double` | Note that full floating-point precision may not be preserved. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`bytes` | A JSON string, with BASE64 encoded binary data. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`string` | A JSON string, number or boolean value. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`enum` | A JSON string, so long as the it's identical to one of the enumeration's | +| | symbols. (If not, the value will be treated as :code:`null`. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`record` | A JSON object, with each property corresponding to a field in the record. | +| | (Extraneous properties are ignored.) The property values and field types | +| | must also be compatible. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`array` | A JSON array. Each element of the JSON array must be compatible with the | +| | type declared for the Avro array. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`map` | A JSON object, with each property being an entry in the map. Property | +| | names are used for keys, and the values must be compatible with the Avro | +| | type for the map values. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`union` | Only trivial unions are supported of :code:`null` with another type. The | +| | JSON value must either be null or compatible with the other union type. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`fixed` | The same as :code:`bytes`, as above. Data beyond the declared length will | +| | be truncated. | ++-------------------+---------------------------------------------------------------------------+ In addition to these compatibility rules, trivial array wrapping and unwrapping will be performed if necessary: * If the Avro type specifies an array, any JSON value compatible with the type of the array elements will be wrapped as a single-element array. * If the Avro type is not an array, a JSON array containing a single element that is compatible will be unwrapped. -For example, a shopping basket could be supplied as the following JSON:: +For example, a shopping basket could be supplied as the following JSON: + +.. code-block:: json { "total_price": 184.91, @@ -279,7 +301,9 @@ For example, a shopping basket could be supplied as the following JSON:: ] } -This could be mapped using the following Avro schema:: +This could be mapped using the following Avro schema: + +.. code-block:: json { "type": [ @@ -317,24 +341,30 @@ The Avro field will remain unchanged if mapping fails at runtime because the JSO Unlike most mappings, schema compatibility for JSON mappings cannot be checked on startup because compatibility depends on the JSON supplied with each individual event. -Conditional mapping (when) -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Not all incoming requests are the same and usually, different types of requests require different values to be extracted and different fields to be set. This can be achieved using conditional mapping. With conditional mapping any boolean value can be used to conditionally apply a part of the mapping script. This can be done using the following syntax:: +Conditional mapping (:code:`when`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Not all incoming requests are the same and usually, different types of requests require different values to be extracted and different fields to be set. This can be achieved using conditional mapping. With conditional mapping any boolean value can be used to conditionally apply a part of the mapping script. This can be done using the following syntax: + +.. code-block:: groovy when conditionBooleanValue apply { // Conditional mapping go here map 'value' onto 'fieldName' } -A more concrete example of using this construct would be:: +A more concrete example of using this construct would be: + +.. code-block:: groovy when referer().isAbsent() apply { map true onto 'directTraffic' } -Here we check whether the referrer value is absent and if so, map a literal value onto a boolean field. +Here we check whether the referer value is absent and if so, map a literal value onto a boolean field. -As an alternative syntax, it is possible to use a closure that produces the boolean value as well, just like in `Mapping values onto fields (map)`_. In this example we check if a query parameter called clientId is present in the location and on that condition perform a mapping:: +As an alternative syntax, it is possible to use a closure that produces the boolean value as well, just like in `Mapping values onto fields (map)`_. In this example we check if a query parameter called :code:`clientId` is present in the location and on that condition perform a mapping: + +.. code-block:: groovy when { def u = parse location() to uri @@ -345,42 +375,46 @@ As an alternative syntax, it is possible to use a closure that produces the bool Conditions """""""""" -Any boolean value can be used as a condition. In order to be able to create flexible conditional mappings, the mapping DSL provides a number of methods on values to produce booleans that are useful in conditional mappings, such as equality comparisons and boolean logic: - -+------------------------------------------------+----------------------------------------------------------------+ -| Condition | Description | -+================================================+================================================================+ -| value.isPresent() | True if the value is present. See: `Value presence and nulls`_ | -+------------------------------------------------+----------------------------------------------------------------+ -| value.isAbsent() | True if the value is absent. See: `Value presence and nulls`_ | -+------------------------------------------------+----------------------------------------------------------------+ -| value.equalTo(otherValue) | True if both values are equal. Values must be of the same type.| -+------------------------------------------------+----------------------------------------------------------------+ -| value.equalTo('literal') | True if the value is equal to the given literal. Types other | -| | than string are supported as well. | -+------------------------------------------------+----------------------------------------------------------------+ -| booleanValue.and(otherBooleanValue) | True if booleanValue AND otherBooleanValue are true. | -+------------------------------------------------+----------------------------------------------------------------+ -| booleanValue.or(otherBooleanValue) | True if booleanValue OR otherBooleanValue or both are true. | -+------------------------------------------------+----------------------------------------------------------------+ -| not booleanValue | True if booleanValue is false. | -+------------------------------------------------+----------------------------------------------------------------+ -| regexMatcherValue.matches() | True if the regex matches the value. See: | -| | `Regular expression matching`_. | -+------------------------------------------------+----------------------------------------------------------------+ - -Sections and short circuit -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Sections are useful for grouping together parts of the mapping that somehow form a logical subset of the entire mapping. This makes it possible to conditionally jump out of a section as well. To define a section, just use the section keyword followed by a closure that contains the section:: +Any boolean value can be used as a condition. In order to be able to create flexible conditional mappings, the mapping DSL provides a number of methods on values that return booleans useful in conditional mappings, such as equality comparisons and boolean logic: + ++-------------------------------------------------+----------------------------------------------------------------+ +| Condition | Description | ++=================================================+================================================================+ +| :samp:`{value}.isPresent()` | True if the value is present. See: `Value presence and nulls`_ | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{value}.isAbsent()` | True if the value is absent. See: `Value presence and nulls`_ | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{value}.equalTo({otherValue})` | True if both values are equal. Values must be of the same type.| ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{value}.equalTo({'literal'})` | True if the value is equal to the given literal. Non-string | +| | types are supported as well. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{booleanValue}.and({otherBooleanValue})` | True if both booleans are true. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{booleanValue}.or({otherBooleanValue})` | True if either or both of the boolean values are true. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`not {booleanValue}` | True if the boolean value is false. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{regexMatcherValue}.matches()` | True if the regular expression matches the value. See: | +| | `Regular expression matching`_. | ++-------------------------------------------------+----------------------------------------------------------------+ + +Sections and short circuiting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Sections are useful for grouping together parts of the mapping that form a logical subset of the entire mapping. In addition to grouping it is possible to conditionally stop processing a section prematurely. Sections are defined using the :code:`section` keyword followed by a closure that contains the section: + +.. code-block:: groovy section { // Section's mappings go here map 'value' onto 'field' } -exit -"""" -The exit() method will, at any point, break out of the enclosing section or, when no enclosing section can be found, break out of the entire mapping script. This can be used to conditionally break out of a section, for example to create a type of first-match-wins scenario:: +Function: :code:`exit()` +"""""""""""""""""""""""" +The :code:`exit()` function will, at any point, break out of the enclosing section or, when no enclosing section can be found, break out of the entire mapping script. This can be used to conditionally break out of a section. For example to create a type of first-match-wins scenario: + +.. code-block:: groovy section { def u = parse location() to uri @@ -400,63 +434,82 @@ The exit() method will, at any point, break out of the enclosing section or, whe // other mappings here -There is a optional shorthand syntax for conditionally exiting from a section, which leaves out the apply keyword and closure like this:: +There is a optional shorthand syntax for conditionally exiting from a section which leaves out the :code:`apply` keyword and closure: + +.. code-block:: groovy when referer().isAbsent() exit() -stop -"""" -The stop() method will, at any point, stop *all* further processing and break out of the entire mapping script. This is typically applied conditionally. Generally, it is safer to use sections and exit() instead. Use with care. The stop() method can also be used conditionally, just as anything else:: +Function: :code:`stop()` +"""""""""""""""""""""""" +The :code:`stop()` function will, at any point, stop *all* further processing and break out of the entire mapping script. This is typically applied conditionally. Generally, it is safer to use sections and :code:`exit()` instead. Use with care. The :code:`stop()` function can also be used conditionally, just as anything else: + +.. code-block:: groovy when referer().isAbsent() { stop() } -Or, using shorthand syntax:: +Or, using shorthand syntax: + +.. code-block:: groovy when referer().isAbsent stop() A word on groovy ---------------- -Groovy is a dynamic language for the JVM. This means, amongst other things, that you don't have to specify the types of variables:: +Groovy is a dynamic language for the JVM. This means, amongst other things, that you don't have to specify the types of variables: + +.. code-block:: groovy def i = 40 println i + 2 -The above snippet will print out 42 as you would expect. Note two things: we never specified that variable i is an int and also, we are not using any parenthese in the println method call. Groovy allows to leave out the parentheses in most method calls. The code above is equal to this snippet:: +The above snippet will print out 42 as you would expect. Note two things: we never specified that variable i is an int and also, we are not using any parentheses in the :code:`println` function call. Groovy allows to leave out the parentheses in most function and method calls. The code above is equivalent to this snippet: + +.. code-block:: groovy def i = 42 println(i + 2) -Which in turn is equals to this:: +This in turn is equivalent to this: + + +.. code-block:: groovy def i = 42 println(i.plus(2)) -When chaining single argument methods, this works out well. However, with nested method calls, this can be more problematic. Let's say we have a method called increment which increments the argument by one; so increment(10) will return 11. For example the following will not compile:: +This works well when chaining single argument methods. However, this can be more problematic with nested method calls. Suppose we have a function called :samp:`increment({x})` which increments the :code:`x` argument by 1, so :code:`increment(10)` will return 11. The following will not compile: + +.. code-block:: groovy println increment 10 -But this will:: +However this will: + +.. code-block:: groovy println(increment(10)) -And this won't:: +Yet this won't: + +.. code-block:: groovy println(increment 10) -In the Divolte Collector mapping DSL, it is sometimes required to chain method calls. For example when using the result of a casting operation in a mapping. We solve this by accepting a closure that produces a value as result:: +In the Divolte Collector mapping DSL, it is sometimes required to chain method calls. For example when using the result of a casting operation in a mapping. We solve this by accepting a closure that produces a value as result: - map { parse cookie('customer_id') to int32 } onto 'customerId' +.. code-block:: groovy -This way, you don't have to add parentheses to all intermediate method calls and we keep the syntax fluent. If you follow these general guidelines, you should be safe: + map { parse cookie('customer_id') to int32 } onto 'customerId' -* When calling methods that produce a value, always use parentheses. For example: location(), referer(), partyId() -* When deriving a condition or other value from a method that produces a value, also use parenthese. Example: +This way you don't have to add parentheses to all intermediate method calls and we keep the syntax fluent. If you follow these general guidelines, you should be safe: - .. +* When calling methods that produce a value, always use parentheses. For example: :code:`location()`, :code:`referer()`, :code:`partyId()` +* When deriving a condition or other value from a method that produces a value, also use parentheses. For example: - :: + .. code-block:: groovy when location().equalTo('http://www.example.com/') apply { ... @@ -466,13 +519,9 @@ This way, you don't have to add parentheses to all intermediate method calls and map parsedUri.query().value('foo') onto 'field' - .. - * When parsing or matching on something, extract it to a variable before using it. This also improves readability: - .. - - :: + .. code-block:: groovy def myUri = parse location() to uri when myUri.query().value('foo').isPresent() apply { ... } @@ -480,95 +529,103 @@ This way, you don't have to add parentheses to all intermediate method calls and def myMatcher = match '^/foo/bar/([a-z]+)/' against myUri.path() when myMatcher.matches() apply { ... } - .. - * When casting inline, use the closure syntax for mapping or conditionals: - .. - - :: + .. code-block:: groovy map { parse cookie('example') to int32 } onto 'field' Simple values ^^^^^^^^^^^^^ -Simple values are pieces of information that are directly extracted from the request without any processing. You can map simple values directly onto fields of the correct type or you can use them in further processing, such as regex matching and extraction or URI parsing. +Simple values are pieces of information that are directly extracted from the event without any processing. You can map simple values directly onto fields of the correct type or you can use them in further processing, such as matching againast a regular expression or URI parsing. + +.. _location: -location -"""""""" +Simple value: :code:`location()` +"""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map location() onto 'locationField' :Description: - The location of this request: the full address in the address bar of the user's browser, including the fragment part if this is present (the part after the #). This is different from server side request logs, which will not be able to catch the fragment part. + The location URL for the page-view that triggered the event: the full address in the address bar of the user's browser. This includes the fragment part if this is present (the part after the ``#``), which is different from server side request logs which do not contain the fragment part. :Type: - string + :code:`string` -referer -""""""" +.. _referer: + +Simple value: :code:`referer()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map referer() onto 'refererField' :Description: - The referer of this request. Note that the referer is taken from JavaScript and does not depend on any headers being sent by the browser. The referer will not contain any fragment part that might have been present in the user's address bar. + The referrer URL for the page-view that triggered the event. Unlike :code:`location()`, the referer will not contain any fragment part. :Type: - string + :code:`String` + +.. _firstInSession: -firstInSession -"""""""""""""" +Simple value: :code:`firstInSession()` +"""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map firstInSession() onto 'first' :Description: - A boolean flag that is set to true if a new session ID was generated for this request and false otherwise. A value of true indicates that a new session has started. + A boolean flag that is true if a new session ID was generated for this event and false otherwise. If true a new session has started. :Type: - boolean + :code:`Boolean` + +.. _corrupt: -corrupt -""""""" +Simple value: :code:`corrupt()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map corrupt() onto 'detectedCorruption' :Description: - A boolean flag that is set to true when the request checksum does not match the request contents and false otherwise. Whenever a the JavaScript performs a request, it calculates a hash code of all request properties and adds this hash code at the end of the request. On the server side, this hash is calculated again and checked for correctness. Corrupt requests usually occur when intermediate parties try to re-write requests or truncate long URLs (e.g. proxies and anti-virus software can have this habit). + A boolean flag that is true if the source for the event detected corruption of the event data. Event corruption usually occurs when intermediate parties try to re-write HTTP requests or truncate long URLs. Real-world proxies and anti-virus software has been observed doing this. :Type: - boolean + :code:`Boolean` -duplicate -""""""""" +.. _duplicate: + +Simple value: :code:`duplicate()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map duplicate() onto 'detectedDuplicate' :Description: - A boolean flag that is set to true when the request is believed to be duplicated and false otherwise. Duplicate detection in Divolte Collector utilizes a probabilistic data structure that has a low false positive and false negative rate. Nonetheless, these can still occur. Duplicate requests are often performed by certain types of anti-virus software and certain proxies. Additionally, sometimes certain browsers go haywire and send the same request large numbers of times (in the tens of thousands). The duplicate flag server as a line of defense against this phenomenon, which is particularly handy in real-time processing where it is not practical to perform de-duplication of the data based on a full data scan. + A boolean flag that true when the event is believed to be a duplicate of an earlier one. Duplicate detection in Divolte Collector utilizes a probabilistic data structure that has a low false positive and false negative rate. Nonetheless classification mistakes can still occur. Duplicate events often arrive due to certain types of anti-virus software and certain proxies. Additionally, browsers sometimes go haywire and send the same request large numbers of times (in the tens of thousands). Duplicate detection can be used to mitigate the effects when this occurs. This is particularly handy in real-time processing where it is not practical to perform de-duplication of the data based on a full data scan. :Type: - boolean + :code:`Boolean` + +.. _timestamp: -timestamp -""""""""" +Simple value: :code:`timestamp()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map timestamp() onto 'timeField' @@ -576,13 +633,15 @@ timestamp The timestamp of the time the the request was received by the server, in milliseconds since the UNIX epoch. :Type: - long + :code:`long` -clientTimestamp -""""""""""""""" +.. _clientTimestamp: + +Simple value: :code:`clientTimestamp()` +""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map clientTimestamp() onto 'timeField' @@ -590,27 +649,31 @@ clientTimestamp The timestamp that was recorded on the client side immediately prior to sending the request, in milliseconds since the UNIX epoch. :Type: - long + :code:`long` -remoteHost -"""""""""" +.. _remoteHost: + +Simple value: :code:`remoteHost()` +"""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map remoteHost() onto 'ipAddressField' :Description: - The remote IP address of the request. Depending on configuration, Divolte Collector will use any X-Forwarded-For headers set by intermediate proxies or load balancers. + The remote IP address of the request. Depending on configuration, Divolte Collector will use any :mailheader:`X-Forwarded-For` headers set by intermediate proxies or load balancers. :Type: - string + :code:`String` -viewportPixelWidth -"""""""""""""""""" +.. _viewportPixelWidth: + +Simple value: :code:`viewportPixelWidth()` +"""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map viewportPixelWidth() onto 'widthField' @@ -618,13 +681,15 @@ viewportPixelWidth The width of the client's browser viewport in pixels. :Type: - int + :code:`int` + +.. _viewportPixelHeight: -viewportPixelHeight -""""""""""""""""""" +Simple value: :code:`viewportPixelHeight()` +""""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map viewportPixelHeight() onto 'widthField' @@ -632,13 +697,15 @@ viewportPixelHeight The height of the client's browser viewport in pixels. :Type: - int + :code:`int` -screenPixelWidth -"""""""""""""""" +.. _screenPixelWidth: + +Simple value: :code:`screenPixelWidth()` +"""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map screenPixelWidth() onto 'widthField' @@ -646,13 +713,15 @@ screenPixelWidth The width of the client's screen in pixels. :Type: - int + :code:`int` + +.. _screenPixelHeight: -screenPixelHeight -""""""""""""""""" +Simple value: :code:`screenPixelHeight()` +""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map screenPixelHeight() onto 'widthField' @@ -660,13 +729,15 @@ screenPixelHeight The height of the client's screen in pixels. :Type: - int + :code:`int` + +.. _devicePixelRatio: -devicePixelRatio -"""""""""""""""" +Simple value: :code:`devicePixelRatio()` +"""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map devicePixelRatio() onto 'ratioField' @@ -674,115 +745,140 @@ devicePixelRatio The ratio of physical pixels to logical pixels on the client's device. Some devices use a scaled resolution, meaning that the resolution and the actual available pixels are different. This is common on retina-type displays, with very high pixel density. :Type: - int + :code:`int` -partyId -""""""" +.. _partyId: + +Simple value: :code:`partyId()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map partyId() onto 'partyField' :Description: - A unique identifier stored with the client in a long lived cookie. The party ID identifies a known device. + A long-lived unique identifier stored by a client that is associated with each event from that source. All events from the same client should have the same party identifier. + + For browser sources this value is stored in a cookie. :Type: - string + :code:`String` + +.. _sessionId: -sessionId -""""""""" +Simple value: :code:`sessionId()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map sessionId() onto 'sessionField' :Description: - A unique identifier stored with the client in a cookie that is set to expire after a fixed amount of time (default: 30 minutes). Each new request resets the session expiry time, which means that a new session will start after the session timeout has passed without any activity. + A short-lived unique identifier stored by a client that is associated with each event from that source within a session of activity. All events from the same client within a session should have the same session identifier. + + For browser sources a session to expire when 30 minutes has elapsed without any events occurring. :Type: - string + :code:`String` -pageViewId -"""""""""" +.. _pageViewId: + +Simple value: :code:`pageViewId()` +"""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map pageViewId() onto 'pageviewField' :Description: - A unique identifier that is generated for each pageview request. + A unique identifier that is generated for each page-view. All events from a client within the same page-view will have the same page-view identifier. + + For browser sources a page-view starts when the user visits a page, and ends when the user navigates to a new page. Note that navigating within single-page web applications or links to anchors within the same page do *not* normally trigger a new page-view. :Type: - string + :code:`String` + +.. _eventId: -eventId -""""""" +Simple value: :code:`eventId()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map eventId() onto 'eventField' :Description: - A unique identifier that is created for each event that is fired by taking the pageViewId and appending a monotonically increasing number to it. + A unique identifier that is associated with each event received from a source. (This identifier is assigned by the client, not by the server.) :Type: - string + :code:`String` -userAgentString -""""""""""""""" +.. _userAgentString: + +Simple value: :code:`userAgentString()` +""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map userAgentString() onto 'uaField' :Description: - The full user agent identification string as reported by the client's browser. See `User agent parsing`_ on how to extract more meaningful information from this string. + The full user agent identification string reported by the client HTTP headers when sending an event. + + See `User agent parsing`_ on how to extract more meaningful information from this string. :Type: - string + :code:`String` + +.. _cookie: -cookie -"""""" +Simple value: :samp:`cookie({name})` +"""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map cookie('cookie_name') onto 'customCookieField' :Description: - The value for a cookie that was sent by the client's browser in the request. + The value of a cookie included in the client HTTP headers when sending an event. :Type: - string + :code:`String` -eventType -""""""""" +.. _eventType: + +Simple value: :code:`eventType()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map eventType() onto 'eventTypeField' :Description: - The type of event that was captured in this request. This defaults to 'pageView', but can be overridden when custom events are fired from JavaScript within a page. + The type of event being processed. + + The tracking tag used by sites integrating with browser sources automatically issue a :code:`pageView` event by default + when a page-view commences. Custom events may set this value to anything they like. :Type: - string + :code:`String` Complex values ^^^^^^^^^^^^^^ -Complex values return objects that you can in turn use to extract derived, simple values from. Complex values are either the result of parsing something (e.g. the user agent string) or matching regular expressions against another value. +Complex values often return intermediate objects that you extract derived, simple values for mapping onto fields. The main exception to this is when working with event-parameters: the :code:`JsonNode` results can be mapped directly to fields, so long as they are of the right 'shape'; see :ref:`mapping-json-label` for more details. -eventParameters -""""""""""""""" +Complex value: :code:`eventParameters()` +"""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy // on the client in JavaScript: divolte.signal('myEvent', { foo: 'hello', bar: 42 }); @@ -791,40 +887,46 @@ eventParameters map eventParameters() onto 'parametersField' :Description: - A JSON object (``JsonNode``) containing the custom parameters that were submitted with + A JSON object or array (:code:`JsonNode`) containing the custom parameters that were submitted with the event. See :ref:`mapping-json-label` for an example on how to map this to a field. :Type: - JsonNode + :code:`JsonNode` -eventParameters value -""""""""""""""""""""" +Derived simple value: :samp:`eventParameters().value({name})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + On a site submitting events to a browser source: + + .. code-block:: javascript - // On the client in JavaScript: divolte.signal('myEvent', { foo: 'hello', bar: 42 }); - // In the mapping: + In the mapping: + + .. code-block:: groovy + map eventParameters().value('foo') onto 'fooField' // Or with a cast: map { parse eventParameters().value('bar') to int32 } onto 'barField' :Description: - The value for a parameter that was sent as part of a custom event from JavaScript. Note that this is always a string, regardless of the type used on the client side. In the case that you are certain a parameter has a specific type, you can explicitly cast it as in the example above. + The value for an event parameter that was sent as part of a custom event. Note that this is always a string, regardless of the type used on the client side. If you are certain a parameter has a specific format you can explicitly cast it as in the example above. :Type: - string + :code:`String` -eventParameters path -"""""""""""""""""""" +Derived complex value: :samp:`eventParameters().path({expression})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + On a site submitting events to a browser source: + + .. code-block:: javascript // On the client in JavaScript: divolte.signal('searchResults', [ @@ -832,60 +934,66 @@ eventParameters path { "sku": "0094638246817", "score": 0.8 } ]); - // In the Avro schema: + In the Avro schema: + + .. code-block:: json + { "name": "searchResults", "type": [ "null", { "type": "array", "items": "string" } ], "default": null } - // In the mapping: + In the mapping: + + .. code-block:: groovy + map eventParameters().path('$[*].sku') onto 'searchResults' :Description: This can be used to extract parts of parameters supplied with the event using a JSON-path expression. (See http://goessner.net/articles/JsonPath/ for a description of JSON-path expressions.) - If the expression does not match anything, the value is not considered to be present. (A ``when`` expression can test for this.) + If the expression does not match anything, the value is not considered to be present. (A :code:`when` expression can test for this.) See :ref:`mapping-json-label` for an example on how to map JSON values to a field. Expressions can return more than one result; these are presented as a JSON array for subsequent mapping. :Type: - JsonNode + :code:`JsonNode` -URI -""" +Complex conversion: :code:`uri` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy - def locationUri = parse location() to uri + def locationUri = parse location() to uri :Description: - Attempts to parse a string into a URI. The most obvious values to use for this are the location() and referer() values, but you can equally do the same with custom event parameters or any other string. If the parser fails to create a URI from a string, than the value will be absent. Note that the parsed URI itself is not directly mappable onto any Avro field. + Attempts to parse a string as a URI. The most obvious candidates to use for this are the :code:`location()` and :code:`referer()` values, but you can equally do this same with custom event parameters or any other string value. If the parser fails to create a URI from a string, then the value will be absent. Note that the parsed URI itself is not directly mappable onto any Avro field. :Type: - URI + :code:`URI` -URI path -~~~~~~~~ +Derived simple value: :code:`URI.path()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.path() onto 'locationPathField' :Description: - The path component of a URI. Any URL encoded values in the path will be decoded. Keep in mind that if the path contains a encoded / character (%2F), this will also be decoded. Be careful when matching regular expressions against path parameters. + The path component of a URI. Any URL encoded values in the path will be decoded. Keep in mind that if the path contains a encoded :code:`/` character (:code:`%2F`), this will also be decoded. Be careful when matching regular expressions against path parameters. :Type: - string + :code:`String` -URI rawPath -~~~~~~~~~~~ +Derived simple value: :code:`URI.rawPath()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.rawPath() onto 'locationPathField' @@ -894,13 +1002,13 @@ URI rawPath The path component of a URI. This value is not decoded in any way. :Type: - string + :code:`String` -URI scheme -~~~~~~~~~~ +Derived simple value: :code:`URI.scheme()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.scheme() onto 'locationSchemeField' @@ -909,97 +1017,99 @@ URI scheme map locationUri.scheme().equalTo('https') onto 'isSecure' :Description: - The scheme component of a URI. This is the protocol part, such as http or https. + The scheme component of a URI. This is the protocol part, such as :code:`http` or :code:`https`. :Type: - string + :code:`String` -URI host -~~~~~~~~ +Derived simple value: :code:`URI.host()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.host() onto 'locationHostField' :Description: - The host component of a URI. In http://www.example.com/foo/bar, this would be: www.example.com + The host component of a URI. For :code:`http://www.example.com/foo/bar` this would be :code:`www.example.com`. :Type: - string + :code:`String` -URI port -~~~~~~~~ +Derived simple value: :code:`URI.port()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.port() onto 'locationPortField' :Description: - The port component of a URI. In http://www.example.com:8080/foo, this would be: 8080. Note that when no port is specified in the URI (e.g. http://www.example.com/foo), this value will be absent. Divolte Collector makes no assumptions about default ports for protocoles. + The port component of a URI. For :code:`http://www.example.com:8080/foo` this would be :code:`8080`. Note that when no port is specified in the URI (e.g. :code:`http://www.example.com/foo`) this value will be absent. Divolte Collector makes no assumptions about default ports for protocols. :Type: - int + :code:`int` -URI decodedQueryString -~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.decodedQueryString()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.decodedQueryString() onto 'locationQS' :Description: - The full, URL decoded query string of a URI. In http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar, this would be: "q=hello world&foo/bar". + The full, URL decoded query string of a URI. For :code:`http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar`, this would be :code:`q=hello world&foo/bar`. :Type: - string + :code:`String` -URI rawQueryString -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.rawQueryString()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.rawQueryString() onto 'locationQS' :Description: - The full, query string of a URI without any decoding. In http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar, this would be: "q=hello+world&foo%2Fbar". + The full, query string of a URI without any decoding. For :code:`http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar` this would be :code:`q=hello+world&foo%2Fbar`. :Type: - string + :code:`String` -URI decodedFragment -~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.decodedFragment()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.decodedFragment() onto 'locationFragment' :Description: - The full, URL decoded fragment of a URI. In http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar, this would be: "/localpath/?q=hello world&foo/bar". + The full, URL decoded fragment of a URI. For :code:`http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar` this would be :code:`/localpath/?q=hello world&foo/bar`. :Type: - string + :code:`String` -URI rawFragment -~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.rawFragment()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.rawFragment() onto 'locationFragment' :Description: - The full, fragment of a URI without any decoding. In http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar, this would be: "/localpath/?q=hello+world&foo%2Fbar". In web applications with rich client side functionality written in JavaScript, it is a common pattern that the fragment of the location is written as a URI again, but without a scheme, host and port. Nonetheless, it is entirely possible to parse the raw fragment of a location into a separate URI again and use this for further mapping. As an example, consider the following:: + The full, fragment of a URI without any decoding. For :code:`http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar` this would be :code:`/localpath/?q=hello+world&foo%2Fbar`. In web applications with rich client side functionality written in JavaScript, it is a common pattern that the fragment of the location is written as a URI again, but without a scheme, host and port. Nonetheless, it is entirely possible to parse the raw fragment of a location into a separate URI again and use this for further mapping. As an example, consider the following: + + .. code-block:: groovy // If location() = 'http://www.example.com/foo/#/local/path/?q=hello+world' // this would map '/local/path/' onto the field clientSidePath @@ -1008,20 +1118,22 @@ URI rawFragment map localUri.path() onto 'clientSidePath' :Type: - string + :code:`String` -Query strings -""""""""""""" +Derived complex value: :code:`URI.query()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri def locationQuery = locationUri.query() map locationQuery onto 'locationQueryParameters' :Description: - The query string from a URI parsed into a map of value lists. In the resulting map, the keys are the parameter names of the query string and the values are lists of strings. Lists are required, as a query parameter can have multiple values (by being present more than once). In order to map all the query parameters directly onto a Avro field, the field must be typed as a map of string lists, possibly a union with null, to have a sensible default when no query string is possible. In a Avro schema definition, the following field definition can be a target field for the query parameters:: + The query string from a URI parsed into a map of value lists. In the resulting map, the keys are the parameter names of the query string and the values are lists of strings. Lists are required because a query parameter can have multiple values (by being present more than once). In order to map all the query parameters directly onto a Avro field, the field must be typed as a map of string lists, possibly a union with null, to have a sensible default when no query string is possible. In a Avro schema definition, the following field definition can be a target field for the query parameters: + + .. code-block:: json { "name": "uriQuery", @@ -1039,13 +1151,13 @@ Query strings } :Type: - map> + :code:`Map>` -Query string value -~~~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`URI.query().value({name})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri def locationQuery = locationUri.query() @@ -1055,13 +1167,13 @@ Query string value The first value found for a query parameter. This value is URL decoded. :Type: - string + :code:`String` -Query string valueList -~~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :samp:`URI.query().valueList({name})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri def locationQuery = locationUri.query() @@ -1071,18 +1183,22 @@ Query string valueList A list of all values found for a query parameter name. These values are URL decoded. :Type: - list + :code:`List` -Regular expression matching -""""""""""""""""""""""""""" +.. _Regular expression matching: + +Complex value: :samp:`match({regex}).against({stringValue})` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy def matcher = match '/foo/bar/([a-z]+).html$' against location() :Description: - Matches the given regular expression against a value; the entire value must match. The result of this can not be directly mapped onto a Avro field, but can be used to extract capture groups or conditionally perform a mapping if the pattern is a match. Often it is required to perform non-trivial partial extractions against strings that are taken from the requests. One example would be matching the path of the location with a wild card. It is not recommended to match patterns against the location() or referer() values directly; instead consider parsing out relevant parts of the URI first using URI parsing. In the following example, the matching is much more robust in the presence of unexpected query parameters or fragments compared to matching against the entire location string:: + Matches a regular expression against a string value; the entire value must match. The result of this can not be directly mapped onto a Avro field, but can be used to extract capture groups or conditionally perform a mapping if the pattern is a match. Often it is required to perform non-trivial partial extractions against strings that are taken from the requests. One example would be matching the path of the location with a wild card. It is not recommended to match patterns against the :code:`location()` or :code:`referer()` values directly; instead parse as an URI first and match against the relevant parts. In the following example, the matching is much more robust in the presence of unexpected query parameters or fragments compared to matching against the entire location string: + + .. code-block:: groovy def locationUri = parse location() to uri def pathMatcher = match '^/foo/bar/([a-z]+).html$' against locationUri.path() @@ -1092,13 +1208,13 @@ Regular expression matching } :Type: - Matcher + :code:`Matcher` -Regex matches -~~~~~~~~~~~~~ +Derived simple value: :code:`Matcher.matches()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def matcher = match '^/foo/bar/([a-z]+).html$' against location() @@ -1111,16 +1227,16 @@ Regex matches map matcher.matches() onto 'isFooBarPage' :Description: - True when the pattern matches the value or false otherwise. In case the target value is absent, this will produce false. + True when the value is present and matches the regular expression or false otherwise. :Type: - boolean + :code:`Boolean` -Regex group -~~~~~~~~~~~ +Derived simple value: :samp:`Matcher.group({positionOrName})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy // Using group number def matcher = match '/foo/bar/([a-z]+).html$' against location() @@ -1134,18 +1250,20 @@ Regex group The value from a capture group in a regular expression pattern if the pattern matches, absent otherwise. Groups can be identified by their group number, starting from 1 as the first group or using named capture groups. :Type: - string + :code:`String` -HTTP headers -"""""""""""" +Complex value: :samp:`header({name})` +""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map header('header-name') onto 'fieldName' :Description: - The list of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name; these are returned as a list. The Avro type of the target field for this mapping must be a list of string:: + The list of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name; these are returned as a list. The Avro type of the target field for this mapping must be a list of string: + + .. code-block:: json { "name": "headers", @@ -1160,16 +1278,16 @@ HTTP headers "default": null } - Note that the array field in Avro itself is nullable and has a default value of null, whereas the items in the array are not nullable. The latter is not required, because when te header is present, the elements in the list are guaranteed to be present. + Note that the array field in Avro itself is nullable and has a default value of null, whereas the items in the array are not nullable. The latter is not required, because when the header is present the elements in the list are guaranteed to be non-null. :Type: - list + :code:`List` -HTTP header first -~~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`header({name}).first()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map header('header-name').first() onto 'fieldName' @@ -1177,13 +1295,13 @@ HTTP header first The *first* of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name. This returns the first value in that list. :Type: - string + :code:`String` -HTTP header last -~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`header({name}).last()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map header('header-name').last() onto 'fieldName' @@ -1191,13 +1309,13 @@ HTTP header last The *last* of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name. This returns the last value in that list. :Type: - string + :code:`String` -HTTP header commaSeparated -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`header({name}).commaSeparated()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map header('header-name').commaSeparated() onto 'fieldName' @@ -1205,27 +1323,31 @@ HTTP header commaSeparated The comma separated string of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name. This joins that list using a comma as separator. :Type: - string + :code:`String` -User agent parsing -"""""""""""""""""" +.. _User agent parsing: + +Complex value: :code:`userAgent()` +"""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy - def ua = userAgent() + def ua = userAgent() :Description: - Attempts to parse a the result of `userAgentString`_ string into a user agent object. Note that this result is not directly mappable onto any Avro field. Instead, the subfields from this object, described below, can be mapped onto fields. When the parsing of the user agent string fails, either because the user agent is unknown or malformed, or because the user agent was not sent by the browser, this value and all subfields' values are absent. + Attempts to parse a the result of `userAgentString`_ string into a user agent object. Note that this result is not directly mappable onto any Avro field. Instead, the subfields from this object, described below, can be mapped onto fields. When the parsing of the user agent string fails, either because the user agent is unknown or malformed, or because the user agent was not sent by the browser, this value and all subfield values are absent. :Type: - ReadableUserAgent + :code:`ReadableUserAgent` + +.. _User agent name: -User agent name -~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().name()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().name() onto 'uaNameField' @@ -1233,13 +1355,15 @@ User agent name The canonical name for the parsed user agent. E.g. 'Chrome' for Google Chrome browsers. :Type: - string + :code:`String` + +.. _User agent family: -User agent family -~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().family()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().family() onto 'uaFamilyField' @@ -1247,27 +1371,31 @@ User agent family The canonical name for the family of the parsed user agent. E.g. 'Mobile Safari' for Apple's mobile browser. :Type: - string + :code:`String` -User agent vendor -~~~~~~~~~~~~~~~~~ +.. _User agent vendor: + +Derived simple value: :code:`userAgent().vendor()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().vendor() onto 'uaVendorField' :Description: - The name of the company or oganisation that produces the user agent software. E.g. 'Google Inc.' for Google Chrome browsers. + The name of the company or organisation that produces the user agent software. E.g. 'Google Inc.' for Google Chrome browsers. :Type: - string + :code:`String` + +.. _User agent type: -User agent type -~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().type()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().type() onto 'uaTypeField' @@ -1275,13 +1403,15 @@ User agent type The type of user agent that was used. E.g. 'Browser' for desktop browsers. :Type: - string + :code:`String` + +.. _User agent version: -User agent version -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().version()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().version() onto 'uaVersionField' @@ -1289,13 +1419,15 @@ User agent version The version string of the user agent software. E.g. '39.0.2171.71' for Google Chrome 39. :Type: - string + :code:`String` -User agent device category -~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _User agent device category: + +Derived simple value: :code:`userAgent().deviceCategory()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().deviceCategory() onto 'uaDeviceCategoryField' @@ -1303,13 +1435,15 @@ User agent device category The type of device that the user agent runs on. E.g. 'Tablet' for a tablet based browser. :Type: - string + :code:`String` + +.. _User agent OS family: -User agent OS family -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().osFamily()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().osFamily() onto 'uaOSFamilyField' @@ -1317,13 +1451,17 @@ User agent OS family The operating system family that the user agent runs on. E.g. 'OS X' for a Apple OS X based desktop. :Type: - string + :code:`String` + +.. _User agent OS version: -User agent OS version -~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: + +Derived simple value: :code:`userAgent().osVersion()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().osVersion() onto 'uaOSVersionField' @@ -1331,267 +1469,269 @@ User agent OS version The version string of the operating system that the user agent runs on. E.g. '10.10.1' for Max OS X 10.10.1. :Type: - string + :code:`String` + +.. _User agent OS vendor: -User agent OS vendor -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().osVendor()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().osVendor() onto 'uaOSVendorField' :Description: - The name of the company or oganisation that produces the operating system that the user agent software runs on. E.g. 'Apple Computer, Inc.' for Apple Mac OS X. + The name of the company or organisation that produces the operating system that the user agent software runs on. E.g. 'Apple Computer, Inc.' for Apple Mac OS X. :Type: - string + :code:`String` -ip2geo -"""""" +Complex value: :code:`ip2geo({optionalIP})` +""""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy - // uses the remoteHost as IP address to lookup - def ua = ip2geo() + // uses the remoteHost as IP address to lookup + def ua = ip2geo() - // If a load balancer sets custom headers for IP addresses, use like this - def ip = header('X-Custom-Header').first() - def myUa = ip2geo(ip) + // If a load balancer sets custom headers for IP addresses, use like this + def ip = header('X-Custom-Header').first() + def myUa = ip2geo(ip) :Description: Attempts to turn a IPv4 address into a geo location by performing a lookup into a configured `MaxMind GeoIP City database `_. This database is not distributed with Divolte Collector, but must be provided separately. See the :doc:`configuration` chapter for more details on this. - Note that this result is not directly mappable onto any Avro field. Instead, the subfields from this object, described below, can be mapped onto fields. When the lookup for a IP address fails or when the argument is not a IPv4 address, this value and all subfields' values are absent. + Note that this result is not directly mappable onto any Avro field. Instead the subfields from this object, described below, can be mapped onto fields. When the lookup for a IP address fails or when the argument is not a IPv4 address, this value and all subfield values are absent. :Type: - CityResponse + :code:`CityResponse` -Geo IP cityId -~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().cityId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().cityId() onto 'cityIdField' :Description: - The City ID for the geo location as known by http://www.geonames.org/. + The City ID for the geolocation as known by http://www.geonames.org/. :Type: - int + :code:`int` -Geo IP cityName -~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().cityName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().cityName() onto 'cityNameField' :Description: - The city name for the geo location in English. + The city name for the geolocation in English. :Type: - string + :code:`String` -Geo IP continentCode -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().continentCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().continentCode() onto 'continentCodeField' :Description: - The ISO continent code for the geo location. + The ISO continent code for the geolocation. :Type: - string + :code:`String` -Geo IP continentId -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().continentId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().continentId() onto 'continentIdField' :Description: - The Continent Id for the geo location as known by http://www.geonames.org/. + The Continent Id for the geolocation as known by http://www.geonames.org/. :Type: - int + :code:`int` -Geo IP continentName -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().continentName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().continentName() onto 'continentNameField' :Description: - The continent name for the geo location in English. + The continent name for the geolocation in English. :Type: - string + :code:`String` -Geo IP countryCode -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().countryCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().countryCode() onto 'countryCodeField' :Description: - The ISO country code for the geo location. + The ISO country code for the geolocation. :Type: - string + :code:`String` -Geo IP countryId -~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().countryId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().countryId() onto 'countryIdField' :Description: - The Country Id for the geo location as known by http://www.geonames.org/. + The Country Id for the geolocation as known by http://www.geonames.org/. :Type: - int + :code:`int` -Geo IP countryName -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().countryName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().countryName() onto 'countryNameField' :Description: - The country name for the geo location in English. + The country name for the geolocation in English. :Type: - string + :code:`String` -Geo IP latitude -~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().latitude()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().latitude() onto 'latitudeField' :Description: - The latitude for the geo location in English. + The latitude for the geolocation in English. :Type: - double + :code:`double` -Geo IP longitude -~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().longitude()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().longitude() onto 'longitudeField' :Description: - The longitude for the geo location in English. + The longitude for the geolocation in English. :Type: - double + :code:`double` -Geo IP metroCode -~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().metroCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().metroCode() onto 'metroCodeField' :Description: - The ISO metro code for the geo location. + The ISO metro code for the geolocation. :Type: - string + :code:`String` -Geo IP timeZone -~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().timeZone()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().timeZone() onto 'timeZoneField' :Description: - The time zone name for the geo location as found in the `IANA Time Zone Database `_. + The time zone name for the geolocation as found in the `IANA Time Zone Database `_. :Type: - string + :code:`String` -Geo IP mostSpecificSubdivisionCode -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().mostSpecificSubdivisionCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().mostSpecificSubdivisionCode() onto 'mostSpecificSubdivisionCodeField' :Description: - The ISO code for the most specific subdivision known for the geo location. + The ISO code for the most specific subdivision known for the geolocation. :Type: - string + :code:`String` -Geo IP mostSpecificSubdivisionId -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().mostSpecificSubdivisionId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().mostSpecificSubdivisionId() onto 'mostSpecificSubdivisionIdField' :Description: - The ID for the most specific subdivision known for the geo location as known by http://www.geonames.org/. + The ID for the most specific subdivision known for the geolocation as known by http://www.geonames.org/. :Type: - int + :code:`int` -Geo IP mostSpecificSubdivisionName -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().mostSpecificSubdivisionName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().mostSpecificSubdivisionName() onto 'mostSpecificSubdivisionNameField' :Description: - The name for the most specific subdivision known for the geo location in English. + The name for the most specific subdivision known for the geolocation in English. :Type: - string + :code:`String` -Geo IP postalCode -~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().postalCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().postalCode() onto 'postalCodeField' :Description: - The postal code for the geo location. + The postal code for the geolocation. :Type: - string + :code:`String` .. Do these even work? @@ -1615,47 +1755,47 @@ Geo IP postalCode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Geo IP subdivisionCodes -~~~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :code:`ip2geo().subdivisionCodes()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().subdivisionCodes() onto 'subdivisionCodesField' :Description: - The ISO codes for all subdivisions for the geo location in order from least specific to most specific. + The ISO codes for all subdivisions for the geolocation in order from least specific to most specific. :Type: - list + :code:`List` -Geo IP subdivisionIds -~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :code:`ip2geo().subdivisionIds()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().subdivisionIds() onto 'subdivisionIdsFields' :Description: - The IDs for all subdivisions for the geo location in order from least specific to most specific as known by http://www.geonames.org/. + The IDs for all subdivisions for the geolocation in order from least specific to most specific as known by http://www.geonames.org/. :Type: - list + :code:`List` -Geo IP subdivisionNames -~~~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :code:`ip2geo().subdivisionNames()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().subdivisionNames() onto 'subdivisionNames' :Description: - The names in English for all subdivisions for the geo location in order from least specific to most specific. + The names in English for all subdivisions for the geolocation in order from least to most specific. :Type: - list + :code:`List` .. These GEO IP fields don't really work currently anyway From 57f695b47dc00d4560ea4aced804dae3f09c34b9 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:09:57 +0200 Subject: [PATCH 72/80] Sphinx markup for the example strings for user-agent components. Plus a few typo corrections. --- docs/mapping_reference.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index ab1b7f06..43149032 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -1368,7 +1368,7 @@ Derived simple value: :code:`userAgent().family()` map userAgent().family() onto 'uaFamilyField' :Description: - The canonical name for the family of the parsed user agent. E.g. 'Mobile Safari' for Apple's mobile browser. + The canonical name for the family of the parsed user agent. E.g. ``Mobile Safari`` for Apple's mobile browser. :Type: :code:`String` @@ -1384,7 +1384,7 @@ Derived simple value: :code:`userAgent().vendor()` map userAgent().vendor() onto 'uaVendorField' :Description: - The name of the company or organisation that produces the user agent software. E.g. 'Google Inc.' for Google Chrome browsers. + The name of the company or organisation that produces the user agent software. E.g. ``Google Inc.`` for Google Chrome browsers. :Type: :code:`String` @@ -1400,7 +1400,7 @@ Derived simple value: :code:`userAgent().type()` map userAgent().type() onto 'uaTypeField' :Description: - The type of user agent that was used. E.g. 'Browser' for desktop browsers. + The type of user agent that was used. E.g. ``Browser`` for desktop browsers. :Type: :code:`String` @@ -1416,7 +1416,7 @@ Derived simple value: :code:`userAgent().version()` map userAgent().version() onto 'uaVersionField' :Description: - The version string of the user agent software. E.g. '39.0.2171.71' for Google Chrome 39. + The version string of the user agent software. E.g. ``39.0.2171.71`` for Google Chrome 39. :Type: :code:`String` @@ -1432,7 +1432,7 @@ Derived simple value: :code:`userAgent().deviceCategory()` map userAgent().deviceCategory() onto 'uaDeviceCategoryField' :Description: - The type of device that the user agent runs on. E.g. 'Tablet' for a tablet based browser. + The type of device that the user agent runs on. E.g. ``Tablet`` for a tablet based browser. :Type: :code:`String` @@ -1448,7 +1448,7 @@ Derived simple value: :code:`userAgent().osFamily()` map userAgent().osFamily() onto 'uaOSFamilyField' :Description: - The operating system family that the user agent runs on. E.g. 'OS X' for a Apple OS X based desktop. + The operating system family that the user agent runs on. E.g. ``OS X`` for an Apple Mac OS X based desktop. :Type: :code:`String` @@ -1466,7 +1466,7 @@ Derived simple value: :code:`userAgent().osVersion()` map userAgent().osVersion() onto 'uaOSVersionField' :Description: - The version string of the operating system that the user agent runs on. E.g. '10.10.1' for Max OS X 10.10.1. + The version string of the operating system that the user agent runs on. E.g. ``10.10.1`` for Mac OS X 10.10.1. :Type: :code:`String` @@ -1482,7 +1482,7 @@ Derived simple value: :code:`userAgent().osVendor()` map userAgent().osVendor() onto 'uaOSVendorField' :Description: - The name of the company or organisation that produces the operating system that the user agent software runs on. E.g. 'Apple Computer, Inc.' for Apple Mac OS X. + The name of the company or organisation that produces the operating system that the user agent software runs on. E.g. ``Apple Computer, Inc.`` for Apple Mac OS X. :Type: :code:`String` From 970f36ebed78d48d202236456cacea44f235bc07 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:10:09 +0200 Subject: [PATCH 73/80] Metro codes aren't an ISO thing. --- docs/mapping_reference.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index 43149032..2fec154c 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -1657,7 +1657,7 @@ Derived simple value: :code:`ip2geo().metroCode()` map ip2geo().metroCode() onto 'metroCodeField' :Description: - The ISO metro code for the geolocation. + The Metro Code for the geolocation. :Type: :code:`String` From 0339e9f10d082c9330738e7102ef17f0bc027d47 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:10:30 +0200 Subject: [PATCH 74/80] Latitude and longitude aren't really English. --- docs/mapping_reference.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index 2fec154c..18455b2e 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -1629,7 +1629,7 @@ Derived simple value: :code:`ip2geo().latitude()` map ip2geo().latitude() onto 'latitudeField' :Description: - The latitude for the geolocation in English. + The latitude for the geolocation. :Type: :code:`double` @@ -1643,7 +1643,7 @@ Derived simple value: :code:`ip2geo().longitude()` map ip2geo().longitude() onto 'longitudeField' :Description: - The longitude for the geolocation in English. + The longitude for the geolocation. :Type: :code:`double` From 6e0cbe026798c6de50a7fc252b5d37c08f9900f9 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:11:05 +0200 Subject: [PATCH 75/80] GeoNames hyperlink reference. --- docs/mapping_reference.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index 18455b2e..87f8214e 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -1517,7 +1517,7 @@ Derived simple value: :code:`ip2geo().cityId()` map ip2geo().cityId() onto 'cityIdField' :Description: - The City ID for the geolocation as known by http://www.geonames.org/. + The `GeoNames`_ City ID for the geolocation. :Type: :code:`int` @@ -1559,7 +1559,7 @@ Derived simple value: :code:`ip2geo().continentId()` map ip2geo().continentId() onto 'continentIdField' :Description: - The Continent Id for the geolocation as known by http://www.geonames.org/. + The `GeoNames`_ Continent Id for the geolocation. :Type: :code:`int` @@ -1601,7 +1601,7 @@ Derived simple value: :code:`ip2geo().countryId()` map ip2geo().countryId() onto 'countryIdField' :Description: - The Country Id for the geolocation as known by http://www.geonames.org/. + The `GeoNames`_ Country Id for the geolocation. :Type: :code:`int` @@ -1699,7 +1699,7 @@ Derived simple value: :code:`ip2geo().mostSpecificSubdivisionId()` map ip2geo().mostSpecificSubdivisionId() onto 'mostSpecificSubdivisionIdField' :Description: - The ID for the most specific subdivision known for the geolocation as known by http://www.geonames.org/. + The `GeoNames`_ ID for the most specific subdivision known for the geolocation. :Type: :code:`int` @@ -1778,7 +1778,7 @@ Derived complex value: :code:`ip2geo().subdivisionIds()` map ip2geo().subdivisionIds() onto 'subdivisionIdsFields' :Description: - The IDs for all subdivisions for the geolocation in order from least specific to most specific as known by http://www.geonames.org/. + The `GeoNames`_ IDs for all subdivisions for the geolocation in order from least to most specific. :Type: :code:`List` @@ -1820,3 +1820,5 @@ Derived complex value: :code:`ip2geo().subdivisionNames()` Geo IP satelliteProvider ~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _GeoNames: http://www.geonames.org/ From 13bf817402e8e489ff37840b3526acdf84a54640 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:11:19 +0200 Subject: [PATCH 76/80] Minor wording changes. --- docs/mapping_reference.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index 87f8214e..4ec2a78f 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -1671,7 +1671,7 @@ Derived simple value: :code:`ip2geo().timeZone()` map ip2geo().timeZone() onto 'timeZoneField' :Description: - The time zone name for the geolocation as found in the `IANA Time Zone Database `_. + The name of the time zone for the geolocation as found in the `IANA Time Zone Database `_. :Type: :code:`String` @@ -1764,7 +1764,7 @@ Derived complex value: :code:`ip2geo().subdivisionCodes()` map ip2geo().subdivisionCodes() onto 'subdivisionCodesField' :Description: - The ISO codes for all subdivisions for the geolocation in order from least specific to most specific. + The ISO codes for all subdivisions for the geolocation in order from least to most specific. :Type: :code:`List` From 4d3475fc567d11a715b27652efdc61299bfcc917 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:11:53 +0200 Subject: [PATCH 77/80] Whitespace. --- docs/deployment.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployment.rst b/docs/deployment.rst index f5d8eed6..68343df1 100644 --- a/docs/deployment.rst +++ b/docs/deployment.rst @@ -36,7 +36,7 @@ Example nginx configuration When using `nginx `_ as a reverse proxy and load balancer in front of Divolte Collector, you can use this snippet for configuring nginx:: upstream divolte { - hash $request_uri consistent; + hash $request_uri consistent; server divolte1.internaldomain:8290; server divolte1.internaldomain:8290; From db44b39cbc77d79f1f97f8026837384d0bed3529 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 16:22:09 +0200 Subject: [PATCH 78/80] Refresh the deployment documentation. --- docs/deployment.rst | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/deployment.rst b/docs/deployment.rst index 68343df1..4b9febbb 100644 --- a/docs/deployment.rst +++ b/docs/deployment.rst @@ -3,15 +3,15 @@ Deployment ********** This chapter describes common steps for deploying Divolte Collector in production. -Installation / packages -======================= +Installation/packages +===================== The distributions provided for Divolte Collector are: -- A .tar.gz archive distribution containing the binaries and startup scripts. -- A .zip archive distribution containing the binaries and startup scripts. -- A RPM that can be installed onto Redhat / CentOS systems. This includes startup and init scripts. +- A ``.tar.gz`` archive distribution containing the binaries and startup scripts. +- A ``.zip`` archive distribution containing the binaries and startup scripts. +- A RPM that can be installed onto Red Hat/CentOS systems. This includes startup and init scripts. -Currently, there is no .deb distribution. This will be added in a next release. +Currently there is no Debian packaging. Load balancers ============== @@ -19,21 +19,23 @@ In a production scenario, Divolte Collector is typically deployed behind a load Divolte Collector is semi-stateless. This means that it is not required that requests form the same client always go to the same instance; the event will be logged in all cases. Divolte Collector does however build up some soft state during operation for detecting duplicate events and caching parsed user agents. This means that there is benefit in stickyness, but it is not a requirement. -URI / hash based load balancing policy --------------------------------------- +URI/hash-based load balancing policy +------------------------------------ Divolte Collector keeps a short term memory for detecting duplicate requests. In order for this to work, exact duplicate requests need to always go to the same instance. Most load balancers can support this by setting up a routing policy that uses a hash of the requested URI to determine which instance to route the request to. When using duplicate detection, be sure to configure your load balancer to do this. Consistent hashing and event de-duplication ------------------------------------------- -If possible, load balancers should use a so called consistent hashing scheme when performing URI hash based routing. This ensures that when a instance of Divolte Collector dies, the re-hashing amongst the remaining instances only minimally disrupts the event assignments. The benefit of this is that the duplicate memory kept by Divolte Collector nodes remains effective on the still running nodes. +If possible, load balancers should use a consistent hashing scheme when performing URI hash-based routing. This should ensure that most traffic continues to be routed to the same instance as before. The benefit of this is that the duplicate memory kept by Divolte Collector nodes remains effective. SSL === -Divolte Collector does not handle SSL in any way. SSL offloading needs to be done by a load balancer or a reverse proxy server. These systems are generally capable of offloading SSL and since there will always be a load balancer in front of Divolte Collector in production setups, it was decided not to add this functionality to the internal HTTP server. +Divolte Collector does not handle SSL itself. SSL offloading needs to be done by a load balancer or a reverse proxy server. This can normally handled by the load balancer in front of Divolte Collector in production setups. Example nginx configuration =========================== -When using `nginx `_ as a reverse proxy and load balancer in front of Divolte Collector, you can use this snippet for configuring nginx:: +When using `nginx `_ as a reverse proxy and load balancer in front of Divolte Collector, you can use this snippet for configuring nginx: + +.. code-block:: nginx upstream divolte { hash $request_uri consistent; From fe5d59b8389d8bbd8f1216986f8b95a3b547b3bc Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 5 Aug 2016 17:48:04 +0200 Subject: [PATCH 79/80] Fix some failing Selenium tests. --- src/test/java/io/divolte/server/SeleniumJavaScriptTest.java | 5 +++-- .../resources/selenium-test-no-default-event-config.conf | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java b/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java index 1e69a9b6..a470edcc 100644 --- a/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java +++ b/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java @@ -28,6 +28,7 @@ import javax.annotation.ParametersAreNonnullByDefault; +import io.divolte.server.config.BrowserSourceConfiguration; import org.junit.Before; import org.junit.Test; import org.openqa.selenium.By; @@ -232,13 +233,13 @@ public void shouldSetAppropriateCookies() throws RuntimeException, InterruptedEx driver.get(urlOf(BASIC)); server.waitForEvent(); - final Optional parsedPartyCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(server.config.getString("divolte.tracking.party_cookie")).getValue()); + final Optional parsedPartyCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION.partyCookie).getValue()); assertTrue(parsedPartyCookieOption.isPresent()); assertThat( parsedPartyCookieOption.get(), isA(DivolteIdentifier.class)); - final Optional parsedSessionCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(server.config.getString("divolte.tracking.session_cookie")).getValue()); + final Optional parsedSessionCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION.sessionCookie).getValue()); assertTrue(parsedSessionCookieOption.isPresent()); assertThat( parsedSessionCookieOption.get(), diff --git a/src/test/resources/selenium-test-no-default-event-config.conf b/src/test/resources/selenium-test-no-default-event-config.conf index 1492cc1a..6145043f 100644 --- a/src/test/resources/selenium-test-no-default-event-config.conf +++ b/src/test/resources/selenium-test-no-default-event-config.conf @@ -15,4 +15,7 @@ // // Disable automatic default pageView event -divolte.javascript.auto_page_view_event = false +divolte.sources.browser { + type = browser + javascript.auto_page_view_event = false +} From 9005a31f53ec112cf2fe486f53c768dd9098cb5d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 9 Aug 2016 21:23:46 +0200 Subject: [PATCH 80/80] Call out the implicit default sources/mapping/sinks with a heading. --- docs/configuration.rst | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index d11d230e..fcc8c0cf 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -214,7 +214,7 @@ This section controls global settings related to the processing of incoming requ Property: ``divolte.global.mapper.threads`` """"""""""""""""""""""""""""""""""""""""""" :Description: - The number of threads that each mapper should use to process events. + The total number of threads that mappers will use to process events. This is a global total; all mappings share the same threads. :Default: 1 :Example: @@ -468,6 +468,9 @@ For example: } } +Implicit default source +^^^^^^^^^^^^^^^^^^^^^^^ + If no sources are specified a single implicit browser source is created that is equivalent to: .. code-block:: none @@ -690,6 +693,9 @@ An example mapping configuration could be: } } +Implicit default mapping +^^^^^^^^^^^^^^^^^^^^^^^^ + If no mappings are specified a single implicit mapping is created that is equivalent to: .. code-block:: none @@ -820,6 +826,9 @@ For example: } } +Implicit default sinks +^^^^^^^^^^^^^^^^^^^^^^ + If no sinks are specified two implicit sinks are created that are equivalent to: .. code-block:: none