diff --git a/build.gradle b/build.gradle index efd471bf..9630f716 100644 --- a/build.gradle +++ b/build.gradle @@ -75,12 +75,18 @@ repositories { } dependencies { + // Define some key versions for components that we use lots of artifacts from. + // Note: we refer to the Avro documentation in our own documentation. + def avroVersion = '1.8.1' + def hadoopVersion = '2.7.2' + def jacksonVersion = '2.8.1' + def slf4jVersion = '1.7.21' + compile group: 'io.divolte', name: 'divolte-schema', version: version compile group: 'io.undertow', name: 'undertow-core', version: '1.4.0.Final' compile group: 'com.typesafe', name: 'config', version: '1.3.0' compile group: 'com.google.guava', name: 'guava', version: '19.0' - // Note: we refer to the Avro documentation in our own documentation. - compile group: 'org.apache.avro', name: 'avro', version: '1.8.1' + compile group: 'org.apache.avro', name: 'avro', version: avroVersion /* * We package the Avro Tools to provide an easy way to view Avro files @@ -91,14 +97,14 @@ dependencies { * run and have a working tojson command. The other commands weren't fully * tested with these deps. */ - compile group: 'org.apache.avro', name: 'avro-tools', version: '1.8.1', classifier: 'nodeps' - compile group: 'org.apache.avro', name: 'trevni-core', version: '1.8.1' - compile group: 'org.apache.avro', name: 'avro-mapred', version: '1.8.1' + compile group: 'org.apache.avro', name: 'avro-tools', version: avroVersion, classifier: 'nodeps' + compile group: 'org.apache.avro', name: 'trevni-core', version: avroVersion + compile group: 'org.apache.avro', name: 'avro-mapred', version: avroVersion - compile (group: 'org.apache.hadoop', name:'hadoop-common', version: '2.7.2') { + compile (group: 'org.apache.hadoop', name:'hadoop-common', version: hadoopVersion) { exclude group: 'jline', module: 'jline' } - compile group: 'org.apache.hadoop', name:'hadoop-hdfs', version: '2.7.2' + compile group: 'org.apache.hadoop', name:'hadoop-hdfs', version: hadoopVersion compile (group: 'net.sf.uadetector', name: 'uadetector-core', version: '0.9.22') { exclude group: 'com.google.code.findbugs', module: 'jsr305' } @@ -116,9 +122,10 @@ dependencies { compile group: 'org.codehaus.groovy', name:'groovy', version: '2.4.7', classifier: 'indy' compile group: 'net.sf.jopt-simple', name:'jopt-simple', version: '5.0.2' compile group: 'com.jayway.jsonpath', name: 'json-path', version: '2.2.0' - compile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: '2.8.1' - compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: '2.8.1' - compile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: '2.8.1' + compile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: jacksonVersion + compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: jacksonVersion + compile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-guava', version: jacksonVersion + compile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: jacksonVersion compile group: 'com.jasonclawson', name: 'jackson-dataformat-hocon', version: '1.1.0' // Used for configuration validation @@ -127,16 +134,13 @@ dependencies { // We use the SLF4J API. At runtime, this is LogBack. // (We also force any dependencies that use Log4J to go via SLF4J.) - compile group: 'org.slf4j', name: 'slf4j-api', version: '1.7.21' + compile group: 'org.slf4j', name: 'slf4j-api', version: slf4jVersion runtime group: 'ch.qos.logback', name: 'logback-classic', version: '1.1.7' - runtime group: 'org.slf4j', name: 'log4j-over-slf4j', version: '1.7.21' + runtime group: 'org.slf4j', name: 'log4j-over-slf4j', version: slf4jVersion testCompile group: 'junit', name: 'junit', version: '4.12' testCompile group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' testCompile group: 'org.mockito', name: 'mockito-all', version: '1.10.19' - testCompile group: 'com.fasterxml.jackson.core', name:'jackson-databind', version: '2.6.3' - testCompile group: 'com.fasterxml.jackson.module', name:'jackson-module-parameter-names', version: '2.6.3' - testCompile group: 'com.fasterxml.jackson.datatype', name:'jackson-datatype-jdk8', version: '2.6.3' testCompile group: 'com.saucelabs', name:'sauce_junit', version: '2.1.21' testCompile group: 'org.seleniumhq.selenium', name:'selenium-java', version: '2.53.1' diff --git a/docs/conf.py b/docs/conf.py index 8b0f4c23..5c9c702b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014 GoDataDriven B.V. +# Copyright 2016 GoDataDriven B.V. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ # General information about the project. project = u'Divolte' -copyright = u'2015, GoDataDriven' +copyright = u'2016, GoDataDriven' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -103,6 +103,8 @@ # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False +# If true, Sphinx will warn about all references where the target cannot be found. +#nitpicky = False # -- Options for HTML output ---------------------------------------------- diff --git a/docs/configuration.rst b/docs/configuration.rst index 3567a851..fcc8c0cf 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5,15 +5,15 @@ This chapter describes the configuration mechanisms and available options for Di Configuration files =================== -The configuration for Divolte Collector consists of three files: +The main configuration for Divolte Collector consists of three files: -- ``divolte-env.sh``: a shell script that is included in the startup script to set environment variables and JVM startup arguments. -- ``divolte-collector.conf``: the main configuration file for Divolte Collector. -- ``logback.xml``: the logging configuration. +- :file:`divolte-env.sh`: a shell script that is included in the startup script to set environment variables and JVM startup arguments. +- :file:`logback.xml`: the logging configuration. +- :file:`divolte-collector.conf`: the main configuration file for Divolte Collector. Configuration directory ----------------------- -Divolte Collector will try to find configuration files at startup in the configuration directory. Typically this is the ``conf/`` directory nested under the Divolte Collector installation. Divolte Collector will try to locate the configuration directory at ``../conf`` relative to the startup script. The configuration directory can be overridden by setting the ``DIVOLTE_CONF_DIR`` environment variable. If set, the value will be used as configuration directory. If you have installed Divolte Collector from a RPM, the init script will set this variable to ``/etc/divolte-collector``. +Divolte Collector will try to find configuration files at startup in the configuration directory. Typically this is the :file:`conf/` directory nested under the Divolte Collector installation. Divolte Collector will try to locate the configuration directory at :file:`../conf` relative to the startup script. The configuration directory can be overridden by setting the :envvar:`DIVOLTE_CONF_DIR` environment variable. If set, the value will be used as configuration directory. If you have installed Divolte Collector from a RPM, the init script will set this variable to :file:`/etc/divolte-collector`. divolte-env.sh -------------- @@ -22,7 +22,7 @@ This shell script is run by the startup script prior to starting the Divolte Col HADOOP_CONF_DIR ^^^^^^^^^^^^^^^ :Description: - Directory where Hadoop / HDFS configuration files are to be found. This directory is included in the classpath on startup, which causes the HDFS client to load the configuration files. + Directory where Hadoop/HDFS configuration files are to be found. This directory is included in the classpath on startup, which causes the HDFS client to load the configuration files. :Example: @@ -33,7 +33,7 @@ HADOOP_CONF_DIR JAVA_HOME ^^^^^^^^^ :Description: - The directory where the JRE/JDK is located. Divolte Collector will use ``$JAVA_HOME/bin/java`` as Java executable for startup. If this is not set, Divolte Collector will attempt to find a suitable JDK in a number of common Java installation locations on Linux systems. It is however not recommended to rely on this mechanism for production use. + The directory where the JRE/JDK is located. Divolte Collector will use :command:`$JAVA_HOME/bin/java` as the Java executable during startup. If this is not set, Divolte Collector will attempt to find a suitable JDK in a number of common Java installation locations on Linux systems. It is however not recommended to rely on this mechanism for production use. :Example: @@ -44,7 +44,7 @@ JAVA_HOME DIVOLTE_JAVA_OPTS ^^^^^^^^^^^^^^^^^ :Description: - Additional arguments passed to the Java Virtual Machine on startup. If not set, by default Divolte Collector will start the JVM with ``-XX:+UseG1GC -Djava.awt.headless=true``. It is recommended to use the G1 garbage collector. For light and medium traffic, the defaults tend to work fine. *If this setting is set, Divolte Collector will not add any arguments by itself; this setting overrides the defaults.* + Additional arguments passed to the Java Virtual Machine on startup. If not set, by default Divolte Collector will start the JVM with :code:`-XX:+UseG1GC -Djava.awt.headless=true`. It is recommended to use the G1 garbage collector. For light and medium traffic, the defaults tend to work fine. *If this setting is set, Divolte Collector will not add any arguments by itself; this setting overrides the defaults.* :Example: @@ -52,6 +52,10 @@ DIVOLTE_JAVA_OPTS DIVOLTE_JAVA_OPTS="-XX:+UseG1GC -Djava.awt.headless=true -XX:+HeapDumpOnOutOfMemoryError" +logback.xml +----------- +Divolte Collector uses the `Logback Project `_ as its logging provider. This provider is configured through the :file:`logback.xml` file in the configuration directory. For more information about the settings in this file, review the `Configuration chapter in the Logback Manual `_. + divolte-collector.conf ---------------------- This is the main configuration file for Divolte Collector. For configuration, Divolte Collector uses the `Typesafe Config library `_. The dialect of the configuration file is a JSON superset called HOCON (for *Human-Optimized Config Object Notation*). HOCON has a nested structure, like JSON, but is slightly less verbose and doesn't require escaping and quoting of strings in many cases. Here we outline some basic features of HOCON. @@ -62,22 +66,26 @@ Nesting and dot separated namespacing can be used interchangeably: // This: divolte { - server { - host = 127.0.0.1 + global { + server { + host = 127.0.0.1 + } } } // Is the same as this: - divolte.server.host = 127.0.0.1 + divolte.global.server.host = 127.0.0.1 -Environment variable overrides can be used. In this example the ``divolte.server.port`` setting defaults to 8290, unless the ``DIVOLTE_PORT`` environment variable is set: +Environment variable overrides can be used. In this example the ``divolte.global.server.port`` setting defaults to 8290, unless the :envvar:`DIVOLTE_PORT` environment variable is set: .. code-block:: none divolte { - server { - port = 8290 - port = ${?DIVOLTE_PORT} + global { + server { + port = 8290 + port = ${?DIVOLTE_PORT} + } } } @@ -87,73 +95,87 @@ Objects are merged: // This configuration divolte { - server { - host = 0.0.0.0 + global { + server { + host = 0.0.0.0 + } } } - divolte.server { + divolte.global.server { port = 8290 } // Will result in this: - divolte.server.host = 0.0.0.0 - divolte.server.port = 8290 + divolte.global.server.host = 0.0.0.0 + divolte.global.server.port = 8290 For a full overview please refer to the `HOCON features and specification `_. .. warning:: - Be careful when enclosing values in quotes. Quotes are optional, but if present they must be JSON-style double-quotes (``"``). - This can easily lead to confusion: + Be careful when enclosing values in quotes. Quotes are optional, but if present they must be JSON-style double-quotes (``"``). This can easily lead to confusion: .. code-block:: none // This ... - divolte.tracking.cookie_domain = '.example.com' + divolte.sources.browser.cookie_domain = '.example.com' // ... is really equivalent to: - divolte.tracking.cookie_domain = "'.example.com'" + divolte.sources.browser.cookie_domain = "'.example.com'" Configuration reference ======================= -The following sections and settings are available in the ``divolte-collector.conf`` file. Note that in this documentation the path notation for configuration options is used (e.g. ``divolte.server``) but in examples the path and nested notation is used interchangeably. -divolte.server --------------- +The main configuration is read from :file:`divolte-collector.conf`, which consists of several sections: + +- *Global* (``divolte.global``): Global settings that affect the entire service. +- *Sources* (``divolte.sources``): Configured sources for Divolte Collector events. +- *Mappings* (``divolte.mappings``): Configured mappings between sources and sinks. +- *Sinks* (``divolte.sinks``): Configured sinks, where Avro events are written. + +This documentation uses the path notation for configuration options (e.g. ``divolte.global.server``) but in examples the path and nested notations are used interchangeably. + +Global Settings (``divolte.global``) +------------------------------------ + +This section contains settings which are global in nature. All settings have default values. + +HTTP Server Settings (``divolte.global.server``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This section controls the settings for the internal HTTP server of Divolte Collector. -divolte.server.host -^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.host`` +"""""""""""""""""""""""""""""""""""""""" :Description: - The address to which the server binds. Set to a specific IP address to selectively listen on that interface. + The address to which the server binds. Set to a specific IP address to selectively listen on that interface, or ``0.0.0.0`` to listen on all interfaces. :Default: - ``0.0.0.0`` + The address of a loopback interface. :Example: .. code-block:: none - divolte.server { + divolte.global.server { host = 0.0.0.0 } -divolte.server.port -^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.port`` +"""""""""""""""""""""""""""""""""""""""" :Description: The TCP port on which the server listens. :Default: - ``8290`` + ``8290``, or the content of the :envvar:`DIVOLTE_PORT` environment variable if set. :Example: .. code-block:: none - divolte.server { + divolte.global.server { port = 8290 } -divolte.server.use_x_forwarded_for -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.use_x_forwarded_for`` +""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Whether to use the ``X-Forwarded-For`` HTTP header for determining the source IP of a request, if present. If multiple values are present, the last value is used. + Whether to use the :mailheader:`X-Forwarded-For` HTTP header for determining the source IP of a request, if present. If multiple values are present, the last value is used. Both of these examples would yield a source IP of ``11.34.82.30``: @@ -162,106 +184,77 @@ divolte.server.use_x_forwarded_for | ``X-Forwarded-For: 11.45.82.30`` :Default: - ``false`` + :code:`false` :Example: .. code-block:: none - divolte.server { + divolte.global.server { use_x_forwarded_for = true } -divolte.server.serve_static_resources -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.server.serve_static_resources`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: When true Divolte Collector serves a static test page at ``/``. :Default: - ``true`` + :code:`true` :Example: .. code-block:: none - divolte.server { + divolte.global.server { serve_static_resources = false } -divolte.tracking ----------------- -This section controls the tracking mechanism for Divolte Collector, covering areas such as the cookies and session timeouts, user agent parsing and ip2geo lookups. - -divolte.tracking.party_cookie -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - The name of the cookie used for setting a party ID. -:Default: - ``_dvp`` -:Example: - - .. code-block:: none - - divolte.tracking { - party_cookie = _pid - } - -divolte.tracking.party_timeout -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - The expiry timeout for the party identifier. If no events occur for this duration, the party identifier is discarded. - Any subsequent events will be assigned a new party identifier. -:Default: - 730 days -:Example: - - .. code-block:: none - - divolte.tracking { - party_timeout = 1000 days - } +Global Mapper Settings (``divolte.global.mapper``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section controls global settings related to the processing of incoming requests after they have been received by the server. Incoming requests for Divolte Collector are responded to as quickly as possible, with mapping and flushing occurring in the background. -divolte.tracking.session_cookie -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.threads`` +""""""""""""""""""""""""""""""""""""""""""" :Description: - The name of the cookie used for tracking the session ID. + The total number of threads that mappers will use to process events. This is a global total; all mappings share the same threads. :Default: - ``_dvs`` + 1 :Example: .. code-block:: none - divolte.tracking { - session_cookie = _sid + divolte.global.mapper { + threads = 4 } -divolte.tracking.session_timeout -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.buffer_size`` +""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The expiry timeout for a session. A session lapses if no events occur for this duration. + The maximum number of incoming events, rounded up to the nearest power of 2, to queue for processing *per mapper thread* before starting to drop incoming events. While this buffer is full new events are dropped and a warning is logged. (Dropped requests are not reported to the client: Divolte Collector always responds to clients immediately once minimal validation has taken place.) :Default: - 30 minutes + 1048576 :Example: .. code-block:: none - divolte.tracking { - session_timeout = 1 hour + divolte.global.mapper { + buffer_size = 10M } -divolte.tracking.cookie_domain -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.duplicate_memory_size`` +""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The cookie domain that is assigned to the cookies. When left empty, the cookies will have no domain explicitly associated with them, which effectively sets it to the website domain of the page that loaded the Divolte Collector JavaScript. + Clients will sometimes deliver an event multiple times, normally within a short period of time. Divolte Collector contains a probabilistic filter which can detect this, trading off memory for improved results. This setting configures the size of the filter *per mapper thread*, and is multiplied by 8 to yield the actual memory usage. :Default: - *Empty* + 1000000 :Example: .. code-block:: none - divolte.tracking { - cookie_domain = ".example.com" + divolte.global.mapper { + duplicate_memory_size = 10000000 } -divolte.tracking.ip2geo_database -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.ip2geo_database`` +""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: This configures the ip2geo database for geographic lookups. An ip2geo database can be obtained from `MaxMind `_. (Free 'lite' versions and commercial versions are available.) @@ -272,16 +265,16 @@ divolte.tracking.ip2geo_database .. code-block:: none - divolte.tracking { + divolte.global.mapper { ip2geo_database = "/etc/divolte/ip2geo/GeoLite2-City.mmdb" } -divolte.tracking.ua_parser --------------------------- +Property: ``divolte.global.mapper.user_agent_parser`` +""""""""""""""""""""""""""""""""""""""""""""""""""""" This section controls the user agent parsing settings. The user agent parsing is based on an `open source parsing library `_ and supports dynamic reloading of the backing database if an internet connection is available. -divolte.tracking.ua_parser.type -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.user_agent_parser.type`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: This setting controls the updating behavior of the user agent parser. @@ -298,12 +291,12 @@ divolte.tracking.ua_parser.type .. code-block:: none - divolte.tracking.ua_parser { - type = caching_and_updating + divolte.global.mapper.user_agent_parser { + type = caching_and_updating } -divolte.tracking.ua_parser.cache_size -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.mapper.user_agent_parser.cache_size`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: User agent parsing is a relatively expensive operation that requires many regular expression evaluations. Very often the same user agent will make consecutive requests and many clients will have the exact same user agent as well. It therefore makes sense to cache the parsing results for re-use in subsequent requests. This setting determines how many unique user agent strings will be cached. :Default: @@ -312,557 +305,677 @@ divolte.tracking.ua_parser.cache_size .. code-block:: none - divolte.tracking.ua_parser { + divolte.global.mapper.user_agent_parser { cache_size = 10000 } -divolte.tracking.schema_file -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - By default, Divolte Collector will use a built-in Avro schema for writing data and a default mapping, which is documented in the Mapping section of the user documentation. If not set, a `default built-in schema `_ will be used. +Global HDFS Settings (``divolte.global.hdfs``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section controls global HDFS settings shared by all HDFS sinks. - Typically, users will configure their own schema, usually with fields specific to their domain and custom events and other mappings. When using a user defined schema, it is also required to provide a mapping script. See :doc:`mapping_reference` for further reference. +Property: ``divolte.global.hdfs.enabled`` +""""""""""""""""""""""""""""""""""""""""" +:Description: + Whether or not HDFS support is enabled or not. If disabled all HDFS sinks are ignored. :Default: - *`Built-in schema `_* + :code:`true` :Example: .. code-block:: none - divolte.tracking { - schema_file = /etc/divolte/MyEventRecord.avsc + divolte.global.hdfs { + enabled = false } -divolte.tracking.schema_mapping -------------------------------- -This section controls the schema mapping to use. Schema mapping is an important feature of Divolte Collector, as it allows users to map incoming requests onto custom Avro schemas in non-trivial ways. See :doc:`mapping_reference` for details about this process and the internal mapping DSL used for defining mappings. - -divolte.tracking.schema_mapping.version -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.hdfs.threads`` +""""""""""""""""""""""""""""""""""""""""" :Description: - Prior versions of Divolte Collector supported an alternative mapping DSL. The current version is 2, and this is the only - value supported if the built-in mapping is not being used. + Number of threads to use per HDFS sink for writing events. Each thread creates its own files on HDFS. :Default: - *Not set (for built-in mapping)* + 2 :Example: .. code-block:: none - divolte.tracking.schema_mapping { - version = 2 + divolte.global.hdfs { + threads = 1 } -divolte.tracking.schema_mapping.mapping_script_file -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.hdfs.buffer_size`` +""""""""""""""""""""""""""""""""""""""""""""" :Description: - The location of the Groovy script that defines the how events will be mapped to Avro records. If unset, a default built-in mapping will be used. + The maximum number of mapped events to queue internally *per sink thread* for HDFS before starting to drop them. This value will be rounded up to the nearest power of 2. :Default: - *Built-in mapping* + 1048576 :Example: .. code-block:: none - divolte.tracking.schema_mapping { - mapping_script_file = /etc/divolte/my-mapping.groovy + divolte.global.hdfs.buffer_size { + max_write_queue = 10M } -divolte.javascript ------------------- -This section controls various aspects of the JavaScript tag that will be loaded. - -divolte.javascript.name -^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.hdfs.client`` +"""""""""""""""""""""""""""""""""""""""" :Description: - The path with which the JavaScript is served. This changes the ``divolte.js`` part in the script url: http://example.com/divolte.js. + Properties that will be used to configure the HDFS client used by HDFS sinks. If set, these properties will be used *instead of* the settings from :file:`hdfs-site.xml` in the directory specified by the :envvar:`HADOOP_CONF_DIR`. Although it is possible to configure all settings here instead of in :envvar:`HADOOP_CONF_DIR` this is not recommended. :Default: - ``divolte.js`` + *Not set* :Example: .. code-block:: none - divolte.javascript { - name = tracking.js + divolte.global.hdfs.client { + fs.defaultFS = "file:///var/log/divolte/" } -divolte.javascript.logging -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Global Kafka Settings (``divolte.global.kafka``) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section controls global Kafka settings shared by all Kafka sinks. At present Divolte Collector only supports connecting to a single Kafka cluster. + +Property: ``divolte.global.kafka.enabled`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - Enable or disable the logging on the JavaScript console in the browser. + This controls whether flushing to Kafka is enabled or not. If disabled all Kafka sinks are ignored. (This is disabled by default because the producer configuration for Kafka is normally site-specific.) :Default: - ``false`` + :code:`false` :Example: .. code-block:: none - divolte.javascript { - logging = true + divolte.global.kafka { + enabled = true } -divolte.javascript.debug -^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.kafka.threads`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - When enabled, the served JavaScript will be less compact and *slightly* easier to debug. This setting is mainly intended - to help track down problems in either the minification process used to reduce the size of the tracking script, or in the - behaviour of specific browser versions. + Number of threads to use per Kafka sink for flushing events to Kafka. :Default: - ``false`` + 2 :Example: .. code-block:: none - divolte.javascript { - debug = true + divolte.global.kafka { + threads = 1 } -divolte.javascript.auto_page_view_event -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.kafka.buffer_size`` +"""""""""""""""""""""""""""""""""""""""""""""" :Description: - When enabled the JavaScript tag automatically generates a ``pageView`` event when loaded, simplifying site integration. - If sites wish to control all events (including the initial ``pageView`` event) this can be disabled. + The maximum number of mapped events to queue internally *per sink thread* for Kafka before starting to drop them. This value will be rounded up to the nearest power of 2. :Default: - ``true`` + 1048576 :Example: .. code-block:: none - divolte.javascript { - auto_page_view_event = false + divolte.global.kafka.buffer_size { + max_write_queue = 10M } - -divolte.incoming_request_processor ----------------------------------- -This section controls settings related to the processing of incoming requests after they have been received by the server. Incoming requests for Divolte Collector are responded to as quickly as possible, with mapping and flushing occurring in the - background. Only minimal validation is performed before issuing a HTTP `200 OK` response that contains a transparent 1x1 pixel GIF image.containing a handled by a pool of HTTP threads, which immediately respond with a HTTP code 200 and send the response payload (a 1x1 pixel transparent GIF image). The background mapping and processing is performed by the incoming request processor and configured in this section. - -divolte.incoming_request_processor.threads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Property: ``divolte.global.kafka.producer`` +""""""""""""""""""""""""""""""""""""""""""" :Description: - Number of threads to use for processing incoming requests. All requests for a single party are processed on the same thread. + The configuration to use for Kafka producers. All settings are used as-is to configure the Kafka producer; refer to the `Kafka Documentation `_ for further details. :Default: - ``2`` -:Example: .. code-block:: none - divolte.incoming_request_processor { - threads = 1 + { + bootstrap.servers = ["localhost:9092"] + bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} + client.id = divolte.collector + client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} + + acks = 1 + retries = 0 + compression.type = lz4 + max.in.flight.requests.per.connection = 1 } -divolte.incoming_request_processor.max_write_queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:Description: - The maximum number of incoming requests to queue for processing *per thread* before starting to drop incoming requests. While this queue is full new requests are dropped and a warning is logged. (Dropped requests are not reported to the client: Divolte Collector will always respond with a HTTP 200 status code once minimal validation has taken place.). -:Default: - ``100000`` + Note the use of :envvar:`DIVOLTE_KAFKA_BROKER_LIST` and :envvar:`DIVOLTE_KAFKA_CLIENT_ID` environment variables, if they have been set. + :Example: .. code-block:: none - divolte.incoming_request_processor { - max_write_queue = 1000000 + divolte.global.kafka.producer = { + metadata.broker.list = ["broker1:9092", "broker2:9092", "broker3:9092"] + client.id = divolte.collector + + acks = 0 + retries = 5 + } + +Sources (``divolte.sources``) +----------------------------- + +Sources are endpoints that can receive events. Each source has a name used to identify it when configuring a mapper that uses the source. A source cannot have the same name as a sink (and vice versa). Sources are configured in sections using their name as the configuration path. (Due to the `HOCON merging rules `_, it's not possible to configure multiple sources with the same name.) + +Each source has a type configured via a mandatory ``type`` property. At present the only supported type is ``browser``. + +For example: + +.. code-block:: none + + divolte.sources { + // The name of the source is 'my_source' + my_source = { + // This is a browser source. + type = browser + } + } + +Implicit default source +^^^^^^^^^^^^^^^^^^^^^^^ + +If no sources are specified a single implicit browser source is created that is equivalent to: + +.. code-block:: none + + divolte.sources { + // The name of the implicit source is 'browser' + browser = { + type = browser } + } + +If *any* sources are configured this implicit source is not present and all sources must be explicitly specified. + +Browser Sources +^^^^^^^^^^^^^^^ + +A browser source is intended to receive tracking events from a browser. Each browser source serves up a tracking tag (JavaScript). This tag must be integrated into a website for Divolte Collector to receive tracking events. Each page of a website needs to include this: + +.. code-block:: html -divolte.incoming_request_processor.max_enqueue_delay -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +The URL will need to use the domain name where you are hosting Divolte Collector, and ``divolte.js`` needs to match the ``javascript.name`` setting of the browser source. + +By default loading the tag will trigger a ``pageView`` event. The tag also provides an API for issuing custom +events: + +.. code-block:: html + + + +The first argument to the :samp:`divolte.signal({...})` function is the type of event, while the second argument is an arbitrary object containing custom parameters associated with the event. Storing the event and its parameters into the configured Avro schema is controlled via mapping; see the :doc:`mapping_reference` chapter for details. + +Browser sources are able to detect some cases of corruption in the event data. The most common source of this is due to URLs being truncated, but there are also other sources of corruption between the client and the server. Corrupted events are flagged as such but still made available for mapping. (Mappings may choose to discard corrupted events, but by default they are processed normally.) + +Within the namespace for a browser source properties are used to configure it. + +Browser source property: ``prefix`` +""""""""""""""""""""""""""""""""""" :Description: - The maximum time to wait if the queue is full before dropping an event. + The path prefix under which the tracking tag is available. Each browser source must have a unique prefix. A trailing slash (``/``) is automatically appended if not specified. :Default: - 1 second + ``/`` :Example: .. code-block:: none - divolte.incoming_request_processor { - max_enqueue_delay = 20 seconds + divolte.sources.a_source { + type = browser + prefix = /tracking } -divolte.incoming_request_processor.discard_corrupted -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + In this case the tracking tag could be included using: + + .. code-block:: html + + + +Browser source property: ``party_cookie`` +""""""""""""""""""""""""""""""""""""""""" :Description: - Events from the JavaScript tag contain a checksum to detect corrupted events. (The most common source of this is - URLs being truncated, but there are also other sources of corruption between the client and the server.) If enabled, - corrupt events will be discarded and not subject to mapping and further processing. If disabled, a best effort will - be made to map and process the event as if it was normal. + The name of the cookie used for setting a party identifier. :Default: - ``false`` + ``_dvp`` :Example: .. code-block:: none - divolte.incoming_request_processor { - discard_corrupted = true + divolte.sources.a_source { + type = browser + party_cookie = _pid } -divolte.incoming_request_processor.duplicate_memory_size -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``party_timeout`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - Browsers and other clients will sometimes deliver an event to the Divolte Collector multiple times, normally - within a short period of time. Divolte Collector contains a probabilistic filter which can detect this, trading - off memory for improved results. This setting configures the size of the filter *per thread*, and is multuplied - by 8 to yield the actual memory usage. + The expiry timeout for the party identifier. If no events occur for this duration, the party identifier is discarded by the browser. Any subsequent events will be cause a new party identifier to be assigned to the browser. :Default: - ``1000000`` + 730 days :Example: .. code-block:: none - divolte.incoming_request_processor { - duplicate_memory_size = 10000000 + divolte.sources.a_source { + type = browser + party_timeout = 1000 days } -divolte.incoming_request_processor.discard_duplicates -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``session_cookie`` +""""""""""""""""""""""""""""""""""""""""""" :Description: - Browsers and other clients will sometimes deliver an event to the Divolte Collector multiple times, normally - within a short period of time. Divolte Collector contains a probabilistic filter which can detect this, and - when this setting is enabled events considered duplicates will be discarded without further mapping or processing. + The name of the cookie used for tracking the session identifier. :Default: - ``false`` + ``_dvs`` :Example: .. code-block:: none - divolte.incoming_request_processor { - discard_duplicates = true + divolte.sources.a_source { + type = browser + session_cookie = _sid } -divolte.kafka_flusher ---------------------- -This section controls settings related to forwarding the event stream to a Apache Kafka topic. Events for Kafka topics -are keyed by their party identifier. - -divolte.kafka_flusher.enabled -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``session_timeout`` +"""""""""""""""""""""""""""""""""""""""""""" :Description: - This controls whether flushing to Kafka is enabled or not. (This is disabled by default because the producer configuration for Kafka is normally site-specific.) + The expiry timeout for a session. A session lapses if no events occur for this duration. :Default: - ``false`` + 30 minutes :Example: .. code-block:: none - divolte.kafka_flusher { - enabled = true + divolte.sources.a_source { + type = browser + session_timeout = 1 hour } -divolte.kafka_flusher.threads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``cookie_domain`` +"""""""""""""""""""""""""""""""""""""""""" :Description: - Number of threads to use for flushing events to Kafka. + The cookie domain that is assigned to the cookies. When left empty, the cookies will have no domain explicitly associated with them, which effectively sets it to the website domain of the page that loaded the tag. :Default: - ``2`` + *Empty* :Example: .. code-block:: none - divolte.kafka_flusher { - threads = 1 + divolte.sources.a_source { + type = browser + cookie_domain = ".example.com" } -divolte.kafka_flusher.max_write_queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``javascript.name`` +"""""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum number of mapped events to queue internally *per thread* for Kafka before starting to drop them. + The name of the JavaScript loaded as the tag. This is appended to the value of the ``prefix`` property to form the complete path of the tag in the URL. :Default: - ``200000`` + ``divolte.js`` :Example: .. code-block:: none - divolte.kafka_flusher { - max_write_queue = 1000000 + divolte.sources.a_source { + type = browser + javascript.name = tracking.js } -divolte.kafka_flusher.max_enqueue_delay -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + In this case the tracking tag could be included using: + + .. code-block:: html + + + +Browser source property: ``javascript.logging`` +""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum time to wait before dropping the event if the internal queue for one of the Kafka threads is full. + Enable or disable the logging to the JavaScript console in the browser. :Default: - 1 second + :code:`false` :Example: .. code-block:: none - divolte.kafka_flusher { - max_enqueue_delay = 20 seconds + divolte.sources.a_source { + type = browser + javascript.logging = true } -divolte.kafka_flusher.topic -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``javascript.debug`` +""""""""""""""""""""""""""""""""""""""""""""" :Description: - The Kafka topic onto which events are published. + When enabled, the served JavaScript will be less compact and *slightly* easier to debug. This setting is mainly intended to help track down problems in either the minification process used to reduce the size of the tracking script, or in the behaviour of specific browser versions. :Default: - ``divolte`` + :code:`false` :Example: .. code-block:: none - divolte.kafka_flusher { - topic = clickevents + divolte.sources.a_source { + type = browser + javascript.debug = true } -divolte.kafka_flusher.producer -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Browser source property: ``javascript.auto_page_view_event`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - The producer configuration. All settings are used as-is to configure the Kafka producer; refer to the `Kafka Documentation `_ for further details. + When enabled the JavaScript tag automatically generates a ``pageView`` event when loaded, simplifying site integration. If sites wish to control all events (including the initial ``pageView`` event) this can be disabled. :Default: + :code:`true` +:Example: .. code-block:: none - producer = { - bootstrap.servers = "localhost:9092" - bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} + divolte.sources.a_source { + type = browser + javascript.auto_page_view_event = false + } - client.id = divolte.collector - client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} +Mappings (``divolte.mappings``) +------------------------------- - acks = 0 - retries = 5 - retry.backoff.ms = 200 +Mappings are used to specify event flows between sources and sinks, along with the transformation ("mapping") required to convert events into Avro records that conform to a schema. Schema mapping is an important feature of Divolte Collector as it allows incoming events to be mapped onto custom Avro schemas in non-trivial ways. See :doc:`mapping_reference` for details about this process and the internal mapping DSL used for defining mappings. + +Each configured mapping has a name and produces homogenous records conforming to an Avro schema. It may consume events from multiple sources, and the resulting records may be sent to multiple sinks. Sources and sinks may be shared between multiple mappings. If multiple mappings produce records for the same sink, all mappings must use the same Avro schema. + +An example mapping configuration could be: + +.. code-block:: none + + divolte.mappings { + // The name of the mapping is 'a_mapping' + a_mapping = { + schema_file = /some/dir/MySchema.avsc + mapping_script_file = schema-mapping.groovy + sources = [browser] + sinks = [hdfs,kafka] } + } -:Example: +Implicit default mapping +^^^^^^^^^^^^^^^^^^^^^^^^ - .. code-block:: none +If no mappings are specified a single implicit mapping is created that is equivalent to: - divolte.kafka_flusher.producer = { - metadata.broker.list = ["broker1:9092", "broker2:9092", "broker3:9092"] - client.id = divolte.collector +.. code-block:: none - request.required.acks = 0 - message.send.max.retries = 5 - retry.backoff.ms = 200 + divolte.mappings { + // The name of the implicit mapping is 'default' + default = { + sources = [ /* All configured sources */ ] + sinks = [ /* All configured sinks */ ] } + } + +If *any* mappings are configured this implicit mapping is not present and all mappings must be explicitly specified. + +Mapping properties +^^^^^^^^^^^^^^^^^^ -divolte.hdfs_flusher --------------------- -This section controls settings related to flushing the event stream. +Within the namespace for a mapping properties are used to configure it. At a minimum the ``sources`` and ``sinks`` should be specified; without these a mapping has no work to do. -divolte.hdfs_flusher.enabled -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``sources`` +""""""""""""""""""""""""""""" :Description: - This controls whether flushing to HDFS is enabled. Note that in absence of further HDFS configuration events will be written to the local filesystem. + A list of the names of the sources that this mapping should consume events from. A source may be shared by multiple mappings; each mapping will process every event from the source. :Default: - ``true`` + *Not specified* :Example: .. code-block:: none - divolte.hdfs_flusher { - enabled = false + divolte.mappings.a_mapping { + sources = [site1, site2] } -divolte.hdfs_flusher.threads -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``sinks`` +""""""""""""""""""""""""""" :Description: - Number of threads to use for flushing events to HDFS. Each thread creates its own files on HDFS. Depending on the flushing strategy, multiple concurrent files can be kept open per thread. + A list of the names of the sinks that this mapping should write produced Avro records to. Each produced record is written to all sinks. A sink may be shared by multiple mappings; in this case all mappings must produce records conforming to the same Avro schema. :Default: - ``2`` + *Not specified* :Example: .. code-block:: none - divolte.hdfs_flusher { - threads = 1 + divolte.mappings.a_mapping { + sinks = [hdfs, kafka] } -divolte.hdfs_flusher.max_write_queue -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``schema_file`` +""""""""""""""""""""""""""""""""" :Description: - The maximum number of mapped events to queue internally *per thread* for HDFS before starting to drop them. + By default a mapping will produce records that conform to a `built-in Avro schema `_. However, a custom schema makes usually makes sense that contains fields specific to the domain and custom events. Note that the value for this property is ignored unless ``mapping_script_file`` is also set. :Default: - 100000 + |Built-in schema|_ :Example: .. code-block:: none - divolte.hdfs_flusher { - max_write_queue = 1000000 + divolte.mappings.a_mapping { + schema_file = /etc/divolte/MyEventRecord.avsc } -divolte.hdfs_flusher.max_enqueue_delay -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. |Built-in schema| replace:: *Built-in schema* +.. _Built-in schema: https://github.com/divolte/divolte-schema + +Mapping property: ``mapping_script_file`` +""""""""""""""""""""""""""""""""""""""""" :Description: - The maximum time to wait before dropping the event if the internal queue for one of the HDFS threads is full. + The location of the Groovy script that defines the how events from sources will be mapped to Avro records that are written to sinks. If unset, a default built-in mapping will be used. (In this case any value for the ``schema_file`` property is ignored: the default built-in mapping always produces records conforming to the `built-in schema `.) + + See the :doc:`mapping_reference` for details on mapping events. :Default: - 1 second + *Built-in mapping* :Example: .. code-block:: none - divolte.hdfs_flusher { - max_enqueue_delay = 20 seconds + divolte.mappings.a_mapping { + mapping_script_file = /etc/divolte/my-mapping.groovy } -divolte.hdfs_flusher.hdfs -------------------------- -HDFS specific settings. Although it is possible to configure a HDFS URI here, it is more advisable to configure HDFS settings by specifying a ``HADOOP_CONF_DIR`` environment variable which will be added to the classpath on startup. - -divolte.hdfs_flusher.hdfs.uri -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``discard_corrupted`` +""""""""""""""""""""""""""""""""""""""" :Description: - The filesystem URI to configure the HDFS client with. When absent, the URI is not set. When using ``HADOOP_CONF_DIR`` this should not be set. + Events contain a flag indicating whether the source detected corruption in the event data. If this property is enabled corrupt events will be discarded and not subject to mapping and further processing. Otherwise a best effort will be made to map and process the event as if it was normal. :Default: - *Not set* + :code:`false` :Example: .. code-block:: none - divolte.hdfs_flusher.hdfs { - uri = "file:///" + divolte.mappings.a_mapping { + discard_corrupted = true } -divolte.hdfs_flusher.hdfs.replication -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Mapping property: ``discard_duplicates`` +"""""""""""""""""""""""""""""""""""""""" :Description: - The HDFS replication factor to use when creating files. + Clients sometimes deliver events to sources multiple times, normally within a short period of time. Sources contain a probabilistic filter which can detect this and set a flag on the event. If this property is enabled events flagged as duplicates will be discarded without further mapping or processing. :Default: - ``1`` + :code:`false` :Example: .. code-block:: none - divolte.hdfs_flusher.hdfs { - replication = 3 + divolte.incoming_request_processor { + discard_duplicates = true + } + +Sinks (``divolte.sinks``) +------------------------- + +Sinks are used to write Avro records that have been mapped from received events. Each sink has a name used to identify it when configuring a mapper that produces records for the sink. A sink cannot have the same name as a source (and vice versa). Sinks are configured in sections using their name as the configuration path. (Due to the `HOCON merging rules `_, it's not possible to configure multiple sinks with the same name.) + +Each sink has a type configured via a mandatory ``type`` property. The supported types are: + +- ``hdfs`` +- ``kafka`` + +For example: + +.. code-block:: none + + divolte.sinks { + // The name of the source is 'my_sink' + my_sink = { + // This is a HDFS sink. + type = hdfs + } + } + +Implicit default sinks +^^^^^^^^^^^^^^^^^^^^^^ + +If no sinks are specified two implicit sinks are created that are equivalent to: + +.. code-block:: none + + divolte.sinks { + // The name of the implicit sinks are 'hdfs' and 'kakfa'. + hdfs = { + type = hdfs + replication_factor = 1 } + kafka = { + type = kafka + } + } + +If *any* sinks are configured these implicit sinks are not present and all sinks must be explicitly specified. + -divolte.hdfs.file_strategy --------------------------- -Divolte Collector has two strategies for creating files on HDFS and flushing data. One of these must be configured, but not both. Which strategy to use is set using the `type` property of this configuration; accepted values are either ``SIMPLE_ROLLING_FILE` (default) or ``SESSION_BINNING``. +HDFS Sinks +^^^^^^^^^^ -Simple rolling file strategy -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -By default a simple rolling file strategy is employed. This opens one file per thread and rolls over to a new file after a configurable interval. Files that are being written to have an extension of ``.avro.partial`` and are created in the the directory configured in the ``working_dir`` setting. When a file is closed, it is renamed to have an ``.avro`` extension and moved to the directory configured in the ``publish_dir`` setting. This happens in a single (atomic) filesystem move operation. +A HDFS sink uses a HDFS client to write `Avro files `_ containing records produced by mapping. The schema of the Avro file is the schema of the mapping producing the records. If multiple mappings produce records for a sink they must all use the same schema. -Session binning file strategy -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A more complex strategy is the session binning strategy. This strategy attempts to place events that belong to the same session in the same file. +The HDFS client used to write files is configured according to the global HDFS settings. Depending on the HDFS client version in use, HDFS sinks can write to various locations: -Events are assigned to files using the following rules: +- Native HDFS in a Hadoop cluster. +- A local filesystem. +- S3 in Amazon Web Services (AWS). (See `here `_ for details.) -- The strategy always has a 'current' open file to which events will be written. -- When a session starts, its events are assigned to the current file and will be written there for as long as possible. -- When a period of time the length of the configured session timeout has elapsed, a new file is opened and designed 'current'. -- The previously current file remains open for a further period of time equal to twice the session timeout. During this - period events for sessions assigned to that file will be written there. -- If an event arrives assigned to file that has been closed, the session's events will be reassigned to the oldest open - file. +A HDFS sink uses multiple threads to write the records as they are produced. Each thread writes to its own Avro file, flushing regularly. Periodically the Avro files are closed and new ones started. Files are initially created in the configured working directory and have an extension of ``.avro.partial`` while open and being written to. When closed, they are renamed to have an extension of ``.avro`` and moved to the publish directory. This happens in a single (atomic) move operation, so long as the underlying storage supports this. -.. note:: +Records produced from events with the same party identifier are always written to the same Avro file, and in the order they were received by the originating source. (The relative ordering of records produced from events with the same party identifier is undefined if they originated from different sources, although they will still be written to the same Avro file.) - If the Divolte Collector is shutdown or fails, open files are not moved into the published directory. Instead they - remain in the working directory and need to be manually processed. +Within the namespace for a HDFS sink properties are used to configure it. -divolte.hdfs.file_strategy.type -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``replication`` +""""""""""""""""""""""""""""""""""" :Description: - Identify which strategy to use for flushing HDFS files. Type can be either `SIMPLE_ROLLING_FILE` or `SESSION_BINNING` for the respective strategies. + The HDFS replication factor to use when creating files. :Default: - ``SIMPLE_ROLLING_FILE`` + 3 :Example: .. code-block:: none - divolte.hdfs.file_strategy { - type = SESSION_BINNING + divolte.sinks.a_sink { + type = hdfs + replication = 1 } -divolte.hdfs.file_strategy.sync_file_after_records -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.working_dir`` +""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - How often a ``hsync()`` should be issued to flush HDFS data based on the number of records that have been written since the last flush. + Directory where files are created and kept while being written to. Files being written have a ``.avro.partial`` extension. :Default: - ``1000`` + :file:`/tmp` :Example: .. code-block:: none - divolte.hdfs.file_strategy { - sync_file_after_records = 100 + divolte.sinks.a_sink { + type = hdfs + file_strategy.working_dir = /webdata/inflight } -divolte.hdfs.file_strategy.sync_file_after_duration -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.publish_dir`` +""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - How often a ``hsync()`` should be issued to flush HDFS data based on how long it is since the last flush. + Directory where files are moved to after they are closed. Files when closed have a ``.avro`` extension. :Default: - 30 seconds + :file:`/tmp` :Example: .. code-block:: none - divolte.hdfs.file_strategy { - sync_file_after_duration = 1 minute + divolte.sinks.a_sink { + type = hdfs + file_strategy.publish_dir = /webdata/published } -divolte.hdfs.file_strategy.working_dir -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.roll_every`` +"""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Directory where files are created and kept while being written to. + Roll over files on HDFS after this amount of time. (If the working file doesn't contain any records it will be discarded.) :Default: - ``/tmp`` + 1 hour :Example: .. code-block:: none - divolte.hdfs.file_strategy { - working_dir = /webdata/inflight + divolte.sinks.a_sink { + type = hdfs + file_strategy.roll_every = 15 minutes } -divolte.hdfs.file_strategy.publish_dir -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.sync_file_after_records`` +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Directory where files are moved to after they are closed. + The maximum number of records that should be written to the working file since the last flush before flushing again. Flushing is performed by issuing a :code:`hsync()` call to flush HDFS data. :Default: - ``/tmp`` + 1000 :Example: .. code-block:: none - divolte.hdfs.file_strategy { - publish_dir = /webdata/published + divolte.sinks.a_sink { + type = hdfs + file_strategy.sync_file_after_records = 100 } -divolte.hdfs.file_strategy.roll_every *(simple rolling strategy only)* -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +HDFS Sink Property: ``file_strategy.sync_file_after_duration`` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Description: - Roll over files on HDFS after this amount of time. + The maximum time that may elapse after a record is written to the working file before it is flushed. Flushing is performed by issuing a :code:`hsync()` call to flush HDFS data. :Default: - 60 minutes + 30 seconds :Example: .. code-block:: none - divolte.hdfs.file_strategy { - roll_every = 15 minutes + divolte.sinks.a_sink { + type = hdfs + file_strategy.sync_file_after_duration = 10 seconds } -logback.xml ------------ -Divolte Collector uses the `Logback Project `_ as its logging provider. This provider is configured through the ``logback.xml`` file in the configuration directory. For more information about the settings in this file, review the `Configuration chapter in the Logback Manual `_. - -Website integration -=================== -Next to the server side configuration, Divolte Collector needs to be integrated into a website in order to log events. The minimum integration involves adding a single tag to collect pageviews. This can be extended with custom events for tracking specific user interactions. - -The tag -------- -The tag for Divolte Collector to include in each page of a website is this: +Kafka Sinks +^^^^^^^^^^^ -.. code-block:: html +A Kafka sink uses a Kafka producer to write Avro records as individual messages on a Kafka topic. The producer is configured according to the global Kafka settings. - +Records produced from events with the same party identifier are queued on a topic in the same order they were received by the originating source. (The relative ordering across sources is not guaranteed.) The messages are keyed by their party identifier meaning that Kafka will preserve the relative ordering between messages with the same party identifier. -The URL will need to use the domain name where you are hosting Divolte Collector, and ``divolte.js`` needs to match the ``divolte.javascript.name`` configuration setting. +The body of each Kafka message contains a single Avro record, serialised using Avro's `binary encoding `_. The schema is not included or referenced in the message. Because Avro's binary encoding is not self-describing, a topic consumer must be independently configured to use a *write schema* that corresponds to the schema used by the mapper that produced the record. -Custom events -------------- -The tracking tag provides an API for pages to fire custom events: +Within the namespace for a Kafka sink properties are used to configure it. -.. code-block:: html +Kafka sink property: ``topic`` +"""""""""""""""""""""""""""""" +:Description: + The Kafka topic onto which events are published. +:Default: + ``divolte`` +:Example: - + .. code-block:: none -The first argument to the ``divolte.signal(...)`` function is the event type parameter. The second argument is a arbitrary object with custom event parameters. Storing the event parameter and the custom event parameters into the configured Avro data is achieved through the mapping. See the :doc:`mapping_reference` chapter for details. + divolte.sinks.a_sink { + type = kafka + topic = clickevents + } diff --git a/docs/deployment.rst b/docs/deployment.rst index f5d8eed6..4b9febbb 100644 --- a/docs/deployment.rst +++ b/docs/deployment.rst @@ -3,15 +3,15 @@ Deployment ********** This chapter describes common steps for deploying Divolte Collector in production. -Installation / packages -======================= +Installation/packages +===================== The distributions provided for Divolte Collector are: -- A .tar.gz archive distribution containing the binaries and startup scripts. -- A .zip archive distribution containing the binaries and startup scripts. -- A RPM that can be installed onto Redhat / CentOS systems. This includes startup and init scripts. +- A ``.tar.gz`` archive distribution containing the binaries and startup scripts. +- A ``.zip`` archive distribution containing the binaries and startup scripts. +- A RPM that can be installed onto Red Hat/CentOS systems. This includes startup and init scripts. -Currently, there is no .deb distribution. This will be added in a next release. +Currently there is no Debian packaging. Load balancers ============== @@ -19,24 +19,26 @@ In a production scenario, Divolte Collector is typically deployed behind a load Divolte Collector is semi-stateless. This means that it is not required that requests form the same client always go to the same instance; the event will be logged in all cases. Divolte Collector does however build up some soft state during operation for detecting duplicate events and caching parsed user agents. This means that there is benefit in stickyness, but it is not a requirement. -URI / hash based load balancing policy --------------------------------------- +URI/hash-based load balancing policy +------------------------------------ Divolte Collector keeps a short term memory for detecting duplicate requests. In order for this to work, exact duplicate requests need to always go to the same instance. Most load balancers can support this by setting up a routing policy that uses a hash of the requested URI to determine which instance to route the request to. When using duplicate detection, be sure to configure your load balancer to do this. Consistent hashing and event de-duplication ------------------------------------------- -If possible, load balancers should use a so called consistent hashing scheme when performing URI hash based routing. This ensures that when a instance of Divolte Collector dies, the re-hashing amongst the remaining instances only minimally disrupts the event assignments. The benefit of this is that the duplicate memory kept by Divolte Collector nodes remains effective on the still running nodes. +If possible, load balancers should use a consistent hashing scheme when performing URI hash-based routing. This should ensure that most traffic continues to be routed to the same instance as before. The benefit of this is that the duplicate memory kept by Divolte Collector nodes remains effective. SSL === -Divolte Collector does not handle SSL in any way. SSL offloading needs to be done by a load balancer or a reverse proxy server. These systems are generally capable of offloading SSL and since there will always be a load balancer in front of Divolte Collector in production setups, it was decided not to add this functionality to the internal HTTP server. +Divolte Collector does not handle SSL itself. SSL offloading needs to be done by a load balancer or a reverse proxy server. This can normally handled by the load balancer in front of Divolte Collector in production setups. Example nginx configuration =========================== -When using `nginx `_ as a reverse proxy and load balancer in front of Divolte Collector, you can use this snippet for configuring nginx:: +When using `nginx `_ as a reverse proxy and load balancer in front of Divolte Collector, you can use this snippet for configuring nginx: + +.. code-block:: nginx upstream divolte { - hash $request_uri consistent; + hash $request_uri consistent; server divolte1.internaldomain:8290; server divolte1.internaldomain:8290; diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 289c218f..7c3a1a27 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -55,7 +55,7 @@ Now, take your web browser to http://127.0.0.1:8290/ and check that you see a pa Looking at the data =================== -Now, go back to the console where Divolte Collector is running and hit CTRL+C (or kill the process). You should see output similar to this: +Now, go back to the console where Divolte Collector is running and hit :kbd:`Control-c` (or kill the process). You should see output similar to this: .. code-block:: none @@ -63,13 +63,13 @@ Now, go back to the console where Divolte Collector is running and hit CTRL+C (o 2014-12-17 09:27:15.396+01 [Thread-8] INFO [Server]: Stopping thread pools. 2014-12-17 09:27:17.399+01 [Thread-8] INFO [Server]: Closing HDFS filesystem connection. -When Divolte Collector shuts down it will flush and close all open files, so now we can have a look at the data that was generated. By default, with no configuration, Divolte Collector will write ``.avro`` files in ``/tmp`` on the local filesystem. For convenience, Divolte Collector packages a version of the avro-tools that come with Apache Avro, so you can look at the contents of these files as JSON records. Try the following: +When Divolte Collector shuts down it will flush and close all open files, so now we can have a look at the data that was generated. By default, with no configuration, Divolte Collector will write ``.avro`` files in :file:`/tmp` on the local filesystem. For convenience, Divolte Collector packages a version of the avro-tools that come with Apache Avro, so you can look at the contents of these files as JSON records. Try the following: .. code-block:: bash % find /tmp/*.avro -name '*divolte-tracking-*.avro' | sort | tail -n1 | xargs ./bin/avro-tools tojson --pretty -This finds a ``.avro`` file in your ``/tmp`` directory and passes it to the ``avro-tools tojson`` command. Depending on how many requests you made, it will display multiple records. The output for a single record should look like this: +This finds a ``.avro`` file in your :file:`/tmp` directory and passes it to the :code:`avro-tools tojson` command. Depending on how many requests you made, it will display multiple records. The output for a single record should look like this: .. code-block:: json @@ -140,14 +140,14 @@ This finds a ``.avro`` file in your ``/tmp`` directory and passes it to the ``av Bring your own schema ===================== -Divolte Collector uses Avro to write data to files. Avro records require you to define a `Avro schema `_ that defines the fields in the records. Divolte Collector comes with a `built in generic schema `_ that is useful for keeping track of the basics of your clickstream data, but in most cases it makes sense to create your own schema with more specific fields that have a meaning within your website's domain. In order to achieve this two things are needed: +Divolte Collector uses Avro to write data to files. Avro records require you to define a `Avro schema `_ that defines the fields in the records. Divolte Collector comes with a `built-in generic schema `_ that is useful for keeping track of the basics of your clickstream data, but in most cases it makes sense to create your own schema with more specific fields that have a meaning within your website's domain. In order to achieve this two things are needed: 1. A custom Avro schema 2. A mapping that defines how to map requests onto the custom schema. Let's create a custom schema. -Create a file called ``MyEventRecord.avsc`` with the following contents (for example in the ``conf/`` directory under the Divolte Collector installation): +Create a file called :file:`MyEventRecord.avsc` with the following contents (for example in the :file:`conf/` directory under the Divolte Collector installation): .. code-block:: json @@ -165,7 +165,7 @@ Create a file called ``MyEventRecord.avsc`` with the following contents (for exa ] } -This is a very minimal custom schema, but it allows us to demonstrate a very important feature in Divolte Collector: mapping. In order to use the custom schema, we need to create a mapping that maps incoming requests onto the schema fields. Create a file called ``mapping.groovy`` with the following contents: +This is a very minimal custom schema, but it allows us to demonstrate a very important feature in Divolte Collector: mapping. In order to use the custom schema, we need to create a mapping that maps incoming requests onto the schema fields. Create a file called :file:`mapping.groovy` with the following contents: .. code-block:: groovy @@ -185,16 +185,17 @@ This is a very minimal custom schema, but it allows us to demonstrate a very imp The mapping is defined using a internal Groovy DSL in Divolte Collector. In this example we map a number of values onto fields in the Avro schema. The values for timestamp, remoteHost and location are mapped directly onto fields in the schema. In the remainder of the script, we tell Divolte Collector to take the fragment of the location (the part after the ``#`` in the URL) and try to parse that into a (partial) URI again. From the result URI, we map the path onto a schema field. Subsequently, parse out the values to two query string parameters (``q`` and ``n``) and map those onto separate schema fields after trying to parse an integer out of the ``n`` parameter. The mapping DSL allows for a lot more constructs, including conditional logic, regex matching and more; see the :doc:`mapping_reference` documentation for more information on this. -Finally, we need to configure Divolte Collector to use our custom schema and mapping. Edit the (empty) ``divolte-collector.conf`` file in the ``conf/`` directory of your installation to resemble the following configuration (be sure to use the correct paths for the schema and mapping file that you just created): +Finally, we need to configure Divolte Collector to use our custom schema and mapping. Edit the (empty) :file:`divolte-collector.conf` file in the :file:`conf/` directory of your installation to resemble the following configuration (be sure to use the correct paths for the schema and mapping file that you just created): .. code-block:: none divolte { - tracking { - schema_file = "/path/to/divolte-collector/conf/MyEventRecord.avsc" - schema_mapping { - version = 2 + mappings { + my_mapping = { + schema_file = "/path/to/divolte-collector/conf/MyEventRecord.avsc" mapping_script_file = "/path/to/divolte-collector/conf/mapping.groovy" + sources = [browser] + sinks = [hdfs] } } } @@ -203,7 +204,7 @@ Finally, we need to configure Divolte Collector to use our custom schema and map Divolte Collector configuration uses the `Typesafe Config `_ library, which uses a configuration dialect called `HOCON `_. -Now, once more, start Divolte Collector as before. Only this time, take your web browser to this address: `http://127.0.0.1:8290/#/fragment/path?q=textual&n=42 `_. You can refresh the page a couple of times and perhaps change the query string parameter values that are in the URL to something else. After you have done one or more requests, stop Divolte Collector again (using CTRL+C) and look at the collected data using this command again: +Now, once more, start Divolte Collector as before. Only this time, take your web browser to this address: `http://127.0.0.1:8290/#/fragment/path?q=textual&n=42 `_. You can refresh the page a couple of times and perhaps change the query string parameter values that are in the URL to something else. After you have done one or more requests, stop Divolte Collector again (using :kbd:`Control-c`) and look at the collected data using this command again: .. code-block:: console @@ -254,7 +255,7 @@ The tag is the line: -The tag performs a number of important tasks. It generates unique identifiers for parties, sessions, pageviews and events. It collects the location, referer, screen and viewport size information from the browser sends it to the Divolte Collector server. +The tag performs a number of important tasks. It generates unique identifiers for parties, sessions, page-views and events. It collects the location, referer, screen and viewport size information from the browser sends it to the Divolte Collector server. In order to instrument a web page of your own, insert the tag as above into the HTML code on each page. Additionally, once the Divolte Collector JavaScript is loaded in the browser it is possible to fire custom events from JavaScript in the page: @@ -289,69 +290,79 @@ In order to use the custom events in your mapping, map values onto fields like t Writing to HDFS =============== -So far, we've been writing our data to the local filesystem in ``/tmp``. Although this works it not the intended use of Divolte Collector. The aim is to write the clickstream data to HDFS, such that it is safely and redundantly stored and available for processing using any tool available that knows how to process Avro files (e.g. Apache Hive or Apache Spark). It is trivial to configure Divolte Collector to write to HDFS, assuming you have a working HDFS instance setup. (Setting this up is out of the scope of this getting started guide. There are many great resources to be found on the internet about getting started with and running Hadoop and HDFS.) +So far, we've been writing our data to the local filesystem in :file:`/tmp`. Although this works it not the intended use of Divolte Collector. The aim is to write the clickstream data to HDFS, such that it is safely and redundantly stored and available for processing using any tool available that knows how to process Avro files (e.g. Apache Hive or Apache Spark). It is trivial to configure Divolte Collector to write to HDFS, assuming you have a working HDFS instance setup. (Setting this up is out of the scope of this getting started guide. There are many great resources to be found on the internet about getting started with and running Hadoop and HDFS.) Assuming you have a HDFS instance running somewhere, there are two ways of making Divolte Collector write files to it: 1. Direct configuration; or -2. Setting the ``HADOOP_CONF_DIR`` environment variable to point to a directory containing valid Hadoop configuration files. +2. Setting the :envvar:`HADOOP_CONF_DIR` environment variable to point to a directory containing valid Hadoop configuration files. While the first option works, it is recommended to use the latter as it is easier to maintain when your HDFS parameters change over time. -First, we'll change the configuration to write files to HDFS. Add the following section to ``conf/divolte-collector.conf``: +First, we'll change the configuration to write files to HDFS. Add the following section to :file:`conf/divolte-collector.conf`: .. code-block:: none divolte { - hdfs_flusher { - // Enable the HDFS flushing - enabled = true - - // Use multiple threads to write to HDFS - threads = 2 - - // Use a simple strategy of rolling files after a certain period of time. - // For other strategies, have a look at the configuration documentation. - simple_rolling_file_strategy { - // Create a new file every hour - roll_every = 1 hour - - // Perform a hsync call on the HDFS files after every 1000 record written or - // after every 5 seconds, whichever happens first. - - // Performing a hsync call periodically prevents data loss incase of failure - // scenarios. - sync_file_after_records = 1000 - sync_file_after_duration = 5 seconds - - // Files that are being written will be created in a working directory. - // Once a file is closed, Divolte Collector will move the file to a - // publish directory. The working and publish directories are allowed - // to be the same, but this is not recommended. - working_dir = "/divolte/inflight" - publish_dir = "/divolte/published" + global { + hdfs { + // Enable HDFS sinks. + enabled = true + + // Use multiple threads to write to HDFS. + threads = 2 + } + } + + sinks { + // The name of the sink. (It's referred to by the mapping.) + hdfs { + type = hdfs + + // For HDFS sinks we can control how the files are created. + file_strategy { + // Create a new file every hour + roll_every = 1 hour + + // Perform a hsync call on the HDFS files after every 1000 records are written + // or every 5 seconds, whichever happens first. + + // Performing a hsync call periodically can prevent data loss in the case of + // some failure scenarios. + sync_file_after_records = 1000 + sync_file_after_duration = 5 seconds + + // Files that are being written will be created in a working directory. + // Once a file is closed, Divolte Collector will move the file to the + // publish directory. The working and publish directories are allowed + // to be the same, but this is not recommended. + working_dir = "/divolte/inflight" + publish_dir = "/divolte/published" + } + + // Set the replication factor for created files. + replication = 3 } } } -Note that you need to create these directories on HDFS prior to starting Divolte Collector. It will not startup if the directories do not exist. +Note that you need to create these directories prior to starting Divolte Collector. It will not startup if the directories do not exist. -If you have a working HDFS setup and a directory with the appropriate configuration files, Divolte Collector will use them automatically if a ``HADOOP_CONF_DIR`` environment variable is set pointing to that directory. Otherwise, it is possible to tell Divolte Collector directly about your HDFS location from the configuration: +If you have a working HDFS setup and a directory with the appropriate configuration files, Divolte Collector will use them automatically if a :envvar:`HADOOP_CONF_DIR` environment variable is set pointing to that directory. Alternatively, HDFS client properties can be provided in the configuration: .. code-block:: none divolte { - hdfs_flusher { + global { hdfs { - uri = "hdfs://192.168.100.128:8020/" - replication = 1 + client { + fs.defaultFS = "hdfs://192.168.100.128:8020/" + } } } } -Do note that in this scenario it is not possible to set additional HDFS client configuration, as you can do when using the ``HADOOP_CONF_DIR`` environment variable. Also, when your HDFS NameNode is setup redundantly you can configure only one using the Divolte Collector configuration. This is why it is recommended to use a ``HADOOP_CONF_DIR``. - -With everything in place, start Divolte Collector again, create some events and see verify that files are being created on HDFS: +With everything in place, start Divolte Collector again, create some events and verify that files are being created on HDFS: .. code-block:: console @@ -360,7 +371,7 @@ With everything in place, start Divolte Collector again, create some events and -rw-r--r-- 1 divolte supergroup 617 2014-08-30 11:46 /divolte/inflight/20141220152512-divolte-tracking-divoltehost-1.avro.partial -rw-r--r-- 1 divolte supergroup 617 2014-08-30 11:46 /divolte/inflight/20141220152513-divolte-tracking-divoltehost-2.avro.partial -After the rolling interval, files should show up in the publish directory with a .avro extension (without the .partial). However, if a file was opened in the working directory, but no events were ever written to it (because there was no activity or otherwise), it will not be moved to the publish directory, but will be deleted entirely instead: +After the rolling interval, files should show up in the publish directory with a ``.avro`` extension (without the ``.partial``). However, if a file was opened in the working directory, but no events were ever written to it (because there was no activity or otherwise), it will not be moved to the publish directory, but will be deleted entirely instead: .. code-block:: console @@ -375,21 +386,30 @@ Configuring Divolte Collector to write data to a Kafka topic is quite similar to .. code-block:: none divolte { - kafka_flusher { - // Enable Kafka flushing - enabled = true - - // This is the name of the topic that data will be produced on - topic = divolte-data - - // The properties under the producer key in this - // configuration are used to create a Properties object - // which is passed to Kafka as is. At the very least, - // configure the broker list here. For more options - // that can be passed to a Kafka producer, see this link: - // http://kafka.apache.org/documentation.html#producerconfigs - producer = { - bootstrap.servers = "10.200.8.55:9092,10.200.8.53:9092,10.200.8.54:9092" + global { + kafka { + // Enable Kafka flushing + enabled = true + + // The properties under the producer key in this + // configuration are used to create a Properties object + // which is passed to Kafka as is. At the very least, + // configure the broker list here. For more options + // that can be passed to a Kafka producer, see this link: + // http://kafka.apache.org/082/documentation.html#newproducerconfigs + producer = { + bootstrap.servers = "10.200.8.55:9092,10.200.8.53:9092,10.200.8.54:9092" + } + } + } + + sinks { + // The name of the sink. (It's referred to by the mapping.) + kafka { + type = kafka + + // This is the name of the topic that data will be produced on + topic = divolte-data } } } @@ -398,6 +418,21 @@ Data in Kafka ------------- Avro files on HDFS are written with the schema in the header. Unfortunately Kafka doesn't really have a clear way of passing along the schema. For the messages on Kafka queues we expect the consumer to know the schema in advance, meaning that *the messages that are passed onto the queue only contain the raw bytes of the serialized Avro record without any metadata*. The key of each message is the party ID that for the event. Divolte Collector provides a small helper library to easily create Kafka consumers in Java using Avro's code generation support. There is an example Kafka consumer with step by step instruction on getting it up and running in our usage examples repository here: `https://github.com/divolte/divolte-examples/tree/master/tcp-kafka-consumer `_. +Event Flows +=========== + +So far we've seen a single source of events being mapped to HDFS, and Kafka if you tried this. However Divolte can be +configured with multiple: + +- *Sources* of events, which is where Divolte events arrive. +- *Sinks* (destinations) where Avro records can be written after they have been produced by mapping Divolte events. +- *Mappings* between sources and sinks, which controls which sources are connected to which sinks, and how the events + are converted to Avro records. + +Events flow from sources to sinks, via an intermediate mapping. Allowing multiple sources, sinks and mappings allows Divolte to support multiple sites and domains, each of which may require independent mapping. Note, however, that a sink can only support a single Avro schema: all mappings which refer to it must be configured to produce records conforming to the same Avro schema. + +An event flow imposes a partial ordering on the events it receives: events from a source that have the same party identifier will be written to sinks in the same order that they were received in. (This doesn't apply to events received across different sources: even if they share the same party identifier their relative ordering is not guaranteed.) + What's next? ============ * Once you are collecting data to either HDFS or Kafka, see our `examples `_ to learn how to use your clickstream data in tools like Apache Spark, Apache Hive or Impala or build near real-time consumers for Apache Kafka with your Divolte Collector data. diff --git a/docs/introduction.rst b/docs/introduction.rst index dca4398b..e283f659 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -69,7 +69,12 @@ Features ======== In addition to collecting click events, Divolte Collector provides a number of welcome features: -* Single line JavaScript deployment: +* Single line JavaScript deployment: + + .. code-block:: html + + + * Mapping clickstream data onto a domain specific (Avro) schema; on the fly parsing * Comes with a built in default schema and mapping for basic, zero-config deployment @@ -79,10 +84,10 @@ In addition to collecting click events, Divolte Collector provides a number of w * Corrupt request detection for similar issues as above. * Generates unique identifiers: - * party ID: a long lived cookie that is set on the client - * session ID: a cookie that expires after 30 minutes of inactivity - * pageview ID: a unique identifier for each pageview and subsequent custom events fired from the same page - * event ID: a unique identifier for each event + * Party ID: a long lived cookie that is set on the client + * Session ID: a cookie that expires after 30 minutes of inactivity + * Pageview ID: a unique identifier for each pageview and subsequent custom events fired from the same page + * Event ID: a unique identifier for each event * User agent parsing: the user agent string is parsed on the fly and the resulting fields (e.g. operating system, browser type, device type) can be mapped onto the schema. * On the fly geolocation lookup based on IP address can be done using the `Maxmind databases `_. diff --git a/docs/mapping_reference.rst b/docs/mapping_reference.rst index 154430e7..4ec2a78f 100644 --- a/docs/mapping_reference.rst +++ b/docs/mapping_reference.rst @@ -2,25 +2,25 @@ Mapping ******* -Mapping in Divolte Collector is the definition that determines how incoming requests are translated into Avro records with a given schema. This definition is composed in a special, built in `Groovy `_ based DSL (domain specific language). +Mapping in Divolte Collector is the definition that determines how incoming events are translated into Avro records conforming to a schema. This definition is constructed using a `Groovy\ `_\ -based DSL (Domain-Specific Language). Why mapping? ============ -Most clickstream data collection services or solutions use a canonical data model that is specific to click events and related properties. Things such as location, referer, remote IP address, path, etc. are all properties of a click event that come to mind. While Divolte Collector exposes all of these fields just as well, it is our vision that this is not enough to make it easy to build online and near real-time data driven products within specific domains and environments. For example, when working on a system for product recommendation, the notion of a URL or path for a specific page is completely in the wrong domain; what you would care about in this case is likely a product ID and probably a type of interaction (e.g. product page view, large product photo view, add to basket, etc.). It is usually possible to extract these pieces of information from the clickstream representation, which means custom parsers have to be created to parse this information out of URLs, custom events from JavaScript and other sources. This means that whenever you work with the clickstream data, you have to run these custom parsers initially in order to get meaninful, domain specific information from the data. When building real-time systems, it normally means that this parser has to run in multiple locations: as part of the off line processing jobs and as part of the real-time processing. +Most clickstream data collection services or solutions use a canonical data model that is specific to click events and related properties. Things such as location, referrer, remote IP address, path, etc. are all properties of a click event that come to mind. While Divolte Collector exposes all of these fields just as well, it is our vision that this is not enough to make it easy to build online and near real-time data driven products within specific domains and environments. For example, when working on a system for product recommendation, the notion of a URL or path for a specific page is completely in the wrong domain; what you care about in this case is likely a product ID and probably a type of interaction (e.g. product page view, large product photo view, add to basket, etc.). It is usually possible to extract these pieces of information from the clickstream representation, which means custom parsers have to be created to parse this information out of URLs, custom events from JavaScript and other sources. This means that whenever you work with the clickstream data, you have to run these custom parsers initially in order to get meaninful, domain specific information from the data. When building real-time systems, it normally means that this parser has to run in multiple locations: as part of the off line processing jobs and as part of the real-time processing. -With Divolte Collector, instead of writing parsers and working with the raw clickstream event data in your processing, you define a mapping that allows Divolte Collector to do all the required parsing on the fly as events come in and subsequently produce structured records with a schema to use in further processing. This means that all data that comes in can already have the relevant domain specific fields populated. And whenever the need for a new extracted piece of information arises, you can update the mapping to include the new field in the newly produced data. The older data that lacks newly additional fields can co-exist with newer data that does have the additional fields through a process called schema evolution. This is supported by Avro's ability to read data with a different schema from the one that the data was written with. +With Divolte Collector, instead of writing parsers and working with the raw clickstream event data in your processing, you define mappings that allows Divolte Collector to do all the required parsing on the fly as events come in and subsequently produce structured records with a schema to use in further processing. This means that all data that comes in can already have the relevant domain specific fields populated. Whenever the need for a new extracted piece of information arises, you can update the mapping to include the new field in the newly produced data. The older data that lacks newly additional fields can co-exist with newer data that does have the additional fields through a process called schema evolution. This is supported by Avro's ability to read data with a different schema from the one that the data was written with. (This is implemented at read-time using a process called `schema resolution `_.) -In essence, the goal of the mapping is to get rid of log file or URL parsing on collected data after it is published. The event stream from Divolte Collector should have all the domain specific fields to support you use cases directly. +The goal of the mapping is to get rid of log file or URL parsing on collected data after it is published. The event stream from Divolte Collector should have all the domain specific fields to support you use cases directly. Understanding the mapping process --------------------------------- -Before you dive in to creating your own mappings, it is important to understand a little bit about how the mapping is actually performed. **The most notable thing to keep in mind is that the mapping script that you provide, is not evaluated at request time for each request.** Rather, it is evaluated only once on startup and the result of the script is used to perform the actual mapping. This means that your mapping script is evaluated only once during the run-time of the Divolte Collector server. +Before you dive in to creating your own mappings, it is important to understand a little bit about how a mapping is actually performed. **The most notable thing to keep in mind is that a mapping script that you provide is not evaluated at request time for each event.** Instead a mapping is evaluated only once during startup and *declares* how the actual mapping should take place. .. image:: images/mapping-request-run-time.png -Built in default mapping +Built-in default mapping ------------------------ -Divolte Collector comes with a built in default schema and mapping. This will map pretty much all of the basics that you would expect from a clickstream data collector. The Avro schema that is used can be found in the `divolte-schema Github repository `_. The following mappings are present in the default mapping: +Divolte Collector comes with a built-in default schema and mapping. A mapping will use these if the mapping schema or script file are not specified. The default mapping will map pretty much all of the basics that you would expect from a clickstream data collector. The Avro schema that is used can be found in the `divolte-schema Github repository `_. The following mappings are present in the default mapping: =============================== ================= Mapped value Avro schema field @@ -53,145 +53,165 @@ Mapped value Avro schema field `User agent OS vendor`_ userAgentOsVendor =============================== ================= -The default schema is not available as a mapping script. Instead, it is hard coded into Divolte Collector. This way, you can setup Divolte Collector to do something useful out-of-the-box without any complex configuration. +The default schema is not available as a mapping script. Instead, it is hard coded into Divolte Collector. This allows Divolte Collector to do something useful out-of-the-box without any complex configuration. Schema evolution and default values ----------------------------------- Schema evolution is the process of changing the schema over time as requirements change. For example when a new feature is added to your website, you add additional fields to the schema that contain specific information about user interactions with this new feature. In this scenario, you would update the schema to have these additional fields, update the mapping and then run Divolte Collector with the new schema and mapping. This means that there will be a difference between data that was written prior to the update and data that is written after the update. Also, it means that after the update, there can still be consumers of the data (from HDFS or Kafka) that still use the old schema. In order to make sure that this isn't a problem, the readers with the old schema need to be able to read data written with the new schema and readers with the new schema should also still work on data written with the old schema. -Luckily, Avro supports both of these cases. When reading newer data with an older schema, the fields that are not present in the old schema are simply ignored by the reader. The other way araound is slightly trickier. When reading older data with a new schema, Avro will fill in the default values for fields that are present in the schema but not in the data. *This is provided that there is a default value.* Basically, this means that it is recommended to always provide a default value for all your fields in the schema. In case of nullable fields, the default value could just be null. +Luckily, Avro supports both of these cases. When reading newer data with an older schema, the fields that are not present in the old schema are simply ignored by the reader. The other way around is slightly trickier. When reading older data with a new schema, Avro will fill in the default values for fields that are present in the schema but not in the data. *This is provided that there is a default value.* Basically, this means that it is recommended to always provide a default value for all your fields in the schema. In case of nullable fields, the default value could just be null. One other reason to always provide a default value is that Avro does not allow to create records with missing values if there are no default values. As a result of this, fields that have no default value always must be populated in the mapping, otherwise an error will occur. This is problematic if the mapping for some reason fails to set a field (e.g. because of a user typing in a non-conforming location in the browser). +In addition to introducing new fields with defaults, other forms of changes such as renaming and type changes can be permitted under some circumstances. For full details on the changes that are permitted and how the writing and reading schemas are reconciled refer to the `Avro documentation on schema resolution `_. + Mapping DSL =========== -The mapping is a Groovy script that is compiled and run by Divolte Collector on startup. This script is written in the mapping DSL. The result of this script is a mapping that Divolte Collector can use to map incoming requests onto a Avro schema. +Mappings are specified by Groovy scripts that are compiled and run by Divolte Collector on startup. Each mapping script is written in the mapping DSL. The result of running this script is a mapping that Divolte Collector can use to map incoming events from its configured sources onto an Avro schema. Values, fields and mappings --------------------------- -The mapping involves three main concepts: values, fields and mappings. +Mapping involves three main concepts: values, fields and mappings. + +A *value* is something that is extracted from the incoming event (e.g. the location or a HTTP header value) or is derived from another value (e.g. a query parameter from the location URI). Values in the mapping are produced using calls to functions that are built into the mapping DSL. Below is the complete documentation for all values that can be produced. One example of such a function call would be calling :code:`location()` for the location value or :code:`referer()` for the referrer value of the event. -A value is something that is extracted from the incoming request (e.g. the location or a HTTP header value) or is derived from another value (e.g. a query parameter from the location URI). Values in the mapping are produced using method calls to methods that are built into the mapping DSL. Below is the complete documentation for all values that can be produced. One example of such a method call would be calling location() for the location value or referer() for the referer value of the request. +A *field* is a field in the Avro record that will be produced as a result of the mapping process. The type of a field is defined by the Avro schema that is used. Mapping is the process of mapping values extracted from the event onto fields in the Avro record. -A field is a field in the Avro record that will be produced as a result of the mapping process. The type of a field is defined by the Avro schema that is used. Mapping is the process of mapping values extracted from the request onto fields in the Avro record. +A *mapping* is the piece that tells Divolte Collector which values need to be mapped onto which fields. The mapping DSL has a built in construct for this, explained below. -A mapping is the piece that tells Divolte Collector which values need to be mapped onto which fields. The mapping DSL has a built in construct for this, explained below. +Mapping values onto fields (:code:`map`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The simplest possible mapping is mapping a simple value onto a schema field. The syntax is as follows: -Mapping values onto fields (map) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The simplest possible mapping is mapping a simple value onto a schema field. The syntax is as follows:: +.. code-block:: groovy map location() onto 'locationField' -Alternatively, the map methods takes a closure as first argument, which can come in handy when the value is the result of several operations or a more complex construct, such as this example where we take a query parameter form the location and parse it to an int:: +Alternatively, the :code:`map` function takes a closure as first argument, which can come in handy when the value is the result of several operations or a more complex construct, such as this example where we take a query parameter from the location and parse it as an integer: + +.. code-block:: groovy map { def u = parse location() to uri // Parse the URI out of the location parse u.query().value('n') to int32 // Take the n query parameter and try to parse an int out of it } onto 'intField' -In Groovy, the last statement in a closure becomes the return value for the closure. So in the closure above, the value returned by the parse call is the result of the entire closure. This is in turn mapped onto the 'intField' field of the Avro record. +In Groovy the last statement in a closure becomes the return value for the closure. So in the closure above, the value returned by the :code:`parse` call is the result of the entire closure. This is in turn mapped onto the :code:`intField` field of the Avro record. -Apart from mapping values onto fields, it is also possible to map a literal onto a field:: +Apart from mapping values onto fields, it is also possible to map a literal onto a field: + +.. code-block:: groovy map 'string literal' onto 'stringField' map true onto 'booleanField' -This is most often used in combination with `Conditional mapping (when)`_, like in this example:: +This is most often used in combination with `Conditional mapping (when)`_ as in this example: + +.. code-block:: groovy - when referer().isAbsent() apply { // Only apply this mapping when a referer is absent + when referer().isAbsent() apply { // Only apply this mapping when a referer is absent map true onto 'directTraffic' } Value presence and nulls """""""""""""""""""""""" -Not all values are present in each request. For example when using a custom cookie value, there could be incoming requests where the cookie is not sent by the client. In this case, the cookie value is said to absent. Divolte Collector will never actively set a null value. Instead for absent values it does nothing at all; i.e. the mapped field is not set on the Avro record. When values that are absent are used in subsequent constructs, the resulting values will also be absent. In the following example, if the incoming request has no referrer, the field 'intField' will never be set, but no error occurs:: +Not all values are present in each event. For example, when using a custom cookie value there could be incoming events where the cookie is not sent by the client. In this case the cookie value is said to absent. Divolte Collector will never actively set a null value. Instead for absent values it does nothing at all: the mapped field is not set on the Avro record. When values that are absent are used in subsequent expressions the derived values will also be absent. In the following example the :code:`intField` field will never be set because the incoming request has no referrer. This is not an error: + +.. code-block:: groovy def u = parse referer() to uri // parse a URI out of the referer def q = u.query() // parse the query string of the URI def i = parse q.value('foo') to int32 // parse a int out of the query parameter 'foo' map i onto 'intField' // map it onto the field 'intField' -Because absent values result in fields not being set, your schema must have default values for all fields that are used for mappings where the value can be absent. In practice, it is recommended to always use default values for all fields in your schema. +Because absent values result in fields not being set your schema must have default values for all fields that are used for mappings where the value can be absent. In practice, it is recommended to always use default values for all fields in your schema. Types ^^^^^ -Values in the mapping are typed and the value type must match the type of the Avro field that they are mapped onto. Divolte Collector checks the type compatibility during startup and will report an error if there is a mismatch. The type for a value can be found in the documentation below. - -Below is a table of all types that can be produced in a mapping and the corresponding Avro schema's that match them: - -+----------------------------+------------------------------------------------------------------------+ -| type | Avro type | -+============================+========================================================================+ -| string | :: | -| | | -| | { "name": "fieldName", "type": ["null","string"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| boolean | :: | -| | | -| | { "name": "fieldName", "type": ["null","boolean"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| int | :: | -| | | -| | { "name": "fieldName", "type": ["null","int"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| long | :: | -| | | -| | { "name": "fieldName", "type": ["null","long"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| float | :: | -| | | -| | { "name": "fieldName", "type": ["null","float"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| double | :: | -| | | -| | { "name": "fieldName", "type": ["null","double"], "default": null } | -+----------------------------+------------------------------------------------------------------------+ -| map> | :: | -| | | -| | { | -| | "name": "fieldName", | -| | "type": [ | -| | "null", | -| | { | -| | "type": "map", | -| | "values": { | -| | "type": "array", | -| | "items": "string" | -| | } | -| | } | -| | ], | -| | "default": null | -| | } | -+----------------------------+------------------------------------------------------------------------+ -| list | :: | -| | | -| | { | -| | "name": "fieldName", | -| | "type": | -| | [ | -| | "null", | -| | { | -| | "type": "array", | -| | "items": "int" | -| | } | -| | ], | -| | "default": null | -| | } | -+----------------------------+------------------------------------------------------------------------+ -| JSON (JsonNode) | _Must match the structure of the JSON fragment._ | -| | _See :ref:`mapping-json-label`._ | -+----------------------------+------------------------------------------------------------------------+ - -Casting / parsing -""""""""""""""""" -Many of the simple values that can be extracted from a request are strings. Possibly, these values are not intended to be strings. Because type information about things like query parameters or path components is lost in a HTTP request, Divolte Collector can only treat these as strings. It is, however, possible to parse string to other primitive or other types in the mapping using this construct:: +Values in a mapping are typed and the value type must match the type of the Avro field that they are mapped onto. Divolte Collector checks for type compatibility during startup and will report an error if there is a mismatch. The type for a value can be found in the documentation below. + +Below is a table of all types that can be produced in a mapping and the corresponding Avro types that match them: + ++----------------------------------+------------------------------------------------------------------------+ +| Type | Avro type | ++==================================+========================================================================+ +| :code:`String` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","string"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`Boolean` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","boolean"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`int` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","int"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`long` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","long"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`float` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","float"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`double` | .. code-block:: json | +| | | +| | { "name": "fieldName", "type": ["null","double"], "default": null } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`Map>` | .. code-block:: json | +| | | +| | { | +| | "name": "fieldName", | +| | "type": [ | +| | "null", | +| | { | +| | "type": "map", | +| | "values": { | +| | "type": "array", | +| | "items": "string" | +| | } | +| | } | +| | ], | +| | "default": null | +| | } | ++----------------------------------+------------------------------------------------------------------------+ +| :code:`List` | .. code-block:: json | +| | | +| | { | +| | "name": "fieldName", | +| | "type": | +| | [ | +| | "null", | +| | { | +| | "type": "array", | +| | "items": "int" | +| | } | +| | ], | +| | "default": null | +| | } | ++----------------------------------+------------------------------------------------------------------------+ +| JSON (:code:`JsonNode`) | Must match the structure of the JSON fragment. | +| | See :ref:`mapping-json-label`. | ++----------------------------------+------------------------------------------------------------------------+ + +Casting/parsing +""""""""""""""" +Many of the simple values that can be extracted from an event are strings. Sometimes these values are not intended to be strings. Because type information about things like query parameters or path components is not present in a HTTP request, Divolte Collector can only treat these values as strings. It is, however, possible to parse a string to a primitive or other type in the mapping using this construct: + +.. code-block:: groovy def i = parse stringValue to int32 -In the example above, stringValue is a value of type string and the result value, assigned to i, will be of type int. *Note that this is not casting, but string parsing. When the string value cannot be parsed to an int (because it is not a number), then the resulting value will be absent, but no error occurs.* +In the example above, :code:`stringValue` is a string value and the result value, assigned to :code:`i`, will be of type :code:`int`. + +.. note:: + + This is not casting, but string parsing. If the string value cannot be parsed to an integer (because it is not a number) the resulting value will be absent, but no error occurs. + +A more complete example is this: -A more complete example is this:: +.. code-block:: groovy def u = parse referer() to uri // u is of type URI (which is not mappable) def q = u.query() // q is of type map> @@ -199,75 +219,77 @@ A more complete example is this:: def i = parse s to int32 // i is of type int map i onto 'intField' // map it onto the field 'intField' -Because int, long, boolean, etc. are reserved words in Groovy, the mapping DSL uses aliases for casting. These are all the type that can be used for parsing and the corresponding mapping type: +Because :code:`int`, :code:`long`, :code:`Boolean`, etc. are reserved words in Groovy, the mapping DSL uses aliases for parsing. The following table lists the types that can be used for parsing and the corresponding mapping types: +-------------------+-------------------+ -| parsing alias | type | +| Parsing alias | Type | +===================+===================+ -| int32 | int | +| :code:`int32` | :code:`int` | +-------------------+-------------------+ -| int64 | long | +| :code:`int64` | :code:`long` | +-------------------+-------------------+ -| fp32 | float | +| :code:`fp32` | :code:`float` | +-------------------+-------------------+ -| fp64 | double | +| :code:`fp64` | :code:`double` | +-------------------+-------------------+ -| bool | boolean | +| :code:`bool` | :code:`Boolean` | +-------------------+-------------------+ -| uri | `URI`_ | +| :code:`uri` | :code:`URI` | +-------------------+-------------------+ .. _mapping-json-label: -Mapping JSON (``JsonNode``) to Avro fields -"""""""""""""""""""""""""""""""""""""""""" - -Some expressions, for example, ``eventParameters()`` (and its ``path()`` method), produce a ``JsonNode`` value that represents JSON supplied by a client. Because Avro doesn't have a type built in to handle arbitrary JSON data, a *compatible* Avro type must be chosen to match the expected structure of the JSON from the client. The following table lists the rules for compatibility between JSON values and Avro types. - -+---------------+-------------------------------------------------------------------------+ -| Avro type | JSON value | -+===============+=========================================================================+ -| | ``null`` | JSON's ``null`` value | -+---------------+-------------------------------------------------------------------------+ -| | ``boolean`` | A JSON boolean, or a string if it can be parsed as a boolean. | -+---------------+-------------------------------------------------------------------------+ -| | ``int`` | A JSON number, or a string if it can be parsed as a number. | -| | ``long`` | Fractional components are truncated for ``float`` and ``double``. | -+---------------+-------------------------------------------------------------------------+ -| | ``float`` | A JSON number, or a string if it can be parsed as a number. | -| | ``double`` | Note that full floating-point precision may not be preserved. | -+---------------+-------------------------------------------------------------------------+ -| | ``bytes`` | A JSON string, with BASE64 encoded binary data. | -+---------------+-------------------------------------------------------------------------+ -| | ``string`` | A JSON string, number or boolean value. | -+---------------+-------------------------------------------------------------------------+ -| | ``enum`` | A JSON string, so long as the it's identical to one of the | -| | enumeration's symbols. (If not, the value will be treated as null.) | -+---------------+-------------------------------------------------------------------------+ -| | ``record`` | A JSON object, with each property corresponding to a field in the | -| | record. (Extraneous properties are ignored.) The property values and | -| | field types must also be compatible. | -+---------------+-------------------------------------------------------------------------+ -| | ``array`` | A JSON array. Each element of the JSON array must be compatible with | -| | the type declared for the Avro array. | -+---------------+-------------------------------------------------------------------------+ -| | ``map`` | A JSON object, with each property being an entry in the map. Property | -| | names are used for keys, and the values must be compatible with the | -| | Avro type for the map values. | -+---------------+-------------------------------------------------------------------------+ -| | ``union`` | Only trivial unions are supported of ``null`` with another type. The | -| | JSON value must either be null or compatible with the other union type. | -+---------------+-------------------------------------------------------------------------+ -| | ``fixed`` | The same as ``bytes``, as above. Data beyond the declared length will | -| | be truncated. | -+---------------+-------------------------------------------------------------------------+ +Mapping JSON (:code:`JsonNode`) to Avro fields +"""""""""""""""""""""""""""""""""""""""""""""" + +Some expressions, for example, :code:`eventParameters()` (and its :code:`path()` method), produce a :code:`JsonNode` value that represents JSON supplied by a client. Because Avro doesn't have a type for handling arbitrary JSON data, a *compatible* Avro type must be chosen to match the expected structure of the JSON from the client. The following table lists the rules for compatibility between JSON values and Avro types. + ++-------------------+---------------------------------------------------------------------------+ +| Avro type | JSON value | ++===================+===========================================================================+ +| | :code:`null` | JSON's :code:`null` value | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`boolean` | A JSON boolean, or a string if it can be parsed as a boolean. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`int` | A JSON number, or a string if it can be parsed as a number. | +| | :code:`long` | Fractional components are truncated for :code:`float` and :code:`double`. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`float` | A JSON number, or a string if it can be parsed as a number. | +| | :code:`double` | Note that full floating-point precision may not be preserved. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`bytes` | A JSON string, with BASE64 encoded binary data. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`string` | A JSON string, number or boolean value. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`enum` | A JSON string, so long as the it's identical to one of the enumeration's | +| | symbols. (If not, the value will be treated as :code:`null`. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`record` | A JSON object, with each property corresponding to a field in the record. | +| | (Extraneous properties are ignored.) The property values and field types | +| | must also be compatible. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`array` | A JSON array. Each element of the JSON array must be compatible with the | +| | type declared for the Avro array. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`map` | A JSON object, with each property being an entry in the map. Property | +| | names are used for keys, and the values must be compatible with the Avro | +| | type for the map values. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`union` | Only trivial unions are supported of :code:`null` with another type. The | +| | JSON value must either be null or compatible with the other union type. | ++-------------------+---------------------------------------------------------------------------+ +| | :code:`fixed` | The same as :code:`bytes`, as above. Data beyond the declared length will | +| | be truncated. | ++-------------------+---------------------------------------------------------------------------+ In addition to these compatibility rules, trivial array wrapping and unwrapping will be performed if necessary: * If the Avro type specifies an array, any JSON value compatible with the type of the array elements will be wrapped as a single-element array. * If the Avro type is not an array, a JSON array containing a single element that is compatible will be unwrapped. -For example, a shopping basket could be supplied as the following JSON:: +For example, a shopping basket could be supplied as the following JSON: + +.. code-block:: json { "total_price": 184.91, @@ -279,7 +301,9 @@ For example, a shopping basket could be supplied as the following JSON:: ] } -This could be mapped using the following Avro schema:: +This could be mapped using the following Avro schema: + +.. code-block:: json { "type": [ @@ -317,24 +341,30 @@ The Avro field will remain unchanged if mapping fails at runtime because the JSO Unlike most mappings, schema compatibility for JSON mappings cannot be checked on startup because compatibility depends on the JSON supplied with each individual event. -Conditional mapping (when) -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Not all incoming requests are the same and usually, different types of requests require different values to be extracted and different fields to be set. This can be achieved using conditional mapping. With conditional mapping any boolean value can be used to conditionally apply a part of the mapping script. This can be done using the following syntax:: +Conditional mapping (:code:`when`) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Not all incoming requests are the same and usually, different types of requests require different values to be extracted and different fields to be set. This can be achieved using conditional mapping. With conditional mapping any boolean value can be used to conditionally apply a part of the mapping script. This can be done using the following syntax: + +.. code-block:: groovy when conditionBooleanValue apply { // Conditional mapping go here map 'value' onto 'fieldName' } -A more concrete example of using this construct would be:: +A more concrete example of using this construct would be: + +.. code-block:: groovy when referer().isAbsent() apply { map true onto 'directTraffic' } -Here we check whether the referrer value is absent and if so, map a literal value onto a boolean field. +Here we check whether the referer value is absent and if so, map a literal value onto a boolean field. -As an alternative syntax, it is possible to use a closure that produces the boolean value as well, just like in `Mapping values onto fields (map)`_. In this example we check if a query parameter called clientId is present in the location and on that condition perform a mapping:: +As an alternative syntax, it is possible to use a closure that produces the boolean value as well, just like in `Mapping values onto fields (map)`_. In this example we check if a query parameter called :code:`clientId` is present in the location and on that condition perform a mapping: + +.. code-block:: groovy when { def u = parse location() to uri @@ -345,42 +375,46 @@ As an alternative syntax, it is possible to use a closure that produces the bool Conditions """""""""" -Any boolean value can be used as a condition. In order to be able to create flexible conditional mappings, the mapping DSL provides a number of methods on values to produce booleans that are useful in conditional mappings, such as equality comparisons and boolean logic: - -+------------------------------------------------+----------------------------------------------------------------+ -| Condition | Description | -+================================================+================================================================+ -| value.isPresent() | True if the value is present. See: `Value presence and nulls`_ | -+------------------------------------------------+----------------------------------------------------------------+ -| value.isAbsent() | True if the value is absent. See: `Value presence and nulls`_ | -+------------------------------------------------+----------------------------------------------------------------+ -| value.equalTo(otherValue) | True if both values are equal. Values must be of the same type.| -+------------------------------------------------+----------------------------------------------------------------+ -| value.equalTo('literal') | True if the value is equal to the given literal. Types other | -| | than string are supported as well. | -+------------------------------------------------+----------------------------------------------------------------+ -| booleanValue.and(otherBooleanValue) | True if booleanValue AND otherBooleanValue are true. | -+------------------------------------------------+----------------------------------------------------------------+ -| booleanValue.or(otherBooleanValue) | True if booleanValue OR otherBooleanValue or both are true. | -+------------------------------------------------+----------------------------------------------------------------+ -| not booleanValue | True if booleanValue is false. | -+------------------------------------------------+----------------------------------------------------------------+ -| regexMatcherValue.matches() | True if the regex matches the value. See: | -| | `Regular expression matching`_. | -+------------------------------------------------+----------------------------------------------------------------+ - -Sections and short circuit -^^^^^^^^^^^^^^^^^^^^^^^^^^ -Sections are useful for grouping together parts of the mapping that somehow form a logical subset of the entire mapping. This makes it possible to conditionally jump out of a section as well. To define a section, just use the section keyword followed by a closure that contains the section:: +Any boolean value can be used as a condition. In order to be able to create flexible conditional mappings, the mapping DSL provides a number of methods on values that return booleans useful in conditional mappings, such as equality comparisons and boolean logic: + ++-------------------------------------------------+----------------------------------------------------------------+ +| Condition | Description | ++=================================================+================================================================+ +| :samp:`{value}.isPresent()` | True if the value is present. See: `Value presence and nulls`_ | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{value}.isAbsent()` | True if the value is absent. See: `Value presence and nulls`_ | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{value}.equalTo({otherValue})` | True if both values are equal. Values must be of the same type.| ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{value}.equalTo({'literal'})` | True if the value is equal to the given literal. Non-string | +| | types are supported as well. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{booleanValue}.and({otherBooleanValue})` | True if both booleans are true. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{booleanValue}.or({otherBooleanValue})` | True if either or both of the boolean values are true. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`not {booleanValue}` | True if the boolean value is false. | ++-------------------------------------------------+----------------------------------------------------------------+ +| :samp:`{regexMatcherValue}.matches()` | True if the regular expression matches the value. See: | +| | `Regular expression matching`_. | ++-------------------------------------------------+----------------------------------------------------------------+ + +Sections and short circuiting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Sections are useful for grouping together parts of the mapping that form a logical subset of the entire mapping. In addition to grouping it is possible to conditionally stop processing a section prematurely. Sections are defined using the :code:`section` keyword followed by a closure that contains the section: + +.. code-block:: groovy section { // Section's mappings go here map 'value' onto 'field' } -exit -"""" -The exit() method will, at any point, break out of the enclosing section or, when no enclosing section can be found, break out of the entire mapping script. This can be used to conditionally break out of a section, for example to create a type of first-match-wins scenario:: +Function: :code:`exit()` +"""""""""""""""""""""""" +The :code:`exit()` function will, at any point, break out of the enclosing section or, when no enclosing section can be found, break out of the entire mapping script. This can be used to conditionally break out of a section. For example to create a type of first-match-wins scenario: + +.. code-block:: groovy section { def u = parse location() to uri @@ -400,63 +434,82 @@ The exit() method will, at any point, break out of the enclosing section or, whe // other mappings here -There is a optional shorthand syntax for conditionally exiting from a section, which leaves out the apply keyword and closure like this:: +There is a optional shorthand syntax for conditionally exiting from a section which leaves out the :code:`apply` keyword and closure: + +.. code-block:: groovy when referer().isAbsent() exit() -stop -"""" -The stop() method will, at any point, stop *all* further processing and break out of the entire mapping script. This is typically applied conditionally. Generally, it is safer to use sections and exit() instead. Use with care. The stop() method can also be used conditionally, just as anything else:: +Function: :code:`stop()` +"""""""""""""""""""""""" +The :code:`stop()` function will, at any point, stop *all* further processing and break out of the entire mapping script. This is typically applied conditionally. Generally, it is safer to use sections and :code:`exit()` instead. Use with care. The :code:`stop()` function can also be used conditionally, just as anything else: + +.. code-block:: groovy when referer().isAbsent() { stop() } -Or, using shorthand syntax:: +Or, using shorthand syntax: + +.. code-block:: groovy when referer().isAbsent stop() A word on groovy ---------------- -Groovy is a dynamic language for the JVM. This means, amongst other things, that you don't have to specify the types of variables:: +Groovy is a dynamic language for the JVM. This means, amongst other things, that you don't have to specify the types of variables: + +.. code-block:: groovy def i = 40 println i + 2 -The above snippet will print out 42 as you would expect. Note two things: we never specified that variable i is an int and also, we are not using any parenthese in the println method call. Groovy allows to leave out the parentheses in most method calls. The code above is equal to this snippet:: +The above snippet will print out 42 as you would expect. Note two things: we never specified that variable i is an int and also, we are not using any parentheses in the :code:`println` function call. Groovy allows to leave out the parentheses in most function and method calls. The code above is equivalent to this snippet: + +.. code-block:: groovy def i = 42 println(i + 2) -Which in turn is equals to this:: +This in turn is equivalent to this: + + +.. code-block:: groovy def i = 42 println(i.plus(2)) -When chaining single argument methods, this works out well. However, with nested method calls, this can be more problematic. Let's say we have a method called increment which increments the argument by one; so increment(10) will return 11. For example the following will not compile:: +This works well when chaining single argument methods. However, this can be more problematic with nested method calls. Suppose we have a function called :samp:`increment({x})` which increments the :code:`x` argument by 1, so :code:`increment(10)` will return 11. The following will not compile: + +.. code-block:: groovy println increment 10 -But this will:: +However this will: + +.. code-block:: groovy println(increment(10)) -And this won't:: +Yet this won't: + +.. code-block:: groovy println(increment 10) -In the Divolte Collector mapping DSL, it is sometimes required to chain method calls. For example when using the result of a casting operation in a mapping. We solve this by accepting a closure that produces a value as result:: +In the Divolte Collector mapping DSL, it is sometimes required to chain method calls. For example when using the result of a casting operation in a mapping. We solve this by accepting a closure that produces a value as result: - map { parse cookie('customer_id') to int32 } onto 'customerId' +.. code-block:: groovy -This way, you don't have to add parentheses to all intermediate method calls and we keep the syntax fluent. If you follow these general guidelines, you should be safe: + map { parse cookie('customer_id') to int32 } onto 'customerId' -* When calling methods that produce a value, always use parentheses. For example: location(), referer(), partyId() -* When deriving a condition or other value from a method that produces a value, also use parenthese. Example: +This way you don't have to add parentheses to all intermediate method calls and we keep the syntax fluent. If you follow these general guidelines, you should be safe: - .. +* When calling methods that produce a value, always use parentheses. For example: :code:`location()`, :code:`referer()`, :code:`partyId()` +* When deriving a condition or other value from a method that produces a value, also use parentheses. For example: - :: + .. code-block:: groovy when location().equalTo('http://www.example.com/') apply { ... @@ -466,13 +519,9 @@ This way, you don't have to add parentheses to all intermediate method calls and map parsedUri.query().value('foo') onto 'field' - .. - * When parsing or matching on something, extract it to a variable before using it. This also improves readability: - .. - - :: + .. code-block:: groovy def myUri = parse location() to uri when myUri.query().value('foo').isPresent() apply { ... } @@ -480,95 +529,103 @@ This way, you don't have to add parentheses to all intermediate method calls and def myMatcher = match '^/foo/bar/([a-z]+)/' against myUri.path() when myMatcher.matches() apply { ... } - .. - * When casting inline, use the closure syntax for mapping or conditionals: - .. - - :: + .. code-block:: groovy map { parse cookie('example') to int32 } onto 'field' Simple values ^^^^^^^^^^^^^ -Simple values are pieces of information that are directly extracted from the request without any processing. You can map simple values directly onto fields of the correct type or you can use them in further processing, such as regex matching and extraction or URI parsing. +Simple values are pieces of information that are directly extracted from the event without any processing. You can map simple values directly onto fields of the correct type or you can use them in further processing, such as matching againast a regular expression or URI parsing. + +.. _location: -location -"""""""" +Simple value: :code:`location()` +"""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map location() onto 'locationField' :Description: - The location of this request: the full address in the address bar of the user's browser, including the fragment part if this is present (the part after the #). This is different from server side request logs, which will not be able to catch the fragment part. + The location URL for the page-view that triggered the event: the full address in the address bar of the user's browser. This includes the fragment part if this is present (the part after the ``#``), which is different from server side request logs which do not contain the fragment part. :Type: - string + :code:`string` -referer -""""""" +.. _referer: + +Simple value: :code:`referer()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map referer() onto 'refererField' :Description: - The referer of this request. Note that the referer is taken from JavaScript and does not depend on any headers being sent by the browser. The referer will not contain any fragment part that might have been present in the user's address bar. + The referrer URL for the page-view that triggered the event. Unlike :code:`location()`, the referer will not contain any fragment part. :Type: - string + :code:`String` + +.. _firstInSession: -firstInSession -"""""""""""""" +Simple value: :code:`firstInSession()` +"""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map firstInSession() onto 'first' :Description: - A boolean flag that is set to true if a new session ID was generated for this request and false otherwise. A value of true indicates that a new session has started. + A boolean flag that is true if a new session ID was generated for this event and false otherwise. If true a new session has started. :Type: - boolean + :code:`Boolean` + +.. _corrupt: -corrupt -""""""" +Simple value: :code:`corrupt()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map corrupt() onto 'detectedCorruption' :Description: - A boolean flag that is set to true when the request checksum does not match the request contents and false otherwise. Whenever a the JavaScript performs a request, it calculates a hash code of all request properties and adds this hash code at the end of the request. On the server side, this hash is calculated again and checked for correctness. Corrupt requests usually occur when intermediate parties try to re-write requests or truncate long URLs (e.g. proxies and anti-virus software can have this habit). + A boolean flag that is true if the source for the event detected corruption of the event data. Event corruption usually occurs when intermediate parties try to re-write HTTP requests or truncate long URLs. Real-world proxies and anti-virus software has been observed doing this. :Type: - boolean + :code:`Boolean` -duplicate -""""""""" +.. _duplicate: + +Simple value: :code:`duplicate()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map duplicate() onto 'detectedDuplicate' :Description: - A boolean flag that is set to true when the request is believed to be duplicated and false otherwise. Duplicate detection in Divolte Collector utilizes a probabilistic data structure that has a low false positive and false negative rate. Nonetheless, these can still occur. Duplicate requests are often performed by certain types of anti-virus software and certain proxies. Additionally, sometimes certain browsers go haywire and send the same request large numbers of times (in the tens of thousands). The duplicate flag server as a line of defense against this phenomenon, which is particularly handy in real-time processing where it is not practical to perform de-duplication of the data based on a full data scan. + A boolean flag that true when the event is believed to be a duplicate of an earlier one. Duplicate detection in Divolte Collector utilizes a probabilistic data structure that has a low false positive and false negative rate. Nonetheless classification mistakes can still occur. Duplicate events often arrive due to certain types of anti-virus software and certain proxies. Additionally, browsers sometimes go haywire and send the same request large numbers of times (in the tens of thousands). Duplicate detection can be used to mitigate the effects when this occurs. This is particularly handy in real-time processing where it is not practical to perform de-duplication of the data based on a full data scan. :Type: - boolean + :code:`Boolean` + +.. _timestamp: -timestamp -""""""""" +Simple value: :code:`timestamp()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map timestamp() onto 'timeField' @@ -576,13 +633,15 @@ timestamp The timestamp of the time the the request was received by the server, in milliseconds since the UNIX epoch. :Type: - long + :code:`long` -clientTimestamp -""""""""""""""" +.. _clientTimestamp: + +Simple value: :code:`clientTimestamp()` +""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map clientTimestamp() onto 'timeField' @@ -590,27 +649,31 @@ clientTimestamp The timestamp that was recorded on the client side immediately prior to sending the request, in milliseconds since the UNIX epoch. :Type: - long + :code:`long` -remoteHost -"""""""""" +.. _remoteHost: + +Simple value: :code:`remoteHost()` +"""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map remoteHost() onto 'ipAddressField' :Description: - The remote IP address of the request. Depending on configuration, Divolte Collector will use any X-Forwarded-For headers set by intermediate proxies or load balancers. + The remote IP address of the request. Depending on configuration, Divolte Collector will use any :mailheader:`X-Forwarded-For` headers set by intermediate proxies or load balancers. :Type: - string + :code:`String` -viewportPixelWidth -"""""""""""""""""" +.. _viewportPixelWidth: + +Simple value: :code:`viewportPixelWidth()` +"""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map viewportPixelWidth() onto 'widthField' @@ -618,13 +681,15 @@ viewportPixelWidth The width of the client's browser viewport in pixels. :Type: - int + :code:`int` + +.. _viewportPixelHeight: -viewportPixelHeight -""""""""""""""""""" +Simple value: :code:`viewportPixelHeight()` +""""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map viewportPixelHeight() onto 'widthField' @@ -632,13 +697,15 @@ viewportPixelHeight The height of the client's browser viewport in pixels. :Type: - int + :code:`int` -screenPixelWidth -"""""""""""""""" +.. _screenPixelWidth: + +Simple value: :code:`screenPixelWidth()` +"""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map screenPixelWidth() onto 'widthField' @@ -646,13 +713,15 @@ screenPixelWidth The width of the client's screen in pixels. :Type: - int + :code:`int` + +.. _screenPixelHeight: -screenPixelHeight -""""""""""""""""" +Simple value: :code:`screenPixelHeight()` +""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map screenPixelHeight() onto 'widthField' @@ -660,13 +729,15 @@ screenPixelHeight The height of the client's screen in pixels. :Type: - int + :code:`int` + +.. _devicePixelRatio: -devicePixelRatio -"""""""""""""""" +Simple value: :code:`devicePixelRatio()` +"""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map devicePixelRatio() onto 'ratioField' @@ -674,115 +745,140 @@ devicePixelRatio The ratio of physical pixels to logical pixels on the client's device. Some devices use a scaled resolution, meaning that the resolution and the actual available pixels are different. This is common on retina-type displays, with very high pixel density. :Type: - int + :code:`int` -partyId -""""""" +.. _partyId: + +Simple value: :code:`partyId()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map partyId() onto 'partyField' :Description: - A unique identifier stored with the client in a long lived cookie. The party ID identifies a known device. + A long-lived unique identifier stored by a client that is associated with each event from that source. All events from the same client should have the same party identifier. + + For browser sources this value is stored in a cookie. :Type: - string + :code:`String` + +.. _sessionId: -sessionId -""""""""" +Simple value: :code:`sessionId()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map sessionId() onto 'sessionField' :Description: - A unique identifier stored with the client in a cookie that is set to expire after a fixed amount of time (default: 30 minutes). Each new request resets the session expiry time, which means that a new session will start after the session timeout has passed without any activity. + A short-lived unique identifier stored by a client that is associated with each event from that source within a session of activity. All events from the same client within a session should have the same session identifier. + + For browser sources a session to expire when 30 minutes has elapsed without any events occurring. :Type: - string + :code:`String` -pageViewId -"""""""""" +.. _pageViewId: + +Simple value: :code:`pageViewId()` +"""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map pageViewId() onto 'pageviewField' :Description: - A unique identifier that is generated for each pageview request. + A unique identifier that is generated for each page-view. All events from a client within the same page-view will have the same page-view identifier. + + For browser sources a page-view starts when the user visits a page, and ends when the user navigates to a new page. Note that navigating within single-page web applications or links to anchors within the same page do *not* normally trigger a new page-view. :Type: - string + :code:`String` + +.. _eventId: -eventId -""""""" +Simple value: :code:`eventId()` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map eventId() onto 'eventField' :Description: - A unique identifier that is created for each event that is fired by taking the pageViewId and appending a monotonically increasing number to it. + A unique identifier that is associated with each event received from a source. (This identifier is assigned by the client, not by the server.) :Type: - string + :code:`String` -userAgentString -""""""""""""""" +.. _userAgentString: + +Simple value: :code:`userAgentString()` +""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map userAgentString() onto 'uaField' :Description: - The full user agent identification string as reported by the client's browser. See `User agent parsing`_ on how to extract more meaningful information from this string. + The full user agent identification string reported by the client HTTP headers when sending an event. + + See `User agent parsing`_ on how to extract more meaningful information from this string. :Type: - string + :code:`String` + +.. _cookie: -cookie -"""""" +Simple value: :samp:`cookie({name})` +"""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map cookie('cookie_name') onto 'customCookieField' :Description: - The value for a cookie that was sent by the client's browser in the request. + The value of a cookie included in the client HTTP headers when sending an event. :Type: - string + :code:`String` -eventType -""""""""" +.. _eventType: + +Simple value: :code:`eventType()` +""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map eventType() onto 'eventTypeField' :Description: - The type of event that was captured in this request. This defaults to 'pageView', but can be overridden when custom events are fired from JavaScript within a page. + The type of event being processed. + + The tracking tag used by sites integrating with browser sources automatically issue a :code:`pageView` event by default + when a page-view commences. Custom events may set this value to anything they like. :Type: - string + :code:`String` Complex values ^^^^^^^^^^^^^^ -Complex values return objects that you can in turn use to extract derived, simple values from. Complex values are either the result of parsing something (e.g. the user agent string) or matching regular expressions against another value. +Complex values often return intermediate objects that you extract derived, simple values for mapping onto fields. The main exception to this is when working with event-parameters: the :code:`JsonNode` results can be mapped directly to fields, so long as they are of the right 'shape'; see :ref:`mapping-json-label` for more details. -eventParameters -""""""""""""""" +Complex value: :code:`eventParameters()` +"""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy // on the client in JavaScript: divolte.signal('myEvent', { foo: 'hello', bar: 42 }); @@ -791,40 +887,46 @@ eventParameters map eventParameters() onto 'parametersField' :Description: - A JSON object (``JsonNode``) containing the custom parameters that were submitted with + A JSON object or array (:code:`JsonNode`) containing the custom parameters that were submitted with the event. See :ref:`mapping-json-label` for an example on how to map this to a field. :Type: - JsonNode + :code:`JsonNode` -eventParameters value -""""""""""""""""""""" +Derived simple value: :samp:`eventParameters().value({name})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + On a site submitting events to a browser source: + + .. code-block:: javascript - // On the client in JavaScript: divolte.signal('myEvent', { foo: 'hello', bar: 42 }); - // In the mapping: + In the mapping: + + .. code-block:: groovy + map eventParameters().value('foo') onto 'fooField' // Or with a cast: map { parse eventParameters().value('bar') to int32 } onto 'barField' :Description: - The value for a parameter that was sent as part of a custom event from JavaScript. Note that this is always a string, regardless of the type used on the client side. In the case that you are certain a parameter has a specific type, you can explicitly cast it as in the example above. + The value for an event parameter that was sent as part of a custom event. Note that this is always a string, regardless of the type used on the client side. If you are certain a parameter has a specific format you can explicitly cast it as in the example above. :Type: - string + :code:`String` -eventParameters path -"""""""""""""""""""" +Derived complex value: :samp:`eventParameters().path({expression})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + On a site submitting events to a browser source: + + .. code-block:: javascript // On the client in JavaScript: divolte.signal('searchResults', [ @@ -832,60 +934,66 @@ eventParameters path { "sku": "0094638246817", "score": 0.8 } ]); - // In the Avro schema: + In the Avro schema: + + .. code-block:: json + { "name": "searchResults", "type": [ "null", { "type": "array", "items": "string" } ], "default": null } - // In the mapping: + In the mapping: + + .. code-block:: groovy + map eventParameters().path('$[*].sku') onto 'searchResults' :Description: This can be used to extract parts of parameters supplied with the event using a JSON-path expression. (See http://goessner.net/articles/JsonPath/ for a description of JSON-path expressions.) - If the expression does not match anything, the value is not considered to be present. (A ``when`` expression can test for this.) + If the expression does not match anything, the value is not considered to be present. (A :code:`when` expression can test for this.) See :ref:`mapping-json-label` for an example on how to map JSON values to a field. Expressions can return more than one result; these are presented as a JSON array for subsequent mapping. :Type: - JsonNode + :code:`JsonNode` -URI -""" +Complex conversion: :code:`uri` +""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy - def locationUri = parse location() to uri + def locationUri = parse location() to uri :Description: - Attempts to parse a string into a URI. The most obvious values to use for this are the location() and referer() values, but you can equally do the same with custom event parameters or any other string. If the parser fails to create a URI from a string, than the value will be absent. Note that the parsed URI itself is not directly mappable onto any Avro field. + Attempts to parse a string as a URI. The most obvious candidates to use for this are the :code:`location()` and :code:`referer()` values, but you can equally do this same with custom event parameters or any other string value. If the parser fails to create a URI from a string, then the value will be absent. Note that the parsed URI itself is not directly mappable onto any Avro field. :Type: - URI + :code:`URI` -URI path -~~~~~~~~ +Derived simple value: :code:`URI.path()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.path() onto 'locationPathField' :Description: - The path component of a URI. Any URL encoded values in the path will be decoded. Keep in mind that if the path contains a encoded / character (%2F), this will also be decoded. Be careful when matching regular expressions against path parameters. + The path component of a URI. Any URL encoded values in the path will be decoded. Keep in mind that if the path contains a encoded :code:`/` character (:code:`%2F`), this will also be decoded. Be careful when matching regular expressions against path parameters. :Type: - string + :code:`String` -URI rawPath -~~~~~~~~~~~ +Derived simple value: :code:`URI.rawPath()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.rawPath() onto 'locationPathField' @@ -894,13 +1002,13 @@ URI rawPath The path component of a URI. This value is not decoded in any way. :Type: - string + :code:`String` -URI scheme -~~~~~~~~~~ +Derived simple value: :code:`URI.scheme()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.scheme() onto 'locationSchemeField' @@ -909,97 +1017,99 @@ URI scheme map locationUri.scheme().equalTo('https') onto 'isSecure' :Description: - The scheme component of a URI. This is the protocol part, such as http or https. + The scheme component of a URI. This is the protocol part, such as :code:`http` or :code:`https`. :Type: - string + :code:`String` -URI host -~~~~~~~~ +Derived simple value: :code:`URI.host()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.host() onto 'locationHostField' :Description: - The host component of a URI. In http://www.example.com/foo/bar, this would be: www.example.com + The host component of a URI. For :code:`http://www.example.com/foo/bar` this would be :code:`www.example.com`. :Type: - string + :code:`String` -URI port -~~~~~~~~ +Derived simple value: :code:`URI.port()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.port() onto 'locationPortField' :Description: - The port component of a URI. In http://www.example.com:8080/foo, this would be: 8080. Note that when no port is specified in the URI (e.g. http://www.example.com/foo), this value will be absent. Divolte Collector makes no assumptions about default ports for protocoles. + The port component of a URI. For :code:`http://www.example.com:8080/foo` this would be :code:`8080`. Note that when no port is specified in the URI (e.g. :code:`http://www.example.com/foo`) this value will be absent. Divolte Collector makes no assumptions about default ports for protocols. :Type: - int + :code:`int` -URI decodedQueryString -~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.decodedQueryString()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.decodedQueryString() onto 'locationQS' :Description: - The full, URL decoded query string of a URI. In http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar, this would be: "q=hello world&foo/bar". + The full, URL decoded query string of a URI. For :code:`http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar`, this would be :code:`q=hello world&foo/bar`. :Type: - string + :code:`String` -URI rawQueryString -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.rawQueryString()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.rawQueryString() onto 'locationQS' :Description: - The full, query string of a URI without any decoding. In http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar, this would be: "q=hello+world&foo%2Fbar". + The full, query string of a URI without any decoding. For :code:`http://www.example.com/foo/bar.html?q=hello+world&foo%2Fbar` this would be :code:`q=hello+world&foo%2Fbar`. :Type: - string + :code:`String` -URI decodedFragment -~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.decodedFragment()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.decodedFragment() onto 'locationFragment' :Description: - The full, URL decoded fragment of a URI. In http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar, this would be: "/localpath/?q=hello world&foo/bar". + The full, URL decoded fragment of a URI. For :code:`http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar` this would be :code:`/localpath/?q=hello world&foo/bar`. :Type: - string + :code:`String` -URI rawFragment -~~~~~~~~~~~~~~~ +Derived simple value: :code:`URI.rawFragment()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri map locationUri.rawFragment() onto 'locationFragment' :Description: - The full, fragment of a URI without any decoding. In http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar, this would be: "/localpath/?q=hello+world&foo%2Fbar". In web applications with rich client side functionality written in JavaScript, it is a common pattern that the fragment of the location is written as a URI again, but without a scheme, host and port. Nonetheless, it is entirely possible to parse the raw fragment of a location into a separate URI again and use this for further mapping. As an example, consider the following:: + The full, fragment of a URI without any decoding. For :code:`http://www.example.com/foo/#/localpath/?q=hello+world&foo%2Fbar` this would be :code:`/localpath/?q=hello+world&foo%2Fbar`. In web applications with rich client side functionality written in JavaScript, it is a common pattern that the fragment of the location is written as a URI again, but without a scheme, host and port. Nonetheless, it is entirely possible to parse the raw fragment of a location into a separate URI again and use this for further mapping. As an example, consider the following: + + .. code-block:: groovy // If location() = 'http://www.example.com/foo/#/local/path/?q=hello+world' // this would map '/local/path/' onto the field clientSidePath @@ -1008,20 +1118,22 @@ URI rawFragment map localUri.path() onto 'clientSidePath' :Type: - string + :code:`String` -Query strings -""""""""""""" +Derived complex value: :code:`URI.query()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri def locationQuery = locationUri.query() map locationQuery onto 'locationQueryParameters' :Description: - The query string from a URI parsed into a map of value lists. In the resulting map, the keys are the parameter names of the query string and the values are lists of strings. Lists are required, as a query parameter can have multiple values (by being present more than once). In order to map all the query parameters directly onto a Avro field, the field must be typed as a map of string lists, possibly a union with null, to have a sensible default when no query string is possible. In a Avro schema definition, the following field definition can be a target field for the query parameters:: + The query string from a URI parsed into a map of value lists. In the resulting map, the keys are the parameter names of the query string and the values are lists of strings. Lists are required because a query parameter can have multiple values (by being present more than once). In order to map all the query parameters directly onto a Avro field, the field must be typed as a map of string lists, possibly a union with null, to have a sensible default when no query string is possible. In a Avro schema definition, the following field definition can be a target field for the query parameters: + + .. code-block:: json { "name": "uriQuery", @@ -1039,13 +1151,13 @@ Query strings } :Type: - map> + :code:`Map>` -Query string value -~~~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`URI.query().value({name})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri def locationQuery = locationUri.query() @@ -1055,13 +1167,13 @@ Query string value The first value found for a query parameter. This value is URL decoded. :Type: - string + :code:`String` -Query string valueList -~~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :samp:`URI.query().valueList({name})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def locationUri = parse location() to uri def locationQuery = locationUri.query() @@ -1071,18 +1183,22 @@ Query string valueList A list of all values found for a query parameter name. These values are URL decoded. :Type: - list + :code:`List` -Regular expression matching -""""""""""""""""""""""""""" +.. _Regular expression matching: + +Complex value: :samp:`match({regex}).against({stringValue})` +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy def matcher = match '/foo/bar/([a-z]+).html$' against location() :Description: - Matches the given regular expression against a value; the entire value must match. The result of this can not be directly mapped onto a Avro field, but can be used to extract capture groups or conditionally perform a mapping if the pattern is a match. Often it is required to perform non-trivial partial extractions against strings that are taken from the requests. One example would be matching the path of the location with a wild card. It is not recommended to match patterns against the location() or referer() values directly; instead consider parsing out relevant parts of the URI first using URI parsing. In the following example, the matching is much more robust in the presence of unexpected query parameters or fragments compared to matching against the entire location string:: + Matches a regular expression against a string value; the entire value must match. The result of this can not be directly mapped onto a Avro field, but can be used to extract capture groups or conditionally perform a mapping if the pattern is a match. Often it is required to perform non-trivial partial extractions against strings that are taken from the requests. One example would be matching the path of the location with a wild card. It is not recommended to match patterns against the :code:`location()` or :code:`referer()` values directly; instead parse as an URI first and match against the relevant parts. In the following example, the matching is much more robust in the presence of unexpected query parameters or fragments compared to matching against the entire location string: + + .. code-block:: groovy def locationUri = parse location() to uri def pathMatcher = match '^/foo/bar/([a-z]+).html$' against locationUri.path() @@ -1092,13 +1208,13 @@ Regular expression matching } :Type: - Matcher + :code:`Matcher` -Regex matches -~~~~~~~~~~~~~ +Derived simple value: :code:`Matcher.matches()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy def matcher = match '^/foo/bar/([a-z]+).html$' against location() @@ -1111,16 +1227,16 @@ Regex matches map matcher.matches() onto 'isFooBarPage' :Description: - True when the pattern matches the value or false otherwise. In case the target value is absent, this will produce false. + True when the value is present and matches the regular expression or false otherwise. :Type: - boolean + :code:`Boolean` -Regex group -~~~~~~~~~~~ +Derived simple value: :samp:`Matcher.group({positionOrName})` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy // Using group number def matcher = match '/foo/bar/([a-z]+).html$' against location() @@ -1134,18 +1250,20 @@ Regex group The value from a capture group in a regular expression pattern if the pattern matches, absent otherwise. Groups can be identified by their group number, starting from 1 as the first group or using named capture groups. :Type: - string + :code:`String` -HTTP headers -"""""""""""" +Complex value: :samp:`header({name})` +""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy map header('header-name') onto 'fieldName' :Description: - The list of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name; these are returned as a list. The Avro type of the target field for this mapping must be a list of string:: + The list of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name; these are returned as a list. The Avro type of the target field for this mapping must be a list of string: + + .. code-block:: json { "name": "headers", @@ -1160,16 +1278,16 @@ HTTP headers "default": null } - Note that the array field in Avro itself is nullable and has a default value of null, whereas the items in the array are not nullable. The latter is not required, because when te header is present, the elements in the list are guaranteed to be present. + Note that the array field in Avro itself is nullable and has a default value of null, whereas the items in the array are not nullable. The latter is not required, because when the header is present the elements in the list are guaranteed to be non-null. :Type: - list + :code:`List` -HTTP header first -~~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`header({name}).first()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map header('header-name').first() onto 'fieldName' @@ -1177,13 +1295,13 @@ HTTP header first The *first* of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name. This returns the first value in that list. :Type: - string + :code:`String` -HTTP header last -~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`header({name}).last()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map header('header-name').last() onto 'fieldName' @@ -1191,13 +1309,13 @@ HTTP header last The *last* of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name. This returns the last value in that list. :Type: - string + :code:`String` -HTTP header commaSeparated -~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :samp:`header({name}).commaSeparated()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map header('header-name').commaSeparated() onto 'fieldName' @@ -1205,27 +1323,31 @@ HTTP header commaSeparated The comma separated string of all values associated with the given HTTP header from the incoming request. A HTTP header can be present in a request multiple times, yielding multiple values for the same header name. This joins that list using a comma as separator. :Type: - string + :code:`String` -User agent parsing -"""""""""""""""""" +.. _User agent parsing: + +Complex value: :code:`userAgent()` +"""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy - def ua = userAgent() + def ua = userAgent() :Description: - Attempts to parse a the result of `userAgentString`_ string into a user agent object. Note that this result is not directly mappable onto any Avro field. Instead, the subfields from this object, described below, can be mapped onto fields. When the parsing of the user agent string fails, either because the user agent is unknown or malformed, or because the user agent was not sent by the browser, this value and all subfields' values are absent. + Attempts to parse a the result of `userAgentString`_ string into a user agent object. Note that this result is not directly mappable onto any Avro field. Instead, the subfields from this object, described below, can be mapped onto fields. When the parsing of the user agent string fails, either because the user agent is unknown or malformed, or because the user agent was not sent by the browser, this value and all subfield values are absent. :Type: - ReadableUserAgent + :code:`ReadableUserAgent` + +.. _User agent name: -User agent name -~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().name()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().name() onto 'uaNameField' @@ -1233,365 +1355,383 @@ User agent name The canonical name for the parsed user agent. E.g. 'Chrome' for Google Chrome browsers. :Type: - string + :code:`String` + +.. _User agent family: -User agent family -~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().family()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().family() onto 'uaFamilyField' :Description: - The canonical name for the family of the parsed user agent. E.g. 'Mobile Safari' for Apple's mobile browser. + The canonical name for the family of the parsed user agent. E.g. ``Mobile Safari`` for Apple's mobile browser. :Type: - string + :code:`String` -User agent vendor -~~~~~~~~~~~~~~~~~ +.. _User agent vendor: + +Derived simple value: :code:`userAgent().vendor()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().vendor() onto 'uaVendorField' :Description: - The name of the company or oganisation that produces the user agent software. E.g. 'Google Inc.' for Google Chrome browsers. + The name of the company or organisation that produces the user agent software. E.g. ``Google Inc.`` for Google Chrome browsers. :Type: - string + :code:`String` + +.. _User agent type: -User agent type -~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().type()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().type() onto 'uaTypeField' :Description: - The type of user agent that was used. E.g. 'Browser' for desktop browsers. + The type of user agent that was used. E.g. ``Browser`` for desktop browsers. :Type: - string + :code:`String` + +.. _User agent version: -User agent version -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().version()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().version() onto 'uaVersionField' :Description: - The version string of the user agent software. E.g. '39.0.2171.71' for Google Chrome 39. + The version string of the user agent software. E.g. ``39.0.2171.71`` for Google Chrome 39. :Type: - string + :code:`String` -User agent device category -~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _User agent device category: + +Derived simple value: :code:`userAgent().deviceCategory()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().deviceCategory() onto 'uaDeviceCategoryField' :Description: - The type of device that the user agent runs on. E.g. 'Tablet' for a tablet based browser. + The type of device that the user agent runs on. E.g. ``Tablet`` for a tablet based browser. :Type: - string + :code:`String` + +.. _User agent OS family: -User agent OS family -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().osFamily()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().osFamily() onto 'uaOSFamilyField' :Description: - The operating system family that the user agent runs on. E.g. 'OS X' for a Apple OS X based desktop. + The operating system family that the user agent runs on. E.g. ``OS X`` for an Apple Mac OS X based desktop. :Type: - string + :code:`String` + +.. _User agent OS version: -User agent OS version -~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: + +Derived simple value: :code:`userAgent().osVersion()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().osVersion() onto 'uaOSVersionField' :Description: - The version string of the operating system that the user agent runs on. E.g. '10.10.1' for Max OS X 10.10.1. + The version string of the operating system that the user agent runs on. E.g. ``10.10.1`` for Mac OS X 10.10.1. :Type: - string + :code:`String` + +.. _User agent OS vendor: -User agent OS vendor -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`userAgent().osVendor()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map userAgent().osVendor() onto 'uaOSVendorField' :Description: - The name of the company or oganisation that produces the operating system that the user agent software runs on. E.g. 'Apple Computer, Inc.' for Apple Mac OS X. + The name of the company or organisation that produces the operating system that the user agent software runs on. E.g. ``Apple Computer, Inc.`` for Apple Mac OS X. :Type: - string + :code:`String` -ip2geo -"""""" +Complex value: :code:`ip2geo({optionalIP})` +""""""""""""""""""""""""""""""""""""""""""" :Usage: - :: + .. code-block:: groovy - // uses the remoteHost as IP address to lookup - def ua = ip2geo() + // uses the remoteHost as IP address to lookup + def ua = ip2geo() - // If a load balancer sets custom headers for IP addresses, use like this - def ip = header('X-Custom-Header').first() - def myUa = ip2geo(ip) + // If a load balancer sets custom headers for IP addresses, use like this + def ip = header('X-Custom-Header').first() + def myUa = ip2geo(ip) :Description: Attempts to turn a IPv4 address into a geo location by performing a lookup into a configured `MaxMind GeoIP City database `_. This database is not distributed with Divolte Collector, but must be provided separately. See the :doc:`configuration` chapter for more details on this. - Note that this result is not directly mappable onto any Avro field. Instead, the subfields from this object, described below, can be mapped onto fields. When the lookup for a IP address fails or when the argument is not a IPv4 address, this value and all subfields' values are absent. + Note that this result is not directly mappable onto any Avro field. Instead the subfields from this object, described below, can be mapped onto fields. When the lookup for a IP address fails or when the argument is not a IPv4 address, this value and all subfield values are absent. :Type: - CityResponse + :code:`CityResponse` -Geo IP cityId -~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().cityId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().cityId() onto 'cityIdField' :Description: - The City ID for the geo location as known by http://www.geonames.org/. + The `GeoNames`_ City ID for the geolocation. :Type: - int + :code:`int` -Geo IP cityName -~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().cityName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().cityName() onto 'cityNameField' :Description: - The city name for the geo location in English. + The city name for the geolocation in English. :Type: - string + :code:`String` -Geo IP continentCode -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().continentCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().continentCode() onto 'continentCodeField' :Description: - The ISO continent code for the geo location. + The ISO continent code for the geolocation. :Type: - string + :code:`String` -Geo IP continentId -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().continentId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().continentId() onto 'continentIdField' :Description: - The Continent Id for the geo location as known by http://www.geonames.org/. + The `GeoNames`_ Continent Id for the geolocation. :Type: - int + :code:`int` -Geo IP continentName -~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().continentName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().continentName() onto 'continentNameField' :Description: - The continent name for the geo location in English. + The continent name for the geolocation in English. :Type: - string + :code:`String` -Geo IP countryCode -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().countryCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().countryCode() onto 'countryCodeField' :Description: - The ISO country code for the geo location. + The ISO country code for the geolocation. :Type: - string + :code:`String` -Geo IP countryId -~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().countryId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().countryId() onto 'countryIdField' :Description: - The Country Id for the geo location as known by http://www.geonames.org/. + The `GeoNames`_ Country Id for the geolocation. :Type: - int + :code:`int` -Geo IP countryName -~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().countryName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().countryName() onto 'countryNameField' :Description: - The country name for the geo location in English. + The country name for the geolocation in English. :Type: - string + :code:`String` -Geo IP latitude -~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().latitude()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().latitude() onto 'latitudeField' :Description: - The latitude for the geo location in English. + The latitude for the geolocation. :Type: - double + :code:`double` -Geo IP longitude -~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().longitude()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().longitude() onto 'longitudeField' :Description: - The longitude for the geo location in English. + The longitude for the geolocation. :Type: - double + :code:`double` -Geo IP metroCode -~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().metroCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().metroCode() onto 'metroCodeField' :Description: - The ISO metro code for the geo location. + The Metro Code for the geolocation. :Type: - string + :code:`String` -Geo IP timeZone -~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().timeZone()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().timeZone() onto 'timeZoneField' :Description: - The time zone name for the geo location as found in the `IANA Time Zone Database `_. + The name of the time zone for the geolocation as found in the `IANA Time Zone Database `_. :Type: - string + :code:`String` -Geo IP mostSpecificSubdivisionCode -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().mostSpecificSubdivisionCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().mostSpecificSubdivisionCode() onto 'mostSpecificSubdivisionCodeField' :Description: - The ISO code for the most specific subdivision known for the geo location. + The ISO code for the most specific subdivision known for the geolocation. :Type: - string + :code:`String` -Geo IP mostSpecificSubdivisionId -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().mostSpecificSubdivisionId()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().mostSpecificSubdivisionId() onto 'mostSpecificSubdivisionIdField' :Description: - The ID for the most specific subdivision known for the geo location as known by http://www.geonames.org/. + The `GeoNames`_ ID for the most specific subdivision known for the geolocation. :Type: - int + :code:`int` -Geo IP mostSpecificSubdivisionName -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().mostSpecificSubdivisionName()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().mostSpecificSubdivisionName() onto 'mostSpecificSubdivisionNameField' :Description: - The name for the most specific subdivision known for the geo location in English. + The name for the most specific subdivision known for the geolocation in English. :Type: - string + :code:`String` -Geo IP postalCode -~~~~~~~~~~~~~~~~~ +Derived simple value: :code:`ip2geo().postalCode()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().postalCode() onto 'postalCodeField' :Description: - The postal code for the geo location. + The postal code for the geolocation. :Type: - string + :code:`String` .. Do these even work? @@ -1615,47 +1755,47 @@ Geo IP postalCode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Geo IP subdivisionCodes -~~~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :code:`ip2geo().subdivisionCodes()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().subdivisionCodes() onto 'subdivisionCodesField' :Description: - The ISO codes for all subdivisions for the geo location in order from least specific to most specific. + The ISO codes for all subdivisions for the geolocation in order from least to most specific. :Type: - list + :code:`List` -Geo IP subdivisionIds -~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :code:`ip2geo().subdivisionIds()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().subdivisionIds() onto 'subdivisionIdsFields' :Description: - The IDs for all subdivisions for the geo location in order from least specific to most specific as known by http://www.geonames.org/. + The `GeoNames`_ IDs for all subdivisions for the geolocation in order from least to most specific. :Type: - list + :code:`List` -Geo IP subdivisionNames -~~~~~~~~~~~~~~~~~~~~~~~ +Derived complex value: :code:`ip2geo().subdivisionNames()` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :Usage: - :: + .. code-block:: groovy map ip2geo().subdivisionNames() onto 'subdivisionNames' :Description: - The names in English for all subdivisions for the geo location in order from least specific to most specific. + The names in English for all subdivisions for the geolocation in order from least to most specific. :Type: - list + :code:`List` .. These GEO IP fields don't really work currently anyway @@ -1680,3 +1820,5 @@ Geo IP subdivisionNames Geo IP satelliteProvider ~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _GeoNames: http://www.geonames.org/ diff --git a/examples/divolte-collector.conf b/examples/divolte-collector.conf index 9ee0cb8d..4f44ee58 100644 --- a/examples/divolte-collector.conf +++ b/examples/divolte-collector.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,17 +15,26 @@ // divolte { - tracking { - include "schema-mapping.conf" - schema_file = /some/dir/MySchema.avsc - } + sources { + my_source = { + type = browser + } + + my_sink = { + type = hdfs + file_strategy { + sync_file_after_records = 1000 + sync_file_after_duration = 30 seconds + working_dir = /tmp + publish_dir = /tmp + } + } - hdfs_flusher { - session_binning_file_strategy { - sync_file_after_records = 1000 - sync_file_after_duration = 30 seconds - working_dir = /tmp - publish_dir = /tmp + my_mapping = { + schema_file = /some/dir/MySchema.avsc + mapping_script_file = schema-mapping.groovy + sources = [my_source] + sinks = [my_sink] } } } diff --git a/examples/schema-mapping.conf b/examples/schema-mapping.conf deleted file mode 100644 index 0a24c810..00000000 --- a/examples/schema-mapping.conf +++ /dev/null @@ -1,129 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// to be included under config key: 'divolte.tracking' -schema_mapping { - version = 1 - - regexes { - // Matches the home page - // e.g. http://www.example.com/ - // e.g. http://www.example.com/index.html - home = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/(?:index.html)?$" - - // Match different levels of taxonomy pages (up to three levels deep) - // URL layout is: http://www.example.com/shop/section/category/ - // e.g. http://www.example.com/fashion/jeans/regular/ - taxonomy = "^(?:http|https):\\/\\/[a-z0-9\\.\\-:]+\\/(?:(?[a-z0-9\\-]+)\\/)(?:(?
[a-z0-9\\-]+)\\/)?(?:(?[a-z0-9\\-]+)\\/)?$" - shop = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/[a-z0-9\\-]+\\/$" - section = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/[a-z0-9\\-]+\\/[a-z0-9\\-]+\\/$" - category = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/[a-z0-9\\-]+\\/[a-z0-9\\-]+\\/[a-z0-9\\-]+\\/$" - - // http://www.example.com/products/311381 - product_detail = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/products\\/(?[0-9]{6})$" - - // http://www.example.com/basket - basket = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/basket$" - - // http://www.example.com/search?q=search+phrase - search = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/search\\?q=(?.*)$" - - // http://www.example.com/checkout - checkout = "^http:\\/\\/[a-z0-9\\.\\-:]+\\/checkout$" - } - - fields { - // Simple field mappings - // For fields that are potentially not set, - // make sure that the Avro record field is nullable - firstInSession = firstInSession - timestamp = timestamp - remoteHost = remoteHost - referer = referer - location = location - viewportPixelWidth = viewportPixelWidth - viewportPixelHeight = viewportPixelHeight - screenPixelWidth = screenPixelWidth - screenPixelHeight = screenPixelHeight - devicePixelRatio = devicePixelRatio - partyId = partyId - sessionId = sessionId - pageViewId = pageViewId - - userAgentString = userAgent - userAgentName = userAgentName - userAgentFamily = userAgentFamily - userAgentVendor = userAgentVendor - userAgentType = userAgentType - userAgentVersion = userAgentVersion - userAgentDevicesection = userAgentDevicesection - userAgentOsFamily = userAgentOsFamily - userAgentOsVersion = userAgentOsVersion - userAgentOsVendor = userAgentOsVendor - - // pageType field will be set to the name of the first - // regex in the list that matches the location, or is - // not set if no regex matches (must be nullable in this - // case) - pageType { - type = regex_name - regexes = [home, category, section, shop, product_detail, basket, search, checkout] - field = location - } - - // productId will be set to the named capture group 'product' from - // the regex named product_detail or will not be set if the regex - // does not match the location (must be nullable in this case) - productId { - type = regex_group - regex = product_detail - field = location - group = product - } - - // Similar to productId - shop { - type = regex_group - regex = taxonomy - field = location - group = shop - } - - // Similar to productId - section { - type = regex_group - regex = taxonomy - field = location - group = section - } - - // Similar to productId - category { - type = regex_group - regex = taxonomy - field = location - group = category - } - - // In case of search, capture the search phrase - searchPhrase { - type = regex_group - regex = search - field = location - group = phrase - } - } -} diff --git a/examples/schema-mapping.groovy b/examples/schema-mapping.groovy new file mode 100644 index 00000000..f6c9bcef --- /dev/null +++ b/examples/schema-mapping.groovy @@ -0,0 +1,130 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +mapping { + // Simple field mappings. + // For fields that are potentially not set, + // make sure that the Avro record field is nullable + map firstInSession() onto 'firstInSession' + map timestamp() onto 'ts' + map remoteHost() onto 'remoteHost' + + map referer() onto 'referer' + map location() onto 'location' + map viewportPixelWidth() onto 'viewportPixelWidth' + map viewportPixelHeight() onto 'viewportPixelHeight' + map screenPixelWidth() onto 'screenPixelWidth' + map screenPixelHeight() onto 'screenPixelHeight' + map devicePixelRatio() onto 'devicePixelRatio' + map partyId() onto 'partyId' + map sessionId() onto 'sessionId' + map pageViewId() onto 'pageViewId' + + map userAgentString() onto 'userAgent' + def ua = userAgent() + map ua.name() onto 'userAgentName' + map ua.family() onto 'userAgentFamily' + map ua.vendor() onto 'userAgentVendor' + map ua.type() onto 'userAgentType' + map ua.version() onto 'userAgentVersion' + map ua.deviceCategory() onto 'userAgentDeviceCategory' + map ua.osFamily() onto 'userAgentOsFamily' + map ua.osVersion() onto 'userAgentOsVersion' + map ua.osVendor() onto 'userAgentOsVendor' + + section { + // Pagetype detection + + // Extract the location path; we don't care about the domain. + def locationUri = parse location() to uri + def locationPath = locationUri.path() + + // Matches the home page + // e.g. / + // e.g. /index.html + def homepageMatcher = match /^\/(?:index\.html)?$/ against locationPath + when homepageMatcher.matches apply { + map 'home' onto 'pageType' + exit() + } + + // Viewing product details + // e.g. /products/311381 + def productDetailMatcher = match /^\/product\/([0-9]{6})$/ against locationPath + when productDetailMatcher.matches apply { + map 'product_detail' onto 'pageType' + map productDetailMatcher.group(1) onto 'productId' + exit() + } + + // Search results. + // e.g. /search?q=search+phrase + when locationPath.equalTo('/search') apply { + map 'searchResults' onto 'pageType' + map locationUri.query().value('q') onto 'searchPhrase' + exit() + } + + // Viewing basket + // e.g. /basket + when locationPath.equalTo('/basket') apply { + map 'basket' onto 'pageType' + exit() + } + + // Checkout funnel + // e.g. /checkout + when locationPath.equalTo('/checkout') apply { + map 'checkout' onto 'pageType' + exit() + } + + // Match different levels of taxonomy pages (up to three levels deep) + // URL layout is: http://www.example.com/shop/section/category/ + // e.g. http://www.example.com/fashion/jeans/regular/ + // (These are last due to ambiguity with the special URLs above.) + + // Category + // e.g. /fashion/jeans/regular/ + def categoryMatcher = match /^\/([a-z0-9\-]+)\/([a-z0-9\-]+)\/([a-z0-9\-]+)\/$/ against locationPath + when categoryMatcher.matches() apply { + map 'category' onto 'pageType' + map categoryMatcher.group(1) onto 'shop' + map categoryMatcher.group(2) onto 'section' + map categoryMatcher.group(3) onto 'category' + exit() + } + + // Section + // e.g. /fashion/jeans/ + def sectionMatcher = match /^\/([a-z0-9\-]+)\/([a-z0-9\-]+)\/$/ against locationPath + when sectionMatcher.matches() apply { + map 'section' onto 'pageType' + map sectionMatcher.group(1) onto 'shop' + map sectionMatcher.group(2) onto 'section' + exit() + } + + // Stop + // e.g. /fashion/jeans/ + def shopMatcher = match /^\/([a-z0-9\-]+)\/$/ against locationPath + when shopMatcher.matches() apply { + map 'section' onto 'pageType' + map shopMatcher.group(1) onto 'shop' + exit() + } + } +} diff --git a/rpm/SOURCES/divolte-collector.conf b/rpm/SOURCES/divolte-collector.conf index dc94c98b..7ffd6b93 100644 --- a/rpm/SOURCES/divolte-collector.conf +++ b/rpm/SOURCES/divolte-collector.conf @@ -1,49 +1,44 @@ # This is the configuration for the Divolte collector. divolte { - server { - # The address of the interface on which to bind and listen. - # (Alternatively, you can set the DIVOLTE_HOST environment variable.) - # - # Default: localhost - # - # Uncomment to listen on all interfaces. - #host=0.0.0.0 - - # The TCP port on which to listen. - # (Alternatively, you can set the DIVOLTE_PORT environment variable.) - # - # Default: 8290 - # - #port=8290 - } - - # Custom URL mappings are possible if you wish to extract parts of the URL - # into the click-stream events. - # - #tracking { - # include "schema-mapping.conf" - #} - - # By default, we flush to local HDFS. - hdfs_flusher { - - hdfs { - # The URI of the HDFS where events should be stored. - # (Alternatively, you can set the DIVOLTE_HDFS_URI environment variable.) + global { + server { + # The address of the interface on which to bind and listen. + # (Alternatively, you can set the DIVOLTE_HOST environment variable.) # - # Default: "file:///" (local filesystem) + # Default: localhost # - #uri = "file:///" + # Uncomment to listen on all interfaces. + #host=0.0.0.0 - # The replication factor that should be used for events stored to HDFS. - # (Alternatively, you can set the DIVOLTE_HDFS_REPLICATION environment - # variable.) - # For production this would normally be 3. + # The TCP port on which to listen. + # (Alternatively, you can set the DIVOLTE_PORT environment variable.) # - # Default: 1 + # Default: 8290 # - #replication = 3 + #port=8290 } } + + # Custom sources, mappings and sinks. + # If anything is configured, these all needed to be configured. The default + # configuration is: + # - A single browser-based source (/divolte.js) + # - A default mapping that produces events that conform to the default schema. + # - A single HDFS-based sink that writes to /tmp. + # (If you have not configured HDFS, this will be the local filesystem.) + # + # Refer to the Divolte documentation for more information. + # + # sources { + # + # } + # + # mappings { + # + # } + # + # sinks { + # + # } } diff --git a/src/main/java/io/divolte/server/BaseEventHandler.java b/src/main/java/io/divolte/server/BaseEventHandler.java deleted file mode 100644 index 835c1449..00000000 --- a/src/main/java/io/divolte/server/BaseEventHandler.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server; - -import com.google.common.base.Strings; -import com.google.common.io.Resources; -import io.undertow.server.HttpHandler; -import io.undertow.server.HttpServerExchange; -import io.undertow.util.ETag; -import io.undertow.util.ETagUtils; -import io.undertow.util.Headers; -import io.undertow.util.StatusCodes; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.io.IOException; -import java.net.InetSocketAddress; -import java.nio.ByteBuffer; -import java.util.Deque; -import java.util.Objects; -import java.util.Optional; - -@ParametersAreNonnullByDefault -public abstract class BaseEventHandler implements HttpHandler { - private static final Logger logger = LoggerFactory.getLogger(BaseEventHandler.class); - - private final static ETag SENTINEL_ETAG = new ETag(false, "6b3edc43-20ec-4078-bc47-e965dd76b88a"); - private final static String SENTINEL_ETAG_VALUE = SENTINEL_ETAG.toString(); - - private final ByteBuffer transparentImage; - protected final IncomingRequestProcessingPool processingPool; - - public BaseEventHandler(final IncomingRequestProcessingPool processingPool) { - this.processingPool = Objects.requireNonNull(processingPool); - - try { - this.transparentImage = ByteBuffer.wrap( - Resources.toByteArray(Resources.getResource("transparent1x1.gif")) - ).asReadOnlyBuffer(); - } catch (final IOException e) { - // Should throw something more specific than this. - throw new RuntimeException("Could not load transparent image resource.", e); - } - } - - @Override - public void handleRequest(final HttpServerExchange exchange) { - /* - * The source address can be fetched on-demand from the peer connection, which may - * no longer be available after the response has been sent. So we materialize it here - * to ensure it's available further down the chain. - */ - final InetSocketAddress sourceAddress = exchange.getSourceAddress(); - exchange.setSourceAddress(sourceAddress); - - /* - * Set up the headers that we always send as a response, irrespective of what type it - * will be. Note that the client is responsible for ensuring that ensures that each request - * is unique. - * The cache-related headers are intended to prevent spurious reloads for an event. - * (Being a GET request, agents are free to re-issue the request at will. We don't want this.) - * As a last resort, we try to detect duplicates via the ETag header. - */ - exchange.getResponseHeaders() - .put(Headers.CONTENT_TYPE, "image/gif") - .put(Headers.ETAG, SENTINEL_ETAG_VALUE) - .put(Headers.CACHE_CONTROL, "private, no-cache, proxy-revalidate") - .put(Headers.PRAGMA, "no-cache") - .put(Headers.EXPIRES, "Fri, 14 Apr 1995 11:30:00 GMT"); - - // If an ETag is present, this is a duplicate event. - if (ETagUtils.handleIfNoneMatch(exchange, SENTINEL_ETAG, true)) { - /* - * Subclasses are responsible to logging events. - * We just ensure the pixel is always returned, no matter what. - */ - try { - logEvent(exchange); - } finally { - // Default status code what we want: 200 OK. - exchange.getResponseSender().send(transparentImage.slice()); - } - } else { - if (logger.isDebugEnabled()) { - logger.debug("Ignoring duplicate event from {}: {}", sourceAddress, getFullUrl(exchange)); - } - exchange.setStatusCode(StatusCodes.NOT_MODIFIED); - exchange.endExchange(); - } - } - - private static String getFullUrl(HttpServerExchange exchange) { - final String queryString = exchange.getQueryString(); - final String requestUrl = exchange.getRequestURL(); - return Strings.isNullOrEmpty(queryString) - ? requestUrl - : requestUrl + '?' + queryString; - } - - static Optional queryParamFromExchange(final HttpServerExchange exchange, final String param) { - return Optional.ofNullable(exchange.getQueryParameters().get(param)).map(Deque::getFirst); - } - - /** - * Log this event. - * - * The subclass is responsible for extracting all information from the request and - * handing it off. The client is still waiting at this point; the subclass should hand - * further processing of as expediently as possible. When it returns (or throws an - * exception) the pixel response will be sent. (The subclass must never complete the - * request.) - * @param exchange the HTTP exchange from which event data can be extracted. - */ - protected abstract void logEvent(final HttpServerExchange exchange); - - protected static class IncompleteRequestException extends Exception { - private static final long serialVersionUID = 1L; - } -} diff --git a/src/main/java/io/divolte/server/BrowserSource.java b/src/main/java/io/divolte/server/BrowserSource.java new file mode 100644 index 00000000..a266a680 --- /dev/null +++ b/src/main/java/io/divolte/server/BrowserSource.java @@ -0,0 +1,86 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Objects; + +import javax.annotation.ParametersAreNonnullByDefault; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.divolte.server.config.BrowserSourceConfiguration; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.js.TrackingJavaScriptResource; +import io.undertow.server.HttpHandler; +import io.undertow.server.handlers.PathHandler; +import io.undertow.util.Methods; + +@ParametersAreNonnullByDefault +public class BrowserSource { + private static final Logger logger = LoggerFactory.getLogger(BrowserSource.class); + + private final String sourceName; + private final String pathPrefix; + private final String javascriptName; + private final HttpHandler javascriptHandler; + private final HttpHandler eventHandler; + + + public BrowserSource(final ValidatedConfiguration vc, + final String sourceName, + final IncomingRequestProcessingPool processingPool) { + this(sourceName, + vc.configuration().getSourceConfiguration(sourceName, BrowserSourceConfiguration.class).prefix, + loadTrackingJavaScript(vc, sourceName), + processingPool, + vc.configuration().sourceIndex(sourceName)); + } + + private BrowserSource(final String sourceName, + final String pathPrefix, + final TrackingJavaScriptResource trackingJavascript, + final IncomingRequestProcessingPool processingPool, + final int sourceIndex) { + this.sourceName = Objects.requireNonNull(sourceName); + this.pathPrefix = Objects.requireNonNull(pathPrefix); + javascriptName = trackingJavascript.getScriptName(); + javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavascript), Methods.GET); + final ClientSideCookieEventHandler clientSideCookieEventHandler = new ClientSideCookieEventHandler(processingPool, sourceIndex); + eventHandler = new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET); + } + + public PathHandler attachToPathHandler(PathHandler pathHandler) { + final String javascriptPath = pathPrefix + javascriptName; + pathHandler = pathHandler.addExactPath(javascriptPath, javascriptHandler); + logger.info("Registered source[{}] script location: {}", sourceName, javascriptPath); + final String eventPath = pathPrefix + "csc-event"; + pathHandler = pathHandler.addExactPath(eventPath, eventHandler); + logger.info("Registered source[{}] event handler: {}", sourceName, eventPath); + return pathHandler; + } + + private static TrackingJavaScriptResource loadTrackingJavaScript(final ValidatedConfiguration vc, final String sourceName) { + try { + return TrackingJavaScriptResource.create(vc, sourceName); + } catch (final IOException e) { + throw new UncheckedIOException("Could not precompile tracking JavaScript for source: " + sourceName, e); + } + } +} diff --git a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java index cb08c67f..2010b17e 100644 --- a/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java +++ b/src/main/java/io/divolte/server/ClientSideCookieEventHandler.java @@ -16,32 +16,42 @@ package io.divolte.server; -import java.io.IOException; -import java.net.InetSocketAddress; -import java.nio.charset.StandardCharsets; -import java.util.Deque; -import java.util.Map; -import java.util.Optional; -import java.util.SortedMap; -import java.util.TreeMap; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; +import com.google.common.base.Strings; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; - +import com.google.common.io.Resources; import io.divolte.server.mincode.MincodeFactory; +import io.divolte.server.processing.Item; +import io.undertow.server.HttpHandler; import io.undertow.server.HttpServerExchange; +import io.undertow.util.ETag; +import io.undertow.util.ETagUtils; +import io.undertow.util.Headers; +import io.undertow.util.StatusCodes; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.*; @ParametersAreNonnullByDefault -public final class ClientSideCookieEventHandler extends BaseEventHandler { +public final class ClientSideCookieEventHandler implements HttpHandler { private static final Logger logger = LoggerFactory.getLogger(ClientSideCookieEventHandler.class); + + private final static ETag SENTINEL_ETAG = new ETag(false, "6b3edc43-20ec-4078-bc47-e965dd76b88a"); + private final static String SENTINEL_ETAG_VALUE = SENTINEL_ETAG.toString(); + + private final ByteBuffer transparentImage; + private final IncomingRequestProcessingPool processingPool; + private final int sourceIndex; + private static final String TRUE_STRING = "t"; private static final String PARTY_ID_QUERY_PARAM = "p"; @@ -66,12 +76,21 @@ public final class ClientSideCookieEventHandler extends BaseEventHandler { static final String EVENT_SOURCE_NAME = "browser"; - public ClientSideCookieEventHandler(final IncomingRequestProcessingPool pool) { - super(pool); + public ClientSideCookieEventHandler(final IncomingRequestProcessingPool processingPool, final int sourceIndex) { + this.sourceIndex = sourceIndex; + this.processingPool = Objects.requireNonNull(processingPool); + + try { + this.transparentImage = ByteBuffer.wrap( + Resources.toByteArray(Resources.getResource("transparent1x1.gif")) + ).asReadOnlyBuffer(); + } catch (final IOException e) { + // Should throw something more specific than this. + throw new RuntimeException("Could not load transparent image resource.", e); + } } - @Override - protected void logEvent(final HttpServerExchange exchange) { + private void logEvent(final HttpServerExchange exchange) { try { handleRequestIfComplete(exchange); } catch (final IncompleteRequestException ire) { @@ -80,6 +99,64 @@ protected void logEvent(final HttpServerExchange exchange) { } } + @Override + public void handleRequest(final HttpServerExchange exchange) { + /* + * The source address can be fetched on-demand from the peer connection, which may + * no longer be available after the response has been sent. So we materialize it here + * to ensure it's available further down the chain. + */ + final InetSocketAddress sourceAddress = exchange.getSourceAddress(); + exchange.setSourceAddress(sourceAddress); + + /* + * Set up the headers that we always send as a response, irrespective of what type it + * will be. Note that the client is responsible for ensuring that ensures that each request + * is unique. + * The cache-related headers are intended to prevent spurious reloads for an event. + * (Being a GET request, agents are free to re-issue the request at will. We don't want this.) + * As a last resort, we try to detect duplicates via the ETag header. + */ + exchange.getResponseHeaders() + .put(Headers.CONTENT_TYPE, "image/gif") + .put(Headers.ETAG, SENTINEL_ETAG_VALUE) + .put(Headers.CACHE_CONTROL, "private, no-cache, proxy-revalidate") + .put(Headers.PRAGMA, "no-cache") + .put(Headers.EXPIRES, "Fri, 14 Apr 1995 11:30:00 GMT"); + + // If an ETag is present, this is a duplicate event. + if (ETagUtils.handleIfNoneMatch(exchange, SENTINEL_ETAG, true)) { + try { + logEvent(exchange); + } finally { + // Default status code what we want: 200 OK. + exchange.getResponseSender().send(transparentImage.slice()); + } + } else { + if (logger.isDebugEnabled()) { + logger.debug("Ignoring duplicate event from {}: {}", sourceAddress, getFullUrl(exchange)); + } + exchange.setStatusCode(StatusCodes.NOT_MODIFIED); + exchange.endExchange(); + } + } + + private static String getFullUrl(final HttpServerExchange exchange) { + final String queryString = exchange.getQueryString(); + final String requestUrl = exchange.getRequestURL(); + return Strings.isNullOrEmpty(queryString) + ? requestUrl + : requestUrl + '?' + queryString; + } + + static Optional queryParamFromExchange(final HttpServerExchange exchange, final String param) { + return Optional.ofNullable(exchange.getQueryParameters().get(param)).map(Deque::getFirst); + } + + public static class IncompleteRequestException extends Exception { + private static final long serialVersionUID = 1L; + } + private void handleRequestIfComplete(final HttpServerExchange exchange) throws IncompleteRequestException { final boolean corrupt = !isRequestChecksumCorrect(exchange); final DivolteIdentifier partyId = queryParamFromExchange(exchange, PARTY_ID_QUERY_PARAM).flatMap(DivolteIdentifier::tryParse).orElseThrow(IncompleteRequestException::new); @@ -96,7 +173,7 @@ private void handleRequestIfComplete(final HttpServerExchange exchange) throws I isNewPartyId, isFirstInSession, exchange); logger.debug("Enqueuing event (client generated cookies): {}/{}/{}/{}", partyId, sessionId, pageViewId, eventId); - processingPool.enqueueIncomingExchangeForProcessing(partyId, event); + processingPool.enqueue(Item.of(sourceIndex, partyId.value, event)); } static DivolteEvent buildBrowserEventData(final boolean corruptEvent, diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java index dabd7135..f30d9333 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessingPool.java @@ -16,90 +16,55 @@ package io.divolte.server; -import java.io.File; import java.io.IOException; import java.nio.file.Paths; import java.util.Optional; -import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; -import org.apache.avro.Schema; -import org.apache.avro.Schema.Parser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import io.divolte.record.DefaultEventRecord; +import com.google.common.collect.ImmutableMap; + import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.hdfs.HdfsFlushingPool; import io.divolte.server.ip2geo.ExternalDatabaseLookupService; import io.divolte.server.ip2geo.LookupService; -import io.divolte.server.kafka.KafkaFlushingPool; import io.divolte.server.processing.ProcessingPool; @ParametersAreNonnullByDefault final class IncomingRequestProcessingPool extends ProcessingPool { private final static Logger logger = LoggerFactory.getLogger(IncomingRequestProcessingPool.class); - private final Optional kafkaPool; - private final Optional hdfsPool; - - public IncomingRequestProcessingPool(final ValidatedConfiguration vc, final IncomingRequestListener listener) { + public IncomingRequestProcessingPool(final ValidatedConfiguration vc, + final SchemaRegistry schemaRegistry, + final ImmutableMap> sinksByName, + final IncomingRequestListener listener) { this ( - vc.configuration().incomingRequestProcessor.threads, - vc.configuration().incomingRequestProcessor.maxWriteQueue, - vc.configuration().incomingRequestProcessor.maxEnqueueDelay.toMillis(), vc, - schemaFromConfig(vc), - vc.configuration().kafkaFlusher.enabled ? new KafkaFlushingPool(vc) : null, - vc.configuration().hdfsFlusher.enabled ? new HdfsFlushingPool(vc, schemaFromConfig(vc)) : null, + schemaRegistry, + sinksByName, lookupServiceFromConfig(vc), listener ); } public IncomingRequestProcessingPool( - final int numThreads, - final int maxQueueSize, - final long maxEnqueueDelay, final ValidatedConfiguration vc, - final Schema schema, - @Nullable final KafkaFlushingPool kafkaFlushingPool, - @Nullable final HdfsFlushingPool hdfsFlushingPool, - @Nullable final LookupService geoipLookupService, + final SchemaRegistry schemaRegistry, + final ImmutableMap> sinksByName, + final Optional geoipLookupService, final IncomingRequestListener listener) { super( - numThreads, - maxQueueSize, - maxEnqueueDelay, + vc.configuration().global.mapper.threads, + vc.configuration().global.mapper.bufferSize, "Incoming Request Processor", - () -> new IncomingRequestProcessor(vc, kafkaFlushingPool, hdfsFlushingPool, geoipLookupService, schema, listener)); - - this.kafkaPool = Optional.ofNullable(kafkaFlushingPool); - this.hdfsPool = Optional.ofNullable(hdfsFlushingPool); - } - - private static Schema schemaFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().tracking.schemaFile - .map((schemaFileName) -> { - final Parser parser = new Schema.Parser(); - logger.info("Using Avro schema from configuration: {}", schemaFileName); - try { - return parser.parse(new File(schemaFileName)); - } catch(final IOException ioe) { - logger.error("Failed to load Avro schema file."); - throw new RuntimeException("Failed to load Avro schema file.", ioe); - } - }) - .orElseGet(() -> { - logger.info("Using built in default Avro schema."); - return DefaultEventRecord.getClassSchema(); - }); + () -> new IncomingRequestProcessor(vc, sinksByName, geoipLookupService, schemaRegistry, listener)); } - @Nullable - private static LookupService lookupServiceFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().tracking.ip2geoDatabase + private static Optional lookupServiceFromConfig(final ValidatedConfiguration vc) { + // XXX: This service should be a singleton, instead of per-pool. + return vc.configuration().global.mapper.ip2geoDatabase .map((path) -> { try { return new ExternalDatabaseLookupService(Paths.get(path)); @@ -107,19 +72,6 @@ private static LookupService lookupServiceFromConfig(final ValidatedConfiguratio logger.error("Failed to configure GeoIP database: " + path, e); throw new RuntimeException("Failed to configure GeoIP lookup service.", e); } - }) - .orElse(null); - } - - public void enqueueIncomingExchangeForProcessing(final DivolteIdentifier partyId, final DivolteEvent event) { - enqueue(partyId.value, event); - } - - @Override - public void stop() { - super.stop(); - - kafkaPool.ifPresent(KafkaFlushingPool::stop); - hdfsPool.ifPresent(HdfsFlushingPool::stop); + }); } } diff --git a/src/main/java/io/divolte/server/IncomingRequestProcessor.java b/src/main/java/io/divolte/server/IncomingRequestProcessor.java index 0ccf77e9..e75c5017 100644 --- a/src/main/java/io/divolte/server/IncomingRequestProcessor.java +++ b/src/main/java/io/divolte/server/IncomingRequestProcessor.java @@ -18,159 +18,142 @@ import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; -import java.util.Objects; +import java.util.ArrayList; +import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.IntStream; -import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; -import org.apache.avro.Schema; -import org.apache.avro.generic.GenericRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; -import io.divolte.record.DefaultEventRecord; import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.hdfs.HdfsFlusher; -import io.divolte.server.hdfs.HdfsFlushingPool; import io.divolte.server.ip2geo.LookupService; -import io.divolte.server.kafka.KafkaFlusher; -import io.divolte.server.kafka.KafkaFlushingPool; +import io.divolte.server.processing.Item; import io.divolte.server.processing.ItemProcessor; import io.divolte.server.processing.ProcessingPool; -import io.divolte.server.recordmapping.DslRecordMapper; -import io.divolte.server.recordmapping.DslRecordMapping; -import io.divolte.server.recordmapping.RecordMapper; -import io.divolte.server.recordmapping.UserAgentParserAndCache; import io.undertow.util.AttachmentKey; @ParametersAreNonnullByDefault public final class IncomingRequestProcessor implements ItemProcessor { - private static final Logger logger = LoggerFactory.getLogger(IncomingRequestProcessor.class); - public static final AttachmentKey DUPLICATE_EVENT_KEY = AttachmentKey.create(Boolean.class); - @Nullable - private final ProcessingPool kafkaFlushingPool; - @Nullable - private final ProcessingPool hdfsFlushingPool; - - private final IncomingRequestListener listener; - - private final RecordMapper mapper; - - private final boolean keepCorrupted; - private final ShortTermDuplicateMemory memory; - private final boolean keepDuplicates; + + // Given a source index, which mappings do we need to apply. + private final ImmutableList> mappingsBySourceIndex; + // Given a mapping index, which sinks do we need to send it to. + private final ImmutableList>> sinksByMappingIndex; public IncomingRequestProcessor(final ValidatedConfiguration vc, - @Nullable final KafkaFlushingPool kafkaFlushingPool, - @Nullable final HdfsFlushingPool hdfsFlushingPool, - @Nullable final LookupService geoipLookupService, - final Schema schema, + final ImmutableMap> sinksByName, + final Optional geoipLookupService, + final SchemaRegistry schemaRegistry, final IncomingRequestListener listener) { - this.kafkaFlushingPool = kafkaFlushingPool; - this.hdfsFlushingPool = hdfsFlushingPool; - this.listener = Objects.requireNonNull(listener); - - keepCorrupted = !vc.configuration().incomingRequestProcessor.discardCorrupted; - - memory = new ShortTermDuplicateMemory(vc.configuration().incomingRequestProcessor.duplicateMemorySize); - keepDuplicates = !vc.configuration().incomingRequestProcessor.discardDuplicates; - - mapper = vc.configuration().tracking.schemaMapping - .map((smc) -> { - final int version = smc.version; - switch(version) { - case 1: - logger.error("Version 1 configuration version had been deprecated and is no longer supported."); - throw new RuntimeException("Unsupported schema mapping config version: " + version); - case 2: - logger.info("Using script based schema mapping."); - return new DslRecordMapper( - vc, - Objects.requireNonNull(schema), - Optional.ofNullable(geoipLookupService)); - default: - throw new RuntimeException("Unsupported schema mapping config version: " + version); - } - }) - .orElseGet(() -> { - logger.info("Using built in default schema mapping."); - return new DslRecordMapper(DefaultEventRecord.getClassSchema(), defaultRecordMapping(vc)); - }); - } - - private DslRecordMapping defaultRecordMapping(final ValidatedConfiguration vc) { - final DslRecordMapping result = new DslRecordMapping(DefaultEventRecord.getClassSchema(), new UserAgentParserAndCache(vc), Optional.empty()); - result.map("detectedCorruption", result.corrupt()); - result.map("detectedDuplicate", result.duplicate()); - result.map("firstInSession", result.firstInSession()); - result.map("timestamp", result.timestamp()); - result.map("clientTimestamp", result.clientTimestamp()); - result.map("remoteHost", result.remoteHost()); - result.map("referer", result.referer()); - result.map("location", result.location()); - result.map("viewportPixelWidth", result.viewportPixelWidth()); - result.map("viewportPixelHeight", result.viewportPixelHeight()); - result.map("screenPixelWidth", result.screenPixelWidth()); - result.map("screenPixelHeight", result.screenPixelHeight()); - result.map("partyId", result.partyId()); - result.map("sessionId", result.sessionId()); - result.map("pageViewId", result.pageViewId()); - result.map("eventType", result.eventType()); - result.map("userAgentString", result.userAgentString()); - final DslRecordMapping.UserAgentValueProducer userAgent = result.userAgent(); - result.map("userAgentName", userAgent.name()); - result.map("userAgentFamily", userAgent.family()); - result.map("userAgentVendor", userAgent.vendor()); - result.map("userAgentType", userAgent.type()); - result.map("userAgentVersion", userAgent.version()); - result.map("userAgentDeviceCategory", userAgent.deviceCategory()); - result.map("userAgentOsFamily", userAgent.osFamily()); - result.map("userAgentOsVersion", userAgent.osVersion()); - result.map("userAgentOsVendor", userAgent.osVendor()); - return result; + memory = new ShortTermDuplicateMemory(vc.configuration().global.mapper.duplicateMemorySize); + + /* + * Create all Mapping instances based on their config. + */ + final Map mappingsByName = vc.configuration() + .mappings + .entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, + kv -> new Mapping(vc, + kv.getKey(), + geoipLookupService, + schemaRegistry, + listener))); + + /* + * Create a mapping from source index to a list of Mapping's that apply + * to events generated from that source index. Finally, we use a + * ImmutableList> as result, not a + * Map> because that way the backing + * data structure is effectively a two-dimensional array and no hashing + * is required for retrieval (list indexes are ints already). + */ + final ArrayList> sourceMappingResult = // temporary mutable container for the result + IntStream.range(0, vc.configuration().sources.size()) + .>mapToObj(ignored -> ImmutableList.of()) // initialized with empty lists per default + .collect(Collectors.toCollection(ArrayList::new)); + + vc.configuration() + .mappings + .entrySet() + .stream() // stream of entries (mapping_name, mapping_configuration) + .flatMap(kv -> kv.getValue() + .sources + .stream() + .map(s -> Maps.immutableEntry(vc.configuration().sourceIndex(s), + kv.getKey()))) // Results in stream of (source_index, mapping_name) + .collect(Collectors.groupingBy(Map.Entry::getKey, + Collectors.mapping(e -> mappingsByName.get(e.getValue()), + MoreCollectors.toImmutableList()) + )) // Results in a Map> where the key is the source index + .forEach(sourceMappingResult::set); // Populate the temporary result in ArrayList> + + mappingsBySourceIndex = ImmutableList.copyOf(sourceMappingResult); // Make immutable copy + + /* + * Create a mapping from mapping index to a list of sinks (ProcessingPools) + * that apply for events that came from the given mapping. Similar as above, + * we transform the result into a list of lists, instead of a map in order + * to make sure the underlying lookups are array index lookups instead of + * hash map lookups. + * + * Note that we need to know the sinks for a mapping here, instead of on the + * sink thread side, since we have one pool per sink at this moment. Later + * we'll likely move to one pool per sink type (i.e. Kafka, HDFS) and leave + * it to that pool to multiplex events to different sinks destinations (HDFS + * files or Kafka topics), which should move this code elsewhere. + */ + final ArrayList>> mappingMappingResult = // temporary mutable container for the result + IntStream.range(0, vc.configuration().mappings.size()) + .>>mapToObj(ignored -> ImmutableList.of()) // initialized with empty lists per default + .collect(Collectors.toCollection(ArrayList::new)); + + /* + * Without the intermediate variable (collected), The Eclipse compiler's type + * inference doesn't know how to handle this. Don't know about Oracle Java compiler. + */ + final Map>> collected = vc.configuration() + .mappings + .entrySet() + .stream() + .flatMap(kv->kv.getValue() + .sinks + .stream() + .map(s -> Maps.immutableEntry(vc.configuration().mappingIndex(kv.getKey()), s))) + .filter(e -> sinksByName.containsKey(e.getValue())) + .collect(Collectors.groupingBy(Map.Entry::getKey, + Collectors.mapping(e -> sinksByName.get(e.getValue()), + MoreCollectors.toImmutableList()))); + collected.forEach(mappingMappingResult::set); + + sinksByMappingIndex = ImmutableList.copyOf(mappingMappingResult); } @Override - public ProcessingDirective process(final DivolteEvent event) { - if (!event.corruptEvent || keepCorrupted) { - /* - * Note: we cannot use the actual query string here, - * as the incoming request processor is agnostic of - * that sort of thing. The request may have come from - * an endpoint that doesn't require a query string, - * but rather generates these IDs on the server side. - */ - final boolean duplicate = memory.isProbableDuplicate(event.partyCookie.value, event.sessionCookie.value, event.eventId); - event.exchange.putAttachment(DUPLICATE_EVENT_KEY, duplicate); - - if (!duplicate || keepDuplicates) { - final GenericRecord avroRecord = mapper.newRecordFromExchange(event); - final AvroRecordBuffer avroBuffer = AvroRecordBuffer.fromRecord( - event.partyCookie, - event.sessionCookie, - event.requestStartTime, - event.clientUtcOffset, - avroRecord); - listener.incomingRequest(event, avroBuffer, avroRecord); - doProcess(avroBuffer); - } - } - + public ProcessingDirective process(final Item item) { + final DivolteEvent event = item.payload; + + final boolean duplicate = memory.isProbableDuplicate(event.partyCookie.value, event.sessionCookie.value, event.eventId); + event.exchange.putAttachment(DUPLICATE_EVENT_KEY, duplicate); + + mappingsBySourceIndex.get(item.sourceId) + .stream() // For each mapping that applies to this source + .map(mapping -> mapping.map(item, duplicate)) + .filter(Optional::isPresent) // Filter discarded for duplication or corruption + .map(Optional::get) + .forEach(bufferItem -> sinksByMappingIndex.get(bufferItem.sourceId) + .stream() // For each sink that applies to this mapping + .forEach(sink -> sink.enqueue(bufferItem))); return CONTINUE; } - - private void doProcess(final AvroRecordBuffer avroBuffer) { - - if (null != kafkaFlushingPool) { - kafkaFlushingPool.enqueue(avroBuffer.getPartyId().value, avroBuffer); - } - if (null != hdfsFlushingPool) { - hdfsFlushingPool.enqueue(avroBuffer.getPartyId().value, avroBuffer); - } - } } diff --git a/src/main/java/io/divolte/server/Mapping.java b/src/main/java/io/divolte/server/Mapping.java new file mode 100644 index 00000000..97ee87af --- /dev/null +++ b/src/main/java/io/divolte/server/Mapping.java @@ -0,0 +1,112 @@ +package io.divolte.server; + +import java.util.Optional; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.divolte.record.DefaultEventRecord; +import io.divolte.server.config.MappingConfiguration; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.ip2geo.LookupService; +import io.divolte.server.processing.Item; +import io.divolte.server.recordmapping.DslRecordMapper; +import io.divolte.server.recordmapping.DslRecordMapping; +import io.divolte.server.recordmapping.RecordMapper; +import io.divolte.server.recordmapping.UserAgentParserAndCache; + +public class Mapping { + private static final Logger logger = LoggerFactory.getLogger(Mapping.class); + + private final RecordMapper mapper; + private final boolean keepCorrupted; + private final boolean keepDuplicates; + private final int mappingIndex; + + private final IncomingRequestListener listener; + + public Mapping( + final ValidatedConfiguration vc, + final String mappingName, + final Optional geoipLookupService, + final SchemaRegistry schemaRegistry, + final IncomingRequestListener listener) { + this.listener = listener; + + final MappingConfiguration mappingConfiguration = vc.configuration().mappings.get(mappingName); + final Schema schema = schemaRegistry.getSchemaByMappingName(mappingName); + + this.mappingIndex = vc.configuration().mappingIndex(mappingName); + this.keepCorrupted = !mappingConfiguration.discardCorrupted; + this.keepDuplicates = !mappingConfiguration.discardDuplicates; + + this.mapper = mappingConfiguration.mappingScriptFile + .map((mappingScriptFile) -> { + logger.info("Using script based schema mapping."); + return new DslRecordMapper(vc, mappingScriptFile, schema, geoipLookupService); + }).orElseGet(() -> { + logger.info("Using built in default schema mapping."); + return new DslRecordMapper(DefaultEventRecord.getClassSchema(), defaultRecordMapping(vc)); + }); + } + + private DslRecordMapping defaultRecordMapping(final ValidatedConfiguration vc) { + final DslRecordMapping result = new DslRecordMapping(DefaultEventRecord.getClassSchema(), new UserAgentParserAndCache(vc), Optional.empty()); + result.map("detectedCorruption", result.corrupt()); + result.map("detectedDuplicate", result.duplicate()); + result.map("firstInSession", result.firstInSession()); + result.map("timestamp", result.timestamp()); + result.map("clientTimestamp", result.clientTimestamp()); + result.map("remoteHost", result.remoteHost()); + result.map("referer", result.referer()); + result.map("location", result.location()); + result.map("viewportPixelWidth", result.viewportPixelWidth()); + result.map("viewportPixelHeight", result.viewportPixelHeight()); + result.map("screenPixelWidth", result.screenPixelWidth()); + result.map("screenPixelHeight", result.screenPixelHeight()); + result.map("partyId", result.partyId()); + result.map("sessionId", result.sessionId()); + result.map("pageViewId", result.pageViewId()); + result.map("eventType", result.eventType()); + result.map("userAgentString", result.userAgentString()); + final DslRecordMapping.UserAgentValueProducer userAgent = result.userAgent(); + result.map("userAgentName", userAgent.name()); + result.map("userAgentFamily", userAgent.family()); + result.map("userAgentVendor", userAgent.vendor()); + result.map("userAgentType", userAgent.type()); + result.map("userAgentVersion", userAgent.version()); + result.map("userAgentDeviceCategory", userAgent.deviceCategory()); + result.map("userAgentOsFamily", userAgent.osFamily()); + result.map("userAgentOsVersion", userAgent.osVersion()); + result.map("userAgentOsVendor", userAgent.osVendor()); + return result; + } + + public Optional> map(final Item item, final boolean duplicate) { + final DivolteEvent event = item.payload; + if ( + (keepDuplicates || !duplicate) && + (keepCorrupted || !event.corruptEvent)) { + final GenericRecord avroRecord = mapper.newRecordFromExchange(event); + final AvroRecordBuffer avroBuffer = AvroRecordBuffer.fromRecord( + event.partyCookie, + event.sessionCookie, + event.requestStartTime, + event.clientUtcOffset, + avroRecord); + + /* + * We should really think of a way to get rid of this and test the + * mapping process in isolation of the server. + * In the many-to-many setup, this call is potentially amplified. + */ + listener.incomingRequest(event, avroBuffer, avroRecord); + + return Optional.of(Item.withCopiedAffinity(mappingIndex, item, avroBuffer)); + } else { + return Optional.empty(); + } + } +} diff --git a/src/main/java/io/divolte/server/MappingTestServer.java b/src/main/java/io/divolte/server/MappingTestServer.java index ca755793..7bc96d9f 100644 --- a/src/main/java/io/divolte/server/MappingTestServer.java +++ b/src/main/java/io/divolte/server/MappingTestServer.java @@ -94,7 +94,7 @@ private Schema loadSchema(final String schemaFilename) throws IOException { @Nullable private static LookupService lookupServiceFromConfig(final ValidatedConfiguration vc) { - return vc.configuration().tracking.ip2geoDatabase + return vc.configuration().global.mapper.ip2geoDatabase .map((path) -> { try { return new ExternalDatabaseLookupService(Paths.get(path)); diff --git a/src/main/java/io/divolte/server/MoreCollectors.java b/src/main/java/io/divolte/server/MoreCollectors.java new file mode 100644 index 00000000..c3a962dc --- /dev/null +++ b/src/main/java/io/divolte/server/MoreCollectors.java @@ -0,0 +1,43 @@ +package io.divolte.server; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMultimap; +import com.google.common.collect.ImmutableSet; + +import java.util.Map; +import java.util.stream.Collector; + +public final class MoreCollectors { + private MoreCollectors() { + // Prevent external instantiation. + } + + public static Collector, ImmutableList> toImmutableList() { + return Collector.of(ImmutableList.Builder::new, + ImmutableList.Builder::add, + (l, r) -> l.addAll(r.build()), + ImmutableList.Builder::build); + } + + public static Collector, ImmutableSet> toImmutableSet() { + return Collector.of(ImmutableSet.Builder::new, + ImmutableSet.Builder::add, + (l, r) -> l.addAll(r.build()), + ImmutableSet.Builder::build); + } + + public static Collector, ImmutableMap.Builder, ImmutableMap> toImmutableMap() { + return Collector.of(ImmutableMap.Builder::new, + ImmutableMap.Builder::put, + (l, r) -> l.putAll(r.build()), + ImmutableMap.Builder::build); + } + + public static Collector, ImmutableMultimap.Builder, ImmutableMultimap> toImmutableMultimap() { + return Collector.of(ImmutableMultimap.Builder::new, + ImmutableMultimap.Builder::put, + (l, r) -> l.putAll(r.build()), + ImmutableMultimap.Builder::build); + } +} diff --git a/src/main/java/io/divolte/server/SchemaRegistry.java b/src/main/java/io/divolte/server/SchemaRegistry.java new file mode 100644 index 00000000..79685221 --- /dev/null +++ b/src/main/java/io/divolte/server/SchemaRegistry.java @@ -0,0 +1,93 @@ +package io.divolte.server; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import io.divolte.record.DefaultEventRecord; +import io.divolte.server.config.MappingConfiguration; +import io.divolte.server.config.ValidatedConfiguration; +import org.apache.avro.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.File; +import java.io.IOException; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class SchemaRegistry { + private static final Logger logger = LoggerFactory.getLogger(SchemaRegistry.class); + + private final ImmutableMap schemasByMappingName; + private final ImmutableMap schemasBySinkName; + + public SchemaRegistry(final ValidatedConfiguration vc) { + final ImmutableMap mappings = vc.configuration().mappings; + + // Build a mapping of the schema location for each mapping. + final ImmutableMap> schemaLocationsByMapping = + ImmutableMap.copyOf(Maps.transformValues(mappings, config -> config.schemaFile)); + + // Load the actual schemas. Once. + logger.debug("Loading schemas for mappings: {}", schemaLocationsByMapping.keySet()); + final ImmutableMap,Schema> schemasByLocation = + schemaLocationsByMapping.values() + .stream() + .distinct() + .map(schemaLocation -> + Maps.immutableEntry(schemaLocation, loadSchema(schemaLocation))) + .collect(MoreCollectors.toImmutableMap()); + + // Store the schema for each mapping. + schemasByMappingName = + ImmutableMap.copyOf(Maps.transformValues(schemaLocationsByMapping, schemasByLocation::get)); + logger.info("Loaded schemas used for mappings: {}", schemasByMappingName.keySet()); + + // Also calculate an inverse mapping by sink name. + // (Validation will ensure that multiple mappings for each sink have the same value.) + schemasBySinkName = + mappings.values() + .stream() + .flatMap(config -> config.sinks + .stream() + .map(sink -> + Maps.immutableEntry(sink, + schemasByLocation.get(config.schemaFile)))) + .distinct() + .collect(MoreCollectors.toImmutableMap()); + logger.info("Inferred schemas used for sinks: {}", schemasBySinkName.keySet()); + } + + public Schema getSchemaByMappingName(final String mappingName) { + final Schema schema = schemasByMappingName.get(mappingName); + Preconditions.checkArgument(null != schema, "Illegal mapping name: %s", mappingName); + return schema; + } + + public Schema getSchemaBySinkName(final String sinkName) { + final Schema schema = schemasBySinkName.get(sinkName); + // This means that the sink either doesn't exist, or isn't associated with a mapping. + // (Without a mapping, we can't infer the schema.) + Preconditions.checkArgument(null != schema, "Illegal sink name: %s", sinkName); + return schema; + } + + private static Schema loadSchema(final Optional schemaLocation) { + return schemaLocation + .map(filename -> { + final Schema.Parser parser = new Schema.Parser(); + logger.info("Loading Avro schema from path: {}", filename); + try { + return parser.parse(new File(filename)); + } catch(final IOException ioe) { + logger.error("Failed to load Avro schema file."); + throw new RuntimeException("Failed to load Avro schema file.", ioe); + } + }) + .orElseGet(() -> { + logger.info("Using builtin default Avro schema."); + return DefaultEventRecord.getClassSchema(); + }); + } +} diff --git a/src/main/java/io/divolte/server/Server.java b/src/main/java/io/divolte/server/Server.java index 39f748bb..51d0f0f9 100644 --- a/src/main/java/io/divolte/server/Server.java +++ b/src/main/java/io/divolte/server/Server.java @@ -16,8 +16,26 @@ package io.divolte.server; +import java.io.IOException; +import java.time.Duration; +import java.util.Map; +import java.util.Optional; + +import javax.annotation.ParametersAreNonnullByDefault; + +import org.apache.hadoop.fs.FileSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.typesafe.config.ConfigFactory; + +import io.divolte.server.config.HdfsSinkConfiguration; +import io.divolte.server.config.KafkaSinkConfiguration; import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.js.TrackingJavaScriptResource; +import io.divolte.server.processing.ProcessingPool; import io.undertow.Undertow; import io.undertow.server.HttpHandler; import io.undertow.server.handlers.CanonicalPathHandler; @@ -30,19 +48,6 @@ import io.undertow.server.handlers.resource.ResourceHandler; import io.undertow.server.handlers.resource.ResourceManager; import io.undertow.util.Headers; -import io.undertow.util.Methods; - -import java.io.IOException; -import java.time.Duration; -import java.util.Optional; - -import javax.annotation.ParametersAreNonnullByDefault; - -import org.apache.hadoop.fs.FileSystem; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.typesafe.config.ConfigFactory; @ParametersAreNonnullByDefault public final class Server implements Runnable { @@ -51,7 +56,8 @@ public final class Server implements Runnable { private final Undertow undertow; private final GracefulShutdownHandler shutdownHandler; - private final IncomingRequestProcessingPool processingPool; + private final ImmutableMap> sinks; + private final IncomingRequestProcessingPool incomingRequestProcessingPool; private final Optional host; private final int port; @@ -61,29 +67,68 @@ public Server(final ValidatedConfiguration vc) { } Server(final ValidatedConfiguration vc, final IncomingRequestListener listener) { - host = vc.configuration().server.host; - port = vc.configuration().server.port; - - processingPool = new IncomingRequestProcessingPool(vc, listener); - final ClientSideCookieEventHandler clientSideCookieEventHandler = - new ClientSideCookieEventHandler(processingPool); - final TrackingJavaScriptResource trackingJavaScript = loadTrackingJavaScript(vc); - final HttpHandler javascriptHandler = new AllowedMethodsHandler(new JavaScriptHandler(trackingJavaScript), Methods.GET); - - final PathHandler handler = new PathHandler(); - handler.addExactPath("/csc-event", - new AllowedMethodsHandler(clientSideCookieEventHandler, Methods.GET)); - handler.addExactPath('/' + trackingJavaScript.getScriptName(), javascriptHandler); - handler.addExactPath("/ping", PingHandler::handlePingRequest); - if (vc.configuration().server.serveStaticResources) { + host = vc.configuration().global.server.host; + port = vc.configuration().global.server.port; + + // First thing we need to do is load all the schemas: the sinks need these, but they come from the + // mappings. + final SchemaRegistry schemaRegistry = new SchemaRegistry(vc); + + // Build a set of referenced sinks. These are the only ones we need to instantiate. + final ImmutableSet referencedSinkNames = + vc.configuration().mappings.values() + .stream() + .flatMap(mc -> mc.sinks.stream()) + .collect(MoreCollectors.toImmutableSet()); + + // Instantiate the active sinks: + // - As a practical matter, unreferenced sinks have no associated schema, which means they + // can't be initialized. + // - This is also where we check whether HDFS and Kafka are globally enabled/disabled. + logger.debug("Initializing active sinks..."); + sinks = vc.configuration().sinks.entrySet() + .stream() + .filter(sink -> referencedSinkNames.contains(sink.getKey())) + .filter(sink -> vc.configuration().global.hdfs.enabled || !(sink.getValue() instanceof HdfsSinkConfiguration)) + .filter(sink -> vc.configuration().global.kafka.enabled || !(sink.getValue() instanceof KafkaSinkConfiguration)) + .>>map(sink -> + Maps.immutableEntry(sink.getKey(), + sink.getValue() + .getFactory() + .create(vc, sink.getKey(), schemaRegistry))) + .collect(MoreCollectors.toImmutableMap()); + logger.info("Initialized sinks: {}", sinks.keySet()); + + logger.debug("Initializing mappings..."); + incomingRequestProcessingPool = new IncomingRequestProcessingPool(vc, schemaRegistry, sinks, listener); + + logger.debug("Initializing sources..."); + // Now instantiate all the sources. We do this in parallel because instantiation can be quite slow. + final ImmutableMap sources = + vc.configuration().sources.keySet() + .parallelStream() + .map(name -> + Maps.immutableEntry(name, new BrowserSource(vc, name, incomingRequestProcessingPool))) + .collect(MoreCollectors.toImmutableMap()); + logger.debug("Attaching sources: {}", sources.keySet()); + // Once all created we can attach them to the server. This has to be done sequentially. + PathHandler pathHandler = new PathHandler(); + for (final BrowserSource browserSource : sources.values()) { + pathHandler = browserSource.attachToPathHandler(pathHandler); + } + logger.info("Initialized sources: {}", sources.keySet()); + + pathHandler.addExactPath("/ping", PingHandler::handlePingRequest); + if (vc.configuration().global.server.serveStaticResources) { // Catch-all handler; must be last if present. - handler.addPrefixPath("/", createStaticResourceHandler()); + // XXX: Our static resources assume the default 'browser' endpoint. + pathHandler.addPrefixPath("/", createStaticResourceHandler()); } final SetHeaderHandler headerHandler = - new SetHeaderHandler(handler, Headers.SERVER_STRING, "divolte"); + new SetHeaderHandler(pathHandler, Headers.SERVER_STRING, "divolte"); final HttpHandler canonicalPathHandler = new CanonicalPathHandler(headerHandler); final GracefulShutdownHandler rootHandler = new GracefulShutdownHandler( - vc.configuration().server.useXForwardedFor ? + vc.configuration().global.server.useXForwardedFor ? new ProxyAdjacentPeerAddressHandler(canonicalPathHandler) : canonicalPathHandler ); @@ -94,17 +139,9 @@ public Server(final ValidatedConfiguration vc) { .build(); } - private TrackingJavaScriptResource loadTrackingJavaScript(final ValidatedConfiguration vc) { - try { - return new TrackingJavaScriptResource(vc); - } catch (final IOException e) { - throw new RuntimeException("Could not precompile tracking JavaScript.", e); - } - } - - private HttpHandler createStaticResourceHandler() { + private static HttpHandler createStaticResourceHandler() { final ResourceManager staticResources = - new ClassPathResourceManager(getClass().getClassLoader(), "static"); + new ClassPathResourceManager(Server.class.getClassLoader(), "static"); // Cache tuning is copied from Undertow unit tests. final ResourceManager cachedResources = new CachingResourceManager(100, 65536, @@ -135,7 +172,9 @@ public void shutdown() { } logger.info("Stopping thread pools."); - processingPool.stop(); + // Stop the mappings before the sinks to ensure work in progress doesn't get stranded. + incomingRequestProcessingPool.stop(); + sinks.values().forEach(ProcessingPool::stop); logger.info("Closing HDFS filesystem connection."); try { @@ -148,8 +187,8 @@ public void shutdown() { public static void main(final String[] args) { final ValidatedConfiguration vc = new ValidatedConfiguration(ConfigFactory::load); if (!vc.isValid()) { - System.err.println("There are configuration errors. Details:"); - vc.errors().forEach(System.err::println); + vc.errors().forEach(logger::error); + logger.error("There are configuration errors. Exiting server."); System.exit(1); } diff --git a/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java new file mode 100644 index 00000000..c6fcfe4b --- /dev/null +++ b/src/main/java/io/divolte/server/config/BrowserSourceConfiguration.java @@ -0,0 +1,76 @@ +package io.divolte.server.config; + +import java.time.Duration; +import java.util.Objects; +import java.util.Optional; + +import javax.annotation.Nonnull; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; +import javax.validation.Valid; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; + +@ParametersAreNonnullByDefault +public class BrowserSourceConfiguration extends SourceConfiguration { + private static final String DEFAULT_PREFIX = "/"; + private static final String DEFAULT_PARTY_COOKIE = "_dvp"; + private static final String DEFAULT_PARTY_TIMEOUT = "730 days"; + private static final String DEFAULT_SESSION_COOKIE = "_dvs"; + private static final String DEFAULT_SESSION_TIMEOUT = "30 minutes"; + + public static final BrowserSourceConfiguration DEFAULT_BROWSER_SOURCE_CONFIGURATION = new BrowserSourceConfiguration( + DEFAULT_PREFIX, + Optional.empty(), + DEFAULT_PARTY_COOKIE, + DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT), + DEFAULT_SESSION_COOKIE, + DurationDeserializer.parseDuration(DEFAULT_SESSION_TIMEOUT), + JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); + + public final String prefix; + + public final Optional cookieDomain; + public final String partyCookie; + public final Duration partyTimeout; + public final String sessionCookie; + public final Duration sessionTimeout; + + @Valid + public final JavascriptConfiguration javascript; + + @JsonCreator + @ParametersAreNullableByDefault + BrowserSourceConfiguration(@JsonProperty(defaultValue=DEFAULT_PREFIX) final String prefix, + @Nonnull final Optional cookieDomain, + @JsonProperty(defaultValue=DEFAULT_PARTY_COOKIE) final String partyCookie, + @JsonProperty(defaultValue=DEFAULT_PARTY_TIMEOUT) final Duration partyTimeout, + @JsonProperty(defaultValue=DEFAULT_SESSION_COOKIE) final String sessionCookie, + @JsonProperty(defaultValue=DEFAULT_SESSION_TIMEOUT) final Duration sessionTimeout, + final JavascriptConfiguration javascript) { + super(); + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + final String rawPrefix = Optional.ofNullable(prefix).map((p) -> p.endsWith("/") ? p : p + '/').orElse(DEFAULT_PREFIX); + this.prefix = rawPrefix.endsWith("/") ? rawPrefix : rawPrefix + '/'; + this.cookieDomain = Objects.requireNonNull(cookieDomain); + this.partyCookie = Optional.ofNullable(partyCookie).orElse(DEFAULT_PARTY_COOKIE); + this.partyTimeout = Optional.ofNullable(partyTimeout).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_PARTY_TIMEOUT)); + this.sessionCookie = Optional.ofNullable(sessionCookie).orElse(DEFAULT_SESSION_COOKIE); + this.sessionTimeout = Optional.ofNullable(sessionTimeout).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_SESSION_TIMEOUT)); + this.javascript = Optional.ofNullable(javascript).orElse(JavascriptConfiguration.DEFAULT_JAVASCRIPT_CONFIGURATION); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("prefix", prefix) + .add("cookieDomain", cookieDomain) + .add("partyCookie", partyCookie) + .add("partyTimeout", partyTimeout) + .add("sessionCookie", sessionCookie) + .add("sessionTimeout", sessionTimeout) + .add("javascript", javascript); + } +} diff --git a/src/main/java/io/divolte/server/config/DivolteConfiguration.java b/src/main/java/io/divolte/server/config/DivolteConfiguration.java index 7a9b2421..bc41d3e8 100644 --- a/src/main/java/io/divolte/server/config/DivolteConfiguration.java +++ b/src/main/java/io/divolte/server/config/DivolteConfiguration.java @@ -1,37 +1,213 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.ParametersAreNonnullByDefault; import javax.validation.Valid; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import io.divolte.server.config.constraint.MappingSourceSinkReferencesMustExist; +import io.divolte.server.config.constraint.OneSchemaPerSink; +import io.divolte.server.config.constraint.SourceAndSinkNamesCannotCollide; + @ParametersAreNonnullByDefault +@MappingSourceSinkReferencesMustExist +@SourceAndSinkNamesCannotCollide +@OneSchemaPerSink public final class DivolteConfiguration { - @Valid public final ServerConfiguration server; - @Valid public final TrackingConfiguration tracking; - @Valid public final JavascriptConfiguration javascript; - @Valid public final IncomingRequestProcessorConfiguration incomingRequestProcessor; - @Valid public final KafkaFlusherConfiguration kafkaFlusher; - @Valid public final HdfsFlusherConfiguration hdfsFlusher; + @Valid public final GlobalConfiguration global; + + // Mappings, sources and sinks are all keyed by their name. + @Valid public final ImmutableMap mappings; + @Valid public final ImmutableMap sources; + @Valid public final ImmutableMap sinks; @JsonCreator - private DivolteConfiguration( - final ServerConfiguration server, - final TrackingConfiguration tracking, - final JavascriptConfiguration javascript, - final IncomingRequestProcessorConfiguration incomingRequestProcessor, - final KafkaFlusherConfiguration kafkaFlusher, - final HdfsFlusherConfiguration hdfsFlusher) { - this.server = server; - this.tracking = tracking; - this.javascript = javascript; - this.incomingRequestProcessor = incomingRequestProcessor; - this.kafkaFlusher = kafkaFlusher; - this.hdfsFlusher = hdfsFlusher; + DivolteConfiguration(final GlobalConfiguration global, + final Optional> sources, + final Optional> sinks, + final Optional> mappings) { + this.global = Objects.requireNonNull(global); + this.sources = sources.orElseGet(DivolteConfiguration::defaultSourceConfigurations); + this.sinks = sinks.orElseGet(DivolteConfiguration::defaultSinkConfigurations); + this.mappings = mappings.orElseGet(() -> defaultMappingConfigurations(this.sources.keySet(), this.sinks.keySet())); + } + + /* + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + private static int position(final T key, final ImmutableMap map) { + final ImmutableList keyList = map.keySet().asList(); + return keyList.indexOf(key); + } + + /** + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + public int sourceIndex(final String name) { + return position(name, sources); + } + + /** + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + public int sinkIndex(final String name) { + return position(name, sinks); + } + + /** + * This performs a linear search over the map. Only use in startup code; + * avoid in inner loops. + */ + public int mappingIndex(final String name) { + return position(name, mappings); + } + + /** + * Retrieve the configuration for the source with the given name, casting it to an expected type. + * + * It is an error to request a source that doesn't exist or is of the wrong type: the caller is + * responsible for knowing the name is valid and the type of source. + * + * @param sourceName the name of the source whose configuration should be retrieved. + * @param sourceClass the class of the source configuration to retrieve. + * @param the type of the source configuration to retrieve. + * @return the configuration for the given source. + * @throws IllegalArgumentException + * if no configuration exists for the given source or its type is different + * to that expected. + */ + public T getSourceConfiguration(final String sourceName, final Class sourceClass) { + final SourceConfiguration sourceConfiguration = sources.get(sourceName); + Preconditions.checkArgument(null != sourceConfiguration, "No source configuration with name: %s", sourceName); + Preconditions.checkArgument(sourceClass.isInstance(sourceConfiguration), + "Source configuration '%s' is not a %s sink", sourceName, sourceClass.getSimpleName()); + return sourceClass.cast(sourceConfiguration); + } + + /** + * Retrieve the configuration for the mapping with the given name. + * + * It is an error to request a mapping that doesn't exist: the caller is responsible for knowing + * the name is valid. + * + * @param mappingName the name of the mapping whose configuration should be retrieved. + * @return the configuration for the given mapping. + * @throws IllegalArgumentException + * if no configuration exists for the given mapping. + */ + public MappingConfiguration getMappingConfiguration(final String mappingName) { + final MappingConfiguration mappingConfiguration = mappings.get(mappingName); + Preconditions.checkArgument(null != mappingConfiguration, "No mapping configuration with name: %s", mappingName); + return mappingConfiguration; + } + + /** + * Retrieve the configuration for the sink with the given name, casting it to an expected type. + * + * It is an error to request a sink that doesn't exist or is of the wrong type: the caller is + * responsible for knowing the name is valid and the type of sink. + * + * @param sinkName the name of the sink whose configuration should be retrieved. + * @param sinkClass the class of the sink configuration to retrieve. + * @param the type of the sink configuration to retrieve. + * @return the configuration for the given sink. + * @throws IllegalArgumentException + * if no configuration exists for the given sink or its type is different + * to that expected. + */ + public T getSinkConfiguration(final String sinkName, final Class sinkClass) { + final SinkConfiguration sinkConfiguration = sinks.get(sinkName); + Preconditions.checkArgument(null != sinkConfiguration, "No sink configuration with name: %s", sinkName); + Preconditions.checkArgument(sinkClass.isInstance(sinkConfiguration), + "Sink configuration '%s' is not a %s sink", sinkName, sinkClass.getSimpleName()); + return sinkClass.cast(sinkConfiguration); + } + + // Defaults; these will eventually disappear + private static ImmutableMap defaultSourceConfigurations() { + return ImmutableMap.of("browser", BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION); + } + + private static ImmutableMap defaultSinkConfigurations() { + return ImmutableMap.of("hdfs", new HdfsSinkConfiguration((short) 1, FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION), + "kafka", new KafkaSinkConfiguration(null)); + } + + private static ImmutableMap defaultMappingConfigurations(final ImmutableSet sourceNames, + final ImmutableSet sinkNames) { + return ImmutableMap.of("default", new MappingConfiguration(Optional.empty(), + Optional.empty(), + sourceNames, + sinkNames, + false, + false)); } @Override public String toString() { - return "DivolteConfiguration [server=" + server + ", tracking=" + tracking + ", javascript=" + javascript + ", incomingRequestProcessor=" + incomingRequestProcessor + ", kafkaFlusher=" + kafkaFlusher + ", hdfsFlusher=" + hdfsFlusher + "]"; + return MoreObjects.toStringHelper(this) + .add("global", global) + .add("sources", sources) + .add("sinks", sinks) + .add("mappings", mappings) + .toString(); + } + + /* + * Validation support methods here. + * + * As bean validation uses expression language for rendering error messages, + * substitutions need to be available for some of these. EL doesn't allow for + * access to attributes, just getters/setters and methods. Hence, here are a + * number of methods that are used to render validation messages. These result + * of these methods can also be used for actual validation. + */ + public Set missingSourcesSinks() { + final Set defined = new HashSet<>(); + defined.addAll(sources.keySet()); + defined.addAll(sinks.keySet()); + + final Set used = mappings + .values() + .stream() + .flatMap(mc -> Stream.concat( + mc.sources.stream(), + mc.sinks.stream())) + .collect(Collectors.toSet()); + + return Sets.difference(used, defined); + } + + public Set collidingSourceAndSinkNames() { + return Sets.intersection(sources.keySet(), sinks.keySet()); + } + + public Set sinksWithMultipleSchemas() { + final Map countsBySink = + mappings.values() + .stream() + .flatMap(config -> config.sinks.stream() + .map(sink -> Maps.immutableEntry(sink, config.schemaFile))) + .distinct() + .collect(Collectors.groupingBy(Map.Entry::getKey, Collectors.counting())); + return Maps.filterValues(countsBySink, count -> count > 1L).keySet(); } } diff --git a/src/main/java/io/divolte/server/config/DurationDeserializer.java b/src/main/java/io/divolte/server/config/DurationDeserializer.java index f0263066..6c2efabb 100644 --- a/src/main/java/io/divolte/server/config/DurationDeserializer.java +++ b/src/main/java/io/divolte/server/config/DurationDeserializer.java @@ -6,14 +6,14 @@ import java.time.Duration; import java.util.concurrent.TimeUnit; +import javax.annotation.ParametersAreNonnullByDefault; + import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.deser.std.StdScalarDeserializer; import com.typesafe.config.impl.ConfigImplUtil; -import javax.annotation.ParametersAreNonnullByDefault; - @ParametersAreNonnullByDefault public class DurationDeserializer extends StdScalarDeserializer { private static final long serialVersionUID = 1L; @@ -28,11 +28,21 @@ public Duration deserialize(final JsonParser p, if (VALUE_STRING != p.getCurrentToken()) { throw ctx.mappingException("Expected string value for Duration mapping."); } - return Duration.ofNanos(parseDuration(p.getText(), ctx)); + long result; + try { + result = parse(p.getText()); + } catch(final DurationFormatException e) { + throw new JsonMappingException(p, e.getMessage(), e); + } + return Duration.ofNanos(result); + } + + public static Duration parseDuration(final String input) { + return Duration.ofNanos(parse(input)); } // Inspired by Typesafe Config parseDuration(...) - private static long parseDuration(final String input, final DeserializationContext context) throws JsonMappingException { + private static long parse(final String input) { final String s = ConfigImplUtil.unicodeTrim(input); final String originalUnitString = getUnits(s); String unitString = originalUnitString; @@ -41,7 +51,8 @@ private static long parseDuration(final String input, final DeserializationConte // this would be caught later anyway, but the error message // is more helpful if we check it here. if (numberString.isEmpty()) { - throw context.mappingException(String.format("No number in duration value '%s'", input)); + final String msg = String.format("No number in duration value '%s'", input); + throw new DurationFormatException(msg); } // All units longer than 2 characters are accepted in singular or plural form. @@ -86,7 +97,8 @@ private static long parseDuration(final String input, final DeserializationConte units = TimeUnit.MINUTES; break; default: - throw context.mappingException(String.format("Could not parse time unit '%s' (try ns, us, ms, s, m, h, d)", originalUnitString)); + final String msg = String.format("Could not parse time unit '%s' (try ns, us, ms, s, m, h, d)", originalUnitString); + throw new DurationFormatException(msg); } try { @@ -96,14 +108,15 @@ private static long parseDuration(final String input, final DeserializationConte ? units.toNanos(Long.parseLong(numberString)) : (long) (Double.parseDouble(numberString) * units.toNanos(1)); } catch (final NumberFormatException e) { - throw context.mappingException(String.format("Could not parse duration number '%s'", numberString)); + final String msg = String.format("Could not parse duration number '%s'", numberString); + throw new DurationFormatException(msg); } } private static String getUnits(final String s) { int i = s.length() - 1; while (i >= 0) { - char c = s.charAt(i); + final char c = s.charAt(i); if (!Character.isLetter(c)) { break; } diff --git a/src/main/java/io/divolte/server/config/DurationFormatException.java b/src/main/java/io/divolte/server/config/DurationFormatException.java new file mode 100644 index 00000000..2c8a6770 --- /dev/null +++ b/src/main/java/io/divolte/server/config/DurationFormatException.java @@ -0,0 +1,9 @@ +package io.divolte.server.config; + +public class DurationFormatException extends RuntimeException { + private static final long serialVersionUID = 8475209646046838380L; + + public DurationFormatException(final String message) { + super(message); + } +} diff --git a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java index b2accace..5733725d 100644 --- a/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java +++ b/src/main/java/io/divolte/server/config/FileStrategyConfiguration.java @@ -1,55 +1,59 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonSubTypes; -import com.fasterxml.jackson.annotation.JsonSubTypes.Type; -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.google.common.base.Preconditions; +import java.time.Duration; +import java.util.Optional; import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; +import javax.annotation.ParametersAreNullableByDefault; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; -@JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") -@JsonSubTypes({ - @Type(value=SimpleRollingFileStrategyConfiguration.class, name = "SIMPLE_ROLLING_FILE"), - @Type(value=SessionBinningFileStrategyConfiguration.class, name = "SESSION_BINNING") -}) @ParametersAreNonnullByDefault -public abstract class FileStrategyConfiguration { - public final FileStrategyConfiguration.Types type; +public class FileStrategyConfiguration { + private static final String DEFAULT_SYNC_FILE_AFTER_RECORDS = "1000"; + private static final String DEFAULT_SYNC_FILE_AFTER_DURATION = "30 seconds"; + private static final String DEFAULT_WORKING_DIR = "/tmp"; + private static final String DEFAULT_PUBLISH_DIR = "/tmp"; + private static final String DEFAULT_ROLL_EVERY = "1 hour"; + + static final FileStrategyConfiguration DEFAULT_FILE_STRATEGY_CONFIGURATION = + new FileStrategyConfiguration( + DurationDeserializer.parseDuration(DEFAULT_ROLL_EVERY), + Integer.parseInt(DEFAULT_SYNC_FILE_AFTER_RECORDS), + DurationDeserializer.parseDuration(DEFAULT_SYNC_FILE_AFTER_DURATION), + DEFAULT_WORKING_DIR, + DEFAULT_PUBLISH_DIR); + public final int syncFileAfterRecords; public final Duration syncFileAfterDuration; public final String workingDir; public final String publishDir; - - protected FileStrategyConfiguration ( - final FileStrategyConfiguration.Types type, - final int syncFileAfterRecords, - final Duration syncFileAfterDuration, - final String workingDir, - final String publishDir) { - this.type = Objects.requireNonNull(type); - this.syncFileAfterRecords = Objects.requireNonNull(syncFileAfterRecords); - this.syncFileAfterDuration = Objects.requireNonNull(syncFileAfterDuration); - this.workingDir = Objects.requireNonNull(workingDir); - this.publishDir = Objects.requireNonNull(publishDir); - } - - @ParametersAreNonnullByDefault - public enum Types { - SIMPLE_ROLLING_FILE(SimpleRollingFileStrategyConfiguration.class), - SESSION_BINNING(SessionBinningFileStrategyConfiguration.class); - - public final Class clazz; - - Types(final Class clazz) { - this.clazz = Objects.requireNonNull(clazz); - } + public final Duration rollEvery; + + @JsonCreator + @ParametersAreNullableByDefault + FileStrategyConfiguration(@JsonProperty(defaultValue=DEFAULT_ROLL_EVERY) final Duration rollEvery, + @JsonProperty(defaultValue=DEFAULT_SYNC_FILE_AFTER_RECORDS) final Integer syncFileAfterRecords, + @JsonProperty(defaultValue=DEFAULT_SYNC_FILE_AFTER_DURATION) final Duration syncFileAfterDuration, + @JsonProperty(defaultValue=DEFAULT_WORKING_DIR) final String workingDir, + @JsonProperty(defaultValue=DEFAULT_PUBLISH_DIR) final String publishDir) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.rollEvery = Optional.ofNullable(rollEvery).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_ROLL_EVERY)); + this.syncFileAfterRecords = Optional.ofNullable(syncFileAfterRecords).orElseGet(() -> Integer.valueOf(DEFAULT_SYNC_FILE_AFTER_RECORDS)); + this.syncFileAfterDuration = Optional.ofNullable(syncFileAfterDuration).orElseGet(() -> DurationDeserializer.parseDuration(DEFAULT_SYNC_FILE_AFTER_DURATION)); + this.workingDir = Optional.ofNullable(workingDir).orElse(DEFAULT_WORKING_DIR); + this.publishDir = Optional.ofNullable(publishDir).orElse(DEFAULT_PUBLISH_DIR); } - public T as(Class target) { - Preconditions.checkState(type.clazz.equals(target), - "Attempt to cast FileStrategyConfiguration to wrong type."); - return target.cast(this); + @Override + public final String toString() { + return MoreObjects.toStringHelper(this) + .add("rollEvery", rollEvery) + .add("syncFileAfterRecords", syncFileAfterRecords) + .add("syncFileAfterDuration", syncFileAfterDuration) + .add("workingDir", workingDir) + .add("publishDir", publishDir).toString(); } } diff --git a/src/main/java/io/divolte/server/config/GlobalConfiguration.java b/src/main/java/io/divolte/server/config/GlobalConfiguration.java new file mode 100644 index 00000000..7819f677 --- /dev/null +++ b/src/main/java/io/divolte/server/config/GlobalConfiguration.java @@ -0,0 +1,37 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.Valid; +import java.util.Objects; + +@ParametersAreNonnullByDefault +public class GlobalConfiguration { + @Valid public final ServerConfiguration server; + @Valid public final MapperConfiguration mapper; + @Valid public final HdfsConfiguration hdfs; + @Valid public final KafkaConfiguration kafka; + + @JsonCreator + GlobalConfiguration(final ServerConfiguration server, + final MapperConfiguration mapper, + final HdfsConfiguration hdfs, + final KafkaConfiguration kafka) { + this.server = Objects.requireNonNull(server); + this.mapper = Objects.requireNonNull(mapper); + this.hdfs = Objects.requireNonNull(hdfs); + this.kafka = Objects.requireNonNull(kafka); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("server", server) + .add("mapper", mapper) + .add("hdfs", hdfs) + .add("kafka", kafka) + .toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/HdfsConfiguration.java b/src/main/java/io/divolte/server/config/HdfsConfiguration.java index 666410f7..41cd9c4b 100644 --- a/src/main/java/io/divolte/server/config/HdfsConfiguration.java +++ b/src/main/java/io/divolte/server/config/HdfsConfiguration.java @@ -1,24 +1,27 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; +import java.util.Optional; +import java.util.Properties; import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; -import java.util.Optional; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; @ParametersAreNonnullByDefault -public final class HdfsConfiguration { - public final Optional uri; - public final short replication; +public final class HdfsConfiguration extends SinkTypeConfiguration { + + public final Optional client; @JsonCreator - private HdfsConfiguration(final Optional uri, final short replication) { - this.uri = Objects.requireNonNull(uri); - this.replication = replication; + HdfsConfiguration(final boolean enabled, final int bufferSize, final int threads, final Optional client) { + super(bufferSize, threads, enabled); + this.client = client.map(ImmutableProperties::fromSource); } @Override - public String toString() { - return "HdfsConfiguration [uri=" + uri + ", replication=" + replication + "]"; + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("client", client); } } diff --git a/src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java b/src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java deleted file mode 100644 index 5ae91756..00000000 --- a/src/main/java/io/divolte/server/config/HdfsFlusherConfiguration.java +++ /dev/null @@ -1,38 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class HdfsFlusherConfiguration { - public final boolean enabled; - public final int threads; - public final int maxWriteQueue; - public final Duration maxEnqueueDelay; - public final HdfsConfiguration hdfs; - public final FileStrategyConfiguration fileStrategy; - - @JsonCreator - private HdfsFlusherConfiguration( - final boolean enabled, - final int threads, - final int maxWriteQueue, - final Duration maxEnqueueDelay, - final HdfsConfiguration hdfs, - final FileStrategyConfiguration fileStrategy) { - this.enabled = enabled; - this.threads = threads; - this.maxWriteQueue = maxWriteQueue; - this.maxEnqueueDelay = Objects.requireNonNull(maxEnqueueDelay); - this.hdfs = Objects.requireNonNull(hdfs); - this.fileStrategy = Objects.requireNonNull(fileStrategy); - } - - @Override - public String toString() { - return "HdfsFlusherConfiguration [enabled=" + enabled + ", threads=" + threads + ", maxWriteQueue=" + maxWriteQueue + ", maxEnqueueDelay=" + maxEnqueueDelay + ", hdfs=" + hdfs + ", fileStrategy=" + fileStrategy + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java new file mode 100644 index 00000000..bcc1e8cf --- /dev/null +++ b/src/main/java/io/divolte/server/config/HdfsSinkConfiguration.java @@ -0,0 +1,39 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; +import io.divolte.server.hdfs.HdfsFlushingPool; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class HdfsSinkConfiguration extends SinkConfiguration { + private static final String DEFAULT_REPLICATION = "3"; + + public final short replication; + public final FileStrategyConfiguration fileStrategy; + + @JsonCreator + @ParametersAreNullableByDefault + HdfsSinkConfiguration(@JsonProperty(defaultValue=DEFAULT_REPLICATION) final Short replication, + final FileStrategyConfiguration fileStrategy) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.replication = Optional.ofNullable(replication).orElseGet(() -> Short.valueOf(DEFAULT_REPLICATION)); + this.fileStrategy = Optional.ofNullable(fileStrategy).orElse(FileStrategyConfiguration.DEFAULT_FILE_STRATEGY_CONFIGURATION); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("replication", replication) + .add("fileStrategy", fileStrategy); + } + + @Override + public SinkFactory getFactory() { + return HdfsFlushingPool::new; + } +} diff --git a/src/main/java/io/divolte/server/config/ImmutableProperties.java b/src/main/java/io/divolte/server/config/ImmutableProperties.java new file mode 100644 index 00000000..26c8a3bc --- /dev/null +++ b/src/main/java/io/divolte/server/config/ImmutableProperties.java @@ -0,0 +1,61 @@ +package io.divolte.server.config; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Map; +import java.util.Properties; + +public class ImmutableProperties extends Properties { + private static final long serialVersionUID = 1333087762733134653L; + + public static ImmutableProperties fromSource(final Properties source) { + final ImmutableProperties result = new ImmutableProperties(); + source.forEach(result::set); + return result; + } + + private void set(final Object key, final Object value) { + super.put(key, value); + } + + @Override + public synchronized void load(final InputStream inStream) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void load(final Reader reader) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void loadFromXML(final InputStream in) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized Object setProperty(final String key, final String value) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized Object put(final Object key, final Object value) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void putAll(final Map t) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized Object remove(final Object key) { + throw new UnsupportedOperationException(); + } + + @Override + public synchronized void clear() { + throw new UnsupportedOperationException(); + } +} diff --git a/src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java b/src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java deleted file mode 100644 index 2866fae4..00000000 --- a/src/main/java/io/divolte/server/config/IncomingRequestProcessorConfiguration.java +++ /dev/null @@ -1,38 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class IncomingRequestProcessorConfiguration { - public final int threads; - public final int maxWriteQueue; - public final Duration maxEnqueueDelay; - public final boolean discardCorrupted; - public final int duplicateMemorySize; - public final boolean discardDuplicates; - - @JsonCreator - private IncomingRequestProcessorConfiguration( - final int threads, - final int maxWriteQueue, - final Duration maxEnqueueDelay, - final boolean discardCorrupted, - final int duplicateMemorySize, - final boolean discardDuplicates) { - this.threads = threads; - this.maxWriteQueue = maxWriteQueue; - this.maxEnqueueDelay = Objects.requireNonNull(maxEnqueueDelay); - this.discardCorrupted = discardCorrupted; - this.duplicateMemorySize = duplicateMemorySize; - this.discardDuplicates = discardDuplicates; - } - - @Override - public String toString() { - return "IncomingRequestProcessorConfiguration [threads=" + threads + ", maxWriteQueue=" + maxWriteQueue + ", maxEnqueueDelay=" + maxEnqueueDelay + ", discardCorrupted=" + discardCorrupted + ", duplicateMemorySize=" + duplicateMemorySize + ", discardDuplicates=" + discardDuplicates + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java index f8f7648c..ecdf89f1 100644 --- a/src/main/java/io/divolte/server/config/JavascriptConfiguration.java +++ b/src/main/java/io/divolte/server/config/JavascriptConfiguration.java @@ -1,15 +1,31 @@ package io.divolte.server.config; -import com.fasterxml.jackson.annotation.JsonCreator; -import org.hibernate.validator.constraints.NotEmpty; - import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; import javax.validation.constraints.NotNull; import javax.validation.constraints.Pattern; -import java.util.Objects; + +import org.hibernate.validator.constraints.NotEmpty; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; + +import java.util.Optional; @ParametersAreNonnullByDefault public final class JavascriptConfiguration { + private static final String DEFAULT_NAME = "divolte.js"; + private static final String DEFAULT_LOGGING = "false"; + private static final String DEFAULT_DEBUG = "false"; + private static final String DEFAULT_AUTO_PAGE_VIEW_EVENT = "true"; + + static final JavascriptConfiguration DEFAULT_JAVASCRIPT_CONFIGURATION = + new JavascriptConfiguration(DEFAULT_NAME, + Boolean.parseBoolean(DEFAULT_LOGGING), + Boolean.parseBoolean(DEFAULT_DEBUG), + Boolean.parseBoolean(DEFAULT_AUTO_PAGE_VIEW_EVENT)); + @NotNull @NotEmpty @Pattern(regexp="^[A-Za-z0-9_-]+\\.js$") public final String name; @@ -18,19 +34,25 @@ public final class JavascriptConfiguration { public final boolean autoPageViewEvent; @JsonCreator - private JavascriptConfiguration( - final String name, - final boolean logging, - final boolean debug, - final boolean autoPageViewEvent) { - this.name = Objects.requireNonNull(name); - this.logging = logging; - this.debug = debug; - this.autoPageViewEvent = autoPageViewEvent; + @ParametersAreNullableByDefault + JavascriptConfiguration(@JsonProperty(defaultValue=DEFAULT_NAME) final String name, + @JsonProperty(defaultValue=DEFAULT_LOGGING) final Boolean logging, + @JsonProperty(defaultValue=DEFAULT_DEBUG) final Boolean debug, + @JsonProperty(defaultValue=DEFAULT_AUTO_PAGE_VIEW_EVENT) final Boolean autoPageViewEvent) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.name = Optional.ofNullable(name).orElse(DEFAULT_NAME); + this.logging = Optional.ofNullable(logging).orElseGet(() -> Boolean.valueOf(DEFAULT_LOGGING)); + this.debug = Optional.ofNullable(debug).orElseGet(() -> Boolean.valueOf(DEFAULT_DEBUG)); + this.autoPageViewEvent = Optional.ofNullable(autoPageViewEvent).orElseGet(() -> Boolean.valueOf(DEFAULT_AUTO_PAGE_VIEW_EVENT)); } @Override public String toString() { - return "JavascriptConfiguration [name=" + name + ", logging=" + logging + ", debug=" + debug + ", autoPageViewEvent=" + autoPageViewEvent + "]"; + return MoreObjects.toStringHelper(this) + .add("name", name) + .add("logging", logging) + .add("debug", debug) + .add("autoPageViewEvent", autoPageViewEvent) + .toString(); } } diff --git a/src/main/java/io/divolte/server/config/KafkaConfiguration.java b/src/main/java/io/divolte/server/config/KafkaConfiguration.java new file mode 100644 index 00000000..9f3302c9 --- /dev/null +++ b/src/main/java/io/divolte/server/config/KafkaConfiguration.java @@ -0,0 +1,26 @@ +package io.divolte.server.config; + +import java.util.Properties; + +import javax.annotation.ParametersAreNonnullByDefault; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +@ParametersAreNonnullByDefault +public class KafkaConfiguration extends SinkTypeConfiguration { + + public final Properties producer; + + @JsonCreator + KafkaConfiguration(final int bufferSize, final int threads, final boolean enabled, final Properties producer) { + super(bufferSize, threads, enabled); + this.producer = ImmutableProperties.fromSource(producer); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper() + .add("producer", producer); + } +} diff --git a/src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java b/src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java deleted file mode 100644 index e476a630..00000000 --- a/src/main/java/io/divolte/server/config/KafkaFlusherConfiguration.java +++ /dev/null @@ -1,39 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; -import java.util.Properties; - -@ParametersAreNonnullByDefault -public final class KafkaFlusherConfiguration { - public final boolean enabled; - public final int threads; - public final int maxWriteQueue; - public final Duration maxEnqueueDelay; - public final String topic; - public final Properties producer; - - @JsonCreator - private KafkaFlusherConfiguration( - final boolean enabled, - final int threads, - final int maxWriteQueue, - final Duration maxEnqueueDelay, - final String topic, - final Properties producer) { - this.enabled = enabled; - this.threads = threads; - this.maxWriteQueue = maxWriteQueue; - this.maxEnqueueDelay = Objects.requireNonNull(maxEnqueueDelay); - this.topic = Objects.requireNonNull(topic); - this.producer = Objects.requireNonNull(producer); - } - - @Override - public String toString() { - return "KafkaFlusherConfiguration [enabled=" + enabled + ", threads=" + threads + ", maxWriteQueue=" + maxWriteQueue + ", maxEnqueueDelay=" + maxEnqueueDelay + ", topic=" + topic + ", producer=" + producer + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java new file mode 100644 index 00000000..04f45c8d --- /dev/null +++ b/src/main/java/io/divolte/server/config/KafkaSinkConfiguration.java @@ -0,0 +1,34 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; +import io.divolte.server.kafka.KafkaFlushingPool; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.ParametersAreNullableByDefault; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class KafkaSinkConfiguration extends SinkConfiguration { + private static final String DEFAULT_TOPIC = "divolte"; + + public final String topic; + + @JsonCreator + @ParametersAreNullableByDefault + KafkaSinkConfiguration(@JsonProperty(defaultValue=DEFAULT_TOPIC) final String topic) { + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.topic = Optional.ofNullable(topic).orElse(DEFAULT_TOPIC); + } + + @Override + protected MoreObjects.ToStringHelper toStringHelper() { + return super.toStringHelper().add("topic", topic); + } + + @Override + public SinkFactory getFactory() { + return KafkaFlushingPool::new; + } +} diff --git a/src/main/java/io/divolte/server/config/MapperConfiguration.java b/src/main/java/io/divolte/server/config/MapperConfiguration.java new file mode 100644 index 00000000..db8f9305 --- /dev/null +++ b/src/main/java/io/divolte/server/config/MapperConfiguration.java @@ -0,0 +1,41 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.google.common.base.MoreObjects; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class MapperConfiguration { + public final int bufferSize; + public final int threads; + public final int duplicateMemorySize; + public final UserAgentParserConfiguration userAgentParser; + public final Optional ip2geoDatabase; + + @JsonCreator + MapperConfiguration(final int bufferSize, + final int threads, + final int duplicateMemorySize, + final UserAgentParserConfiguration userAgentParser, + final Optional ip2geoDatabase) { + this.bufferSize = bufferSize; + this.threads = threads; + this.duplicateMemorySize = duplicateMemorySize; + this.userAgentParser = Objects.requireNonNull(userAgentParser); + this.ip2geoDatabase = Objects.requireNonNull(ip2geoDatabase); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("bufferSize", bufferSize) + .add("threads", threads) + .add("duplicateMemorySize", duplicateMemorySize) + .add("userAgentParser", userAgentParser) + .add("ip2geoDatabase", ip2geoDatabase) + .toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/MappingConfiguration.java b/src/main/java/io/divolte/server/config/MappingConfiguration.java new file mode 100644 index 00000000..e92c09af --- /dev/null +++ b/src/main/java/io/divolte/server/config/MappingConfiguration.java @@ -0,0 +1,60 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; +import com.google.common.collect.ImmutableSet; + +import javax.annotation.Nullable; +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Objects; +import java.util.Optional; + +@ParametersAreNonnullByDefault +public class MappingConfiguration { + private static final String DEFAULT_DISCARD_CORRUPTED = "false"; + private static final String DEFAULT_DISCARD_DUPLICATES = "false"; + + public final Optional schemaFile; + public final Optional mappingScriptFile; + + public final ImmutableSet sources; + public final ImmutableSet sinks; + + public final boolean discardCorrupted; + public final boolean discardDuplicates; + + @JsonCreator + MappingConfiguration(final Optional schemaFile, + final Optional mappingScriptFile, + @JsonProperty(required = true) + final ImmutableSet sources, + @JsonProperty(required = true) + final ImmutableSet sinks, + @JsonProperty(defaultValue=DEFAULT_DISCARD_CORRUPTED) + @Nullable + final Boolean discardCorrupted, + @JsonProperty(defaultValue=DEFAULT_DISCARD_DUPLICATES) + @Nullable + final Boolean discardDuplicates) { + this.schemaFile = Objects.requireNonNull(schemaFile); + this.mappingScriptFile = Objects.requireNonNull(mappingScriptFile); + this.sources = Objects.requireNonNull(sources); + this.sinks = Objects.requireNonNull(sinks); + // TODO: register a custom deserializer with Jackson that uses the defaultValue property from the annotation to fix this + this.discardCorrupted = Optional.ofNullable(discardCorrupted).orElseGet(() -> Boolean.valueOf(DEFAULT_DISCARD_CORRUPTED)); + this.discardDuplicates = Optional.ofNullable(discardDuplicates).orElseGet(() -> Boolean.valueOf(DEFAULT_DISCARD_DUPLICATES)); + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("schemaFile", schemaFile) + .add("mappingScriptFile", mappingScriptFile) + .add("sources", sources) + .add("sinks", sinks) + .add("discardCorrupted", discardCorrupted) + .add("discardDuplicates", discardDuplicates) + .toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/PropertiesDeserializer.java b/src/main/java/io/divolte/server/config/PropertiesDeserializer.java index 15aba723..7ed89e16 100644 --- a/src/main/java/io/divolte/server/config/PropertiesDeserializer.java +++ b/src/main/java/io/divolte/server/config/PropertiesDeserializer.java @@ -9,7 +9,6 @@ import java.util.Properties; import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; @@ -23,7 +22,7 @@ public class PropertiesDeserializer extends JsonDeserializer { private final static Joiner COMMA_JOINER = Joiner.on(','); @Override - public Properties deserialize(JsonParser p, DeserializationContext ctx) throws IOException, JsonProcessingException { + public Properties deserialize(JsonParser p, DeserializationContext ctx) throws IOException { if (START_OBJECT == p.getCurrentToken()) { final Properties properties = new Properties(); final Deque stack = new ArrayDeque<>(); diff --git a/src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java b/src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java deleted file mode 100644 index 1b4f2fc9..00000000 --- a/src/main/java/io/divolte/server/config/SchemaMappingConfiguration.java +++ /dev/null @@ -1,23 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class SchemaMappingConfiguration { - public final int version; - public final String mappingScriptFile; - - @JsonCreator - private SchemaMappingConfiguration(final int version, final String mappingScriptFile) { - this.version = version; - this.mappingScriptFile = Objects.requireNonNull(mappingScriptFile); - } - - @Override - public String toString() { - return "SchemaMappingConfiguration [version=" + version + ", mappingScriptFile=" + mappingScriptFile + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/ServerConfiguration.java b/src/main/java/io/divolte/server/config/ServerConfiguration.java index ad6c00fc..d1b29ca3 100644 --- a/src/main/java/io/divolte/server/config/ServerConfiguration.java +++ b/src/main/java/io/divolte/server/config/ServerConfiguration.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.base.MoreObjects; import javax.annotation.ParametersAreNonnullByDefault; import java.util.Objects; @@ -16,18 +17,23 @@ public final class ServerConfiguration { public final boolean serveStaticResources; @JsonCreator - private ServerConfiguration(final Optional host, - final int port, - @JsonProperty("use_x_forwarded_for") final boolean useXForwardedFor, - final boolean serveStaticResources) { + ServerConfiguration(final Optional host, + final int port, + @JsonProperty("use_x_forwarded_for") final boolean useXForwardedFor, + final boolean serveStaticResources) { this.host = Objects.requireNonNull(host); this.port = port; this.useXForwardedFor = useXForwardedFor; - this.serveStaticResources = Objects.requireNonNull(serveStaticResources, "Cannot be null."); + this.serveStaticResources = serveStaticResources; } @Override public String toString() { - return "ServerConfiguration [host=" + host + ", port=" + port + ", useXForwardedFor=" + useXForwardedFor + ", serveStaticResources=" + serveStaticResources + "]"; + return MoreObjects.toStringHelper(this) + .add("host", host) + .add("port", port) + .add("useXForwardedFor", useXForwardedFor) + .add("serverStaticResources", serveStaticResources) + .toString(); } } diff --git a/src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java deleted file mode 100644 index 87e2ae8e..00000000 --- a/src/main/java/io/divolte/server/config/SessionBinningFileStrategyConfiguration.java +++ /dev/null @@ -1,37 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; - -@ParametersAreNonnullByDefault -public final class SessionBinningFileStrategyConfiguration extends FileStrategyConfiguration { - @JsonCreator - private SessionBinningFileStrategyConfiguration( - final int syncFileAfterRecords, - final Duration syncFileAfterDuration, - final String workingDir, - final String publishDir, - /* - * Nasty hack here! We need to have a roll_every property on this object - * in order to support the default configuration without breaking when - * overriding to the session binning strategy vs. the file binning one. - * - * This will be fixed when we either drop support for session binning - * or we'll move to a new config setup with separation in sources, mappings - * and sinks, where there is no default setup anymore. - * - * This makes it valid configuration to declare roll_every on a configuration - * for session binning flushing, although it has no effect. - */ - @SuppressWarnings("unused") - final Duration rollEvery) { - super(Types.SESSION_BINNING, syncFileAfterRecords, syncFileAfterDuration, workingDir, publishDir); - } - - @Override - public String toString() { - return "SessionBinningFileStrategyConfiguration [type=" + type + ", syncFileAfterRecords=" + syncFileAfterRecords + ", syncFileAfterDuration=" + syncFileAfterDuration + ", workingDir=" + workingDir + ", publishDir=" + publishDir + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java b/src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java deleted file mode 100644 index a942eac4..00000000 --- a/src/main/java/io/divolte/server/config/SimpleRollingFileStrategyConfiguration.java +++ /dev/null @@ -1,28 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class SimpleRollingFileStrategyConfiguration extends FileStrategyConfiguration { - public final Duration rollEvery; - - @JsonCreator - private SimpleRollingFileStrategyConfiguration( - final Duration rollEvery, - final int syncFileAfterRecords, - final Duration syncFileAfterDuration, - final String workingDir, - final String publishDir) { - super(Types.SIMPLE_ROLLING_FILE, syncFileAfterRecords, syncFileAfterDuration, workingDir, publishDir); - this.rollEvery = Objects.requireNonNull(rollEvery); - } - - @Override - public String toString() { - return "SimpleRollingFileStrategyConfiguration [rollEvery=" + rollEvery + ", type=" + type + ", syncFileAfterRecords=" + syncFileAfterRecords + ", syncFileAfterDuration=" + syncFileAfterDuration + ", workingDir=" + workingDir + ", publishDir=" + publishDir + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/SinkConfiguration.java b/src/main/java/io/divolte/server/config/SinkConfiguration.java new file mode 100644 index 00000000..4019bb9d --- /dev/null +++ b/src/main/java/io/divolte/server/config/SinkConfiguration.java @@ -0,0 +1,40 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.google.common.base.MoreObjects; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.SchemaRegistry; +import io.divolte.server.processing.ProcessingPool; + +import javax.annotation.OverridingMethodsMustInvokeSuper; +import javax.annotation.ParametersAreNonnullByDefault; + +@JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value=HdfsSinkConfiguration.class, name = "hdfs"), + @JsonSubTypes.Type(value=KafkaSinkConfiguration.class, name = "kafka"), +}) +@ParametersAreNonnullByDefault +public abstract class SinkConfiguration { + @OverridingMethodsMustInvokeSuper + protected MoreObjects.ToStringHelper toStringHelper() { + return MoreObjects.toStringHelper(this); + } + + @Override + public final String toString() { + return toStringHelper().toString(); + } + + @JsonIgnore + public abstract SinkFactory getFactory(); + + @FunctionalInterface + public interface SinkFactory { + ProcessingPool create(ValidatedConfiguration configuration, + String sinkName, + SchemaRegistry schemaRegistry); + } +} diff --git a/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java b/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java new file mode 100644 index 00000000..b8d52fe0 --- /dev/null +++ b/src/main/java/io/divolte/server/config/SinkTypeConfiguration.java @@ -0,0 +1,33 @@ +package io.divolte.server.config; + +import com.google.common.base.MoreObjects; + +import javax.annotation.OverridingMethodsMustInvokeSuper; +import javax.annotation.ParametersAreNonnullByDefault; + +@ParametersAreNonnullByDefault +public abstract class SinkTypeConfiguration { + + public final boolean enabled; + public final int bufferSize; + public final int threads; + + protected SinkTypeConfiguration(final int bufferSize, final int threads, final boolean enabled) { + this.bufferSize = bufferSize; + this.threads = threads; + this.enabled = enabled; + } + + @OverridingMethodsMustInvokeSuper + protected MoreObjects.ToStringHelper toStringHelper() { + return MoreObjects.toStringHelper(this) + .add("enabled", enabled) + .add("bufferSize", bufferSize) + .add("threads", threads); + } + + @Override + public final String toString() { + return toStringHelper().toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/SourceConfiguration.java b/src/main/java/io/divolte/server/config/SourceConfiguration.java new file mode 100644 index 00000000..a6894200 --- /dev/null +++ b/src/main/java/io/divolte/server/config/SourceConfiguration.java @@ -0,0 +1,25 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.google.common.base.MoreObjects; + +import javax.annotation.OverridingMethodsMustInvokeSuper; +import javax.annotation.ParametersAreNonnullByDefault; + +@JsonTypeInfo(use=JsonTypeInfo.Id.NAME, include=JsonTypeInfo.As.PROPERTY, property = "type") +@JsonSubTypes({ + @JsonSubTypes.Type(value=BrowserSourceConfiguration.class, name = "browser"), +}) +@ParametersAreNonnullByDefault +public abstract class SourceConfiguration { + @OverridingMethodsMustInvokeSuper + protected MoreObjects.ToStringHelper toStringHelper() { + return MoreObjects.toStringHelper(this); + } + + @Override + public final String toString() { + return toStringHelper().toString(); + } +} diff --git a/src/main/java/io/divolte/server/config/TrackingConfiguration.java b/src/main/java/io/divolte/server/config/TrackingConfiguration.java deleted file mode 100644 index b390fe22..00000000 --- a/src/main/java/io/divolte/server/config/TrackingConfiguration.java +++ /dev/null @@ -1,48 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.time.Duration; -import java.util.Objects; -import java.util.Optional; - -@ParametersAreNonnullByDefault -public final class TrackingConfiguration { - public final String partyCookie; - public final Duration partyTimeout; - public final String sessionCookie; - public final Duration sessionTimeout; - public final Optional cookieDomain; - public final UaParserConfiguration uaParser; - public final Optional ip2geoDatabase; - public final Optional schemaFile; - public final Optional schemaMapping; - - @JsonCreator - private TrackingConfiguration( - final String partyCookie, - final Duration partyTimeout, - final String sessionCookie, - final Duration sessionTimeout, - final Optional cookieDomain, - final UaParserConfiguration uaParser, - final Optional ip2geoDatabase, - final Optional schemaFile, - final Optional schemaMapping) { - this.partyCookie = Objects.requireNonNull(partyCookie); - this.partyTimeout = Objects.requireNonNull(partyTimeout); - this.sessionCookie = Objects.requireNonNull(sessionCookie); - this.sessionTimeout = Objects.requireNonNull(sessionTimeout); - this.cookieDomain = Objects.requireNonNull(cookieDomain); - this.uaParser = Objects.requireNonNull(uaParser); - this.ip2geoDatabase = Objects.requireNonNull(ip2geoDatabase); - this.schemaFile = Objects.requireNonNull(schemaFile); - this.schemaMapping = Objects.requireNonNull(schemaMapping); - } - - @Override - public String toString() { - return "TrackingConfiguration [partyCookie=" + partyCookie + ", partyTimeout=" + partyTimeout + ", sessionCookie=" + sessionCookie + ", sessionTimeout=" + sessionTimeout + ", cookieDomain=" + cookieDomain + ", uaParser=" + uaParser + ", ip2geoDatabase=" + ip2geoDatabase + ", schemaFile=" + schemaFile + ", schemaMapping=" + schemaMapping + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/UaParserConfiguration.java b/src/main/java/io/divolte/server/config/UaParserConfiguration.java deleted file mode 100644 index ddf6af3b..00000000 --- a/src/main/java/io/divolte/server/config/UaParserConfiguration.java +++ /dev/null @@ -1,23 +0,0 @@ -package io.divolte.server.config; - -import com.fasterxml.jackson.annotation.JsonCreator; - -import javax.annotation.ParametersAreNonnullByDefault; -import java.util.Objects; - -@ParametersAreNonnullByDefault -public final class UaParserConfiguration { - public final String type; - public final int cacheSize; - - @JsonCreator - private UaParserConfiguration(final String type, final int cacheSize) { - this.type = Objects.requireNonNull(type); - this.cacheSize = cacheSize; - } - - @Override - public String toString() { - return "UaParserConfiguration [type=" + type + ", cacheSize=" + cacheSize + "]"; - } -} diff --git a/src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java b/src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java new file mode 100644 index 00000000..ba2e05a1 --- /dev/null +++ b/src/main/java/io/divolte/server/config/UserAgentParserConfiguration.java @@ -0,0 +1,37 @@ +package io.divolte.server.config; + +import com.fasterxml.jackson.annotation.JsonCreator; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.util.Locale; +import java.util.Objects; + +@ParametersAreNonnullByDefault +public final class UserAgentParserConfiguration { + public final ParserType type; + public final int cacheSize; + + @JsonCreator + UserAgentParserConfiguration(final ParserType type, final int cacheSize) { + this.type = Objects.requireNonNull(type); + this.cacheSize = cacheSize; + } + + @Override + public String toString() { + return "UserAgentParserConfiguration [type=" + type + ", cacheSize=" + cacheSize + "]"; + } + + @ParametersAreNonnullByDefault + public enum ParserType { + NON_UPDATING, + ONLINE_UPDATING, + CACHING_AND_UPDATING; + + // Ensure that enumeration names are case-insensitive when parsing JSON. + @JsonCreator + static ParserType fromJson(final String value) { + return ParserType.valueOf(value.toUpperCase(Locale.ROOT)); + } + } +} diff --git a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java index 7d2d8deb..dcf9d769 100644 --- a/src/main/java/io/divolte/server/config/ValidatedConfiguration.java +++ b/src/main/java/io/divolte/server/config/ValidatedConfiguration.java @@ -16,33 +16,41 @@ package io.divolte.server.config; +import java.io.IOException; +import java.time.Duration; +import java.util.List; +import java.util.Optional; +import java.util.Properties; +import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +import javax.annotation.ParametersAreNonnullByDefault; +import javax.validation.ConstraintViolation; +import javax.validation.Validation; +import javax.validation.Validator; + +import org.hibernate.validator.HibernateValidator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonLocation; import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.JsonMappingException; +import com.fasterxml.jackson.databind.JsonMappingException.Reference; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.PropertyNamingStrategy; +import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException; import com.fasterxml.jackson.databind.module.SimpleModule; +import com.fasterxml.jackson.datatype.guava.GuavaModule; import com.fasterxml.jackson.datatype.jdk8.Jdk8Module; import com.fasterxml.jackson.module.paramnames.ParameterNamesModule; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.jasonclawson.jackson.dataformat.hocon.HoconTreeTraversingParser; import com.typesafe.config.Config; import com.typesafe.config.ConfigException; -import org.hibernate.validator.HibernateValidator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.validation.ConstraintViolation; -import javax.validation.Validation; -import javax.validation.Validator; -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; -import java.util.Set; -import java.util.function.Supplier; /** * Container for a validated configuration loaded from a {@code Config} @@ -59,9 +67,10 @@ public final class ValidatedConfiguration { private final static Logger logger = LoggerFactory.getLogger(ValidatedConfiguration.class); - private final List configurationErrors; - @Nullable - private final DivolteConfiguration divolteConfiguration; + private final static Joiner DOT_JOINER = Joiner.on('.'); + + private final ImmutableList configurationErrors; + private final Optional divolteConfiguration; /** * Creates an instance of a validated configuration. The underlying @@ -73,7 +82,7 @@ public final class ValidatedConfiguration { * Supplier of the underlying {@code Config} instance. */ public ValidatedConfiguration(final Supplier configLoader) { - final List configurationErrors = new ArrayList<>(); + final ImmutableList.Builder configurationErrors = ImmutableList.builder(); DivolteConfiguration divolteConfiguration; try { @@ -83,22 +92,57 @@ public ValidatedConfiguration(final Supplier configLoader) { * errors to the resulting list of error messages. */ final Config config = configLoader.get(); - divolteConfiguration = mapped(config.getConfig("divolte")); - validate(configurationErrors, divolteConfiguration); + divolteConfiguration = mapped(config.getConfig("divolte").resolve()); + configurationErrors.addAll(validate(divolteConfiguration)); } catch(final ConfigException e) { logger.debug("Configuration error caught during validation.", e); configurationErrors.add(e.getMessage()); divolteConfiguration = null; + } catch (final UnrecognizedPropertyException e) { + // Add a special case for unknown property as we add the list of available properties to the message. + logger.debug("Configuration error. Exception while mapping.", e); + final String message = messageForUnrecognizedPropertyException(e); + configurationErrors.add(message); + divolteConfiguration = null; + } catch (final JsonMappingException e) { + logger.debug("Configuration error. Exception while mapping.", e); + final String message = messageForMappingException(e); + configurationErrors.add(message); + divolteConfiguration = null; } catch (final IOException e) { logger.error("Error while reading configuration!", e); - throw new RuntimeException(e); + throw new RuntimeException("Error while reading configuration.", e); } - this.configurationErrors = ImmutableList.copyOf(configurationErrors); - this.divolteConfiguration = divolteConfiguration; + this.configurationErrors = configurationErrors.build(); + this.divolteConfiguration = Optional.ofNullable(divolteConfiguration); + } + + private String messageForMappingException(final JsonMappingException e) { + final String pathToError = e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining(".")); + return String.format( + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'", + e.getOriginalMessage(), + Optional.ofNullable(e.getLocation()).map(JsonLocation::getSourceRef).orElse(""), + "".equals(pathToError) ? "" : pathToError); + } + + private static String messageForUnrecognizedPropertyException(final UnrecognizedPropertyException e) { + return String.format( + "%s.%n\tLocation: %s.%n\tConfiguration path to error: '%s'%n\tAvailable properties: %s.", + e.getOriginalMessage(), + e.getLocation().getSourceRef(), + e.getPath().stream() + .map(Reference::getFieldName) + .collect(Collectors.joining(".")), + e.getKnownPropertyIds().stream() + .map(Object::toString).map(s -> "'" + s + "'") + .collect(Collectors.joining(", "))); } - private void validate(final List configurationErrors, final DivolteConfiguration divolteConfiguration) { + private List validate(final DivolteConfiguration divolteConfiguration) { final Validator validator = Validation .byProvider(HibernateValidator.class) .configure() @@ -107,9 +151,15 @@ private void validate(final List configurationErrors, final DivolteConfi final Set> validationErrors = validator.validate(divolteConfiguration); - validationErrors.forEach((e) -> configurationErrors.add( - String.format("Property 'divolte.%s' %s. Found: '%s'.", e.getPropertyPath(), e.getMessage(), e.getInvalidValue()) - )); + return validationErrors + .stream() + .map( + (e) -> String.format( + "Property '%s' %s. Found: '%s'.", + DOT_JOINER.join("divolte", e.getPropertyPath()), + e.getMessage(), + e.getInvalidValue())) + .collect(Collectors.toList()); } private static DivolteConfiguration mapped(final Config input) throws IOException { @@ -123,12 +173,13 @@ private static DivolteConfiguration mapped(final Config input) throws IOExceptio mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true); // Deserialization for Duration - final SimpleModule module= new SimpleModule("Configuration Deserializers"); + final SimpleModule module = new SimpleModule("Configuration Deserializers"); module.addDeserializer(Duration.class, new DurationDeserializer()); module.addDeserializer(Properties.class, new PropertiesDeserializer()); mapper.registerModules( new Jdk8Module(), // JDK8 types (Optional, etc.) + new GuavaModule(), // Guava types (immutable collections) new ParameterNamesModule(), // Support JDK8 parameter name discovery module // Register custom deserializers module ); @@ -146,9 +197,9 @@ private static DivolteConfiguration mapped(final Config input) throws IOExceptio * When validation errors exist. */ public DivolteConfiguration configuration() { - Preconditions.checkState(null != divolteConfiguration && configurationErrors.isEmpty(), + Preconditions.checkState(configurationErrors.isEmpty(), "Attempt to access invalid configuration."); - return divolteConfiguration; + return divolteConfiguration.orElseThrow(() -> new IllegalStateException("Configuration not available.")); } /** diff --git a/src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java b/src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java new file mode 100644 index 00000000..4a96cb0c --- /dev/null +++ b/src/main/java/io/divolte/server/config/constraint/MappingSourceSinkReferencesMustExist.java @@ -0,0 +1,36 @@ +package io.divolte.server.config.constraint; + +import static java.lang.annotation.ElementType.*; +import static java.lang.annotation.RetentionPolicy.*; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import javax.validation.Constraint; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import javax.validation.Payload; + +import io.divolte.server.config.DivolteConfiguration; + +@Target({ TYPE }) +@Retention(RUNTIME) +@Constraint(validatedBy = MappingSourceSinkReferencesMustExist.Validator.class) +@Documented +public @interface MappingSourceSinkReferencesMustExist { + String message() default "The following sources and/or sinks were used in a mapping but never defined: ${validatedValue.missingSourcesSinks()}."; + Class[] groups() default {}; + Class[] payload() default {}; + + public static final class Validator implements ConstraintValidator{ + @Override + public void initialize(final MappingSourceSinkReferencesMustExist constraintAnnotation) { + } + + @Override + public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { + return value.missingSourcesSinks().isEmpty(); + } + } +} diff --git a/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java new file mode 100644 index 00000000..0b0fe333 --- /dev/null +++ b/src/main/java/io/divolte/server/config/constraint/OneSchemaPerSink.java @@ -0,0 +1,37 @@ +package io.divolte.server.config.constraint; + +import static java.lang.annotation.ElementType.*; +import static java.lang.annotation.RetentionPolicy.*; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import javax.validation.Constraint; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import javax.validation.Payload; + +import io.divolte.server.config.DivolteConfiguration; + +@Target({ TYPE }) +@Retention(RUNTIME) +@Constraint(validatedBy=OneSchemaPerSink.Validator.class) +@Documented +public @interface OneSchemaPerSink { + String message() default "Any sink can only use one schema. The following sinks have multiple mappings with different schema's linked to them: ${validatedValue.sinksWithMultipleSchemas()}."; + Class[] groups() default {}; + Class[] payload() default {}; + + public static class Validator implements ConstraintValidator { + @Override + public void initialize(final OneSchemaPerSink constraintAnnotation) { + // Nothing needed here. + } + + @Override + public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { + return value.sinksWithMultipleSchemas().isEmpty(); + } + } +} diff --git a/src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java b/src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java new file mode 100644 index 00000000..d084619a --- /dev/null +++ b/src/main/java/io/divolte/server/config/constraint/SourceAndSinkNamesCannotCollide.java @@ -0,0 +1,36 @@ +package io.divolte.server.config.constraint; + +import static java.lang.annotation.ElementType.*; +import static java.lang.annotation.RetentionPolicy.*; + +import java.lang.annotation.Documented; +import java.lang.annotation.Retention; +import java.lang.annotation.Target; + +import javax.validation.Constraint; +import javax.validation.ConstraintValidator; +import javax.validation.ConstraintValidatorContext; +import javax.validation.Payload; + +import io.divolte.server.config.DivolteConfiguration; + +@Target({ TYPE }) +@Retention(RUNTIME) +@Constraint(validatedBy=SourceAndSinkNamesCannotCollide.Validator.class) +@Documented +public @interface SourceAndSinkNamesCannotCollide { + String message() default "Source and sink names cannot collide (must be globally unique). The following names were both used as source and as sink: ${validatedValue.collidingSourceAndSinkNames()}."; + Class[] groups() default {}; + Class[] payload() default {}; + + public static class Validator implements ConstraintValidator { + @Override + public void initialize(final SourceAndSinkNamesCannotCollide constraintAnnotation) { + } + + @Override + public boolean isValid(final DivolteConfiguration value, final ConstraintValidatorContext context) { + return value.collidingSourceAndSinkNames().isEmpty(); + } + } +} diff --git a/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java b/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java index bcaf81b0..ff2e7813 100644 --- a/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java +++ b/src/main/java/io/divolte/server/hdfs/FileCreateAndSyncStrategy.java @@ -17,11 +17,8 @@ package io.divolte.server.hdfs; import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.FileStrategyConfiguration.Types; -import io.divolte.server.config.ValidatedConfiguration; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; +import javax.annotation.ParametersAreNonnullByDefault; /* * Used by the HdfsFlusher to actually flush events to HDFS. Different implementation @@ -31,23 +28,13 @@ * heartbeat() when no events are available. When either append(...) or heartbeat return FAILURE, * clients MUST NOT call append(...) any more, until a call to heartbeat() has returned SUCCESS. */ +@ParametersAreNonnullByDefault interface FileCreateAndSyncStrategy { HdfsOperationResult setup(); HdfsOperationResult heartbeat(); HdfsOperationResult append(final AvroRecordBuffer record); void cleanup(); - static FileCreateAndSyncStrategy create(final ValidatedConfiguration vc, final FileSystem fs, final short hdfsReplication, final Schema schema) { - if (vc.configuration().hdfsFlusher.fileStrategy.type == Types.SESSION_BINNING) { - return new SessionBinningFileStrategy(vc, fs, hdfsReplication, schema); - } else if (vc.configuration().hdfsFlusher.fileStrategy.type == Types.SIMPLE_ROLLING_FILE) { - return new SimpleRollingFileStrategy(vc, fs, hdfsReplication, schema); - } else { - // Should not occur with a validate configuration. - throw new RuntimeException("No valid HDFS file flushing strategy was configured."); - } - } - enum HdfsOperationResult { SUCCESS, FAILURE diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java index 8e0f6f13..f530459b 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlusher.java @@ -18,25 +18,26 @@ import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.ValidatedConfiguration; -import io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult; -import io.divolte.server.processing.ItemProcessor; import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; import java.util.Objects; import javax.annotation.ParametersAreNonnullByDefault; import javax.annotation.concurrent.NotThreadSafe; +import io.divolte.server.config.HdfsSinkConfiguration; import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult; +import io.divolte.server.processing.Item; +import io.divolte.server.processing.ItemProcessor; + @ParametersAreNonnullByDefault @NotThreadSafe public final class HdfsFlusher implements ItemProcessor { @@ -45,46 +46,43 @@ public final class HdfsFlusher implements ItemProcessor { private final FileCreateAndSyncStrategy fileStrategy; private HdfsOperationResult lastHdfsResult; - public HdfsFlusher(final ValidatedConfiguration vc, final Schema schema) { + public HdfsFlusher(final ValidatedConfiguration vc, final String name, final Schema schema) { Objects.requireNonNull(vc); - final FileSystem hadoopFs; - final Configuration hdfsConfiguration = new Configuration(); - final short hdfsReplication = vc.configuration().hdfsFlusher.hdfs.replication; - + final Configuration hdfsConfiguration = vc.configuration().global.hdfs.client + .map(clientProperties -> { + final Configuration configuration = new Configuration(false); + for (final String propertyName : clientProperties.stringPropertyNames()) { + configuration.set(propertyName, clientProperties.getProperty(propertyName)); + } + return configuration; + }) + .orElse(new Configuration()); /* * The HDFS client creates a JVM shutdown hook, which interferes with our own server shutdown hook. * This config option disabled the built in shutdown hook. We call FileSystem.closeAll() ourselves * in the server shutdown hook instead. */ hdfsConfiguration.setBoolean("fs.automatic.close", false); + + final FileSystem hadoopFs; try { - hadoopFs = vc.configuration().hdfsFlusher.hdfs.uri.map(uri -> { - try { - return FileSystem.get(new URI(uri), hdfsConfiguration); - } catch (IOException | URISyntaxException e) { - /* - * It is possible to create a FileSystem instance when HDFS is not available (e.g. NameNode down). - * This exception only occurs when there is a configuration error in the URI (e.g. wrong scheme). - * So we fail to start up in this case. Below we create the actual HDFS connection, by opening - * files. If that fails, we do startup and initiate the regular retry cycle. - */ - logger.error("Could not initialize HDFS filesystem.", e); - throw new RuntimeException("Could not initialize HDFS filesystem", e); - } - }).orElse(FileSystem.get(hdfsConfiguration)); - } catch (IOException ioe) { + hadoopFs = FileSystem.get(hdfsConfiguration); + } catch (final IOException e) { /* * It is possible to create a FileSystem instance when HDFS is not available (e.g. NameNode down). * This exception only occurs when there is a configuration error in the URI (e.g. wrong scheme). * So we fail to start up in this case. Below we create the actual HDFS connection, by opening * files. If that fails, we do startup and initiate the regular retry cycle. */ - logger.error("Could not initialize HDFS filesystem.", ioe); - throw new RuntimeException("Could not initialize HDFS filesystem", ioe); + logger.error("Could not initialize HDFS filesystem.", e); + throw new RuntimeException("Could not initialize HDFS filesystem", e); } + final short hdfsReplication = + vc.configuration() + .getSinkConfiguration(Objects.requireNonNull(name), HdfsSinkConfiguration.class).replication; - fileStrategy = FileCreateAndSyncStrategy.create(vc, hadoopFs, hdfsReplication, Objects.requireNonNull(schema)); + fileStrategy = new SimpleRollingFileStrategy(vc, name, hadoopFs, hdfsReplication, Objects.requireNonNull(schema)); lastHdfsResult = fileStrategy.setup(); } @@ -94,7 +92,8 @@ public void cleanup() { } @Override - public ProcessingDirective process(AvroRecordBuffer record) { + public ProcessingDirective process(final Item item) { + final AvroRecordBuffer record = item.payload; if (lastHdfsResult == SUCCESS) { return (lastHdfsResult = fileStrategy.append(record)) == SUCCESS ? CONTINUE : PAUSE; } else { diff --git a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java index bac80ccf..47c4c733 100644 --- a/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java +++ b/src/main/java/io/divolte/server/hdfs/HdfsFlushingPool.java @@ -17,6 +17,7 @@ package io.divolte.server.hdfs; import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.SchemaRegistry; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.processing.ProcessingPool; @@ -28,26 +29,24 @@ @ParametersAreNonnullByDefault public final class HdfsFlushingPool extends ProcessingPool{ - public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema) { - this( - Objects.requireNonNull(vc), - Objects.requireNonNull(schema), - vc.configuration().hdfsFlusher.threads, - vc.configuration().hdfsFlusher.maxWriteQueue, - vc.configuration().hdfsFlusher.maxEnqueueDelay.toMillis() - ); + public HdfsFlushingPool(final ValidatedConfiguration vc, + final String name, + final SchemaRegistry schemaRegistry) { + this(vc, + name, + schemaRegistry.getSchemaBySinkName(name), + vc.configuration().global.hdfs.threads, + vc.configuration().global.hdfs.bufferSize); } - public HdfsFlushingPool(final ValidatedConfiguration vc, final Schema schema, final int numThreads, final int maxQueueSize, final long maxEnqueueDelay) { - super( - numThreads, - maxQueueSize, - maxEnqueueDelay, - "Hdfs Flusher", - () -> new HdfsFlusher(vc, schema)); - } - - public void enqueueRecordsForFlushing(final AvroRecordBuffer record) { - enqueue(record.getPartyId().value, record); + public HdfsFlushingPool(final ValidatedConfiguration vc, + final String name, + final Schema schema, + final int numThreads, + final int maxQueueSize) { + super(numThreads, + maxQueueSize, + String.format("Hdfs Flusher [%s]", Objects.requireNonNull(name)), + () -> new HdfsFlusher(Objects.requireNonNull(vc), name, Objects.requireNonNull(schema))); } } diff --git a/src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java b/src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java deleted file mode 100644 index 3ae00a30..00000000 --- a/src/main/java/io/divolte/server/hdfs/SessionBinningFileStrategy.java +++ /dev/null @@ -1,435 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server.hdfs; - -import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; -import static java.util.Calendar.*; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.SessionBinningFileStrategyConfiguration; -import io.divolte.server.config.ValidatedConfiguration; - -import java.io.IOException; -import java.net.InetAddress; -import java.net.UnknownHostException; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.GregorianCalendar; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; - -import javax.annotation.concurrent.NotThreadSafe; - -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.collect.Maps; - -/* - * The general idea of this file strategy is to provide a best effort to put events that belong to the same session in the same file. - * - * The session binning file strategy assigns event to files as such: - * - each timestamp is assigned to a round, defined as timestamp_in_millis / session_timeout_in_millis - * - we open a file for a round as time passes - * - all events for a session are stored in the file with the round marked by the session start time - * - a file for a round is kept open for at least three times the session duration *in absence of failures* - * - during this entire process, we use the event timestamp for events that come off the queue as a logical clock signal - * - only in the case of an empty queue, we use the actual system time as clock signal (receiving heartbeats in a state of normal operation means an empty queue) - * - when a file for a round is closed, but events that should be in that file still arrive, they are stored in the oldest open file - * - this happens for exceptionally long sessions - * - * The above mechanics allow for the following guarantee: if a file is properly opened, used for flushing and closed without intermediate failures, - * all sessions that start within that file and last less than the session timeout duration, will be fully contained in that file. - * - * In case of failure, we close all open files. This means that files that were closed as a result of such a failure *DO NOT* provide above guarantee. - */ -@NotThreadSafe -public class SessionBinningFileStrategy implements FileCreateAndSyncStrategy { - private final static Logger logger = LoggerFactory.getLogger(SessionBinningFileStrategy.class); - - private final static long HDFS_RECONNECT_DELAY_MILLIS = 15000; - private final static long FILE_TIME_TO_LIVE_IN_SESSION_DURATIONS = 3; - - private final static AtomicInteger INSTANCE_COUNTER = new AtomicInteger(); - private final int instanceNumber; - private final String hostString; - - - private final FileSystem hdfs; - private final short hdfsReplication; - - private final Schema schema; - - private final long sessionTimeoutMillis; - - private final Map openFiles; - private final String hdfsWorkingDir; - private final String hdfsPublishDir; - private final long syncEveryMillis; - private final int syncEveryRecords; - - private boolean isHdfsAlive; - private long lastFixAttempt; - private long timeSignal; - - private long lastSyncTime; - private int recordsSinceLastSync; - - - public SessionBinningFileStrategy(final ValidatedConfiguration vc, final FileSystem hdfs, final short hdfsReplication, final Schema schema) { - sessionTimeoutMillis = vc.configuration().tracking.sessionTimeout.toMillis(); - - hostString = findLocalHostName(); - instanceNumber = INSTANCE_COUNTER.incrementAndGet(); - final SessionBinningFileStrategyConfiguration fileStrategyConfiguration = vc.configuration().hdfsFlusher.fileStrategy.as(SessionBinningFileStrategyConfiguration.class); - hdfsWorkingDir = fileStrategyConfiguration.workingDir; - hdfsPublishDir = fileStrategyConfiguration.publishDir; - - syncEveryMillis = fileStrategyConfiguration.syncFileAfterDuration.toMillis(); - syncEveryRecords = fileStrategyConfiguration.syncFileAfterRecords; - - this.hdfs = hdfs; - this.hdfsReplication = hdfsReplication; - - this.schema = schema; - - openFiles = Maps.newHashMapWithExpectedSize(10); - - throwsIoException(() -> { - if (!hdfs.isDirectory(new Path(hdfsWorkingDir))) { - throw new IOException("Working directory for in-flight AVRO records does not exist: " + hdfsWorkingDir); - } - if (!hdfs.isDirectory(new Path(hdfsPublishDir))) { - throw new IOException("Working directory for publishing AVRO records does not exist: " + hdfsPublishDir); - } - }).ifPresent((e) -> { throw new RuntimeException("Configuration error", e); }); - } - - private static String findLocalHostName() { - try { - return InetAddress.getLocalHost().getHostName(); - } catch (final UnknownHostException e) { - return "localhost"; - } - } - - @Override - public HdfsOperationResult setup() { - /* - * On setup, we assume everything to work, as we cannot open - * any files before receiving any events. This is because the - * events are used as a clock signal. - */ - isHdfsAlive = true; - lastFixAttempt = 0; - - lastSyncTime = 0; - recordsSinceLastSync = 0; - - return SUCCESS; - } - - @Override - public HdfsOperationResult heartbeat() { - if (isHdfsAlive) { - // queue is empty, so logical time == current system time - timeSignal = System.currentTimeMillis(); - return throwsIoException(this::possiblySyncAndOrClose) - .map((ioe) -> { - logger.warn("Failed to sync HDFS file.", ioe); - hdfsDied(); - return FAILURE; - }) - .orElse(SUCCESS); - } else { - // queue may or may not be empty, just attempt a reconnect - return possiblyFixHdfsConnection(); - } - } - - @Override - public HdfsOperationResult append(final AvroRecordBuffer record) { - if (!isHdfsAlive) { - throw new IllegalStateException("Append attempt while HDFS connection is not alive."); - } - - timeSignal = record.getEventTime(); - return writeRecord(record); - } - - private HdfsOperationResult writeRecord(final AvroRecordBuffer record) { - return throwsIoException(() -> { - final RoundHdfsFile file = fileForSessionStartTime(record.getSessionId().timestamp - record.getCookieUtcOffset()); - file.writer.appendEncoded(record.getByteBuffer()); - file.recordsSinceLastSync += 1; - recordsSinceLastSync += 1; - possiblySyncAndOrClose(); - }) - .map((ioe) -> { - logger.warn("Error while flushing event to HDFS.", ioe); - hdfsDied(); - return FAILURE; - }) - .orElse(SUCCESS); - } - - @Override - public void cleanup() { - openFiles.values().forEach((file) -> throwsIoException(() -> file.close(false)) - .ifPresent((ioe) -> logger.warn("Failed to properly close HDFS file: " + file.path, ioe))); - openFiles.clear(); - } - - private void possiblySyncAndOrClose() { - try { - final long time = System.currentTimeMillis(); - - if ( - recordsSinceLastSync >= syncEveryRecords || - time - lastSyncTime >= syncEveryMillis && recordsSinceLastSync > 0) { - - openFiles - .values() - .stream() - .filter((f) -> f.recordsSinceLastSync > 0) // only sync files that have pending records - .forEach((file) -> { - try { - logger.debug("Syncing file: {}", file.path); - file.writer.sync(); // Forces the Avro file to write a block - file.stream.hsync(); // Forces a (HDFS) sync on the underlying stream - file.recordsSinceLastSync = 0; - } catch (final IOException e) { - throw new WrappedIOException(e); - } - }); - - recordsSinceLastSync = 0; - lastSyncTime = time; - } else if (recordsSinceLastSync == 0) { - lastSyncTime = time; - } - } finally { - possiblyCloseAndCleanup(); - } - } - - private void possiblyCloseAndCleanup() { - final long oldestAllowedRound = timeSignal / sessionTimeoutMillis - (FILE_TIME_TO_LIVE_IN_SESSION_DURATIONS - 1); - - final List> entriesToBeClosed = openFiles - .entrySet() - .stream() - .filter((e) -> e.getValue().round < oldestAllowedRound) - .collect(Collectors.toList()); - - entriesToBeClosed - .stream() - .map(Entry::getValue) - .distinct() - .forEach((file) -> { - logger.debug("Closing HDFS file: {}", file.path); - throwsIoException(() -> file.close(true)) - .ifPresent((ioe) -> logger.warn("Failed to cleanly close HDFS file: " + file.path, ioe)); - }); - - entriesToBeClosed - .forEach((e) -> openFiles.remove(e.getKey())); - } - - private HdfsOperationResult possiblyFixHdfsConnection() { - if (isHdfsAlive) { - throw new IllegalStateException("HDFS connection repair attempt while not broken."); - } - - final long time = System.currentTimeMillis(); - if (time - lastFixAttempt > HDFS_RECONNECT_DELAY_MILLIS) { - return throwsIoException(() -> openFiles.put(timeSignal / sessionTimeoutMillis, new RoundHdfsFile(timeSignal))) - .map((ioe) -> { - logger.warn("Could not reconnect to HDFS after failure."); - lastFixAttempt = time; - return FAILURE; - }) - .orElseGet(() -> { - logger.info("Recovered HDFS connection."); - isHdfsAlive = true; - lastFixAttempt = 0; - return SUCCESS; - }); - } else { - return FAILURE; - } - } - - private void hdfsDied() { - /* - * On HDFS connection / access failure, we abandon everything and periodically try to reconnect, - * by re-creating a file for the round that caused the failure. Other files will be re-created - * as records for specific files arrive. - */ - isHdfsAlive = false; - openFiles.values().forEach((file) -> throwsIoException(() -> file.close(false))); - openFiles.clear(); - - logger.warn("HDFS failure. Closing all files and going into connect retry cycle."); - } - - private RoundHdfsFile fileForSessionStartTime(final long sessionStartTime) { - final long requestedRound = sessionStartTime / sessionTimeoutMillis; - // return the first open file for which the round >= the requested round - // or create a new file if no such file is present - return openFiles.computeIfAbsent(requestedRound, (ignored) -> openFiles - .values() - .stream() - .sorted((left, right) -> Long.compare(left.round, right.round)) - .filter((f) -> f.round >= requestedRound) - .findFirst() - .orElseGet(() -> - // if the requested round is greater than the current round + 1, - // we return the file for the current round, as probably this is - // a result of a very skewed client side clock, or a fake request - requestedRound > timeSignal / sessionTimeoutMillis + 1 - ? fileForSessionStartTime(timeSignal) - : new RoundHdfsFile(sessionStartTime) - )); - } - - private final class RoundHdfsFile { - private static final String INFLIGHT_EXTENSION = ".partial"; - private static final int MAX_AVRO_SYNC_INTERVAL = 1 << 30; - private final DateFormat format = new SimpleDateFormat("HH.mm.ss.SSS"); - - final Path path; - final long round; - final FSDataOutputStream stream; - final DataFileWriter writer; - - int recordsSinceLastSync; - - RoundHdfsFile(final long time) { - final long requestedRound = time / sessionTimeoutMillis; - final long oldestAllowedRound = timeSignal / sessionTimeoutMillis - (FILE_TIME_TO_LIVE_IN_SESSION_DURATIONS - 1); - this.round = Math.max(requestedRound, oldestAllowedRound); - - this.path = new Path(hdfsWorkingDir, - String.format("%s-divolte-tracking-%s-%s-%d.avro" + INFLIGHT_EXTENSION, - hostString, // add host name, differentiates when deploying multiple collector instances - roundString(round * sessionTimeoutMillis), // composed of the round start date + round number within the day - format.format(new Date()), // additionally, we add a timestamp, because after failures, a file for a round can be created multiple times - instanceNumber)); // add instance number, so different threads cannot try to create the exact same file - - try { - stream = hdfs.create(path, hdfsReplication); - writer = new DataFileWriter(new GenericDatumWriter<>(schema)).create(schema, stream); - writer.setSyncInterval(MAX_AVRO_SYNC_INTERVAL); // since we manually sync at chosen intervals - writer.setFlushOnEveryBlock(true); - - // Sync the file on open to make sure the - // connection actually works, because - // HDFS allows file creation even with no - // datanodes available - stream.hsync(); - recordsSinceLastSync = 0; - - logger.debug("Created new HDFS file: {}", path); - } catch (final IOException e) { - logger.warn("Failed HDFS file creation: {}", path); - // we may have created the file, but failed to sync, so we attempt a delete - // this happens when the NN responds successfully, but there are no DNs available - throwsIoException(() -> hdfs.delete(path, false)); - throw new WrappedIOException(e); - } - } - - private String roundString(final long roundStartTime) { - /* - * The round string in the filename is constructed from the current date - * in the form YYYYmmdd-RR. Where RR is the 0-padded number of session length - * intervals since midnight on the current day. This uses the system timezone. - * Note that if the system is in a timezone that supports DST, the number of - * session length intervals per day is not equal for all days. - */ - final GregorianCalendar gc = new GregorianCalendar(); - gc.setTimeInMillis(roundStartTime); - gc.set(HOUR_OF_DAY, 0); - gc.set(MINUTE, 0); - gc.set(SECOND, 0); - gc.set(MILLISECOND, 0); - - return String.format("%d%02d%02d-%02d", - gc.get(YEAR), - gc.get(MONTH) + 1, - gc.get(DAY_OF_MONTH), - (roundStartTime - gc.getTimeInMillis()) / sessionTimeoutMillis); - } - - private Path getPublishDestination() { - final String pathName = path.getName(); - return new Path(hdfsPublishDir, pathName.substring(0, pathName.length() - INFLIGHT_EXTENSION.length())); - } - - public void close(final boolean publish) { - try { - writer.close(); - if (publish) { - final Path publishDestination = getPublishDestination(); - logger.debug("Moving HDFS file: {} -> {}", path, publishDestination); - if (!hdfs.rename(path, publishDestination)) { - throw new IOException("Could not rename HDFS file: " + path + " -> " + publishDestination); - } - } - } catch (final IOException e) { - throw new WrappedIOException(e); - } - } - } - - @SuppressWarnings("serial") - private static final class WrappedIOException extends RuntimeException { - final IOException wrappedIOException; - - private WrappedIOException(final IOException ioe) { - this.wrappedIOException = ioe; - } - } - - @FunctionalInterface - private interface IOExceptionThrower { - void run() throws IOException; - } - - private static Optional throwsIoException(final IOExceptionThrower r) { - try { - r.run(); - return Optional.empty(); - } catch (final IOException ioe) { - return Optional.of(ioe); - } catch (final WrappedIOException wioe) { - return Optional.of(wioe.wrappedIOException); - } - } -} diff --git a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java index 257dd12f..01ae78cb 100644 --- a/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java +++ b/src/main/java/io/divolte/server/hdfs/SimpleRollingFileStrategy.java @@ -17,9 +17,6 @@ package io.divolte.server.hdfs; import static io.divolte.server.hdfs.FileCreateAndSyncStrategy.HdfsOperationResult.*; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.config.SimpleRollingFileStrategyConfiguration; -import io.divolte.server.config.ValidatedConfiguration; import java.io.IOException; import java.net.InetAddress; @@ -34,6 +31,7 @@ import javax.annotation.ParametersAreNonnullByDefault; import javax.annotation.concurrent.NotThreadSafe; +import io.divolte.server.config.HdfsSinkConfiguration; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumWriter; @@ -44,6 +42,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.config.FileStrategyConfiguration; +import io.divolte.server.config.ValidatedConfiguration; + @NotThreadSafe @ParametersAreNonnullByDefault public class SimpleRollingFileStrategy implements FileCreateAndSyncStrategy { @@ -73,11 +75,16 @@ public class SimpleRollingFileStrategy implements FileCreateAndSyncStrategy { private boolean isHdfsAlive; private long lastFixAttempt; - public SimpleRollingFileStrategy(final ValidatedConfiguration vc, final FileSystem fs, final short hdfsReplication, final Schema schema) { + public SimpleRollingFileStrategy(final ValidatedConfiguration vc, + final String name, + final FileSystem fs, + final short hdfsReplication, + final Schema schema) { Objects.requireNonNull(vc); this.schema = Objects.requireNonNull(schema); - final SimpleRollingFileStrategyConfiguration fileStrategyConfiguration = vc.configuration().hdfsFlusher.fileStrategy.as(SimpleRollingFileStrategyConfiguration.class); + final FileStrategyConfiguration fileStrategyConfiguration = + vc.configuration().getSinkConfiguration(name, HdfsSinkConfiguration.class).fileStrategy; syncEveryMillis = fileStrategyConfiguration.syncFileAfterDuration.toMillis(); syncEveryRecords = fileStrategyConfiguration.syncFileAfterRecords; newFileEveryMillis = fileStrategyConfiguration.rollEvery.toMillis(); @@ -108,7 +115,7 @@ private Path newFilePath() { private static String findLocalHostName() { try { return InetAddress.getLocalHost().getHostName(); - } catch (UnknownHostException e) { + } catch (final UnknownHostException e) { return "localhost"; } } @@ -202,7 +209,7 @@ private void possiblyRollFile(final long time) throws IOException { final Path newFilePath = newFilePath(); try { currentFile = openNewFile(newFilePath); - } catch (IOException e) { + } catch (final IOException e) { throwsIoException(() -> hdfs.delete(newFilePath, false)); throw e; } @@ -249,7 +256,7 @@ private final class HadoopFile implements AutoCloseable { long totalRecords; @SuppressWarnings("resource") - public HadoopFile(Path path) throws IOException { + public HadoopFile(final Path path) throws IOException { this.path = path; this.stream = hdfs.create(path, hdfsReplication); @@ -274,6 +281,7 @@ private Path getPublishDestination() { return new Path(hdfsPublishDir, pathName.substring(0, pathName.length() - INFLIGHT_EXTENSION.length())); } + @Override public void close() throws IOException { totalRecords += recordsSinceLastSync; writer.close(); diff --git a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java index e047efba..33a5b211 100644 --- a/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java +++ b/src/main/java/io/divolte/server/js/TrackingJavaScriptResource.java @@ -16,17 +16,15 @@ package io.divolte.server.js; +import com.google.common.collect.ImmutableMap; +import io.divolte.server.config.BrowserSourceConfiguration; import io.divolte.server.config.ValidatedConfiguration; - -import java.io.IOException; -import java.time.temporal.ChronoUnit; - -import javax.annotation.ParametersAreNonnullByDefault; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.ImmutableMap; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.time.temporal.ChronoUnit; @ParametersAreNonnullByDefault public class TrackingJavaScriptResource extends JavaScriptResource { @@ -34,25 +32,26 @@ public class TrackingJavaScriptResource extends JavaScriptResource { private static final String SCRIPT_CONSTANT_NAME = "SCRIPT_NAME"; - public TrackingJavaScriptResource(final ValidatedConfiguration vc) throws IOException { - super("divolte.js", createScriptConstants(vc), vc.configuration().javascript.debug); + public TrackingJavaScriptResource(final String resourceName, + final ImmutableMap scriptConstants, + final boolean debugMode) throws IOException { + super(resourceName, scriptConstants, debugMode); } public String getScriptName() { return (String)getScriptConstants().get(SCRIPT_CONSTANT_NAME); } - private static ImmutableMap createScriptConstants(final ValidatedConfiguration vc) { + private static ImmutableMap createScriptConstants(final BrowserSourceConfiguration browserSourceConfiguration) { final ImmutableMap.Builder builder = ImmutableMap.builder(); - builder.put("PARTY_COOKIE_NAME", vc.configuration().tracking.partyCookie); - builder.put("PARTY_ID_TIMEOUT_SECONDS", trimLongToMaxInt(vc.configuration().tracking.partyTimeout.get(ChronoUnit.SECONDS))); - builder.put("SESSION_COOKIE_NAME", vc.configuration().tracking.sessionCookie); - builder.put("SESSION_ID_TIMEOUT_SECONDS", trimLongToMaxInt(vc.configuration().tracking.sessionTimeout.get(ChronoUnit.SECONDS))); - vc.configuration().tracking.cookieDomain - .ifPresent((v) -> builder.put("COOKIE_DOMAIN", v)); - builder.put("LOGGING", vc.configuration().javascript.logging); - builder.put(SCRIPT_CONSTANT_NAME, vc.configuration().javascript.name); - builder.put("AUTO_PAGE_VIEW_EVENT", vc.configuration().javascript.autoPageViewEvent); + builder.put("PARTY_COOKIE_NAME", browserSourceConfiguration.partyCookie); + builder.put("PARTY_ID_TIMEOUT_SECONDS", trimLongToMaxInt(browserSourceConfiguration.partyTimeout.get(ChronoUnit.SECONDS))); + builder.put("SESSION_COOKIE_NAME", browserSourceConfiguration.sessionCookie); + builder.put("SESSION_ID_TIMEOUT_SECONDS", trimLongToMaxInt(browserSourceConfiguration.sessionTimeout.get(ChronoUnit.SECONDS))); + browserSourceConfiguration.cookieDomain.ifPresent((v) -> builder.put("COOKIE_DOMAIN", v)); + builder.put("LOGGING", browserSourceConfiguration.javascript.logging); + builder.put(SCRIPT_CONSTANT_NAME, browserSourceConfiguration.javascript.name); + builder.put("AUTO_PAGE_VIEW_EVENT", browserSourceConfiguration.javascript.autoPageViewEvent); return builder.build(); } @@ -67,4 +66,13 @@ private static int trimLongToMaxInt(long duration) { } return result; } + + public static TrackingJavaScriptResource create(final ValidatedConfiguration vc, + final String sourceName) throws IOException { + final BrowserSourceConfiguration browserSourceConfiguration = + vc.configuration().getSourceConfiguration(sourceName, BrowserSourceConfiguration.class); + return new TrackingJavaScriptResource(browserSourceConfiguration.javascript.name, + createScriptConstants(browserSourceConfiguration), + browserSourceConfiguration.javascript.debug); + } } diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java index 8b551566..fe1543ab 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlusher.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlusher.java @@ -16,19 +16,9 @@ package io.divolte.server.kafka; -import com.google.common.collect.ImmutableList; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.DivolteIdentifier; -import io.divolte.server.processing.ItemProcessor; -import org.apache.kafka.clients.producer.Producer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.clients.producer.RecordMetadata; -import org.apache.kafka.common.errors.RetriableException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; -import javax.annotation.ParametersAreNonnullByDefault; -import javax.annotation.concurrent.NotThreadSafe; +import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Queue; @@ -36,8 +26,22 @@ import java.util.concurrent.Future; import java.util.stream.Collectors; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.CONTINUE; -import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.PAUSE; +import javax.annotation.ParametersAreNonnullByDefault; +import javax.annotation.concurrent.NotThreadSafe; + +import org.apache.kafka.clients.producer.Producer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.clients.producer.RecordMetadata; +import org.apache.kafka.common.errors.RetriableException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ImmutableList; + +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.DivolteIdentifier; +import io.divolte.server.processing.Item; +import io.divolte.server.processing.ItemProcessor; @ParametersAreNonnullByDefault @NotThreadSafe @@ -55,18 +59,19 @@ public KafkaFlusher(final String topic, final Producer buildRecord(AvroRecordBuffer record) { + private ProducerRecord buildRecord(final AvroRecordBuffer record) { return new ProducerRecord<>(topic, record.getPartyId(), record); } @Override - public ProcessingDirective process(final AvroRecordBuffer record) { + public ProcessingDirective process(final Item item) { + final AvroRecordBuffer record = item.payload; logger.debug("Processing individual record: {}", record); return flush(ImmutableList.of(buildRecord(record))); } @Override - public ProcessingDirective process(final Queue batch) { + public ProcessingDirective process(final Queue> batch) { final int batchSize = batch.size(); final ProcessingDirective result; switch (batchSize) { @@ -81,8 +86,9 @@ public ProcessingDirective process(final Queue batch) { logger.debug("Processing batch of {} records.", batchSize); final List> kafkaMessages = batch.stream() + .map(i -> i.payload) .map(this::buildRecord) - .collect(Collectors.toList()); + .collect(Collectors.toCollection(() -> new ArrayList<>(batchSize))); // Clear the messages now; on failure they'll be retried as part of our // pending operation. batch.clear(); @@ -118,10 +124,11 @@ private ProcessingDirective flush(final List> sendBatch(final List> batch) throws InterruptedException { // First start sending the messages. // (This will serialize them, determine the partition and then assign them to a per-partition buffer.) + final int batchSize = batch.size(); final List> sendResults = batch.stream() .map(producer::send) - .collect(Collectors.toList()); + .collect(Collectors.toCollection(() -> new ArrayList<>(batchSize))); // The producer will send the messages in the background. As of 0.8.x we can't // flush, but have to wait for that to occur based on the producer configuration. // (By default it will immediately flush, but users can override this.) @@ -132,7 +139,6 @@ private ImmutableList> sendBa // - A fatal error occurred. // (In addition, we can be interrupted due to shutdown.) final ImmutableList.Builder> remaining = ImmutableList.builder(); - final int batchSize = batch.size(); for (int i = 0; i < batchSize; ++i) { final Future result = sendResults.get(i); try { diff --git a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java index 9eaf52a1..0d02e10b 100644 --- a/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java +++ b/src/main/java/io/divolte/server/kafka/KafkaFlushingPool.java @@ -18,6 +18,8 @@ import io.divolte.server.AvroRecordBuffer; import io.divolte.server.DivolteIdentifier; +import io.divolte.server.SchemaRegistry; +import io.divolte.server.config.KafkaSinkConfiguration; import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.processing.ProcessingPool; import org.apache.kafka.clients.producer.KafkaProducer; @@ -31,24 +33,27 @@ public class KafkaFlushingPool extends ProcessingPool producer; - public KafkaFlushingPool(final ValidatedConfiguration vc) { + public KafkaFlushingPool(final ValidatedConfiguration vc, final String name, final SchemaRegistry ignored) { this( - vc.configuration().kafkaFlusher.threads, - vc.configuration().kafkaFlusher.maxWriteQueue, - vc.configuration().kafkaFlusher.maxEnqueueDelay.toMillis(), - vc.configuration().kafkaFlusher.topic, - new KafkaProducer<>(vc.configuration().kafkaFlusher.producer, + name, + vc.configuration().global.kafka.threads, + vc.configuration().global.kafka.bufferSize, + vc.configuration().getSinkConfiguration(name, KafkaSinkConfiguration.class).topic, + new KafkaProducer<>(vc.configuration().global.kafka.producer, new DivolteIdentifierSerializer(), new AvroRecordBufferSerializer()) ); } - public KafkaFlushingPool(final int numThreads, + public KafkaFlushingPool(final String name, + final int numThreads, final int maxWriteQueue, - final long maxEnqueueDelay, final String topic, final Producer producer ) { - super(numThreads, maxWriteQueue, maxEnqueueDelay, "Kafka Flusher", () -> new KafkaFlusher(topic, producer)); + super(numThreads, + maxWriteQueue, + String.format("Kafka Flusher [%s]", Objects.requireNonNull(name)), + () -> new KafkaFlusher(topic, producer)); this.producer = Objects.requireNonNull(producer); } diff --git a/src/main/java/io/divolte/server/processing/Item.java b/src/main/java/io/divolte/server/processing/Item.java new file mode 100644 index 00000000..1124d48e --- /dev/null +++ b/src/main/java/io/divolte/server/processing/Item.java @@ -0,0 +1,36 @@ +package io.divolte.server.processing; + +import java.nio.charset.StandardCharsets; +import java.util.Objects; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +import javax.annotation.ParametersAreNonnullByDefault; + +@ParametersAreNonnullByDefault +public final class Item { + public final int sourceId; + public final int affinityHash; + public final E payload; + + private static final HashFunction HASHER = Hashing.murmur3_32(42); + + private Item(final int sourceId, final int affinityHash, final E payload) { + this.sourceId = sourceId; + this.affinityHash = affinityHash; + this.payload = Objects.requireNonNull(payload); + } + + public static Item of(final int sourceId, final String key, final E payload) { + return new Item<>( + sourceId, + // making sure the hash result is non-negative by masking with max int + HASHER.hashString(key, StandardCharsets.UTF_8).asInt() & Integer.MAX_VALUE, + payload); + } + + public static Item withCopiedAffinity(final int sourceId, final Item affinitySource, final E payload) { + return new Item<>(sourceId, affinitySource.affinityHash, payload); + } +} diff --git a/src/main/java/io/divolte/server/processing/ItemProcessor.java b/src/main/java/io/divolte/server/processing/ItemProcessor.java index 17cc1dde..97f3c314 100644 --- a/src/main/java/io/divolte/server/processing/ItemProcessor.java +++ b/src/main/java/io/divolte/server/processing/ItemProcessor.java @@ -16,14 +16,14 @@ package io.divolte.server.processing; -import java.util.Queue; - import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; +import java.util.Queue; + public interface ItemProcessor { - ProcessingDirective process(E e); + ProcessingDirective process(Item e); - default ProcessingDirective process(final Queue batch) { + default ProcessingDirective process(final Queue> batch) { ProcessingDirective directive; do { // Note: processing should not throw an unchecked diff --git a/src/main/java/io/divolte/server/processing/ProcessingPool.java b/src/main/java/io/divolte/server/processing/ProcessingPool.java index 6d423417..7032b42f 100644 --- a/src/main/java/io/divolte/server/processing/ProcessingPool.java +++ b/src/main/java/io/divolte/server/processing/ProcessingPool.java @@ -17,7 +17,6 @@ package io.divolte.server.processing; import static io.divolte.server.processing.ItemProcessor.ProcessingDirective.*; -import io.divolte.server.processing.ItemProcessor.ProcessingDirective; import java.util.ArrayDeque; import java.util.ArrayList; @@ -42,6 +41,8 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; +import io.divolte.server.processing.ItemProcessor.ProcessingDirective; + @ParametersAreNonnullByDefault public class ProcessingPool, E> { private static final Logger logger = LoggerFactory.getLogger(ProcessingPool.class); @@ -49,8 +50,7 @@ public class ProcessingPool, E> { private static final int MAX_BATCH_SIZE = 128; private final ExecutorService executorService; - private final List> queues; - private final long maxEnqueueDelay; + private final List>> queues; private volatile boolean running; @@ -60,7 +60,6 @@ public class ProcessingPool, E> { public ProcessingPool( final int numThreads, final int maxQueueSize, - final long maxEnqueueDelay, final String threadBaseName, final Supplier processorSupplier) { @@ -73,9 +72,7 @@ public ProcessingPool( final ThreadFactory factory = createThreadFactory(threadGroup, threadBaseName + " - %d"); executorService = Executors.newFixedThreadPool(numThreads, factory); - this.maxEnqueueDelay = maxEnqueueDelay; - - this.queues = Stream.> + this.queues = Stream.>> generate(() -> new ArrayBlockingQueue<>(maxQueueSize)) .limit(numThreads) .collect(Collectors.toCollection(() -> new ArrayList<>(numThreads))); @@ -87,14 +84,10 @@ public ProcessingPool( } - public void enqueue(String key, E e) { - // We mask the hash-code to ensure we always get a positive bucket index. - if (!offerQuietly( - queues.get((key.hashCode() & Integer.MAX_VALUE) % queues.size()), - e, - maxEnqueueDelay, - TimeUnit.MILLISECONDS)) { - logger.warn("Failed to enqueue item for {} ms. Dropping event.", maxEnqueueDelay); + public void enqueue(final Item item) { + final BlockingQueue> queue = queues.get(item.affinityHash % queues.size()); + if (!queue.offer(item)) { + logger.warn("Failed to enqueue item. Dropping event."); } } @@ -103,12 +96,12 @@ public void stop() { running = false; executorService.shutdown(); executorService.awaitTermination(1, TimeUnit.HOURS); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Thread.currentThread().interrupt(); } } - private void scheduleQueueReader(final ExecutorService es, final BlockingQueue queue, final ItemProcessor processor) { + private void scheduleQueueReader(final ExecutorService es, final BlockingQueue> queue, final ItemProcessor processor) { CompletableFuture.runAsync(microBatchingQueueDrainerWithHeartBeat(queue, processor), es).whenComplete((voidValue, error) -> { processor.cleanup(); @@ -122,12 +115,12 @@ private void scheduleQueueReader(final ExecutorService es, final BlockingQueue queue, + final BlockingQueue> queue, final ItemProcessor processor) { return () -> { // The default item processor implementation removes items one-by-one as they // are processed. Using a Queue ensures that this is efficient. - final Queue batch = new ArrayDeque<>(MAX_BATCH_SIZE); + final Queue> batch = new ArrayDeque<>(MAX_BATCH_SIZE); while (!queue.isEmpty() || running) { ProcessingDirective directive; @@ -158,7 +151,7 @@ private Runnable microBatchingQueueDrainerWithHeartBeat( private static void sleepOneSecond() { try { Thread.sleep(1000); - } catch(InterruptedException e) { + } catch(final InterruptedException e) { Thread.currentThread().interrupt(); } } @@ -166,21 +159,12 @@ private static void sleepOneSecond() { private static E pollQuietly(final BlockingQueue queue, final long timeout, final TimeUnit unit) { try { return queue.poll(timeout, unit); - } catch (InterruptedException e) { + } catch (final InterruptedException e) { Thread.currentThread().interrupt(); return null; } } - private static boolean offerQuietly(final BlockingQueue queue, final E item, final long timeout, final TimeUnit unit) { - try { - return queue.offer(item, timeout, unit); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - return false; - } - } - private static ThreadFactory createThreadFactory(final ThreadGroup group, final String nameFormat) { return new ThreadFactoryBuilder() .setNameFormat(nameFormat) diff --git a/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java b/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java index 2bc417bc..a04d6811 100644 --- a/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java +++ b/src/main/java/io/divolte/server/recordmapping/DslRecordMapper.java @@ -55,10 +55,6 @@ public class DslRecordMapper implements RecordMapper { private final Schema schema; private final List actions; - public DslRecordMapper(final ValidatedConfiguration vc, final Schema schema, final Optional geoipService) { - this(vc, vc.configuration().tracking.schemaMapping.get().mappingScriptFile, schema, geoipService); - } - public DslRecordMapper(final ValidatedConfiguration vc, final String groovyFile, final Schema schema, final Optional geoipService) { this.schema = Objects.requireNonNull(schema); diff --git a/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java b/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java index eb97da26..0826b810 100644 --- a/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java +++ b/src/main/java/io/divolte/server/recordmapping/UserAgentParserAndCache.java @@ -1,5 +1,6 @@ package io.divolte.server.recordmapping; +import io.divolte.server.config.UserAgentParserConfiguration; import io.divolte.server.config.ValidatedConfiguration; import java.util.Optional; @@ -23,8 +24,8 @@ public final class UserAgentParserAndCache { private final LoadingCache cache; public UserAgentParserAndCache(final ValidatedConfiguration vc) { - final UserAgentStringParser parser = parserBasedOnTypeConfig(vc.configuration().tracking.uaParser.type); - this.cache = sizeBoundCacheFromLoadingFunction(parser::parse, vc.configuration().tracking.uaParser.cacheSize); + final UserAgentStringParser parser = parserBasedOnTypeConfig(vc.configuration().global.mapper.userAgentParser.type); + this.cache = sizeBoundCacheFromLoadingFunction(parser::parse, vc.configuration().global.mapper.userAgentParser.cacheSize); logger.info("User agent parser data version: {}", parser.getDataVersion()); } @@ -37,15 +38,15 @@ public Optional tryParse(final String userAgentString) { } } - private static UserAgentStringParser parserBasedOnTypeConfig(String type) { + private static UserAgentStringParser parserBasedOnTypeConfig(UserAgentParserConfiguration.ParserType type) { switch (type) { - case "caching_and_updating": + case CACHING_AND_UPDATING: logger.info("Using caching and updating user agent parser."); return UADetectorServiceFactory.getCachingAndUpdatingParser(); - case "online_updating": + case ONLINE_UPDATING: logger.info("Using online updating user agent parser."); return UADetectorServiceFactory.getOnlineUpdatingParser(); - case "non_updating": + case NON_UPDATING: logger.info("Using non-updating (resource module based) user agent parser."); return UADetectorServiceFactory.getResourceModuleParser(); default: diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf index 093515a6..7e40a31f 100644 --- a/src/main/resources/reference.conf +++ b/src/main/resources/reference.conf @@ -16,361 +16,133 @@ // This is the default configuration. divolte { - server { - // The host to which the server binds. - // Set to a specific IP address to selectively listen on that interface. - // If not present, a loopback-only address will be bound. - //host = 0.0.0.0 - // The bind host can be overridden using the DIVOLTE_HOST environment variable. - //host = ${?DIVOLTE_HOST} - - // The port on which the sever listens. - port = 8290 - // Server port can be overridden using the DIVOLTE_PORT environment variable. - port = ${?DIVOLTE_PORT} - - // Whether to use the X-Forwarded-For header HTTP header - // for determining the source IP of a request if present. - // When a X-Forwared-For header is present, the rightmost - // IP address of the value is used as source IP when - // when multiple IP addresses are separated by a comma. - // When the header is present more than once, the last - // value will be used. - // E.g. - // "X-Forwarded-For: 10.200.13.28, 11.45.82.30" ==> 11.45.82.30 - // - // "X-Forwarded-For: 10.200.13.28" - // "X-Forwarded-For: 11.45.82.30" ==> 11.45.82.30 - use_x_forwarded_for = false - - // When true Divolte Collector serves a static test page at /. - serve_static_resources = true - } - - // The tracking section controls the settings related to the tracking - // JavaScript. This script is compiled using the closure compiler - // (https://developers.google.com/closure/compiler/) on startup of the - // server. During compilation the values from the settings are substituted - // in the JavaScript and thus appear as hard-coded on the client side. - tracking { - // The name of the cookie used for setting a party ID - party_cookie = _dvp - // The expiry time for the party ID cookie - party_timeout = 730 days - - // The name of the cookie used tracking the session ID - session_cookie = _dvs - - // The expiry time for a session - session_timeout = 30 minutes - - // The cookie domain that is assigned to the cookies. - // When left empty, the cookie will have no domain - // explicitly associated with it, which effectively - // sets it to the website domain of the page that - // contains the Divolte Collector JavaScript. - // cookie_domain = '' - - - // This section controls the user agent parsing settings. The user agent - // parsing is based on this library (https://github.com/before/uadetector), - // which allows for dynamic reloading of the backing database if a internet - // connection is available. The parser type controls this behavior. - // Possible values are: - // - non_updating: Uses a local database, bundled - // with Divolte Collector. - // - online_updating: Uses a online database only, never falls back - // to the local database. - // - caching_and_updating: Uses a cached version of the online database - // and periodically checks for new version at the - // remote location. Updates are downloaded - // automatically and cached locally. - - ua_parser { - // The parser type. - type = non_updating - - // User agent parsing is a relatively expensive operation that requires - // many regular expression evaluations. Very often the same user agent - // will make consecutive requests and many clients will have the exact - // same user agent as well. It therefore makes sense to cache the - // parsing results in memory and do a lookup before trying a parse. - // This setting determines how many unique user agent strings will be - // cached. - cache_size = 1000 + global { + server { + // The host to which the server binds. + // Set to a specific IP address to selectively listen on that interface. + // If not present, a loopback-only address will be bound. + //host = 0.0.0.0 + // The bind host can be overridden using the DIVOLTE_HOST environment variable. + //host = ${?DIVOLTE_HOST} + + // The port on which the sever listens. + port = 8290 + // Server port can be overridden using the DIVOLTE_PORT environment variable. + port = ${?DIVOLTE_PORT} + + // Whether to use the X-Forwarded-For header HTTP header + // for determining the source IP of a request if present. + // When a X-Forwared-For header is present, the rightmost + // IP address of the value is used as source IP when + // when multiple IP addresses are separated by a comma. + // When the header is present more than once, the last + // value will be used. + // E.g. + // "X-Forwarded-For: 10.200.13.28, 11.45.82.30" ==> 11.45.82.30 + // + // "X-Forwarded-For: 10.200.13.28" + // "X-Forwarded-For: 11.45.82.30" ==> 11.45.82.30 + use_x_forwarded_for = false + + // When true Divolte Collector serves a static test page at /. + serve_static_resources = true } - // This configures the ip2geo database for geo lookups. A ip2geo database - // can be obtained from MaxMind (https://www.maxmind.com/en/geoip2-databases). - // Both a free version and a more accurate paid version are available. - // - // By default, no ip2geo database is configured. When this setting is - // absent, no attempt will be made to lookup geo-coordinates for IP - // addresses. If configured, Divolte Collector will keep a filesystem - // watch on the database file. If the file is changed on the filesystem - // the database will be reloaded at runtime without requireing a restart. - // ip2geo_database = /path/to/dabase/file.db - - // By default, Divolte Collector will use a built-in Avro schema for - // writing data and a default mapping, which is documented in the - // Mapping section of the user documentation. The default schema - // can be found here: https://github.com/divolte/divolte-schema - // - // Typically, users will configure their own schema, usually with - // fields specific to their domain and custom events and other - // mappings. When using a user defined schema, it is also - // required to provide a mapping script. See the user documentation - // for further reference. - - // schema_file = /Users/friso/code/divolte-examples/avro-schema/src/main/resources/JavadocEventRecord.avsc - // schema_mapping { - // The version of the mapping dialect to use. The current latest - // version is 2. Version 1 has been deprecated and removed from - // Divolte Collector since release 0.2 - // version = 2 - - // The groovy script file to use as mapping definition. - // mapping_script_file = "/Users/friso/code/divolte-examples/avro-schema/mapping.groovy" - // } - } - - // The javascript section controls settings related to the way - // the JavaScript file is compiled. - javascript { - // Name of the script file. This changes the divolte.js part in - // the script url: http://www.domain.tld/divolte.js - name = divolte.js - - // Enable or disable the logging on the JavaScript console in - // the browser - logging = false - - // When true, the served JavaScript will be compiled, but not - // minified, improving readability when debugging in the browser. - debug = false - - // When false, divolte.js will not automatically send a pageView - // event after being loaded. This way clients can send a initial - // event themselves and have full control over the event type and - // the custom parameters that are sent with the initial event. - auto_page_view_event = true - } - - // This section controls settings related to the processing of incoming - // requests after they have been responded to by the server. Incoming - // requests in Divolte Collector are initially handled by a pool of - // HTTP threads, which immediately respond with a HTTP code 200 and send - // the response payload (a 1x1 pixel transparent GIF image). After - // responding, the request data is passed onto the incoming request - // processing thread pool. This is the incoming request processor. - incoming_request_processor { - // Number of threads to use for processing incoming requests - threads = 2 - - // The maximum queue of incoming requests to keep - // before starting to drop incoming requests. Note - // that when this queue is full, requests are dropped - // and a warning is logged. No errors are reported to - // the client side. Divolte Collector will always respond - // with a HTTP 200 status code and the image payload. - // Note that the queue size is per thread. - max_write_queue = 100000 - - // The maximum delay to block before an incoming request - // is dropped in case of a full queue. - max_enqueue_delay = 1 second - - // The incoming request handler attempts to parse out all - // relevant information from the request as passed by the - // JavaScript. If the incoming request appears corrupt, - // for example because of a truncated URL or incorrect - // data in the fields, the request is flagged as corrupt. - // The detection of corrupt requests is enforced by appending - // a hash of all fields to the request from the JavaScript. - // This hash is validated on the server side. - // If this setting is true, events that are flagged as corrupt - // will be dropped from the stream, instead of processed further. - // It is common not to drop the corrupt events, but instead - // include them for later analysis. - discard_corrupted = false - - // Browsers and other clients (e.g. anti-virus software, proxies) - // will sometimes send the exact same request twice. Divolte - // Collector attempts to flag these duplicate events, by using - // a internal probabilistic data structure with a finite memory - // size. The memory consists internally of an array of 64 bit - // integers. This the memory required in bytes is the memory size - // times 8 (8 megabytes for 1 million entries). - // Note that the memory size is per thread. - duplicate_memory_size = 1000000 - - // If this setting is true, events that are flagged as duplicate - // will be dropped from the stream, instead of processed further. - // It is common not to drop the duplicate events, but instead - // include them for later analysis. - discard_duplicates = false - } - - // This section controls settings related to flushing the event stream - // to a Apache Kafka topic. - kafka_flusher { - // If true, flushing to Kafka is enabled. - enabled = false - - // Number of threads to use for flushing events to Kafka - threads = 2 - - // The maximum queue of incoming requests to keep - // before starting to drop incoming requests. Note - // that when this queue is full, requests are dropped - // and a warning is logged. No errors are reported to - // the client side. Divolte Collector will always respond - // with a HTTP 200 status code and the image payload. - // Note that the queue size is per thread. - max_write_queue = 200000 - - // The maximum delay to block before an incoming request - // is dropped in case of a full queue. - max_enqueue_delay = 1 second - - // The Kafka topic onto which events are published. - topic = "divolte" - // The topic can be overridden by setting the - // DIVOLTE_KAFKA_TOPIC environment variable. - topic = ${?DIVOLTE_KAFKA_TOPIC} - - // All settings in here are used as-is to configure - // the Kafka producer. - // See: http://kafka.apache.org/documentation.html#producerconfigs - producer = { - bootstrap.servers = ["localhost:9092"] - bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} - client.id = divolte.collector - client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} - - acks = 1 - retries = 0 - compression.type = lz4 - max.in.flight.requests.per.connection = 1 + mapper { + // Size of the buffer used by each mapper to hold the incoming + // events that need to be mapped. This is rounded up to the + // nearest power of two. + buffer_size = 1048576 + + // The number of threads each configured mapper should use to + // process the events. + threads = 1 + + // The amount of memory that each mapper thread should use for + // detecting duplicate events. + duplicate_memory_size = 1000000 + + // This section controls the user agent parsing settings. The user agent + // parsing is based on this library (https://github.com/before/uadetector), + // which allows for dynamic reloading of the backing database if a internet + // connection is available. + user_agent_parser { + // The parser type. Possible values are: + // - non_updating: Uses a local database, bundled + // with Divolte Collector. + // - online_updating: Uses a online database only, never falls back + // to the local database. + // - caching_and_updating: Uses a cached version of the online database + // and periodically checks for new version at the + // remote location. Updates are downloaded + // automatically and cached locally. + type = non_updating + + // User agent parsing is a relatively expensive operation that requires + // many regular expression evaluations. Very often the same user agent + // will make consecutive requests and many clients will have the exact + // same user agent as well. It therefore makes sense to cache the + // parsing results in memory and do a lookup before trying a parse. + // This setting determines how many unique user agent strings will be + // cached. + cache_size = 1000 + } } - } - - // This section controls settings related to flushing the event stream - // to HDFS. - hdfs_flusher { - // If true, flushing to HDFS is enabled. - enabled = true - - // Number of threads to use for flushing events to HDFS. - // Each thread creates its own files on HDFS. Depending - // on the flushing strategy, multiple concurrent files - // could be kept open per thread. - threads = 2 - - // The maximum queue of incoming requests to keep - // before starting to drop incoming requests. Note - // that when this queue is full, requests are dropped - // and a warning is logged. No errors are reported to - // the client side. Divolte Collector will always respond - // with a HTTP 200 status code and the image payload. - // Note that the queue size is per thread. - max_write_queue = 100000 - // The maximum delay to block before an incoming request - // is dropped in case of a full queue. - max_enqueue_delay = 1 second - - - // HDFS specific settings. Although it's possible to configure - // a HDFS URI here, it is more advisable to configure HDFS - // settings by specifying a HADOOP_CONF_DIR environment variable - // which will be added to the classpath on startup and as such - // configure the HDFS client automatically. hdfs { - // default nonexistant: Use HADOOP_CONF_DIR on the classpath. - // If not present empty config results in local filesystem being used. - // uri = "file:///" - // uri = ${?DIVOLTE_HDFS_URI} - - // The HDFS replication factor to use when creating - // files. - replication = 1 - - // The replication factor can be overridden by setting the - // DIVOLTE_HDFS_REPLICATION environment variable. - replication = ${?DIVOLTE_HDFS_REPLICATION} + // If true, flushing to HDFS is enabled. + enabled = true + + // Number of threads to use for flushing events to HDFS. + // Each thread creates its own files on HDFS. Depending + // on the flushing strategy, multiple concurrent files + // could be kept open per thread. + threads = 2 + + // The maximum queue of mapped events to buffer before + // starting to drop new ones. Note that when this buffer is full, + // events are dropped and a warning is logged. No errors are reported + // to the source of the events. A single buffer is shared between all + // threads, and its size will be rounded up to the nearest power of 2. + buffer_size = 1048576 + + // Arbitrary HDFS client properties. + // If absent, hdfs-site.xml from the classpath will be used. + //client {} } - // Divolte Collector has two strategies for creating files - // on HDFS and flushing data. By default, a simple rolling - // file strategy is employed. This opens one file per thread - // and rolls on to a new file after a configurable interval. - // Files that are being written to, have a extension of - // .avro.partial and are written the the directory configured - // in the working_dir setting. When a file is closed, it - // will be renamed to have a .avro extension and is moved to - // the directory configured in the publish_dir settins. This - // happens in a single (atomic) filesystem move operation. - file_strategy { - // File strategy type - type = SIMPLE_ROLLING_FILE - - // Roll over files on HDFS after this amount of time. - roll_every = 60 minutes - - // Issue a hsync against files each time this number of - // records has been flushed to it. - sync_file_after_records = 1000 - - // If no records are being flushed, issue a hsync when - // this amount of time passes, regardless of how much - // data was written. - sync_file_after_duration = 30 seconds - - // Directory where files are created and kept while being - // written to. - working_dir = /tmp - - // Directory where files are moved to, after they are closed. - publish_dir = /tmp + kafka { + // If true, flushing to Kafka is enabled. + enabled = false + + // Number of threads to use for flushing events to Kafka + threads = 2 + + // The maximum queue of mapped events to buffer before + // starting to drop new ones. Note that when this buffer is full, + // events are dropped and a warning is logged. No errors are reported + // to the source of the events. A single buffer is shared between all + // threads, and its size will be rounded up to the nearest power of 2. + buffer_size = 1048576 + + // All settings in here are used as-is to configure + // the Kafka producer. + // See: http://kafka.apache.org/082/documentation.html#newproducerconfigs + producer = { + bootstrap.servers = ["localhost:9092"] + bootstrap.servers = ${?DIVOLTE_KAFKA_BROKER_LIST} + client.id = divolte.collector + client.id = ${?DIVOLTE_KAFKA_CLIENT_ID} + + acks = 1 + retries = 0 + compression.type = lz4 + max.in.flight.requests.per.connection = 1 + } } - - // Next to the rolling file strategy, there is a more complex - // strategy called session binning file strategy. The general - // idea of this strategy is to provide a best effort to put - // events that belong to the same session in the same file. - // - // This strategy assigns event to files as such: - // - Each event is assigned to a round based on timestamp, - // defined as timestamp_in_millis / session_timeout_in_millis. - // - A file is opened for each round as time passes. - // - All events for a session are stored in the file with the - // round marked by the session start time. - // - A file for a round is kept open for at least three times the - // session duration *in absence of failures*. - // - During this entire process, the event timestamp is used for - // events that come off the queue as a logical clock signal. - // - Only in the case of an empty queue, the actual system - // time is used as clock signal. - // - When a file for a round is closed, but events that should be - // in that file still arrive, they are stored in the oldest open - // file. - // - This happens for exceptionally long sessions - // - // This strategy attempts to write events that belong to the same - // session to the same file. Do note that in case of failures, - // this guarantee not longer holds. For this reason, in failure - // scenario's or at shutdown, this strategy DOES NOT move files - // to the publish directory. Users have to setup a separate process - // to periodically move these files out of the way. - -// file_strategy { -// type = SESSION_BINNING -// sync_file_after_records = 1000 -// sync_file_after_duration = 30 seconds -// working_dir = /tmp -// publish_dir = /tmp -// } - } + + // Sources, sinks and mappings are provided only if the user hasn't + // specified anything. Due to the merging rules for configuration, + // defaults are not present here: this is handled in code. } diff --git a/src/test/java/io/divolte/server/DslRecordMapperTest.java b/src/test/java/io/divolte/server/DslRecordMapperTest.java index 74821531..61c09860 100644 --- a/src/test/java/io/divolte/server/DslRecordMapperTest.java +++ b/src/test/java/io/divolte/server/DslRecordMapperTest.java @@ -37,6 +37,9 @@ import javax.annotation.ParametersAreNonnullByDefault; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import io.divolte.server.config.ValidatedConfiguration; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; @@ -52,12 +55,9 @@ import com.google.common.collect.ImmutableMap; import com.google.common.io.Resources; import com.maxmind.geoip2.model.CityResponse; -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; import io.divolte.server.ServerTestUtils.EventPayload; import io.divolte.server.ServerTestUtils.TestServer; -import io.divolte.server.config.ValidatedConfiguration; import io.divolte.server.ip2geo.LookupService; import io.divolte.server.ip2geo.LookupService.ClosedServiceException; import io.divolte.server.recordmapping.DslRecordMapper; @@ -361,12 +361,12 @@ public void shouldMapAllGeoIpFields() throws IOException, InterruptedException, copyResourceToFile("geo-mapping.groovy", geoMappingFile); final ImmutableMap mappingConfig = ImmutableMap.of( - "divolte.tracking.schema_mapping.mapping_script_file", geoMappingFile.getAbsolutePath(), - "divolte.tracking.schema_file", avroFile.getAbsolutePath() - ); + "divolte.mappings.test.mapping_script_file", geoMappingFile.getAbsolutePath(), + "divolte.mappings.test.schema_file", avroFile.getAbsolutePath() + ); final Config geoConfig = ConfigFactory.parseMap(mappingConfig) - .withFallback(ConfigFactory.parseResources("dsl-mapping-test.conf")) + .withFallback(ConfigFactory.parseResources("base-test-server.conf")) .withFallback(ConfigFactory.parseResources("reference-test.conf")); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> geoConfig); @@ -378,6 +378,7 @@ public void shouldMapAllGeoIpFields() throws IOException, InterruptedException, final DslRecordMapper mapper = new DslRecordMapper( vc, + geoMappingFile.getAbsolutePath(), new Schema.Parser().parse(Resources.toString(Resources.getResource("TestRecord.avsc"), StandardCharsets.UTF_8)), Optional.of(mockLookupService)); @@ -545,12 +546,11 @@ private void setupServer(final String mapping) throws IOException { copyResourceToFile("TestRecord.avsc", avroFile); final ImmutableMap mappingConfig = ImmutableMap.of( - "divolte.tracking.schema_mapping.mapping_script_file", mappingFile.getAbsolutePath(), - "divolte.tracking.schema_file", avroFile.getAbsolutePath() + "divolte.mappings.test.mapping_script_file", mappingFile.getAbsolutePath(), + "divolte.mappings.test.schema_file", avroFile.getAbsolutePath() ); - server = new TestServer("dsl-mapping-test.conf", mappingConfig); - server.server.run(); + server = new TestServer("base-test-server.conf", mappingConfig); } private static void copyResourceToFile(final String resourceName, final File file) throws IOException { diff --git a/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java b/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java index 68e196ab..f4d9d461 100644 --- a/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java +++ b/src/test/java/io/divolte/server/ProxyAdjacentPeerAddressHandlerTest.java @@ -105,7 +105,6 @@ public void shouldAllowMultipleXffHeaders() throws IOException, InterruptedExcep @Before public void setUp() { server = new TestServer("x-forwarded-for-test.conf"); - server.server.run(); } @After diff --git a/src/test/java/io/divolte/server/RequestChecksumTest.java b/src/test/java/io/divolte/server/RequestChecksumTest.java index c9ad7608..1158037c 100644 --- a/src/test/java/io/divolte/server/RequestChecksumTest.java +++ b/src/test/java/io/divolte/server/RequestChecksumTest.java @@ -16,23 +16,22 @@ package io.divolte.server; -import static org.junit.Assert.*; - -import java.io.IOException; -import java.net.HttpURLConnection; -import java.net.URL; - -import javax.annotation.Nullable; -import javax.annotation.ParametersAreNonnullByDefault; - +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableMap; +import io.divolte.server.ServerTestUtils.EventPayload; +import io.divolte.server.ServerTestUtils.TestServer; import org.junit.After; import org.junit.Before; import org.junit.Test; -import com.google.common.base.Preconditions; +import javax.annotation.Nullable; +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.Map; -import io.divolte.server.ServerTestUtils.EventPayload; -import io.divolte.server.ServerTestUtils.TestServer; +import static org.junit.Assert.*; @ParametersAreNonnullByDefault public class RequestChecksumTest { @@ -93,10 +92,12 @@ public class RequestChecksumTest { + "t=sentinelEvent&" + "x=-y99lem"; - private String serverConfigurationResourceName; + private boolean discardCorruptEvents; @Nullable private TestServer server; + @Nullable + private ImmutableMap serverProperties; @Test public void shouldFlagCorrectChecksumAsNotCorrupted() throws IOException, InterruptedException { @@ -144,7 +145,7 @@ public void shouldChecksumCorrectlyWithNonAsciiParameters() throws IOException, @Test public void shouldDiscardCorruptedEventsIfConfigured() throws InterruptedException, IOException { - serverConfigurationResourceName = "checksum-discard-corrupt-test.conf"; + discardCorruptEvents = true; request(URL_QUERY_CHECKSUM_BAD); request(URL_QUERY_SENTINEL); Preconditions.checkState(null != server); @@ -155,22 +156,23 @@ public void shouldDiscardCorruptedEventsIfConfigured() throws InterruptedExcepti } private void request(final String queryString) throws IOException { - setServerConf(serverConfigurationResourceName); + setServerConf(ImmutableMap.of("divolte.mappings.test.discard_corrupted", discardCorruptEvents)); Preconditions.checkState(null != server); final URL url = new URL(String.format(URL_STRING, server.port) + queryString); final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); assertEquals(200, conn.getResponseCode()); } - private void setServerConf(final String configurationResourceName) { - if (null == server || !configurationResourceName.equals(server.config.origin().resource())) { - setServer(new TestServer(configurationResourceName)); + private void setServerConf(final Map configurationProperties) { + if (null == server || !configurationProperties.equals(serverProperties)) { + serverProperties = ImmutableMap.copyOf(configurationProperties); + setServer(new TestServer("base-test-server.conf", serverProperties)); } } @Before public void setUp() { - serverConfigurationResourceName = "checksum-test.conf"; + discardCorruptEvents = false; } @After @@ -185,9 +187,6 @@ private void setServer(@Nullable final TestServer newServer) { oldServer.server.shutdown(); } this.server = newServer; - if (null != newServer) { - newServer.server.run(); - } } } } diff --git a/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java b/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java index 0d392e68..a470edcc 100644 --- a/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java +++ b/src/test/java/io/divolte/server/SeleniumJavaScriptTest.java @@ -28,6 +28,7 @@ import javax.annotation.ParametersAreNonnullByDefault; +import io.divolte.server.config.BrowserSourceConfiguration; import org.junit.Before; import org.junit.Test; import org.openqa.selenium.By; @@ -232,13 +233,13 @@ public void shouldSetAppropriateCookies() throws RuntimeException, InterruptedEx driver.get(urlOf(BASIC)); server.waitForEvent(); - final Optional parsedPartyCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(server.config.getString("divolte.tracking.party_cookie")).getValue()); + final Optional parsedPartyCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION.partyCookie).getValue()); assertTrue(parsedPartyCookieOption.isPresent()); assertThat( parsedPartyCookieOption.get(), isA(DivolteIdentifier.class)); - final Optional parsedSessionCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(server.config.getString("divolte.tracking.session_cookie")).getValue()); + final Optional parsedSessionCookieOption = DivolteIdentifier.tryParse(driver.manage().getCookieNamed(BrowserSourceConfiguration.DEFAULT_BROWSER_SOURCE_CONFIGURATION.sessionCookie).getValue()); assertTrue(parsedSessionCookieOption.isPresent()); assertThat( parsedSessionCookieOption.get(), @@ -258,6 +259,6 @@ public void shouldPickupProvidedPageViewIdFromHash() throws RuntimeException, In @Before public void setup() throws Exception { - doSetUp("selenium-test-config.conf"); + doSetUp(); } } diff --git a/src/test/java/io/divolte/server/SeleniumTestBase.java b/src/test/java/io/divolte/server/SeleniumTestBase.java index b7dbfa4e..489a7023 100644 --- a/src/test/java/io/divolte/server/SeleniumTestBase.java +++ b/src/test/java/io/divolte/server/SeleniumTestBase.java @@ -127,6 +127,14 @@ protected String urlOf(final TEST_PAGES page) { } protected void doSetUp(final String configFileName) throws Exception { + doSetUp(Optional.of(configFileName)); + } + + protected void doSetUp() throws Exception { + doSetUp(Optional.empty()); + } + + protected void doSetUp(final Optional configFileName) throws Exception { final String driverName = System.getenv().getOrDefault(DRIVER_ENV_VAR, PHANTOMJS_DRIVER); switch (driverName) { @@ -145,8 +153,7 @@ protected void doSetUp(final String configFileName) throws Exception { break; } - server = new TestServer(configFileName); - server.server.run(); + server = configFileName.map(TestServer::new).orElseGet(TestServer::new); } private void setupBrowserStack() throws MalformedURLException { diff --git a/src/test/java/io/divolte/server/ServerPingTest.java b/src/test/java/io/divolte/server/ServerPingTest.java new file mode 100644 index 00000000..7b88c6e9 --- /dev/null +++ b/src/test/java/io/divolte/server/ServerPingTest.java @@ -0,0 +1,64 @@ +/* + * Copyright 2014 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import com.google.common.io.ByteStreams; +import io.divolte.server.ServerTestUtils.TestServer; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.util.Optional; + +import static org.junit.Assert.assertEquals; + +@ParametersAreNonnullByDefault +public class ServerPingTest { + + private Optional testServer = Optional.empty(); + + @Before + public void setup() { + testServer = Optional.of(new TestServer()); + } + + @Test + public void shouldRespondToPingWithPong() throws IOException { + final URL url = new URL(String.format("http://localhost:%d/ping", testServer.get().port)); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + try { + conn.setRequestMethod("GET"); + assertEquals(200, conn.getResponseCode()); + assertEquals("text/plain; charset=utf-8", conn.getContentType()); + final String body = new String(ByteStreams.toByteArray(conn.getInputStream()), StandardCharsets.UTF_8); + assertEquals("pong", body); + } finally { + conn.disconnect(); + } + } + + @After + public void tearDown() { + testServer.ifPresent(testServer -> testServer.server.shutdown()); + testServer = Optional.empty(); + } +} diff --git a/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java b/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java index af579dd8..ab7e294b 100644 --- a/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java +++ b/src/test/java/io/divolte/server/ServerSideCookieEventHandlerTest.java @@ -48,8 +48,7 @@ public void shouldRegisterServerSideCookieEvent() throws IOException, RuntimeExc @Before public void setUp() { - server = new TestServer("server-side-cookies-test.conf"); - server.server.run(); + server = new TestServer(); } @After diff --git a/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java new file mode 100644 index 00000000..1a30cf7f --- /dev/null +++ b/src/test/java/io/divolte/server/ServerSinkSourceConfigurationTest.java @@ -0,0 +1,409 @@ +/* + * Copyright 2015 GoDataDriven B.V. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.divolte.server; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import io.divolte.server.ServerTestUtils.TestServer; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.FileReader; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.junit.After; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.ParametersAreNonnullByDefault; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.function.Supplier; +import java.util.stream.Stream; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@ParametersAreNonnullByDefault +public class ServerSinkSourceConfigurationTest { + + private static final String BROWSER_EVENT_URL_TEMPLATE = + "http://localhost:%d%s/csc-event?" + + "p=0%%3Ai1t84hgy%%3A5AF359Zjq5kUy98u4wQjlIZzWGhN~GlG&" + + "s=0%%3Ai1t84hgy%%3A95CbiPCYln_1e0a6rFvuRkDkeNnc6KC8&" + + "v=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF&" + + "e=0%%3A1fF6GFGjDOQiEx_OxnTm_tl4BH91eGLF0&" + + "c=i1t8q2b6&" + + "n=f&" + + "f=f&" + + "l=http%%3A%%2F%%2Flocalhost%%3A8290%%2F&" + + "i=1ak&" + + "j=sj&" + + "k=2&" + + "w=uq&" + + "h=qd&" + + "t=pageView&" + + "x=si9804"; + + private final Set tempDirectories = new HashSet<>(); + private Optional testServer = Optional.empty(); + + private void startServer(final String configResource, + final ImmutableMap extraProperties) { + startServer(() -> new TestServer(configResource, extraProperties)); + } + + private void startServer(final String configResource) { + startServer(() -> new TestServer(configResource)); + } + + private void startServer() { + startServer(TestServer::new); + } + + private void startServer(final Supplier supplier) { + stopServer(); + testServer = Optional.of(supplier.get()); + } + + public void stopServer() { + testServer.ifPresent(testServer -> testServer.server.shutdown()); + testServer = Optional.empty(); + } + + public Path createTempDirectory() throws IOException { + final Path newTempDirectory = Files.createTempDirectory("divolte-test"); + tempDirectories.add(newTempDirectory); + return newTempDirectory; + } + + public void cleanupTempDirectories() { + tempDirectories.forEach(ServerSinkSourceConfigurationTest::deleteRecursively); + tempDirectories.clear(); + } + + private void request() throws IOException { + request(""); + } + + private void request(final String sourcePrefix) throws IOException { + request(sourcePrefix, 200); + } + + private void request(final String sourcePrefix, final int expectedResponseCode) throws IOException { + final URL url = new URL(String.format(BROWSER_EVENT_URL_TEMPLATE, testServer.get().port, sourcePrefix)); + final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); + assertEquals(expectedResponseCode, conn.getResponseCode()); + } + + @ParametersAreNonnullByDefault + private static class AvroFileLocator { + private static final Logger logger = LoggerFactory.getLogger(AvroFileLocator.class); + + private final Path directory; + private final ImmutableSet existingFiles; + + private AvroFileLocator(final Path directory) throws IOException { + this.directory = Objects.requireNonNull(directory); + existingFiles = Files.list(directory) + .filter(AvroFileLocator::isAvroFile) + .collect(MoreCollectors.toImmutableSet()); + } + + private static boolean isAvroFile(final Path p) { + return p.toString().endsWith(".avro"); + } + + private static Stream listRecords(final Path avroFile) { + final GenericDatumReader datumReader = new GenericDatumReader<>(); + logger.debug("Reading records from new Avro file: {}", avroFile); + try (final FileReader fileReader = DataFileReader.openReader(avroFile.toFile(), datumReader)) { + final ImmutableList records = ImmutableList.copyOf(fileReader.iterator()); + logger.info("Read {} record(s) from new Avro file: {}", records.size(), avroFile); + return records.stream(); + } catch (final IOException e) { + throw new UncheckedIOException("Error reading records from file: " + avroFile, e); + } + } + + public Stream listNewRecords() throws IOException { + return Files.list(directory) + .filter(candidate -> isAvroFile(candidate) && !existingFiles.contains(candidate)) + .flatMap(AvroFileLocator::listRecords); + } + } + + @Test + public void shouldRegisterDefaultBrowserSource() throws IOException, InterruptedException { + // Test the default browser source that should be present by default. + startServer(); + request(); + testServer.get().waitForEvent(); + } + + @Test + public void shouldRegisterExplicitSourceOnly() throws IOException, InterruptedException { + // Test that if an explicit source is supplied, the builtin defaults are not present. + startServer("browser-source-explicit.conf"); + request("/a-prefix"); + testServer.get().waitForEvent(); + request("", 404); + } + + @Test + public void shouldSupportLongSourcePaths() throws IOException, InterruptedException { + // Test that the browser sources work with different types of path. + startServer("browser-source-long-prefix.conf"); + request("/a/multi/component/prefix"); + testServer.get().waitForEvent(); + } + + @Test + public void shouldSupportMultipleBrowserSources() throws IOException, InterruptedException { + // Test that multiple browser sources are supported. + startServer("browser-source-multiple.conf"); + request("/path1"); + request("/path2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + } + + @Test + public void shouldSupportUnusedSource() throws IOException { + // Test that an unused source is still reachable. + startServer("browser-source-unused.conf"); + request("/unused"); + } + + @Test + public void shouldSupportDefaultSourceMappingSink() throws IOException, InterruptedException { + // Test that with an out-of-the-box default configuration the default source, mapping and sink are present. + startServer(TestServer::createTestServerWithDefaultNonTestConfiguration); + final AvroFileLocator avroFileLocator = new AvroFileLocator(Paths.get("/tmp")); + request(); + testServer.get().waitForEvent(); + // Stopping the server flushes the HDFS files. + stopServer(); + // Now we can check the number of events that turned up in new files in /tmp. + assertEquals("Wrong number of new events logged to /tmp", + 1, avroFileLocator.listNewRecords().count()); + } + + @Test + public void shouldOnlyRegisterExplicitSourceMappingSink() throws IOException, InterruptedException { + // Test that if an explicit source-mapping-sink is supplied, the builtin defaults are not present. + final AvroFileLocator defaultAvroFileLocator = new AvroFileLocator(Paths.get("/tmp")); + final Path avroDirectory = createTempDirectory(); + startServer("mapping-configuration-explicit.conf", ImmutableMap.of( + "divolte.sinks.test-hdfs-sink.file_strategy.working_dir", avroDirectory.toString(), + "divolte.sinks.test-hdfs-sink.file_strategy.publish_dir", avroDirectory.toString() + )); + final AvroFileLocator explicitAvroFileLocator = new AvroFileLocator(avroDirectory); + request(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - The default location (/tmp) shouldn't have anything new. + // - Our explicit location should have a single record. + assertFalse("Default location (/tmp) shouldn't have any new logged events.", + defaultAvroFileLocator.listNewRecords().findFirst().isPresent()); + assertEquals("Wrong number of new events logged", + 1, explicitAvroFileLocator.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleSinks() throws IOException, InterruptedException { + // Test that multiple hdfs sinks are supported for a single mapping. + final AvroFileLocator defaultAvroFileLocator = new AvroFileLocator(Paths.get("/tmp")); + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + startServer("hdfs-sink-multiple.conf", ImmutableMap.of( + "divolte.sinks.test-hdfs-sink-1.file_strategy.working_dir", avroDirectory1.toString(), + "divolte.sinks.test-hdfs-sink-1.file_strategy.publish_dir", avroDirectory1.toString(), + "divolte.sinks.test-hdfs-sink-2.file_strategy.working_dir", avroDirectory2.toString(), + "divolte.sinks.test-hdfs-sink-2.file_strategy.publish_dir", avroDirectory2.toString() + )); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + request(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - The default location (/tmp) shouldn't have anything new. + // - Our locations should both have a single record. + assertFalse("Default location (/tmp) shouldn't have any new logged events.", + defaultAvroFileLocator.listNewRecords().findFirst().isPresent()); + assertEquals("Wrong number of new events logged in first location", + 1, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 1, explicitAvroFileLocator2.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleMappings() throws IOException, InterruptedException { + // Test that multiple independent mappings are supported. + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + startServer("mapping-configuration-independent.conf", ImmutableMap.of( + "divolte.sinks.sink-1.file_strategy.working_dir", avroDirectory1.toString(), + "divolte.sinks.sink-1.file_strategy.publish_dir", avroDirectory1.toString(), + "divolte.sinks.sink-2.file_strategy.working_dir", avroDirectory2.toString(), + "divolte.sinks.sink-2.file_strategy.publish_dir", avroDirectory2.toString() + )); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + request("/source-1"); + request("/source-2"); + request("/source-2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - One source should have a single event. + // - The other should have a two events. + assertEquals("Wrong number of new events logged in first location", + 1, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 2, explicitAvroFileLocator2.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleMappingsPerSource() throws IOException, InterruptedException { + // Test that a single source can send events to multiple mappings. + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + startServer("mapping-configuration-shared-source.conf", ImmutableMap.of( + "divolte.sinks.sink-1.file_strategy.working_dir", avroDirectory1.toString(), + "divolte.sinks.sink-1.file_strategy.publish_dir", avroDirectory1.toString(), + "divolte.sinks.sink-2.file_strategy.working_dir", avroDirectory2.toString(), + "divolte.sinks.sink-2.file_strategy.publish_dir", avroDirectory2.toString() + )); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + request(); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - Both sinks should have a single event. + assertEquals("Wrong number of new events logged in first location", + 1, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 1, explicitAvroFileLocator2.listNewRecords().count()); + } + + @Test + public void shouldSupportMultipleMappingsPerSink() throws IOException, InterruptedException { + // Test that a multiple mappings can send events to the same sink. + final Path avroDirectory = createTempDirectory(); + startServer("mapping-configuration-shared-sink.conf", ImmutableMap.of( + "divolte.sinks.only-sink.file_strategy.working_dir", avroDirectory.toString(), + "divolte.sinks.only-sink.file_strategy.publish_dir", avroDirectory.toString() + )); + final AvroFileLocator explicitAvroFileLocator = new AvroFileLocator(avroDirectory); + request("/source-1"); + request("/source-2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - The single location should have received both events. + assertEquals("Wrong number of new events logged", + 2, explicitAvroFileLocator.listNewRecords().count()); + } + + @Test + public void shouldSupportComplexSourceMappingSinkConfigurations() throws IOException, InterruptedException { + // Test that a complex source-mapping-sink configuration is possible. + // (This includes combinations of shared and non-shared sources and sinks.) + // Test that a single source can send events to multiple mappings. + final Path avroDirectory1 = createTempDirectory(); + final Path avroDirectory2 = createTempDirectory(); + final Path avroDirectory3 = createTempDirectory(); + final Path avroDirectory4 = createTempDirectory(); + startServer("mapping-configuration-interdependent.conf", new ImmutableMap.Builder() + .put("divolte.sinks.sink-1.file_strategy.working_dir", avroDirectory1.toString()) + .put("divolte.sinks.sink-1.file_strategy.publish_dir", avroDirectory1.toString()) + .put("divolte.sinks.sink-2.file_strategy.working_dir", avroDirectory2.toString()) + .put("divolte.sinks.sink-2.file_strategy.publish_dir", avroDirectory2.toString()) + .put("divolte.sinks.sink-3.file_strategy.working_dir", avroDirectory3.toString()) + .put("divolte.sinks.sink-3.file_strategy.publish_dir", avroDirectory3.toString()) + .put("divolte.sinks.sink-4.file_strategy.working_dir", avroDirectory4.toString()) + .put("divolte.sinks.sink-4.file_strategy.publish_dir", avroDirectory4.toString()) + .build() + ); + final AvroFileLocator explicitAvroFileLocator1 = new AvroFileLocator(avroDirectory1); + final AvroFileLocator explicitAvroFileLocator2 = new AvroFileLocator(avroDirectory2); + final AvroFileLocator explicitAvroFileLocator3 = new AvroFileLocator(avroDirectory3); + final AvroFileLocator explicitAvroFileLocator4 = new AvroFileLocator(avroDirectory4); + request("/source-1"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + request("/source-2"); + testServer.get().waitForEvent(); + testServer.get().waitForEvent(); + request("/source-3"); + testServer.get().waitForEvent(); + request("/source-4"); + testServer.get().waitForEvent(); + // Stopping the server flushes any HDFS files. + stopServer(); + // Now we can check: + // - Each sink should have a specific number of events in it. + assertEquals("Wrong number of new events logged in first location", + 2, explicitAvroFileLocator1.listNewRecords().count()); + assertEquals("Wrong number of new events logged in second location", + 2, explicitAvroFileLocator2.listNewRecords().count()); + assertEquals("Wrong number of new events logged in third location", + 5, explicitAvroFileLocator3.listNewRecords().count()); + assertEquals("Wrong number of new events logged in fourth location", + 2, explicitAvroFileLocator4.listNewRecords().count()); + } + + @After + public void tearDown() throws IOException { + stopServer(); + cleanupTempDirectories(); + } + + private static void deleteRecursively(final Path p) { + try (final Stream files = Files.walk(p).sorted(Comparator.reverseOrder())) { + files.forEachOrdered(path -> { + try { + Files.delete(path); + } catch (final IOException e) { + throw new UncheckedIOException("Error deleting file: " + path, e); + } + }); + } catch (final IOException e) { + throw new UncheckedIOException("Error recursively deleting directory: " + p, e); + } + } +} diff --git a/src/test/java/io/divolte/server/ServerTestUtils.java b/src/test/java/io/divolte/server/ServerTestUtils.java index 944316e5..c476c803 100644 --- a/src/test/java/io/divolte/server/ServerTestUtils.java +++ b/src/test/java/io/divolte/server/ServerTestUtils.java @@ -16,6 +16,14 @@ package io.divolte.server; +import com.google.common.base.Preconditions; +import com.typesafe.config.Config; +import com.typesafe.config.ConfigFactory; +import com.typesafe.config.ConfigValueFactory; +import io.divolte.server.config.ValidatedConfiguration; +import org.apache.avro.generic.GenericRecord; + +import javax.annotation.ParametersAreNonnullByDefault; import java.io.IOException; import java.net.ServerSocket; import java.util.Map; @@ -25,16 +33,6 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; -import javax.annotation.ParametersAreNonnullByDefault; - -import org.apache.avro.generic.GenericRecord; - -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; -import com.typesafe.config.ConfigValueFactory; - -import io.divolte.server.config.ValidatedConfiguration; - public final class ServerTestUtils { /* * Theoretically, this is prone to race conditions, @@ -72,30 +70,37 @@ public static final class TestServer { final Server server; final BlockingQueue events; + public TestServer() { + this(findFreePort(), ConfigFactory.parseResources("reference-test.conf")); + } + public TestServer(final String configResource) { - this( - findFreePort(), - ConfigFactory.parseResources(configResource) - .withFallback(ConfigFactory.parseResources("reference-test.conf")) - ); + this(findFreePort(), + ConfigFactory.parseResources(configResource) + .withFallback(ConfigFactory.parseResources("reference-test.conf"))); } public TestServer(final String configResource, final Map extraConfig) { - this( - findFreePort(), - ConfigFactory.parseMap(extraConfig) - .withFallback(ConfigFactory.parseResources(configResource)) - .withFallback(ConfigFactory.parseResources("reference-test.conf")) - ); + this(findFreePort(), + ConfigFactory.parseMap(extraConfig, "Test-specific overrides") + .withFallback(ConfigFactory.parseResources(configResource)) + .withFallback(ConfigFactory.parseResources("reference-test.conf"))); } private TestServer(final int port, final Config config) { this.port = port; - this.config = config.withValue("divolte.server.port", ConfigValueFactory.fromAnyRef(port)); + this.config = config.withValue("divolte.global.server.port", ConfigValueFactory.fromAnyRef(port)); events = new ArrayBlockingQueue<>(100); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> this.config); + Preconditions.checkArgument(vc.isValid(), + "Invalid test server configuration: %s", vc.errors()); server = new Server(vc, (event, buffer, record) -> events.add(new EventPayload(event, buffer, record))); + server.run(); + } + + static TestServer createTestServerWithDefaultNonTestConfiguration() { + return new TestServer(findFreePort(), ConfigFactory.defaultReference()); } public EventPayload waitForEvent() throws InterruptedException { diff --git a/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java b/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java index db141d5d..bd5cdfb2 100644 --- a/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java +++ b/src/test/java/io/divolte/server/ShortTermDuplicateMemoryTest.java @@ -138,7 +138,6 @@ private void request(final int which) throws IOException { @Before public void setUp() { server = new TestServer("duplicates-test.conf"); - server.server.run(); } @After diff --git a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java index c031f875..b65d9065 100644 --- a/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java +++ b/src/test/java/io/divolte/server/config/ValidatedConfigurationTest.java @@ -2,6 +2,7 @@ import static org.junit.Assert.*; +import com.google.common.collect.ImmutableMap; import org.junit.Test; import com.typesafe.config.Config; @@ -19,14 +20,17 @@ public void shouldNotThrowExceptionsOnInvalidConfiguration() { @Test public void shouldValidateJavaScriptName() { - final Config config = - ConfigFactory.parseString( - "divolte.javascript.name = 404.exe\n") - .withFallback(ConfigFactory.parseResources("reference-test.conf")); + final String propertyName = "divolte.sources.browser.javascript.name"; + final String invalidValue = "404.exe"; + final Config config = ConfigFactory.parseMap(ImmutableMap.of(propertyName, invalidValue)) + .withFallback(ConfigFactory.parseResources("base-test-server.conf")) + .withFallback(ConfigFactory.parseResources("reference-test.conf")); final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); assertFalse(vc.errors().isEmpty()); - assertEquals("Property 'divolte.javascript.name' must match \"^[A-Za-z0-9_-]+\\.js$\". Found: '404.exe'.", vc.errors().get(0)); + final String reportedPropertyName = propertyName.replace(".sources.browser.", ".sources[browser]."); + assertEquals("Property '" + reportedPropertyName + "' must match \"^[A-Za-z0-9_-]+\\.js$\". Found: '" + invalidValue + "'.", + vc.errors().get(0)); } @Test(expected = IllegalStateException.class) @@ -52,4 +56,46 @@ public void shouldMapReferenceConfig() { final ValidatedConfiguration vc = new ValidatedConfiguration(ConfigFactory::load); assertTrue(vc.errors().isEmpty()); } + + @Test + public void shouldReportMissingSourcesAndSinks() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("missing-sources-sinks.conf")); + + assertFalse(vc.isValid()); + assertEquals(1, vc.errors().size()); + assertTrue( + vc.errors() + .get(0) + .startsWith("Property 'divolte.' The following sources and/or sinks were used in a mapping but never defined: [missing-sink, missing-source]..")); + } + + @Test + public void sourceAndSinkNamesCannotCollide() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("source-sink-collisions.conf")); + + assertFalse(vc.isValid()); + assertEquals(1, vc.errors().size()); + assertTrue( + vc.errors() + .get(0) + .startsWith("Property 'divolte.' Source and sink names cannot collide (must be globally unique). The following names were both used as source and as sink: [foo, bar]..")); + } + + @Test + public void sharedSinksAllowedWithSameSchema() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("multiple-mappings-same-schema-shared-sink.conf")); + assertTrue(vc.isValid()); + } + + @Test + public void sharedSinksCannotHaveDifferentSchemas() { + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> ConfigFactory.parseResources("multiple-mappings-different-schema-shared-sink.conf")); + + assertFalse(vc.isValid()); + assertEquals(1, vc.errors().size()); + assertTrue( + vc.errors() + .get(0) + .startsWith("Property 'divolte.' Any sink can only use one schema. The following sinks have multiple mappings with different schema's linked to them: [kafka]..")); + } } diff --git a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java index 546235d2..820f4de9 100644 --- a/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java +++ b/src/test/java/io/divolte/server/hdfs/HdfsFlusherTest.java @@ -17,9 +17,6 @@ package io.divolte.server.hdfs; import static org.junit.Assert.*; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.DivolteIdentifier; -import io.divolte.server.config.ValidatedConfiguration; import java.io.File; import java.io.IOException; @@ -31,6 +28,8 @@ import java.util.stream.LongStream; import java.util.stream.StreamSupport; +import javax.annotation.ParametersAreNonnullByDefault; + import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericData.Record; @@ -44,89 +43,74 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.collect.ImmutableMap; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; +import io.divolte.server.AvroRecordBuffer; +import io.divolte.server.DivolteIdentifier; +import io.divolte.server.config.ValidatedConfiguration; +import io.divolte.server.processing.Item; + +@ParametersAreNonnullByDefault public class HdfsFlusherTest { private static final Logger logger = LoggerFactory.getLogger(HdfsFlusherTest.class); @SuppressWarnings("PMD.AvoidUsingHardCodedIP") private static final String ARBITRARY_IP = "8.8.8.8"; + private Schema schema; private Path tempInflightDir; private Path tempPublishDir; + private List records; + private HdfsFlusher flusher; + @Before - public void setupTempDir() throws IOException { + public void setup() throws IOException { + schema = schemaFromClassPath("/MinimalRecord.avsc"); tempInflightDir = Files.createTempDirectory("hdfs-flusher-test-inflight"); tempPublishDir = Files.createTempDirectory("hdfs-flusher-test-publish"); } @After - public void cleanupTempDir() throws IOException { + public void teardown() throws IOException { + schema = null; + Files.walk(tempInflightDir) .filter((p) -> !p.equals(tempInflightDir)) .forEach(this::deleteQuietly); deleteQuietly(tempInflightDir); + tempInflightDir = null; + Files.walk(tempPublishDir) .filter((p) -> !p.equals(tempPublishDir)) .forEach(this::deleteQuietly); deleteQuietly(tempPublishDir); + tempPublishDir = null; + + flusher = null; + records = null; + flusher = null; } @Test public void shouldCreateAndPopulateFileWithSimpleStrategy() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 1 day\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 10) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + setupFlusher("1 day", 10); + processRecords(); flusher.cleanup(); Files.walk(tempPublishDir) - .filter((p) -> p.toString().endsWith(".avro")) - .findFirst() - .ifPresent((p) -> verifyAvroFile(records, schema, p)); + .filter((p) -> p.toString().endsWith(".avro")) + .findFirst() + .ifPresent((p) -> verifyAvroFile(records, schema, p)); } @Test public void shouldWriteInProgressFilesWithNonAvroExtension() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 1 day\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 10) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + setupFlusher("1 day", 10); + processRecords(); assertTrue(Files.walk(tempInflightDir) .filter((p) -> p.toString().endsWith(".avro.partial")) @@ -136,92 +120,86 @@ public void shouldWriteInProgressFilesWithNonAvroExtension() throws IOException @Test public void shouldRollFilesWithSimpleStrategy() throws IOException, InterruptedException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 1 second\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final List records = LongStream.range(0, 5) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + setupFlusher("1 second", 5); + processRecords(); for (int c = 0; c < 2; c++) { Thread.sleep(500); flusher.heartbeat(); } - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + processRecords(); flusher.cleanup(); final MutableInt count = new MutableInt(0); Files.walk(tempPublishDir) - .filter((p) -> p.toString().endsWith(".avro")) - .forEach((p) -> { - verifyAvroFile(records, schema, p); - count.increment(); - }); + .filter((p) -> p.toString().endsWith(".avro")) + .forEach((p) -> { + verifyAvroFile(records, schema, p); + count.increment(); + }); assertEquals(2, count.intValue()); } @Test public void shouldNotCreateEmptyFiles() throws IOException, InterruptedException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SIMPLE_ROLLING_FILE\n" - + "divolte.hdfs_flusher.file_strategy.roll_every = 100 millisecond\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final List records = LongStream.range(0, 5) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); + setupFlusher("100 millisecond", 5); - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + processRecords(); for (int c = 0; c < 4; c++) { Thread.sleep(500); flusher.heartbeat(); } - records.forEach((record) -> flusher.process(AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.generate(), System.currentTimeMillis(), 0, record))); + processRecords(); flusher.cleanup(); final MutableInt count = new MutableInt(0); Files.walk(tempPublishDir) - .filter((p) -> p.toString().endsWith(".avro")) - .forEach((p) -> { - verifyAvroFile(records, schema, p); - count.increment(); - }); - + .filter((p) -> p.toString().endsWith(".avro")) + .forEach((p) -> { + verifyAvroFile(records, schema, p); + count.increment(); + }); assertEquals(2, count.intValue()); } + private void setupFlusher(final String rollEvery, final int recordCount) throws IOException { + final Config config = ConfigFactory + .parseMap(ImmutableMap.of( + "divolte.sinks.hdfs.file_strategy.roll_every", rollEvery, + "divolte.sinks.hdfs.file_strategy.working_dir", tempInflightDir.toString(), + "divolte.sinks.hdfs.file_strategy.publish_dir", tempPublishDir.toString())) + .withFallback(ConfigFactory.parseResources("hdfs-flusher-test.conf")) + .withFallback(ConfigFactory.parseResources("reference-test.conf")); + final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); + + records = LongStream.range(0, recordCount) + .mapToObj((time) -> + new GenericRecordBuilder(schema) + .set("ts", time) + .set("remoteHost", ARBITRARY_IP) + .build()) + .collect(Collectors.toList()); + + flusher = new HdfsFlusher(vc, "hdfs", schema); + } + + private void processRecords() { + records.stream().map( + (record) -> AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), + DivolteIdentifier.generate(), + System.currentTimeMillis(), + 0, + record)) + .forEach((arb) -> flusher.process(Item.of(0, arb.getPartyId().value, arb))); + } - private void deleteQuietly(Path p) { + private void deleteQuietly(final Path p) { try { Files.delete(p); } catch (final Exception e) { @@ -229,7 +207,7 @@ private void deleteQuietly(Path p) { } } - private void verifyAvroFile(List expected, Schema schema, Path avroFile) { + private void verifyAvroFile(final List expected, final Schema schema, final Path avroFile) { final List result = StreamSupport .stream(readAvroFile(schema, avroFile.toFile()).spliterator(), false) @@ -237,11 +215,11 @@ private void verifyAvroFile(List expected, Schema schema, Path avroFile) assertEquals(expected, result); } - private DataFileReader readAvroFile(Schema schema, File file) { + private DataFileReader readAvroFile(final Schema schema, final File file) { final DatumReader dr = new GenericDatumReader<>(schema); try { return new DataFileReader<>(file, dr); - } catch (IOException e) { + } catch (final IOException e) { throw new RuntimeException(e); } } diff --git a/src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java b/src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java deleted file mode 100644 index ab40730c..00000000 --- a/src/test/java/io/divolte/server/hdfs/SessionBinningFileStrategyTest.java +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright 2014 GoDataDriven B.V. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.divolte.server.hdfs; - -import static org.junit.Assert.*; - -import com.google.common.collect.ImmutableList; -import io.divolte.server.AvroRecordBuffer; -import io.divolte.server.DivolteIdentifier; -import io.divolte.server.config.ValidatedConfiguration; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.LongStream; -import java.util.stream.StreamSupport; - -import org.apache.avro.Schema; -import org.apache.avro.file.DataFileReader; -import org.apache.avro.generic.GenericData.Record; -import org.apache.avro.generic.GenericDatumReader; -import org.apache.avro.generic.GenericRecordBuilder; -import org.apache.avro.io.DatumReader; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.typesafe.config.Config; -import com.typesafe.config.ConfigFactory; - -public class SessionBinningFileStrategyTest { - private static final Logger logger = LoggerFactory.getLogger(SessionBinningFileStrategyTest.class); - - @SuppressWarnings("PMD.AvoidUsingHardCodedIP") - private static final String ARBITRARY_IP = "8.8.8.8"; - - private Path tempInflightDir; - private Path tempPublishDir; - - @Before - public void setupTempDir() throws IOException { - tempInflightDir = Files.createTempDirectory("hdfs-flusher-test-inflight"); - tempPublishDir = Files.createTempDirectory("hdfs-flusher-test-publish"); - } - - @After - public void cleanupTempDir() throws IOException { - Files.walk(tempInflightDir) - .filter((p) -> !p.equals(tempInflightDir)) - .forEach(this::deleteQuietly); - deleteQuietly(tempInflightDir); - Files.walk(tempPublishDir) - .filter((p) -> !p.equals(tempPublishDir)) - .forEach(this::deleteQuietly); - deleteQuietly(tempPublishDir); - } - - @Test - public void shouldCreateFilePerRound() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 5) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time * 1000 + 100) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach( - (record) -> flusher.process( - AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record))); - - flusher.cleanup(); - - final List inflightFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - final List publishedFiles = Files.walk(tempPublishDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro")) - .collect(Collectors.toList()); - - /* - * We created 5 events, each in a different round. On each sync event, we evaluate - * which open files can be closed because their 3-session span has elapsed. So: - * a) On the 4th event, the 1st span is completed. - * b) On the 5th event, the 2nd span is completed. - * c) The last 3 spans remain in-flight. - */ - assertEquals(3, inflightFiles.size()); - assertEquals(2 ,publishedFiles.size()); - verifyAvroFile(ImmutableList.of(records.get(0)), schema, publishedFiles.get(0)); - verifyAvroFile(ImmutableList.of(records.get(1)), schema, publishedFiles.get(1)); - verifyAvroFile(ImmutableList.of(records.get(2)), schema, inflightFiles.get(0)); - verifyAvroFile(ImmutableList.of(records.get(3)), schema, inflightFiles.get(1)); - verifyAvroFile(ImmutableList.of(records.get(4)), schema, inflightFiles.get(2)); - } - - @Test - public void eventsShouldStickWithSessionStartTimeRound() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = LongStream.range(0, 2) - .mapToObj((time) -> new GenericRecordBuilder(schema) - .set("ts", time * 1000 + 100) - .set("remoteHost", ARBITRARY_IP) - .build()) - .collect(Collectors.toList()); - - records.forEach( - (record) -> flusher.process( - AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record))); - - records.forEach( - (record) -> flusher.process( - AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record))); - - flusher.cleanup(); - - final List avroFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - - assertEquals(2, avroFiles.size()); - verifyAvroFile(Arrays.asList(records.get(0), records.get(0)), schema, avroFiles.get(0)); - verifyAvroFile(Arrays.asList(records.get(1), records.get(1)), schema, avroFiles.get(1)); - } - - @Test - public void eventsShouldMoveToNextRoundFileIfSessionStartTimeRoundFileIsNoLongerOpen() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final List records = Arrays.asList( - new GenericRecordBuilder(schema).set("ts", 100L).set("session", DivolteIdentifier.generate(100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 1100L).set("session", DivolteIdentifier.generate(1100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 2100L).set("session", DivolteIdentifier.generate(2100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3100L).set("session", DivolteIdentifier.generate(3100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3150L).set("session", DivolteIdentifier.generate(100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3160L).set("session", DivolteIdentifier.generate(1100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3170L).set("session", DivolteIdentifier.generate(2100).value).set("remoteHost", ARBITRARY_IP).build(), - new GenericRecordBuilder(schema).set("ts", 3180L).set("session", DivolteIdentifier.generate(3100).value).set("remoteHost", ARBITRARY_IP).build() - ); - - final List buffers = records - .stream() - .map((r) -> AvroRecordBuffer.fromRecord(DivolteIdentifier.generate(), DivolteIdentifier.tryParse((String) r.get("session")).get(), (Long) r.get("ts"), 0, r)) - .collect(Collectors.toList()); - - buffers.forEach(flusher::process); - flusher.cleanup(); - - final List inflightFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - final List publishedFiles = Files.walk(tempPublishDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro")) - .collect(Collectors.toList()); - - assertEquals(1, publishedFiles.size()); - assertEquals(3, inflightFiles.size()); - - verifyAvroFile(Arrays.asList(records.get(0)), schema, publishedFiles.get(0)); - verifyAvroFile(Arrays.asList(records.get(1), records.get(4), records.get(5)), schema, inflightFiles.get(0)); - verifyAvroFile(Arrays.asList(records.get(2), records.get(6)), schema, inflightFiles.get(1)); - verifyAvroFile(Arrays.asList(records.get(3), records.get(7)), schema, inflightFiles.get(2)); - } - - @Test - public void shouldNotPublishInflightFilesOnCleanup() throws IOException { - final Schema schema = schemaFromClassPath("/MinimalRecord.avsc"); - final Config config = - ConfigFactory.parseString( - "divolte.hdfs_flusher.file_strategy.type = SESSION_BINNING\n" - + "divolte.hdfs_flusher.file_strategy.working_dir = \"" + tempInflightDir.toString() + "\"\n" - + "divolte.hdfs_flusher.file_strategy.publish_dir = \"" + tempPublishDir.toString() + '"') - .withFallback(ConfigFactory.parseResources("hdfs-flusher-binning-test.conf")); - final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - - final HdfsFlusher flusher = new HdfsFlusher(vc, schema); - - final Record record = new GenericRecordBuilder(schema) - .set("ts", 100L) - .set("remoteHost", ARBITRARY_IP) - .build(); - - flusher.process(AvroRecordBuffer.fromRecord( - DivolteIdentifier.generate((Long) record.get("ts")), - DivolteIdentifier.generate((Long) record.get("ts")), - (Long) record.get("ts"), - 0, - record)); - flusher.cleanup(); - - final List inflightFiles = Files.walk(tempInflightDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro.partial")) - .collect(Collectors.toList()); - final List publishedFiles = Files.walk(tempPublishDir) - .sorted((l, r) -> l.toString().compareTo(r.toString())) // files sort lexicographically in time order - .filter((p) -> p.toString().endsWith(".avro")) - .collect(Collectors.toList()); - - assertEquals(1, inflightFiles.size()); - assertEquals(0 ,publishedFiles.size()); - } - - private void deleteQuietly(Path p) { - try { - Files.delete(p); - } catch (final Exception e) { - logger.info("Ignoring failure while deleting file: " + p, e); - } - } - - private void verifyAvroFile(List expected, Schema schema, Path avroFile) { - final List result = StreamSupport - .stream(readAvroFile(schema, avroFile.toFile()).spliterator(), false) - .collect(Collectors.toList()); - - assertEquals(expected, result); - } - - private DataFileReader readAvroFile(Schema schema, File file) { - final DatumReader dr = new GenericDatumReader<>(schema); - try { - return new DataFileReader<>(file, dr); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private Schema schemaFromClassPath(final String resource) throws IOException { - try (final InputStream resourceStream = this.getClass().getResourceAsStream(resource)) { - return new Schema.Parser().parse(resourceStream); - } - } -} diff --git a/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java b/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java index 72b16875..07a291c8 100644 --- a/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java +++ b/src/test/java/io/divolte/server/js/TrackingJavaScriptResourceTest.java @@ -43,7 +43,7 @@ public class TrackingJavaScriptResourceTest { public void setup() throws IOException { // Essential test to ensure at build-time that our JavaScript can be compiled. final ValidatedConfiguration vc = new ValidatedConfiguration(() -> config); - trackingJavaScript = new TrackingJavaScriptResource(vc); + trackingJavaScript = TrackingJavaScriptResource.create(vc, "browser"); } @After diff --git a/src/test/resources/hdfs-flusher-binning-test.conf b/src/test/resources/base-test-server.conf similarity index 59% rename from src/test/resources/hdfs-flusher-binning-test.conf rename to src/test/resources/base-test-server.conf index 17da524a..a95655d1 100644 --- a/src/test/resources/hdfs-flusher-binning-test.conf +++ b/src/test/resources/base-test-server.conf @@ -14,29 +14,24 @@ // limitations under the License. // -include classpath("reference-test.conf") - +// Specify a basic source/sink/mapping configuration that tests can use. divolte { - tracking { - session_timeout = 1 second + global { + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false } - hdfs_flusher { - enabled = true - - max_write_queue = 10 - max_enqueue_delay = 1 second - threads = 1 + sources.browser.type = browser - file_strategy { - type = SESSION_BINNING - sync_file_after_records = 1 - sync_file_after_duration = 1 hour - } + sinks { + hdfs.type = hdfs + kafka.type = kafka + } - hdfs { - uri = "file:///" - replication = 1 - } + mappings.test = { + sources = [browser] + sinks = [hdfs, kafka] } } diff --git a/src/test/resources/selenium-test-config.conf b/src/test/resources/browser-source-explicit.conf similarity index 61% rename from src/test/resources/selenium-test-config.conf rename to src/test/resources/browser-source-explicit.conf index e252edc2..7559f866 100644 --- a/src/test/resources/selenium-test-config.conf +++ b/src/test/resources/browser-source-explicit.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,4 +14,17 @@ // limitations under the License. // -// Nothing needed here; the default test server configuration suffices. +// Specify a single explicit browser source. +divolte { + sources.test-browser-source { + type = browser + // Specify a prefix, so we can differentiate this from the default one. + prefix = /a-prefix + } + + mappings.test = { + sources = [test-browser-source] + // Need at least one sink. + sinks = [hdfs] + } +} diff --git a/src/test/resources/checksum-test.conf b/src/test/resources/browser-source-long-prefix.conf similarity index 63% rename from src/test/resources/checksum-test.conf rename to src/test/resources/browser-source-long-prefix.conf index 639dc399..4527cab1 100644 --- a/src/test/resources/checksum-test.conf +++ b/src/test/resources/browser-source-long-prefix.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,5 +15,15 @@ // divolte { - incoming_request_processor.discard_corrupted = false + sources.test-browser-source { + type = browser + // Specify a multi-component prefix, to check that / doesn't need to be encoded. + prefix = /a/multi/component/prefix + } + + mappings.test = { + sources = [test-browser-source] + // Need at least one sink. + sinks = [hdfs] + } } diff --git a/src/test/resources/dsl-mapping-test.conf b/src/test/resources/browser-source-multiple.conf similarity index 61% rename from src/test/resources/dsl-mapping-test.conf rename to src/test/resources/browser-source-multiple.conf index ac8ba8af..54307237 100644 --- a/src/test/resources/dsl-mapping-test.conf +++ b/src/test/resources/browser-source-multiple.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,6 +14,21 @@ // limitations under the License. // -divolte.tracking.schema_mapping { - version = 2 +divolte { + sources { + test-browser-source-1 { + type = browser + prefix = /path1 + } + test-browser-source-2 { + type = browser + prefix = /path2 + } + } + + mappings.test = { + sources = [test-browser-source-1, test-browser-source-2] + // Need at least one sink. + sinks = [hdfs] + } } diff --git a/src/test/resources/checksum-discard-corrupt-test.conf b/src/test/resources/browser-source-unused.conf similarity index 74% rename from src/test/resources/checksum-discard-corrupt-test.conf rename to src/test/resources/browser-source-unused.conf index 440cc3dd..a52731bc 100644 --- a/src/test/resources/checksum-discard-corrupt-test.conf +++ b/src/test/resources/browser-source-unused.conf @@ -1,5 +1,5 @@ // -// Copyright 2014 GoDataDriven B.V. +// Copyright 2015 GoDataDriven B.V. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,6 +14,12 @@ // limitations under the License. // +// Specify a single browser source that isn't used by any mappings. divolte { - incoming_request_processor.discard_corrupted = true + sources.unused-source { + type = browser + prefix = /unused + } + mappings {} + sinks {} } diff --git a/src/test/resources/duplicates-test.conf b/src/test/resources/duplicates-test.conf index 3a4de335..d6b586c9 100644 --- a/src/test/resources/duplicates-test.conf +++ b/src/test/resources/duplicates-test.conf @@ -16,5 +16,5 @@ divolte { // Configure 2 slots, so we can control things easily. - incoming_request_processor.duplicate_memory_size = 2 + global.mapper.duplicate_memory_size = 2 } diff --git a/src/test/resources/hdfs-flusher-test.conf b/src/test/resources/hdfs-flusher-test.conf index 42aaaafa..c6829adb 100644 --- a/src/test/resources/hdfs-flusher-test.conf +++ b/src/test/resources/hdfs-flusher-test.conf @@ -14,25 +14,24 @@ // limitations under the License. // -include classpath("reference-test.conf") +include classpath("base-test-server.conf") divolte { - hdfs_flusher { - enabled = true - - max_write_queue = 10 - max_enqueue_delay = 1 second - threads = 1 - - file_strategy { - type = SIMPLE_ROLLING_FILE - sync_file_after_records = 1 - sync_file_after_duration = 1 seconds - } - + global { + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. hdfs { - uri = "file:///" - replication = 1 + enabled = true + buffer_size = 16 + threads = 1 + } + } + sinks { + hdfs = { + file_strategy { + sync_file_after_records = 1 + sync_file_after_duration = 1 seconds + } } } } diff --git a/src/test/resources/hdfs-sink-multiple.conf b/src/test/resources/hdfs-sink-multiple.conf new file mode 100644 index 00000000..e49e04ad --- /dev/null +++ b/src/test/resources/hdfs-sink-multiple.conf @@ -0,0 +1,41 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify multiple HDFS sinks for a mapping. +divolte { + global.hdfs.enabled = true + sources.test-browser-source.type = browser + + sinks.test-hdfs-sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sinks.test-hdfs-sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + + mappings.test = { + sources = [test-browser-source] + sinks = [test-hdfs-sink-1, test-hdfs-sink-2] + } +} diff --git a/src/test/resources/mapping-configuration-explicit.conf b/src/test/resources/mapping-configuration-explicit.conf new file mode 100644 index 00000000..c9ab85ad --- /dev/null +++ b/src/test/resources/mapping-configuration-explicit.conf @@ -0,0 +1,34 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify an explicit source-mapping-sink. +divolte { + global.hdfs.enabled = true + sources.test-browser-source.type = browser + + sinks.test-hdfs-sink { + type = hdfs + file_strategy = { + // working_directory: supplied by test. + // publish_directory: supplied by test. + } + } + + mappings.test = { + sources = [test-browser-source] + sinks = [test-hdfs-sink] + } +} diff --git a/src/test/resources/mapping-configuration-independent.conf b/src/test/resources/mapping-configuration-independent.conf new file mode 100644 index 00000000..d4afcf6f --- /dev/null +++ b/src/test/resources/mapping-configuration-independent.conf @@ -0,0 +1,59 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify multiple independent source-mapping-sink chains. +divolte { + global.hdfs.enabled = true + + sources { + source-1 { + type = browser + prefix = /source-1 + } + source-2 { + type = browser + prefix = /source-2 + } + } + + mappings { + mapping-1 = { + sources = [source-1] + sinks = [sink-1] + } + mapping-2 = { + sources = [source-2] + sinks = [sink-2] + } + } + + sinks { + sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/mapping-configuration-interdependent.conf b/src/test/resources/mapping-configuration-interdependent.conf new file mode 100644 index 00000000..e43db9ea --- /dev/null +++ b/src/test/resources/mapping-configuration-interdependent.conf @@ -0,0 +1,90 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a reasonably complex set of interdependent mappings with multiple +// sources, mapping and sinks with some shared and some not. +divolte { + global.hdfs.enabled = true + + sources { + source-1 { + type = browser + prefix = /source-1 + } + source-2 { + type = browser + prefix = /source-2 + } + source-3 { + type = browser + prefix = /source-3 + } + source-4 { + type = browser + prefix = /source-4 + } + } + + mappings { + mapping-1 = { + sources = [source-1, source-2] + sinks = [sink-1] + } + mapping-2 = { + sources = [source-1, source-2] + sinks = [sink-2, sink-3] + } + mapping-3 = { + sources = [source-3] + sinks = [sink-3] + } + mapping-4 = { + sources = [source-1, source-4] + sinks = [sink-3, sink-4] + } + } + + sinks { + sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-3 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-4 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/mapping-configuration-shared-sink.conf b/src/test/resources/mapping-configuration-shared-sink.conf new file mode 100644 index 00000000..1c65bcee --- /dev/null +++ b/src/test/resources/mapping-configuration-shared-sink.conf @@ -0,0 +1,52 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify multiple mappings (with their own sources) that end up on the same sink. +divolte { + global.hdfs.enabled = true + + sources { + source-1 { + type = browser + prefix = /source-1 + } + source-2 { + type = browser + prefix = /source-2 + } + } + + mappings { + mapping-1 = { + sources = [source-1] + sinks = [only-sink] + } + mapping-2 = { + sources = [source-2] + sinks = [only-sink] + } + } + + sinks { + only-sink { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/mapping-configuration-shared-source.conf b/src/test/resources/mapping-configuration-shared-source.conf new file mode 100644 index 00000000..5a581bc2 --- /dev/null +++ b/src/test/resources/mapping-configuration-shared-source.conf @@ -0,0 +1,49 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Specify a single source with multiple mappings (and sinks). +divolte { + global.hdfs.enabled = true + sources.only-source.type = browser + + mappings { + mapping-1 = { + sources = [only-source] + sinks = [sink-1] + } + mapping-2 = { + sources = [only-source] + sinks = [sink-2] + } + } + + sinks { + sink-1 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + sink-2 { + type = hdfs + file_strategy = { + // working_dir: supplied by test. + // publish_dir: supplied by test. + } + } + } +} diff --git a/src/test/resources/missing-sources-sinks.conf b/src/test/resources/missing-sources-sinks.conf new file mode 100644 index 00000000..4e23c623 --- /dev/null +++ b/src/test/resources/missing-sources-sinks.conf @@ -0,0 +1,57 @@ +// +// Copyright 2014 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +divolte { + global { + server.host = 127.0.0.1 + + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } + + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false + } + + // Explicitly specify the default sinks and sources, so that tests can merge properties in. + sources { + browser = { + type = browser + } + } + sinks { + hdfs = { + type = hdfs + } + kafka = { + type = kafka + } + } + + mappings { + test = { + sources = [browser,missing-source] + sinks = [hdfs,kafka,missing-sink] + } + } +} diff --git a/src/test/resources/multiple-mappings-different-schema-shared-sink.conf b/src/test/resources/multiple-mappings-different-schema-shared-sink.conf new file mode 100644 index 00000000..881357bd --- /dev/null +++ b/src/test/resources/multiple-mappings-different-schema-shared-sink.conf @@ -0,0 +1,45 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +// A shared sink between two mappings, but not allowed because the mappings have different schemas. +divolte { + + // Need a source for the mappings. + sources.browser.type = browser + + // Two sinks, the latter shared. + sinks { + hdfs.type = hdfs + kafka.type = kafka + } + + // Our mappings: The kafka sink is shared, but that's okay. + mappings { + foo = { + sources = [browser] + sinks = [kafka] + schema_file = foo.avsc + } + + bar = { + sources = [browser] + sinks = [hdfs,kafka] + schema_file = bar.avsc + } + } +} diff --git a/src/test/resources/multiple-mappings-same-schema-shared-sink.conf b/src/test/resources/multiple-mappings-same-schema-shared-sink.conf new file mode 100644 index 00000000..f19329f7 --- /dev/null +++ b/src/test/resources/multiple-mappings-same-schema-shared-sink.conf @@ -0,0 +1,45 @@ +// +// Copyright 2015 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +// A shared sink between two mappings, allowed because the mappings have the same schema. +divolte { + + // Need a source for the mappings. + sources.browser.type = browser + + // Two sinks, the latter shared. + sinks { + hdfs.type = hdfs + kafka.type = kafka + } + + // Our mappings: The kafka sink is shared, but that's okay. + mappings { + foo = { + sources = [browser] + sinks = [kafka] + schema_file = foobar.avsc + } + + bar = { + sources = [browser] + sinks = [hdfs,kafka] + schema_file = foobar.avsc + } + } +} diff --git a/src/test/resources/reference-test.conf b/src/test/resources/reference-test.conf index 00bcc532..db5b77e4 100644 --- a/src/test/resources/reference-test.conf +++ b/src/test/resources/reference-test.conf @@ -17,20 +17,16 @@ include classpath("reference.conf") divolte { - // The test server should only listen on loopback by default. - // The port number is a free ephemeral port determined at runtime. - server.host = 127.0.0.1 + global { + server.host = 127.0.0.1 - // For tests we generally want single-threaded processing with a small - // buffer. - incoming_request_processor { - threads = 1 - max_write_queue = 10 - max_enqueue_delay = 1 second - } + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } - // By default the flushers are disabled. Instead events are placed on - // a special queue for the tests to collect. - kafka_flusher.enabled = false - hdfs_flusher.enabled = false + hdfs.enabled = false + } } diff --git a/src/test/resources/selenium-test-no-default-event-config.conf b/src/test/resources/selenium-test-no-default-event-config.conf index 1492cc1a..6145043f 100644 --- a/src/test/resources/selenium-test-no-default-event-config.conf +++ b/src/test/resources/selenium-test-no-default-event-config.conf @@ -15,4 +15,7 @@ // // Disable automatic default pageView event -divolte.javascript.auto_page_view_event = false +divolte.sources.browser { + type = browser + javascript.auto_page_view_event = false +} diff --git a/src/test/resources/server-side-cookies-test.conf b/src/test/resources/server-side-cookies-test.conf deleted file mode 100644 index e252edc2..00000000 --- a/src/test/resources/server-side-cookies-test.conf +++ /dev/null @@ -1,17 +0,0 @@ -// -// Copyright 2014 GoDataDriven B.V. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -// Nothing needed here; the default test server configuration suffices. diff --git a/src/test/resources/source-sink-collisions.conf b/src/test/resources/source-sink-collisions.conf new file mode 100644 index 00000000..7d93a4d6 --- /dev/null +++ b/src/test/resources/source-sink-collisions.conf @@ -0,0 +1,72 @@ +// +// Copyright 2014 GoDataDriven B.V. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +include classpath("reference.conf") + +divolte { + global { + server.host = 127.0.0.1 + + mapper { + // For tests we generally want single-threaded processing with a small + // buffer. + buffer_size = 16 + threads = 1 + } + + // By default the flushers are disabled. Instead events are placed on + // a special queue for the tests to collect. + hdfs.enabled = false + kafka.enabled = false + } + + // Explicitly specify the default sinks and sources, so that tests can merge properties in. + sources { + browser { + type = browser + } + + foo { + type = browser + } + + bar = { + type = browser + } + } + + sinks { + hdfs = { + type = hdfs + } + kafka = { + type = kafka + } + foo = { + type = hdfs + } + bar = { + type = hdfs + } + } + + mappings { + test = { + sources = [browser] + sinks = [hdfs,kafka] + } + } +} diff --git a/src/test/resources/x-forwarded-for-test.conf b/src/test/resources/x-forwarded-for-test.conf index 1485d6fb..5481b62a 100644 --- a/src/test/resources/x-forwarded-for-test.conf +++ b/src/test/resources/x-forwarded-for-test.conf @@ -16,5 +16,5 @@ divolte { // This is what we're testing. - server.use_x_forwarded_for = true + global.server.use_x_forwarded_for = true }