From 57ce7881f80b6d691bc97c3398e03dfa441255b6 Mon Sep 17 00:00:00 2001 From: Jose Castro Date: Fri, 8 May 2026 10:19:13 -0600 Subject: [PATCH 1/5] feat(Content Analytics) #35525 : Add docker-compose examples for Experiments and new CA infrastructure. --- .../analytics/README.md | 423 +++---- .../analytics/conf/clickhouse-01/macros.xml | 14 + .../analytics/conf/clickhouse-02/macros.xml | 14 + .../analytics/conf/keeper/keeper_config.xml | 111 ++ .../analytics/conf/toxiproxy/toxiproxy.json | 9 + .../analytics/conf/users.xml | 14 + .../analytics/conf/zookeeper.xml | 53 + .../analytics/docker-compose.yml | 347 +----- .../analytics/init/01-init.sql | 14 + .../analytics/init/10-global.sql | 105 ++ .../analytics/init/20-event-data.sql | 122 ++ .../analytics/init/30-conversion-data.sql | 167 +++ .../init/40-session-engagement-data.sql | 1085 +++++++++++++++++ .../analytics/init/50-users.sql | 11 + .../{.env => init/99-test-users.sql} | 0 .../experiments/README.md | 289 +++++ .../experiments/docker-compose.yml | 300 +++++ .../{analytics => experiments}/get-token.sh | 8 +- .../dev/clickhouse/clickhouse-issue-15638.xml | 0 .../setup/config/dev/cube/cube.js | 0 .../dev/cube/schema/ContentAttribution.js | 0 .../config/dev/cube/schema/Conversion.js | 0 .../config/dev/cube/schema/EngagementDaily.js | 0 .../config/dev/cube/schema/EventSummary.js | 0 .../setup/config/dev/cube/schema/Events.js | 0 .../setup/config/dev/cube/schema/Request.js | 0 .../dev/cube/schema/SessionsByBrowserDaily.js | 0 .../dev/cube/schema/SessionsByDeviceDaily.js | 0 .../cube/schema/SessionsByLanguageDaily.js | 0 .../dev/jitsu/server/config/eventnative.yaml | 0 .../config/dev/keycloak/keycloak-keystore.jks | Bin .../setup/config/dev/keycloak/test-realm.json | 0 .../setup/db/clickhouse/init-scripts/init.sql | 0 .../setup/db/mssql/entrypoint.sh | 0 .../setup/db/mssql/init-scripts/init.sql | 0 .../db/postgres/init-scripts/init-config.sh | 0 .../setup/db/postgres/init-scripts/init.sql | 0 .../start-experiments.sh} | 50 +- 38 files changed, 2597 insertions(+), 539 deletions(-) create mode 100644 docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml create mode 100644 docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml create mode 100644 docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml create mode 100644 docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json create mode 100644 docker/docker-compose-examples/analytics/conf/users.xml create mode 100644 docker/docker-compose-examples/analytics/conf/zookeeper.xml create mode 100644 docker/docker-compose-examples/analytics/init/01-init.sql create mode 100644 docker/docker-compose-examples/analytics/init/10-global.sql create mode 100644 docker/docker-compose-examples/analytics/init/20-event-data.sql create mode 100644 docker/docker-compose-examples/analytics/init/30-conversion-data.sql create mode 100644 docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql create mode 100644 docker/docker-compose-examples/analytics/init/50-users.sql rename docker/docker-compose-examples/analytics/{.env => init/99-test-users.sql} (100%) create mode 100644 docker/docker-compose-examples/experiments/README.md create mode 100644 docker/docker-compose-examples/experiments/docker-compose.yml rename docker/docker-compose-examples/{analytics => experiments}/get-token.sh (97%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/clickhouse/clickhouse-issue-15638.xml (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/cube.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/ContentAttribution.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/Conversion.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/EngagementDaily.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/EventSummary.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/Events.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/Request.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/SessionsByBrowserDaily.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/SessionsByDeviceDaily.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/cube/schema/SessionsByLanguageDaily.js (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/jitsu/server/config/eventnative.yaml (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/keycloak/keycloak-keystore.jks (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/config/dev/keycloak/test-realm.json (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/db/clickhouse/init-scripts/init.sql (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/db/mssql/entrypoint.sh (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/db/mssql/init-scripts/init.sql (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/db/postgres/init-scripts/init-config.sh (100%) rename docker/docker-compose-examples/{analytics => experiments}/setup/db/postgres/init-scripts/init.sql (100%) rename docker/docker-compose-examples/{analytics/start-analytics.sh => experiments/start-experiments.sh} (58%) diff --git a/docker/docker-compose-examples/analytics/README.md b/docker/docker-compose-examples/analytics/README.md index b327fbd0b5b7..b2ca78fe5175 100644 --- a/docker/docker-compose-examples/analytics/README.md +++ b/docker/docker-compose-examples/analytics/README.md @@ -1,294 +1,257 @@ -# dotCMS Analytics Complete Stack +# Docker Setup -This docker-compose setup provides a complete dotCMS instance pre-configured with the full analytics stack including CubeJS, ClickHouse, Jitsu, and Keycloak. +This directory contains the Docker Compose configuration and supporting files for running the +**dotCMS Content Analytics Event Manager** and its ClickHouse cluster locally. -## Architecture Overview - -``` -┌─────────────┐ ┌──────────────────┐ ┌─────────────────┐ -│ dotCMS │────│ Analytics Stack │────│ Data Layer │ -│ │ │ │ │ │ -│ - dotCMS │ │ - Keycloak (IDP) │ │ - ClickHouse │ -│ - OpenSearch│ │ - Jitsu (Events) │ │ - PostgreSQL │ -│ - Database │ │ - Cube (Read) │ │ - Redis │ -└─────────────┘ │ - Configurator │ └─────────────────┘ - └──────────────────┘ -``` +--- -## Services and Ports +## Table of Contents -### Core dotCMS Services -- **dotCMS**: http://localhost:8082 (HTTPS: 8443) -- **dotCMS Database**: PostgreSQL (internal only) -- **OpenSearch**: http://localhost:9200 (internal + external) -- **Glowroot**: http://localhost:4000 (monitoring) +1. [Architecture Overview](#architecture-overview) +2. [Services](#services) + - [clickhouse-keeper](#clickhouse-keeper) + - [clickhouse-01 / clickhouse-02](#clickhouse-01--clickhouse-02) + - [ca-event-manager](#ca-event-manager) +3. [Directory Layout](#directory-layout) +4. [Running the Stack](#running-the-stack) + - [ClickHouse only (recommended for development)](#clickhouse-only-recommended-for-development) + - [Full stack](#full-stack) +5. [Configuration Files](#configuration-files) +6. [Database Initialization](#database-initialization) +7. [Default Credentials](#default-credentials) +8. [Ports at a Glance](#ports-at-a-glance) +9. [Scaling Keeper to Production](#scaling-keeper-to-production) -### Analytics Services -- **Keycloak (IDP)**: http://localhost:61111 -- **dotCMS Analytics Configurator**: http://localhost:8088 -- **Jitsu (Event Collection)**: http://localhost:8081 -- **CubeJS (Analytics Read)**: http://localhost:4001 -- **ClickHouse (Data Warehouse)**: http://localhost:8124 -- **Analytics Database**: PostgreSQL (internal only) +--- -## Pre-configured Analytics Settings - -The dotCMS instance is pre-configured with the following analytics settings via environment variables: +## Architecture Overview -### Internal URLs (Container-to-Container) -```bash -ANALYTICS_IDP_URL="http://keycloak:8080/realms/dotcms/protocol/openid-connect/token" -ANALYTICS_APP_CONFIG_URL="http://dotcms-analytics:8080/c/customer1/cluster1/keys" -ANALYTICS_APP_WRITE_URL="http://jitsu:8001/api/v1/event" -ANALYTICS_APP_READ_URL="http://cube:4000" ``` - -### External URLs (Host Access) -For browser/external access, these map to: -```bash -ANALYTICS_IDP_URL="http://localhost:61111/realms/dotcms/protocol/openid-connect/token" -ANALYTICS_APP_CONFIG_URL="https://localhost:8088/c/customer1/cluster1/keys" -ANALYTICS_APP_WRITE_URL="https://localhost:8081/api/v1/event" -ANALYTICS_APP_READ_URL="https://localhost:4001" +┌──────────────────────────────────────────────────────────────┐ +│ Docker network │ +│ │ +│ ┌──────────────────┐ ┌──────────────────────────┐ │ +│ │ clickhouse-keeper│◄──────►│ clickhouse-01 │ │ +│ │ (Raft / coord) │ │ (data node, replica 1) │ │ +│ └──────────────────┘ │ HTTP :8123 TCP :9000 │ │ +│ ▲ └──────────────────────────┘ │ +│ │ ▲ │ +│ │ │ replication │ +│ │ ┌──────────────────────────┐ │ +│ └──────────────────►│ clickhouse-02 │ │ +│ │ (data node, replica 2) │ │ +│ │ HTTP :8124 TCP :9001 │ │ +│ └──────────────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────┐ │ +│ │ ca-event-manager │ │ +│ │ Spring Boot app HTTP :8080 │ │ +│ │ connects to clickhouse-01:8123 │ │ +│ └──────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────────────┘ ``` -### Client Configuration (customer1:cluster1) -```bash -Analytics Client ID: "analytics-customer-customer1" -Analytics Client Secret: "testsecret" -Analytics Key: [Auto-generated by configurator] -``` +The two ClickHouse data nodes form a **single-shard, two-replica** cluster. `clickhouse-keeper` +provides Raft-based coordination (replication queues, DDL distribution, leader election). DDL +executed on either node is automatically propagated to the other because the `analytics` database +uses the `Replicated` engine. -## Getting Started +--- -### Quick Start Options +## Services -Choose your startup method based on your needs: +### clickhouse-keeper -#### Option 1: Using the Startup Script (Recommended) -```bash -# Analytics services only (faster startup, less resources) -./start-analytics.sh --analytics-only +| Property | Value | +|---|---| +| Image | `clickhouse/clickhouse-keeper:25.8` | +| Role | Raft coordination — replication queues, distributed DDL, merge leader election | +| Host port | `9181` (ZooKeeper-compatible client port) | +| Internal Raft port | `9234` (peer-to-peer, not exposed to host) | -# Full stack with dotCMS (complete development environment) -./start-analytics.sh +ClickHouse Keeper is a lightweight, built-in replacement for Apache ZooKeeper. It runs as a +**single-node Raft group** in this local setup — always the leader, no quorum required. +See [Scaling Keeper to Production](#scaling-keeper-to-production) for HA options. -# Force recreate containers (required for environment variable changes) -./start-analytics.sh --force-recreate -./start-analytics.sh --analytics-only --force-recreate +Both data nodes declare `depends_on: clickhouse-keeper (service_healthy)`, so starting either +data node automatically starts Keeper first. -# Show help and service details -./start-analytics.sh --help -``` +--- -#### Option 2: Using Docker Compose Directly -```bash -# Analytics services only -docker-compose up -d +### clickhouse-01 / clickhouse-02 -# Full stack with dotCMS -docker-compose --profile full up -d +| Property | clickhouse-01 | clickhouse-02 | +|---|---|---| +| Image | `clickhouse/clickhouse-server:25.8` | `clickhouse/clickhouse-server:25.8` | +| Role | Data node, **replica 1** | Data node, **replica 2** | +| HTTP API (host) | `localhost:8123` | `localhost:8124` | +| Native TCP (host) | `localhost:9000` | `localhost:9001` | +| Shard / Replica macro | `shard1` / `replica1` | `shard1` / `replica2` | -# Force recreate containers (for environment variable changes) -docker-compose up -d --force-recreate -docker-compose --profile full up -d --force-recreate +Both nodes share the same configuration (users, Keeper address, init SQL) except for their +`macros.xml`, which assigns the unique `{replica}` value used by `ReplicatedMergeTree` engine +paths. All tables use `Replicated*MergeTree` without explicit ZooKeeper paths — the `Replicated` +database engine manages paths automatically. -# Stop everything (including dotCMS services) -docker-compose --profile full down -``` +The application connects to **clickhouse-01 only**. clickhouse-02 exists to verify replication +correctness in integration tests. -### Startup Modes +--- -**Analytics Only Mode** (`--analytics-only`): -- Faster startup and lower resource usage -- Includes: Keycloak, Analytics API, Jitsu, Cube, ClickHouse, Redis, PostgreSQL -- Best for: Analytics development, testing API integrations +### ca-event-manager -**Full Stack Mode** (Default): -- Complete development environment -- Includes: All analytics services + dotCMS + OpenSearch + dotCMS Database -- Best for: End-to-end testing, content + analytics workflows +| Property | Value | +|---|---| +| Image | `ghcr.io/dotcms/dot-ca-event-manager:latest` | +| Role | Spring Boot analytics API | +| Host port | `8080` | +| ClickHouse target | `clickhouse-01:8123` | -### Wait for Services +The service is only included in the **full stack** (`docker-compose.yml`). For day-to-day +development you typically run the app with `mvn spring-boot:run` on the host and start only the +ClickHouse containers. -```bash -# Check service health -docker-compose ps +--- -# Monitor startup logs -docker-compose logs -f keycloak dotcms-analytics +## Directory Layout -# For full stack, monitor dotCMS startup -docker-compose logs -f dotcms +``` +analytics/ +├── docker-compose.yml # Main compose file (full stack) +│ +├── conf/ +│ ├── keeper/ +│ │ └── keeper_config.xml # Keeper Raft config (single-node) +│ ├── clickhouse-01/ +│ │ └── macros.xml # {shard=shard1, replica=replica1} +│ ├── clickhouse-02/ +│ │ └── macros.xml # {shard=shard1, replica=replica2} +│ ├── users.xml # ClickHouse admin user definition +│ └── zookeeper.xml # Keeper endpoint for data nodes +│ +└── init/ # SQL files run by clickhouse-01 on first start + ├── 01-init.sql # CREATE DATABASE analytics (Replicated engine) + ├── 10-global.sql # Raw events table + data-skipping indexes + ├── 20-event-data.sql # Content analytics tables + materialized views + ├── 30-conversion-data.sql # Conversion attribution tables + MVs + ├── 40-session-engagement-data.sql # Session engagement pipeline tables + MVs + └── 50-users.sql # Default customer user (cust-001) ``` -### Access Your Services - -**Analytics Services (Always Available):** -- **Keycloak Admin**: http://localhost:61111 (admin:keycloak) -- **Analytics API**: http://localhost:8088 -- **Cube Analytics**: http://localhost:4001 -- **Jitsu Events**: http://localhost:8081 -- **ClickHouse**: http://localhost:8124 - -**dotCMS Services (Full Stack Only):** -- **dotCMS**: http://localhost:8082 (admin@dotcms.com:admin) -- **Glowroot**: http://localhost:4000 - -### Verify Analytics Configuration (Full Stack) +> **Note:** `init/` is mounted on **clickhouse-01 only** (as `/docker-entrypoint-initdb.d`). DDL +> is replicated automatically to clickhouse-02 via the `Replicated` database engine — do not +> mount `init/` on both nodes or scripts will run twice. -1. Access dotCMS at http://localhost:8082 -2. Navigate to: Apps → dotExperiments-config -3. Analytics should be pre-configured with the URLs above -4. Test connection to verify all services are communicating +--- -## Network Architecture +## Running the Stack -### Networks -- **dotcms-net**: Isolated network for dotCMS core services (dotCMS, database, OpenSearch) -- **analytics-net**: Isolated network for analytics services (Keycloak, Jitsu, CubeJS, ClickHouse) -- **Bridge**: dotCMS connects to both networks to communicate with analytics services +### ClickHouse only (recommended for development) -### Security -- Internal service communication uses container names (e.g., `keycloak:8080`) -- External access uses host ports (e.g., `localhost:61111`) -- Databases are isolated and only accessible within their respective networks -- Analytics uses JWT-based authentication with Keycloak +Starts Keeper and both data replicas. Run the application separately with `mvn spring-boot:run`. -## Environment Variables +```bash +cd docker +docker compose up -d clickhouse-01 clickhouse-02 +``` -Key environment variables that can be customized: +Wait for both nodes to be healthy before starting the app: ```bash -# Ports -KEYCLOAK_HOST_PORT=61111 -DOTCMS_ANALYTICS_HOST_PORT=8088 -JITSU_HOST_PORT=8081 -CUBE_HOST_PORT=4001 -CH_HOST_PORT=8124 - -# Database -POSTGRESQL_DB=postgres -POSTGRESQL_USER=postgres -POSTGRESQL_PASS=postgres - -# ClickHouse -CH_DB=clickhouse_test_db -CH_USER=clickhouse_test_user -CH_PWD=clickhouse_password - -# Keycloak -KEYCLOAK_ADMIN=admin -KEYCLOAK_ADMIN_PASSWORD=keycloak - -# dotCMS Experiment Features -DOT_ENABLE_EXPERIMENTS_AUTO_JS_INJECTION=true +docker compose ps ``` -### ⚠️ Important: Environment Variable Changes - -**Environment variables are set when containers are first created and are NOT automatically updated when you restart services.** +### Full stack -To apply changes to environment variables: +```bash +cd docker +docker compose up +``` -1. **Stop and recreate containers:** - ```bash - docker-compose down - docker-compose up -d --force-recreate - ``` +Pulls `ghcr.io/dotcms/dot-ca-event-manager:latest` from GHCR, starts ClickHouse, and runs the +app on port `8080`. -2. **Or use the startup script with force recreate:** - ```bash - ./start-analytics.sh --force-recreate - ./start-analytics.sh --analytics-only --force-recreate - ``` +--- -3. **For individual services:** - ```bash - docker-compose up -d --force-recreate [service-name] - ``` +## Configuration Files -**Why this happens:** Docker containers bake environment variables into the container at creation time. Simply restarting (`docker-compose restart`) keeps the existing container with old environment variables. You must recreate the container to pick up new environment variables from the docker-compose.yml file. +| File | Purpose | +|---|---| +| `conf/keeper/keeper_config.xml` | Keeper Raft config: port 9181, single-node group, log/snapshot paths | +| `conf/zookeeper.xml` | Tells each data node where to find Keeper (`clickhouse-keeper:9181`) | +| `conf/clickhouse-01/macros.xml` | Node macros: `{shard}=shard1`, `{replica}=replica1` | +| `conf/clickhouse-02/macros.xml` | Node macros: `{shard}=shard1`, `{replica}=replica2` | +| `conf/users.xml` | Defines the `admin` user (password: `admin`, full access management) | -## Key Features +--- -### ✅ Complete Analytics Integration -- **Pre-configured dotCMS** with analytics URLs and client credentials -- **Defense-in-depth security** with multi-layer filtering -- **ClickHouse optimization** for customer partition elimination -- **JWT-based authentication** via Keycloak +## Database Initialization -### ✅ Development Ready -- **Hot-reload** CubeJS schema changes via volume mounts -- **Debug logging** enabled for troubleshooting -- **Health checks** for all critical services -- **Glowroot monitoring** for performance analysis +The `init/` scripts run in filename order on clickhouse-01's first start. Because the `analytics` +database uses the `Replicated` engine, all DDL is automatically propagated to clickhouse-02 — +**do not mount `init/` on both nodes**. -### ✅ Production Patterns -- **Separate databases** for dotCMS and analytics -- **Network isolation** between service layers -- **Persistent volumes** for data retention -- **Environment-based configuration** +| Script | What it creates | +|---|---| +| `01-init.sql` | `analytics` database (`Replicated` engine), admin row policy | +| `10-global.sql` | `analytics.events` — raw event ingestion table (`ReplicatedMergeTree`) | +| `20-event-data.sql` | `content_events_counter` + `pageviews_by_device_browser_daily` + their materialized views | +| `30-conversion-data.sql` | `conversion_time`, `content_presents_in_conversion` + refreshable MV | +| `40-session-engagement-data.sql` | Full session engagement pipeline: `session_states` → `session_facts` → `session_facts_latest` → roll-up tables (`engagement_daily`, `sessions_by_device_daily`, `sessions_by_browser_daily`, `sessions_by_language_daily`) | +| `50-users.sql` | Creates `cust-001` with a row policy scoped to `customer_id='cust-001'` | -## Troubleshooting +> `CLICKHOUSE_DB` is intentionally **not set** in `docker-compose.yml`. Setting it would cause +> Docker's entrypoint to pre-create the database as a plain (non-replicated) engine before the +> init scripts run, making the `CREATE DATABASE … ENGINE = Replicated(…)` in `01-init.sql` a +> no-op. -### Common Issues +--- -1. **Services not starting:** - ```bash - # Check logs - docker-compose logs [service-name] - - # Restart specific service - docker-compose restart [service-name] - ``` +## Default Credentials -2. **Analytics connection issues:** - - Verify all services are running: `docker-compose ps` - - Check network connectivity: `docker-compose exec dotcms ping keycloak` - - Verify URLs in dotCMS analytics configuration +| User | Password | Scope | +|---|---|---| +| `admin` | `admin` | Full access, all databases | +| `cust-001` | `abc` | `analytics` database, rows where `customer_id = 'cust-001'` | -3. **Permission issues:** - ```bash - # Fix volume permissions - sudo chown -R 1000:1000 ./setup/ - ``` +These are **local development defaults only**. All passwords must be rotated in any +non-development environment. -4. **Port conflicts:** - - Modify port mappings in docker-compose.yml - - Update corresponding environment variables +--- -### Useful Commands +## Ports at a Glance -```bash -# View service logs -docker-compose logs -f dotcms -docker-compose logs -f cube +| Host port | Container | Protocol | Notes | +|---|---|---|---| +| `8123` | clickhouse-01 | HTTP | Primary ClickHouse HTTP API | +| `9000` | clickhouse-01 | TCP | ClickHouse native protocol | +| `8124` | clickhouse-02 | HTTP | Replica HTTP API (tests only) | +| `9001` | clickhouse-02 | TCP | Replica native protocol (tests only) | +| `9181` | clickhouse-keeper | TCP | ZooKeeper-compatible Keeper client port | +| `8080` | ca-event-manager | HTTP | Analytics REST API | -# Access service shells -docker-compose exec dotcms bash -docker-compose exec analytics-postgres psql -U postgres +--- -# Restart specific services -docker-compose restart dotcms keycloak +## Scaling Keeper to Production -# Clean restart -docker-compose down -v -docker-compose up -d -``` +The current single-node Keeper provides **no high availability**. If the Keeper container goes +down, the data nodes can still serve reads but cannot commit new inserts or run replicated DDL +until the connection is restored. -## Next Steps +For a fault-tolerant cluster, run an **odd number of Keeper nodes** (minimum 3): -1. **Configure A/B Testing**: Set up experiments in dotCMS -2. **Create Dashboards**: Build analytics dashboards using CubeJS -3. **Monitor Performance**: Use Glowroot for application monitoring -4. **Scale Services**: Add replicas for high availability -5. **Production Hardening**: Implement proper secrets management and SSL certificates +| Keeper nodes | Can lose | Quorum | +|---|---|---| +| 1 | 0 | 1 of 1 — no HA | +| 3 | 1 | 2 of 3 | +| 5 | 2 | 3 of 5 | -## Security Considerations +Steps to expand: +1. Add `clickhouse-keeper-2` and `clickhouse-keeper-3` containers, each with its own + `keeper_config.xml` that has a unique `` and all three servers listed in + ``. +2. Update `conf/zookeeper.xml` on every data node to list all three Keeper endpoints. +3. Restart the cluster. -- Change default passwords in production -- Use proper SSL certificates for external access -- Implement proper firewall rules -- Regular security updates for all services -- Monitor access logs and authentication attempts \ No newline at end of file +See the comments inside `conf/keeper/keeper_config.xml` and `conf/zookeeper.xml` for full +configuration examples. \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml b/docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml new file mode 100644 index 000000000000..0c4d93db955e --- /dev/null +++ b/docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml @@ -0,0 +1,14 @@ + + + + shard1 + replica1 + + diff --git a/docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml b/docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml new file mode 100644 index 000000000000..f3db050bc343 --- /dev/null +++ b/docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml @@ -0,0 +1,14 @@ + + + + shard1 + replica2 + + diff --git a/docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml b/docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml new file mode 100644 index 000000000000..f4621d05bed3 --- /dev/null +++ b/docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml @@ -0,0 +1,111 @@ + + + + information + 1 + + + + 0.0.0.0 + + + + 9181 + + + 1 + + + /var/lib/clickhouse-keeper/log + /var/lib/clickhouse-keeper/snapshots + + + + 10000 + + 30000 + information + + + + + + 1 + clickhouse-keeper + 9234 + + + + + diff --git a/docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json b/docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json new file mode 100644 index 000000000000..4a38107404c6 --- /dev/null +++ b/docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json @@ -0,0 +1,9 @@ +[ + { + "_comment": "Pre-configures the clickhouse proxy at container startup via the -config flag. Without this file, Toxiproxy starts with no proxies and the proxy would need to be created manually via the REST API (POST /proxies) before any traffic can flow. For the integration test (HungConnectionIT) this is required: the Spring datasource URL points at the proxy port (18123) before the context boots, so the proxy must exist the moment the container is healthy. For the manual smoke-test stack (docker-compose.toxiproxy.yml) this file is not used — the proxy is created explicitly with a curl call after docker compose up.", + "name": "clickhouse", + "listen": "0.0.0.0:18123", + "upstream": "clickhouse-01:8123", + "enabled": true + } +] diff --git a/docker/docker-compose-examples/analytics/conf/users.xml b/docker/docker-compose-examples/analytics/conf/users.xml new file mode 100644 index 000000000000..7588f9f301d4 --- /dev/null +++ b/docker/docker-compose-examples/analytics/conf/users.xml @@ -0,0 +1,14 @@ + + + + admin + + ::/0 + + default + default + 1 + 1 + + + \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/conf/zookeeper.xml b/docker/docker-compose-examples/analytics/conf/zookeeper.xml new file mode 100644 index 000000000000..b7688c905a1a --- /dev/null +++ b/docker/docker-compose-examples/analytics/conf/zookeeper.xml @@ -0,0 +1,53 @@ + + + + + clickhouse-keeper + 9181 + + + diff --git a/docker/docker-compose-examples/analytics/docker-compose.yml b/docker/docker-compose-examples/analytics/docker-compose.yml index 81d9642976b8..50714a0a2ac5 100644 --- a/docker/docker-compose-examples/analytics/docker-compose.yml +++ b/docker/docker-compose-examples/analytics/docker-compose.yml @@ -1,300 +1,87 @@ -services: +name: dotcms-analytics - # Analytics Database (separate from dotCMS DB) - analytics-postgres: - container_name: analytics-postgres - image: pgvector/pgvector:pg18 - restart: unless-stopped - environment: - POSTGRES_DB: ${POSTGRESQL_DB:-postgres} - POSTGRES_USER: ${POSTGRESQL_USER:-postgres} - POSTGRES_PASSWORD: ${POSTGRESQL_PASS:-postgres} +volumes: + clickhouse-01-data: + clickhouse-02-data: + clickhouse-keeper-data: + +services: + clickhouse-keeper: + image: clickhouse/clickhouse-keeper:25.8 + container_name: clickhouse-keeper ports: - - ${POSTGRESQL_HOST_PORT:-54321}:5432 - networks: - - analytics-net + - "9181:9181" volumes: - - analytics-db-data:/var/lib/postgresql + - ./conf/keeper/keeper_config.xml:/etc/clickhouse-keeper/keeper_config.xml + - clickhouse-keeper-data:/var/lib/clickhouse-keeper healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRESQL_USER:-postgres} -d ${POSTGRESQL_DB:-postgres} -h localhost -p 5432"] - interval: 10s - timeout: 5s - retries: 5 + test: [ "CMD-SHELL", "clickhouse-keeper-client -h 127.0.0.1 -p 9181 -q 'ruok' 2>/dev/null | grep -q imok" ] + interval: 5s + timeout: 3s + retries: 30 - # dotCMS Database - dotcms-db: - profiles: ["full"] - container_name: dotcms-db - image: pgvector/pgvector:pg18 - command: postgres -c 'max_connections=400' -c 'shared_buffers=128MB' - environment: - POSTGRES_USER: 'dotcmsdbuser' - POSTGRES_PASSWORD: 'password' - POSTGRES_DB: 'dotcms' + clickhouse-01: + image: clickhouse/clickhouse-server:25.8 + container_name: clickhouse-01 + ports: + - "8123:8123" + - "9000:9000" volumes: - - dotcms-db-data:/var/lib/postgresql - networks: - - dotcms-net - healthcheck: - test: ["CMD-SHELL", "pg_isready -U dotcmsdbuser -d dotcms -h localhost -p 5432"] - interval: 10s - timeout: 5s - retries: 5 - restart: unless-stopped - - # OpenSearch for dotCMS - opensearch: - profiles: ["full"] - container_name: opensearch - image: opensearchproject/opensearch:1 - environment: - cluster.name: "elastic-cluster" - discovery.type: "single-node" - bootstrap.memory_lock: "true" - OPENSEARCH_JAVA_OPTS: "-Xmx1G" + - ./conf/users.xml:/etc/clickhouse-server/users.d/users.xml + - ./conf/zookeeper.xml:/etc/clickhouse-server/config.d/zookeeper.xml + - ./conf/clickhouse-01/macros.xml:/etc/clickhouse-server/config.d/macros.xml + - ./init:/docker-entrypoint-initdb.d + - clickhouse-01-data:/var/lib/clickhouse ulimits: - memlock: - soft: -1 - hard: -1 nofile: - soft: 65536 - hard: 65536 - ports: - - "9200:9200" - - "9600:9600" - volumes: - - opensearch-data:/usr/share/opensearch/data - networks: - - dotcms-net - deploy: - resources: - limits: - cpus: "1.0" - memory: 2G - - keycloak: - container_name: keycloak - depends_on: - - analytics-postgres - environment: - DB_VENDOR: postgres - DB_ADDR: analytics-postgres - KEYCLOAK_ADMIN: ${KEYCLOAK_ADMIN:-admin} - KEYCLOAK_ADMIN_PASSWORD: ${KEYCLOAK_ADMIN_PASSWORD:-keycloak} - DB_DATABASE: ${POSTGRESQL_DB:-postgres} - DB_USER: ${POSTGRESQL_USER:-postgres} - DB_PASSWORD: ${POSTGRESQL_PASS:-postgres} - KC_HOSTNAME_STRICT_HTTPS: false - KC_HOSTNAME_STRICT: false - KC_HTTP_ENABLED: true - image: quay.io/keycloak/keycloak:${KEYCLOAK_VERSION:-18.0.2} - volumes: - - ./setup/config/dev/keycloak/test-realm.json:/opt/keycloak/data/import/example-realm.json - entrypoint: ["/opt/keycloak/bin/kc.sh", "start-dev", "--import-realm", "--hostname-strict-https=false", "--http-enabled=true"] - ports: - - "${KEYCLOAK_HOST_PORT:-61111}:8080" - networks: - - analytics-net - restart: always - - # dotCMS with Analytics Pre-configured - dotcms: - profiles: ["full"] - container_name: dotcms - image: dotcms/dotcms-test:1.0.0-SNAPSHOT - environment: - # dotCMS Core Configuration - CMS_JAVA_OPTS: '-Xmx1g' - LANG: 'C.UTF-8' - TZ: 'UTC' - DB_BASE_URL: "jdbc:postgresql://dotcms-db/dotcms" - DB_USERNAME: 'dotcmsdbuser' - DB_PASSWORD: 'password' - DOT_ES_AUTH_BASIC_PASSWORD: 'admin' - DOT_ES_ENDPOINTS: 'https://opensearch:9200' - DOT_INITIAL_ADMIN_PASSWORD: 'admin' - DOT_DOTCMS_CLUSTER_ID: 'dotcms-analytics-cluster' - GLOWROOT_ENABLED: 'true' - GLOWROOT_WEB_UI_ENABLED: 'true' - DOT_ALLOW_ACCESS_TO_PRIVATE_SUBNETS: 'true' - - # Analytics Configuration (Environment Variables Override) - # Internal URLs (container-to-container communication) - DOT_FEATURE_FLAG_EXPERIMENTS: "true" - DOT_ENABLE_EXPERIMENTS_AUTO_JS_INJECTION: "true" - DOT_FEATURE_FLAG_CONTENT_ANALYTICS_AUTO_INJECT: "true" - DOT_FEATURE_FLAG_CONTENT_ANALYTICS: "true" - DOT_ANALYTICS_IDP_URL: "http://keycloak:8080/realms/dotcms/protocol/openid-connect/token" - DOT_ANALYTICS_APP_CONFIG_URL: "http://dotcms-analytics:8080/c/customer1/cluster1/keys" - DOT_ANALYTICS_APP_WRITE_URL: "http://jitsu:8001/api/v1/event" - DOT_ANALYTICS_APP_READ_URL: "http://cube:4000" - - # Analytics Client Configuration (customer1:cluster1) - DOT_ANALYTICS_APP_CLIENT_ID: "analytics-customer-customer1" - DOT_ANALYTICS_APP_CLIENT_SECRET: "testsecret" - + soft: 262144 + hard: 262144 depends_on: - dotcms-db: + clickhouse-keeper: condition: service_healthy - opensearch: - condition: service_started - keycloak: - condition: service_started - volumes: - - cms-shared:/data/shared - networks: - - dotcms-net - - analytics-net # Bridge to analytics network - ports: - - "8082:8082" # HTTP - - "8443:8443" # HTTPS - - "4000:4000" # Glowroot web UI - - dotcms-analytics: - container_name: dotcms-analytics - image: ghcr.io/dotcms/internal-infrastructure/configurator:latest - environment: - #- JITSU_USE_CONFIGURATOR='true' - #- JITSU_JITSU_CONFIGURATOR=http://host.docker.internal:7007/ - - JITSU_CLUSTER_ADMIN_TOKEN=myadmin - - JITSU_JITSU_SERVER=http://jitsu:8001/ - ## For local development ONLY. This forces events to be immediately persisted - ## to ClickHouse. For PROD instances, this must be set to 'batch' - - JITSU_DESTINATIONS_CLICKHOUSE_MODE=stream - - QUARKUS_OIDC_AUTH_SERVER_URL=${AUTH_SERVER_URL:-http://keycloak:8080/realms/dotcms} - - QUARKUS_DATASOURCE_DB_KIND=postgresql - - QUARKUS_DATASOURCE_REACTIVE_URL=postgresql://analytics-postgres:5432/${POSTGRESQL_DB:-postgres} - - QUARKUS_DATASOURCE_USERNAME=${POSTGRESQL_USER:-postgres} - - QUARKUS_DATASOURCE_PASSWORD=${POSTGRESQL_PASS:-postgres} - - QUARKUS_HIBERNATE_ORM_DATABASE_GENERATION=drop-and-create - - QUARKUS_HIBERNATE_ORM_DATABASE_GENERATION_CREATE_SCHEMAS=true - - QUARKUS_SWAGGER_UI_ALWAYS_INCLUDE=true - - EXCLUDED_QUERY_PARAMS=${ANALYTICS_EXCLUDED_QUERY_PARAMS:-variantName,redirect} - ## Enable this for extended logging and troubleshooting - #- QUARKUS_LOG_LEVEL=DEBUG - - QUARKUS_PROFILE=prod - - QUARKUS_KUBERNETES_CONFIG_ENABLED=false - ## If not using the prod profid, the jwks url and issuer need to be set directly. - #- MP_JWT_VERIFY_PUBLICKEY_LOCATION=http://keycloak:8080/realms/dotcms/protocol/openid-connect/certs - #- MP_JWT_VERIFY_ISSUER=http://keycloak:8080/realms/dotcms - ## Or, for local testing only, you can set issuer to 'NONE' to not validate - #- MP_JWT_VERIFY_ISSUER=NONE - - ISSUER_URI=${ISSUER_URI:-NONE} - - JWKS_URI=${JWKS_URL:-http://keycloak:8080/realms/dotcms/protocol/openid-connect/certs} - - CLICKHOUSE_URL=http://${CH_USER:-clickhouse_test_user}:${CH_PWD:-clickhouse_password}@ch_server:8123 - ports: - - "${DOTCMS_ANALYTICS_HOST_PORT:-8088}:8080" - networks: - - analytics-net - depends_on: - - keycloak - - analytics-postgres - - jitsu - - jitsu: - container_name: jitsu - image: jitsucom/server:latest - environment: - - CLUSTER_ADMIN_TOKEN=myadmin - - REDIS_URL=redis://jitsu_redis:6379 - - JITSU_CONFIGURATOR_URL=${JITSU_CONFIGURATOR_URL:-http://dotcms-analytics:8080} - - SERVER_PORT=8001 - - TERM=xterm-256color - - TLS_SKIP_VERIFY=true - depends_on: - - redis - - ch_server - volumes: - - ./setup/config/dev/jitsu/server/config:/home/eventnative/data/config - restart: always - networks: - - analytics-net - ports: - - "${JITSU_HOST_PORT:-8081}:8001" - - redis: - container_name: jitsu_redis - image: redis:6.2.6-bullseye - volumes: - - redis-data:/data - networks: - - analytics-net - restart: always healthcheck: - test: ["CMD-SHELL", "redis-cli -h localhost -p 6379 PING"] - interval: 1s - timeout: 30s - - cube: - container_name: cube - image: cubejs/cube:v1.6.11 - ports: - - ${CUBE_HOST_PORT:-4001}:4000 - environment: - - CUBEJS_DEV_MODE=true - - CUBEJS_DB_TYPE=clickhouse - - CUBEJS_DB_HOST=${CH_SERVER:-ch_server} - - CUBEJS_DB_NAME=${CH_DB:-clickhouse_test_db} - - CUBEJS_DB_USER=${CH_USER:-clickhouse_test_user} - - CUBEJS_DB_PASS=${CH_PWD:-clickhouse_password} - # Use internal network for JWT validation - - CUBEJS_JWK_URL=${JWKS_URL:-http://keycloak:8080/realms/dotcms/protocol/openid-connect/certs} - - CUBEJS_JWT_AUDIENCE=api-dotcms-analytics-audience - #- CUBEJS_JWT_ISSUER=${AUTH_SERVER_URL:-http://keycloak:8080/realms/dotcms} - - CUBEJS_JWT_ALGS=RS256 - - CUBEJS_JWT_CLAIMS_NAMESPACE=https://dotcms.com/analytics - - CUBEJS_LOG_LEVEL=trace - - CUBEJS_REFRESH_WORKER=true - - CUBEJS_SCHEDULED_REFRESH_TIME=true - - CUBEJS_OVERRIDE_CUSTOMER=customer1 - - CUBEJS_OVERRIDE_CLUSTER=cluster1 - volumes: - - cube_metastore:/cube/conf/.cubestore - - ./setup/config/dev/cube/schema:/cube/conf/schema - - ./setup/config/dev/cube/cube.js:/cube/conf/cube.js - networks: - - analytics-net - depends_on: - - ch_server - - keycloak + test: [ "CMD-SHELL", "wget -qO- http://localhost:8123/ping | grep -q Ok" ] + interval: 5s + timeout: 3s + retries: 30 - ch_server: - container_name: ch_server + clickhouse-02: image: clickhouse/clickhouse-server:25.8 + container_name: clickhouse-02 ports: - - "${CH_HOST_PORT:-8124}:8123" + - "8124:8123" + - "9001:9000" + volumes: + - ./conf/users.xml:/etc/clickhouse-server/users.d/users.xml + - ./conf/zookeeper.xml:/etc/clickhouse-server/config.d/zookeeper.xml + - ./conf/clickhouse-02/macros.xml:/etc/clickhouse-server/config.d/macros.xml + - ./init:/docker-entrypoint-initdb.d + - clickhouse-02-data:/var/lib/clickhouse ulimits: nofile: soft: 262144 hard: 262144 + depends_on: + clickhouse-keeper: + condition: service_healthy healthcheck: - test: wget --no-verbose --tries=1 --spider http://localhost:8123 || exit 1 - environment: - - CLICKHOUSE_DB=${CH_DB:-clickhouse_test_db} - - CLICKHOUSE_USER=${CH_USER:-clickhouse_test_user} - - CLICKHOUSE_PASSWORD=${CH_PWD:-clickhouse_password} - volumes: - - ch_data:/var/lib/clickhouse - - ./setup/db/clickhouse/init-scripts:/docker-entrypoint-initdb.d - networks: - - analytics-net + test: [ "CMD-SHELL", "wget -qO- http://localhost:8123/ping | grep -q Ok" ] + interval: 5s + timeout: 3s + retries: 30 -networks: - dotcms-net: - driver: bridge - analytics-net: - driver: bridge - -volumes: - # dotCMS volumes - cms-shared: - dotcms-db-data: - opensearch-data: - - # Analytics volumes - analytics-db-data: - ch_data: - redis-data: - redis_ur_data: - cube_metastore: - workspace: + ca-event-manager: + image: ghcr.io/dotcms/dot-ca-event-manager:latest + container_name: ca-event-manager + environment: + - SPRING_PROFILES_ACTIVE=dev + - ANALYTICS_ASYNC_INSERT_TIMEOUT_MS=200 + - ANALYTICS_ASYNC_INSERT_MAX_DATA_SIZE=104857600 + - CLICKHOUSE_URL=jdbc:clickhouse://clickhouse-01:8123/analytics?async_insert=1&wait_for_async_insert=1&async_insert_busy_timeout_ms=${ANALYTICS_ASYNC_INSERT_TIMEOUT_MS:-200}&async_insert_max_data_size=${ANALYTICS_ASYNC_INSERT_MAX_DATA_SIZE:-104857600} + - ANALYTICS_CH_PROBE_USR=${ANALYTICS_CH_PROBE_USR:-admin} + - ANALYTICS_CH_PROBE_PWD=${ANALYTICS_CH_PROBE_PWD:-admin} + ports: + - "8082:8080" + depends_on: + clickhouse-01: + condition: service_healthy diff --git a/docker/docker-compose-examples/analytics/init/01-init.sql b/docker/docker-compose-examples/analytics/init/01-init.sql new file mode 100644 index 000000000000..6207f1c41bcb --- /dev/null +++ b/docker/docker-compose-examples/analytics/init/01-init.sql @@ -0,0 +1,14 @@ +-- ===================================================================== +-- Database creation and initialization +-- ===================================================================== +-- Replicated database: replicates DDL (CREATE/ALTER/DROP) across all nodes via Keeper, +-- and coordinates refreshable MV execution so only one replica runs each refresh cycle. +CREATE DATABASE IF NOT EXISTS analytics + ENGINE = Replicated('/clickhouse/databases/analytics', '{shard}', '{replica}'); +USE analytics; + +CREATE ROW POLICY IF NOT EXISTS rp_admin_user +ON analytics.* +FOR SELECT + USING customer_id = 'customer1' + AND environment = 'cluster1'; \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/init/10-global.sql b/docker/docker-compose-examples/analytics/init/10-global.sql new file mode 100644 index 000000000000..91c552125ad8 --- /dev/null +++ b/docker/docker-compose-examples/analytics/init/10-global.sql @@ -0,0 +1,105 @@ +-- ===================================================================== +-- This is the raw event ingestion table. +-- ===================================================================== +CREATE TABLE IF NOT EXISTS analytics.events +( + -- ###################################################### + -- General Event Properties + -- ###################################################### + timestamp DateTime64(3, 'UTC') CODEC(DoubleDelta, ZSTD(3)), + event_time DateTime64(3, 'UTC') CODEC(DoubleDelta, ZSTD(3)), + event_type LowCardinality(String), + environment LowCardinality(String), + customer_id LowCardinality(String), + + + -- ###################################################### + -- URL Properties + -- ###################################################### + url String, + page_title String, + site_id String, + doc_host String, + doc_path String, + doc_search String, + doc_encoding LowCardinality(String), + doc_hash Nullable(String), + doc_protocol LowCardinality(String), + referer String CODEC(ZSTD(3)), + + + -- ###################################################### + -- Browser Properties + -- ###################################################### + user_agent String, + -- Raw parsed UA fields (set by Java at ingest time via uap-java) + parsed_ua_device_family LowCardinality(String), + parsed_ua_os_family LowCardinality(String), + parsed_ua_ua_family LowCardinality(String), + -- Derived bucketed categories (set by Java at event ingestion time via in-memory lookup) + device_category LowCardinality(String) DEFAULT '', + browser_family LowCardinality(String) DEFAULT '', + screen_resolution String, + viewport_size String, + viewport_height String, + viewport_width String, + browser_language LowCardinality(String), + locale_id LowCardinality(String) DEFAULT '', + user_id String CODEC(ZSTD(3)), + session_id String CODEC(ZSTD(3)), + + + -- ###################################################### + -- Analytics Tool Properties + -- ###################################################### + utm_campaign LowCardinality(String), + utm_medium LowCardinality(String), + utm_source LowCardinality(String), + utm_term Nullable(String), + utm_content Nullable(String), + + + -- ###################################################### + -- Used in content_impression events + -- ###################################################### + content_identifier Nullable(String) CODEC(ZSTD(3)), + content_inode Nullable(String) CODEC(ZSTD(3)), + content_title Nullable(String), + content_content_type Nullable(String), + position_viewport_offset_pct Nullable(Int16), + position_dom_index Nullable(Int8), + + + -- ###################################################### + -- Used in content_click events + -- ###################################################### + dom_element_text Nullable(String), + dom_element_type Nullable(String), + dom_element_id Nullable(String), + dom_element_class Nullable(String), + dom_element_attributes Nullable(String), + + + -- ###################################################### + -- Used in conversion events + -- ###################################################### + conversion_name String, + + + -- ###################################################### + -- Data skipping indexes + -- ###################################################### + INDEX idx_event_time event_time TYPE minmax GRANULARITY 1, + INDEX idx_environment environment TYPE bloom_filter GRANULARITY 64, + INDEX idx_customer_id customer_id TYPE bloom_filter GRANULARITY 64, + INDEX idx_event_type event_type TYPE set(100) GRANULARITY 1, + INDEX idx_conversion conversion_name TYPE set(100) GRANULARITY 1, + INDEX idx_user_id user_id TYPE bloom_filter GRANULARITY 64, + INDEX idx_content_identifier content_identifier TYPE bloom_filter GRANULARITY 64, + INDEX idx_device_category device_category TYPE set(50) GRANULARITY 1, + INDEX idx_browser_family browser_family TYPE set(50) GRANULARITY 1 + +) Engine = ReplicatedMergeTree() + PARTITION BY customer_id + ORDER BY (timestamp, customer_id) + SETTINGS index_granularity = 8192; diff --git a/docker/docker-compose-examples/analytics/init/20-event-data.sql b/docker/docker-compose-examples/analytics/init/20-event-data.sql new file mode 100644 index 000000000000..47ca9ff8742d --- /dev/null +++ b/docker/docker-compose-examples/analytics/init/20-event-data.sql @@ -0,0 +1,122 @@ +-- ===================================================================== +-- Stores daily aggregated counts of events per: +-- +-- day +-- environment +-- customer_id +-- event_type +-- user_id +-- identifier (URL or content_id) +-- title + +-- Why SummingMergeTree? + +-- Because the MV inserts pre-aggregated rows, and daily_total is summed on merge. + +--This allows: +--fast incremental updates +--easy "daily counts" reporting +--low storage overhead +-- ===================================================================== + +CREATE TABLE IF NOT EXISTS analytics.content_events_counter +( + day Date, + + environment LowCardinality(String), + customer_id LowCardinality(String), + + site_id String, + + event_type LowCardinality(String), + user_id String CODEC(ZSTD(3)), + + identifier String CODEC(ZSTD(3)), + title String, + + daily_total UInt64 +) + ENGINE = ReplicatedSummingMergeTree(daily_total) +PARTITION BY (customer_id, environment, toYYYYMM(day)) +ORDER BY (customer_id, environment, user_id, day, identifier, title, event_type); + + +-- ===================================================================== +-- Transforms raw events into daily activity counters. +-- For every event inserted into events, it computes: +-- +-- day → start-of-day from event_time +-- identifier → URL for pageview, content_identifier otherwise +-- title → page_title or content_title +-- +-- Then groups by: +-- customer_id, environment, user_id, day, identifier, title, event_type +-- +-- And inserts: +-- +-- count(*) AS daily_total +-- ===================================================================== + +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.content_events_counter_mv TO analytics.content_events_counter AS +SELECT customer_id, + environment, + event_type, + user_id, + site_id, + toStartOfDay(event_time) as day, + (CASE + WHEN event_type = 'pageview' THEN doc_path + WHEN event_type = 'conversion' THEN conversion_name + ELSE content_identifier + END) as identifier, + (CASE + WHEN event_type = 'pageview' THEN page_title + WHEN event_type = 'conversion' THEN conversion_name + ELSE content_title + END) as title, + count(*) as daily_total +FROM analytics.events +GROUP BY customer_id, environment, user_id, day, identifier, title, event_type, site_id; + + +-- ===================================================================== +-- Stores daily pre-aggregated pageview counts grouped by device category and browser family. +-- +-- Why SummingMergeTree? +-- The same reasoning as content_events_counter: the MV inserts pre-aggregated rows, +-- and pageview_count is summed on merge. Allows fast, scalable reads for the +-- "Pageviews by Device & Browser" dashboard metric. +-- +-- Java sets device_category and browser_family at ingestion time (UA parsing). +-- The MV normalizes empty strings (pre-enrichment historical events) to +-- 'unknown' / 'unknown' so the table never stores blanks. +-- +-- ===================================================================== +CREATE TABLE IF NOT EXISTS analytics.pageviews_by_device_browser_daily +( + day Date, + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + device_category LowCardinality(String), + browser_family LowCardinality(String), + pageview_count UInt64 +) + ENGINE = ReplicatedSummingMergeTree(pageview_count) + PARTITION BY (customer_id, environment, toYYYYMM(day)) + ORDER BY (customer_id, environment, site_id, day, device_category, browser_family); + + +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.pageviews_by_device_browser_daily_mv + TO analytics.pageviews_by_device_browser_daily AS +SELECT + customer_id, + environment, + site_id, + toStartOfDay(event_time) AS day, + if(device_category = '', 'unknown', device_category) AS device_category, + if(browser_family = '', 'unknown', browser_family) AS browser_family, + count(*) AS pageview_count +FROM analytics.events +WHERE event_type = 'pageview' +GROUP BY customer_id, environment, site_id, day, device_category, browser_family; diff --git a/docker/docker-compose-examples/analytics/init/30-conversion-data.sql b/docker/docker-compose-examples/analytics/init/30-conversion-data.sql new file mode 100644 index 000000000000..183397b7994f --- /dev/null +++ b/docker/docker-compose-examples/analytics/init/30-conversion-data.sql @@ -0,0 +1,167 @@ +-- ===================================================================== +-- Stores the latest known conversion timestamp per user, but in aggregate function format. +-- Two aggregated fields: +-- +-- conversion_last_time → last conversion event time +-- +-- timestamp_last_time → last processed timestamp inside content_presents_in_conversion +-- +-- Why AggregatingMergeTree? +-- +-- Because conversion_time_mv inserts aggregate states (maxState) and later merges them. +-- This table provides a "boundary" so that future incremental batches don't reprocess old records. +-- ===================================================================== +CREATE TABLE IF NOT EXISTS analytics.conversion_time +( + environment LowCardinality(String), + customer_id LowCardinality(String), + + site_id String, + user_id String CODEC(ZSTD(3)), + + conversion_last_time AggregateFunction( max, DateTime64(3, 'UTC')), + timestamp_last_time AggregateFunction( max, DateTime64(3, 'UTC')) +) + ENGINE = ReplicatedAggregatingMergeTree() +PARTITION BY (customer_id, environment) +ORDER BY (customer_id, environment, user_id); + + + +-- ===================================================================== +-- Tracks which content a user interacted with prior to a conversion and after the user's previous conversion +-- ===================================================================== +CREATE TABLE IF NOT EXISTS analytics.content_presents_in_conversion +( + day Date, + last_timestamp DateTime64(3, 'UTC'), + last_conversion_time DateTime64(3, 'UTC'), + + environment LowCardinality(String), + customer_id LowCardinality(String), + + site_id String, + + event_type LowCardinality(String), + user_id String CODEC(ZSTD(3)), + + identifier String CODEC(ZSTD(3)), + title String, + + conversion_name String, + conversion_count UInt32, + events_count UInt32 +) + ENGINE = ReplicatedSummingMergeTree() +PARTITION BY (customer_id, environment, toYYYYMM(day)) +ORDER BY (customer_id, environment, user_id, event_type, conversion_name, identifier, title, day); + + + +-- ===================================================================== +-- It does: +-- +-- Identifies new conversions since last refresh +-- Locates content seen by the user right before each conversion +-- Inserts attribution rows into content_presents_in_conversion +-- +-- How it works (step-by-step) +-- A) Define conversion CTE +-- For each conversion event: +-- Joins against conversion_time to get the previous batch's last timestamps +-- Uses lag() to find previous conversion in current batch +-- +-- Calculates: +-- previous_conversion_timestamp = max(previous_timestamp_current_batch, last_timestamp_previous_batch) +-- +-- Filters conversions that are: +-- +-- new (timestamp > last_timestamp_previous_batch) +-- recent (timestamp <= now()) +-- +-- This ensures incremental processing, no duplicates. +-- +-- B) Join events leading to conversion +-- +-- Matches events where: +-- +-- e.event_time < conversion.conversion_time +-- e.event_time > conversion.conversion_last_time +-- event_type <> 'conversion' + +-- Meaning: +-- +-- Only consider events between the previous conversion timestamp and this conversion timestamp. +-- +-- C) Group and insert +-- +-- Inserts rows summarizing content presence before the conversion. +-- ===================================================================== +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.content_presents_in_conversion_mv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND + APPEND TO analytics.content_presents_in_conversion AS +WITH conversion AS ( + SELECT user_id, + event_time AS conversion_time, + timestamp, + maxMerge(conversion_time.timestamp_last_time) as last_timestamp_previous_batch, + maxMerge(conversion_time.conversion_last_time) as conversion_last_time, + e.conversion_name, + lag(timestamp, 1) OVER ( + PARTITION BY user_id + ORDER BY timestamp + ) AS previous_timestamp_current_batch, + lag(event_time, 1) OVER ( + PARTITION BY user_id + ORDER BY event_time + ) AS previous_event_time_current_batch, + (CASE WHEN previous_event_time_current_batch > conversion_last_time THEN previous_event_time_current_batch ELSE conversion_last_time END) as previous_conversion_time + FROM analytics.events as e + LEFT JOIN analytics.conversion_time on e.customer_id = conversion_time.customer_id AND e.environment = conversion_time.environment AND + e.user_id = conversion_time.user_id AND e.site_id = conversion_time.site_id + WHERE event_type = 'conversion' + group by user_id,event_time, timestamp, conversion_name + HAVING (timestamp >= last_timestamp_previous_batch AND timestamp <= now()) +) +SELECT + toStartOfDay(conversion.conversion_time) as day, + customer_id, + environment, + (CASE WHEN event_type = 'pageview' THEN doc_path ELSE content_identifier END) as identifier, + (CASE WHEN event_type = 'pageview' THEN page_title ELSE content_title END) as title, + event_type, + user_id, + site_id, + conversion.conversion_name as conversion_name, + count(*) AS events_count, + count(DISTINCT conversion_time) AS conversion_count, + max(conversion.timestamp) as last_timestamp, + max(conversion.conversion_time) as last_conversion_time +FROM analytics.events e + INNER JOIN conversion ON e.user_id = conversion.user_id AND + e.event_time < conversion.conversion_time AND + e.event_time > conversion.previous_conversion_time AND + event_type <> 'conversion' +GROUP BY customer_id, environment, identifier, title, event_type, user_id, conversion.conversion_name, day, site_id; + + + +-- ===================================================================== +-- Updates the conversion_time table using the output of content_presents_in_conversion. Every time new attribution rows are emitted +-- +-- This ensures: +-- +-- Next execution of the refreshable MV knows where the last batch ended +-- +-- Prevents reprocessing or double counting +-- ===================================================================== +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.conversion_time_mv TO analytics.conversion_time AS +SELECT customer_id, + environment, + user_id, + site_id, + maxState(last_timestamp) as timestamp_last_time, + maxState(last_conversion_time) as conversion_last_time +FROM analytics.content_presents_in_conversion +GROUP BY customer_id, environment, user_id, site_id; \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql b/docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql new file mode 100644 index 000000000000..55d6031b24fd --- /dev/null +++ b/docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql @@ -0,0 +1,1085 @@ +/* ===================================================================================================== + dotCMS Content Analytics + Session Engagement Pipeline + ===================================================================================================== + + OVERVIEW + ----------------------------------------------------------------------------------------------------- + + This script defines the complete session-engagement pipeline used to compute GA4-style engagement + metrics for dotCMS Content Analytics while keeping the architecture scalable, explicit, and easy to + reason about. + + This version assumes: + + 1) session_id is a REAL browser session identifier + - sessions are short-lived + - sessions rotate normally + - session_id is NOT a long-lived user identity + + 2) late-arriving events are still possible + - network retries + - collector delays + - buffering + - eventual ingestion into ClickHouse + + 3) ALL historical session data must be kept + - TTL must be defined when ready for production + - no dropping older sessions + - the late-event window is only for recomputation + + 4) downstream consumers will use RAW SQL + - no semantic modeling layer on top of ClickHouse + - the service layer / API can directly query the roll-up tables + + ----------------------------------------------------------------------------------------------------- + HIGH-LEVEL PIPELINE + ----------------------------------------------------------------------------------------------------- + + events (raw immutable event stream) + ↓ real-time MV + session_states (incremental mergeable session states) + ↓ refreshable MV APPEND + session_facts (full historical session table, versioned) + ↓ refreshable MV + session_facts_latest (latest effective row per session) + ↓ refreshable MVs + engagement_daily + sessions_by_*_daily + ↓ + raw SQL queries / API / Angular dashboard + + ----------------------------------------------------------------------------------------------------- + WHY THIS SHAPE? + ----------------------------------------------------------------------------------------------------- + + We want to solve two competing needs: + + A) Keep ALL session history forever + B) Still reprocess recent sessions to absorb late-arriving events + + A naive design would overwrite `session_facts` with only the recent sliding window, but that would make + older sessions disappear from the table. + + Instead, this design works like this: + + - `session_states` continuously accumulates mergeable event states + - `session_facts_rmv` recalculates ONLY recent sessions and APPENDS a newer version of the row into + `session_facts` + - `session_facts` therefore becomes a versioned historical store + - `session_facts_latest_rmv` deduplicates `session_facts` into exactly one latest row per session key + - all dashboard roll-ups read from `session_facts_latest`, so they do not need the `FINAL` keyword in + the `SELECT` queries used to read data from it. + + ----------------------------------------------------------------------------------------------------- + IMPORTANT CONCEPTS + ----------------------------------------------------------------------------------------------------- + + 1) "Sliding window" DOES NOT mean data retention + The sliding window only determines which sessions are recalculated for late-event correction. + + 2) `session_facts` keeps full history + Old sessions remain stored forever unless you later add a TTL or retention job. + + 3) `session_facts_latest` is the "current truth" layer + It contains the latest effective version of each session and is the recommended source for roll-ups + and direct SQL queries that need one row per session. + + 4) This script is optimized for correctness and clarity first + It is already production-friendly in shape, but you can later tune refresh frequencies, partitions, + and roll-up scopes after observing real data volume and ingestion lag. + +===================================================================================================== */ + + +/* ===================================================================================================== + PIPELINE DIAGRAM + ===================================================================================================== + + ┌───────────────────────────────────────────────┐ + │ Browser / Site │ + │ │ + │ pageview | content_click | conversion | ... │ + │ │ + └──────────────────────┬────────────────────────┘ + │ + ▼ + ┌───────────────────────────────────────────────┐ + │ events (MergeTree) │ + │ │ + │ - one row per event │ + │ - raw immutable ingestion stream │ + │ - device_category / browser_family set by │ + │ Java at ingest time (UA parsing) │ + │ │ + └──────────────────────┬────────────────────────┘ + │ (incremental MV) + ▼ + ┌───────────────────────────────────────────────┐ + │ session_states (AggregatingMT) │ + │ │ + │ - mergeable per-session states │ + │ - late events naturally merge in │ + │ │ + └──────────────────────┬────────────────────────┘ + │ (refreshable MV APPEND) + ▼ + ┌───────────────────────────────────────────────┐ + │ session_facts (ReplacingMT) │ + │ │ + │ - full historical session store │ + │ - newer versions appended for recent rows │ + │ │ + └──────────────────────┬────────────────────────┘ + │ (refreshable MV) + ▼ + ┌───────────────────────────────────────────────┐ + │ session_facts_latest (ReplacingMT) │ + │ │ + │ - exactly one latest row per session │ + │ - source of truth for roll-ups │ + │ │ + └────────────┬────────────────────────┬─────────┘ + │ │ + ▼ ▼ + ┌─────────────────────────┐ ┌────────────────────────────┐ + │ engagement_daily │ │ sessions_by_*_daily │ + │ (daily KPI roll-up) │ │ (device / browser / lang) │ + └────────────┬────────────┘ └──────────────┬─────────────┘ + │ │ + ▼ ▼ + ┌────────────────────────────────────────────────────────┐ + │ Raw SQL queries / service layer │ + │ │ + │ - KPI cards │ + │ - trend charts │ + │ - distribution widgets │ + │ - arbitrary date ranges │ + │ │ + └───────────────────────────┬────────────────────────────┘ + │ + ▼ + ┌───────────────────────────────────────────────┐ + │ Angular Dashboard │ + └───────────────────────────────────────────────┘ + +===================================================================================================== */ + + +/* ===================================================================================================== + 1) SESSION STATES + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.session_states + + ENGINE + ----------------------------------------------------------------------------------------------------- + ReplicatedAggregatingMergeTree + + PURPOSE + ----------------------------------------------------------------------------------------------------- + This table stores mergeable per-session aggregate states derived from the raw events stream. + + Instead of repeatedly scanning analytics.events and running large GROUP BY queries every time we want + session-level metrics, we continuously maintain aggregate states per session. + + This is the scalable "intermediate session layer." + + WHY IT EXISTS + ----------------------------------------------------------------------------------------------------- + - keeps ingestion incremental and efficient + - absorbs late-arriving events automatically + - avoids rebuilding sessions from scratch from raw events over and over + - allows downstream session finalization to work on a much smaller table than analytics.events + + GRAIN + ----------------------------------------------------------------------------------------------------- + One logical session is identified by: + + (customer_id, environment, site_id, session_id) + + WHY AGGREGATE FUNCTION COLUMNS? + ----------------------------------------------------------------------------------------------------- + Because AggregatingMergeTree expects mergeable states: + - minState(...) / minMerge(...) + - maxState(...) / maxMerge(...) + - countState() / countMerge(...) + - argMaxState(...) / argMaxMerge(...) + + This allows partial rows written from many insert batches to merge correctly into one logical session. + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.session_states +( + /* Tenant scope: required to isolate customers and environments cleanly */ + customer_id LowCardinality(String), -- dotCMS customer / tenant identifier + environment LowCardinality(String), -- deployment environment (prod/stage/etc.) + site_id String, + /* Session boundary */ + session_id String, -- unique session identifier. All events with the same session_id belong together + + /* Session time window (mergeable aggregate states) */ + min_ts_state AggregateFunction(min, DateTime64(3, 'UTC')), -- earliest event timestamp seen in session + max_ts_state AggregateFunction(max, DateTime64(3, 'UTC')), -- latest event timestamp seen in session + + /* Event counters (mergeable) */ + total_events_state AggregateFunction(count), -- total events in session + pageviews_state AggregateFunction(countIf, UInt8), -- total number of pageview events in the session + conversions_state AggregateFunction(countIf, UInt8), -- total number of conversion events in the session + + /* Dimension "last known value" states (mergeable) */ + -- last-seen device category label for the session (set by Java at ingestion time) + -- stored as state so that late events can update the final value deterministically. + device_category_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), + -- last-seen browser family bucket (Chrome/Safari/Firefox/Edge/Other) + browser_family_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), + -- last-seen dotCMS language ISO code, defaulting to '' ('undefined') if unknown + locale_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')) +) + /* Why this engine is mandatory: + -> You are storing aggregate states + -> You rely on merge correctness + -> Without replication, different replicas would compute different session states */ + ENGINE = ReplicatedAggregatingMergeTree() + /* Partitioning note: + We partition by a hash of (customer, cluster) to spread writes and merges. + This avoids a single giant partition for big tenants and keeps merges parallelizable. */ + PARTITION BY sipHash64(customer_id, environment) % 64 + /* Note for the sort key: + ORDER BY includes tenant + session to keep session states physically clustered for merges/finalization. + This also ensures stable grouping keys for session_facts refresh queries. */ + ORDER BY ( + customer_id, + environment, + site_id, + session_id); + + +/* ===================================================================================================== + 4) REAL-TIME MV: events → session_states + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Materialized View (incremental, insert-triggered) + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.session_states_mv + + SOURCE + ----------------------------------------------------------------------------------------------------- + analytics.events + + TARGET + ----------------------------------------------------------------------------------------------------- + analytics.session_states + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Runs on every insert into analytics.events and converts the newly inserted batch into mergeable + session aggregate states. + + WHY THIS MV IS IMPORTANT + ----------------------------------------------------------------------------------------------------- + This is the object that keeps the whole pipeline scalable. + + Without it, every sessionization/finalization step would need to repeatedly scan raw events and group + them again. With this MV: + - inserts stay cheap + - aggregation work is incremental + - late events naturally merge into existing sessions + + DIMENSION STRATEGY + ----------------------------------------------------------------------------------------------------- + device_category and browser_family are set by Java at ingestion time and read directly from analytics.events. + +===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.session_states_mv + TO analytics.session_states +AS +WITH + /* Normalize empty locale values so they can be ignored cleanly in argMaxStateIf */ + nullIf(locale_id, '') AS normalized_locale_id +SELECT + e.customer_id, + e.environment, + e.site_id, + e.session_id, + + /* Time boundaries for the session */ + minState(e.event_time) AS min_ts_state, + maxState(e.event_time) AS max_ts_state, + + /* Mergeable counters */ + countState() AS total_events_state, + countIfState(e.event_type = 'pageview') AS pageviews_state, + countIfState(e.event_type = 'conversion') AS conversions_state, + + /* Mergeable latest dimension states — values already set by Java at ingestion time */ + argMaxState(e.device_category, e.event_time) AS device_category_state, + argMaxState(e.browser_family, e.event_time) AS browser_family_state, + + /* Locale is tracked only from pageview events and only when present */ + argMaxStateIf( + normalized_locale_id, + e.event_time, + e.event_type = 'pageview' AND normalized_locale_id IS NOT NULL + ) AS locale_id_state +FROM analytics.events AS e +WHERE e.session_id != '' + AND e.customer_id != '' + AND e.environment != '' + AND e.site_id != '' + /* Defensive guard to avoid broken/null-ish timestamps participating in session logic */ + AND e.event_time > toDateTime64(0, 3, 'UTC') +GROUP BY ( + e.customer_id, + e.environment, + e.site_id, + e.session_id); + + +/* ===================================================================================================== + 5) SESSION FACTS (FULL HISTORICAL VERSIONED TABLE) + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.session_facts + + ENGINE + ----------------------------------------------------------------------------------------------------- + ReplicatedReplacingMergeTree(updated_at) + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Stores the full historical session table. + + This is NOT just a hot window table. + This table is meant to retain all sessions across all history. + + WHY ReplacingMergeTree(updated_at)? + ----------------------------------------------------------------------------------------------------- + Because recent sessions may be recalculated when late events arrive. + + Example: + - session originally finalized at 10:05 + - delayed event arrives at 10:20 + - next RMV refresh recalculates that session and appends a newer row + + ReplacingMergeTree allows the newer version to win logically by updated_at. + + IMPORTANT + ----------------------------------------------------------------------------------------------------- + Because session_facts_rmv uses APPEND TO, multiple physical versions of the same session may coexist + temporarily in this table until background merges occur. + + That is exactly why we introduce analytics.session_facts_latest later: + it provides a deduplicated "latest truth" layer for roll-ups and direct querying. + + RECOMMENDED USAGE + ----------------------------------------------------------------------------------------------------- + - Keep this table as your durable historical versioned store + - Do NOT use it directly for roll-ups if you need exactly one row per session + - Use analytics.session_facts_latest for that + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.session_facts +( + /* Tenant scope */ + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + /* Session identity */ + session_id String, + + /* Finalized session times */ + session_start DateTime64(3, 'UTC'), -- earliest event timestamp + session_end DateTime64(3, 'UTC'), -- latest event timestamp + duration_seconds UInt32, -- session_end - session_start (seconds) + + /* Finalized counters */ + total_events UInt32, -- total events in session + pageviews UInt32, -- total pageview events + conversions UInt32, -- total conversion events + + /* Engagement flag (GA4-style) */ + engaged UInt8, -- 1 if engaged, else 0 + + /* Finalized dimensions */ + device_category LowCardinality(String), -- Desktop/Mobile/Tablet/Other + browser_family LowCardinality(String), -- Chrome/Safari/Firefox/Edge/Other + locale_id LowCardinality(String), -- dotCMS language Locale ID ('' means undefined) + + /* Version column. Newer recalculations must have a greater timestamp. */ + updated_at DateTime64(3, 'UTC') +) + ENGINE = ReplicatedReplacingMergeTree(updated_at) + /* Partition by month of session_start. Keeps partitions time-bounded and supports TTL strategies + later if desired. */ + PARTITION BY toYYYYMM(toDate(session_start)) + /* Sort key includes session identity for deterministic replacement. */ + ORDER BY ( + customer_id, + environment, + site_id, + session_id); + + +/* ===================================================================================================== + 6) REFRESHABLE MV: session_states → session_facts + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Refreshable Materialized View (RMV) + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.session_facts_rmv + + SOURCE + ----------------------------------------------------------------------------------------------------- + analytics.session_states + + TARGET + ----------------------------------------------------------------------------------------------------- + analytics.session_facts + + WRITE MODE + ----------------------------------------------------------------------------------------------------- + APPEND TO + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Re-finalizes only RECENT sessions and appends a new version into analytics.session_facts. + + This is the core late-event correction mechanism. + + SLIDING WINDOW + ----------------------------------------------------------------------------------------------------- + start_cutoff = now() - 1 day + + This means: + - only sessions whose latest activity is recent are recalculated + - older sessions remain in analytics.session_facts untouched + - this is NOT a retention window + + WHY 1 DAY? + ----------------------------------------------------------------------------------------------------- + Good conservative default for local testing: + - covers same-day late arrivals comfortably + - easy to reason about + - can later be reduced to 12h or 6h if actual ingestion lag is small + + ENGAGEMENT LOGIC + ----------------------------------------------------------------------------------------------------- + A session is engaged if ANY of these is true: + - duration > 10 seconds + - pageviews >= 2 + - conversions >= 1 + + DIMENSION FINALIZATION + ----------------------------------------------------------------------------------------------------- + device_category and browser_family are read directly from session_states — values were set by Java + at ingestion time and propagated via session_states_mv. + +===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.session_facts_rmv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND APPEND TO analytics.session_facts +AS +WITH + /* Sliding recomputation window for late-event correction */ + (now64(3, 'UTC') - INTERVAL 1 DAY) AS start_cutoff +SELECT + customer_id, + environment, + site_id, + session_id, + + session_start, + session_end, + duration_seconds, + + total_events, + pageviews, + conversions, + + engaged, + + device_category, + browser_family, + + locale_id, + + /* Version timestamp for ReplacingMergeTree */ + now64(3, 'UTC') AS updated_at +FROM + ( + /* Aggregate session_states into finalized scalar columns */ + SELECT + customer_id, + environment, + site_id, + session_id, + + /* Finalized time boundaries */ + minMerge(min_ts_state) AS session_start, + maxMerge(max_ts_state) AS session_end, + + /* Derived duration */ + toUInt32(greatest(0, dateDiff('second', session_start, session_end))) AS duration_seconds, + + /* Finalized counters */ + toUInt32(countMerge(total_events_state)) AS total_events, + toUInt32(countIfMerge(pageviews_state)) AS pageviews, + toUInt32(countIfMerge(conversions_state)) AS conversions, + + /* Business rules that determine whether a session is flagged as 'engaged' or not */ + toUInt8( + -- 1. Sessions that last more than 10 seconds + (dateDiff('second', session_start, session_end) > 10) + -- 2. Or, sessions that trigger at least 2 events of type 'pageview' + OR (countIfMerge(pageviews_state) >= 2) + -- 3. Or, sessions that trigger at least 1 event of type 'conversion' + OR (countIfMerge(conversions_state) >= 1) + ) AS engaged, + + /* Dimension values set by Java at ingestion time, propagated via session_states_mv */ + argMaxMerge(device_category_state) AS device_category, + argMaxMerge(browser_family_state) AS browser_family, + + /* Locale defaults to empty string when unknown */ + coalesce(argMaxMerge(locale_id_state), '') AS locale_id + FROM analytics.session_states + GROUP BY ( + customer_id, + environment, + site_id, + session_id) + /* Only recent sessions are recalculated */ + HAVING session_end >= start_cutoff + ) finalized_sessions; + + +/* ===================================================================================================== + 7) SESSION FACTS LATEST (DEDUPLICATED INTERMEDIATE TABLE) + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.session_facts_latest + + ENGINE + ----------------------------------------------------------------------------------------------------- + ReplicatedReplacingMergeTree(updated_at) + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Stores exactly one latest effective row per session key. + + WHY THIS TABLE EXISTS + ----------------------------------------------------------------------------------------------------- + analytics.session_facts is a versioned historical store. Because it receives APPENDed updates for + recent sessions, the same logical session may temporarily exist in multiple versions. + + We could read analytics.session_facts FINAL everywhere, but FINAL is heavier and we do not want every + downstream roll-up to pay that cost repeatedly. + + So instead: + - we deduplicate once into session_facts_latest + - roll-ups and direct SQL queries can use this table + - downstream SQL stays simpler and more efficient + + RECOMMENDED USAGE + ----------------------------------------------------------------------------------------------------- + This is the preferred source when you want: + - one row per session + - latest metrics only + - session-level raw SQL queries + - roll-up generation + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.session_facts_latest +( + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + session_id String, + + session_start DateTime64(3, 'UTC'), + session_end DateTime64(3, 'UTC'), + duration_seconds UInt32, + + total_events UInt32, + pageviews UInt32, + conversions UInt32, + + engaged UInt8, + + device_category LowCardinality(String), + browser_family LowCardinality(String), + locale_id LowCardinality(String), + + updated_at DateTime64(3, 'UTC') +) + ENGINE = ReplicatedReplacingMergeTree(updated_at) + PARTITION BY toYYYYMM(toDate(session_start)) + ORDER BY ( + customer_id, + environment, + site_id, + session_id + ); + + +/* ===================================================================================================== + 8) REFRESHABLE MV: session_facts → session_facts_latest + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Refreshable Materialized View (RMV) + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.session_facts_latest_rmv + + SOURCE + ----------------------------------------------------------------------------------------------------- + analytics.session_facts + + TARGET + ----------------------------------------------------------------------------------------------------- + analytics.session_facts_latest + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Centralizes deduplication of the historical versioned session table into one latest row per session key. + + WHY THIS IS BETTER THAN USING FINAL EVERYWHERE + ----------------------------------------------------------------------------------------------------- + Instead of every downstream roll-up doing its own deduplication or reading session_facts FINAL, this + RMV does the work once and stores the result in a clean intermediate table. + + DEDUPLICATION RULE + ----------------------------------------------------------------------------------------------------- + For each session key: + - take the value associated with the greatest updated_at + - that is done via argMax(..., updated_at) + - store max(updated_at) as the effective row version + +===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.session_facts_latest_rmv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_rmv +TO analytics.session_facts_latest +AS +SELECT + sf.customer_id, + sf.environment, + sf.site_id, + sf.session_id, + + argMax(sf.session_start, sf.updated_at) AS session_start, + argMax(sf.session_end, sf.updated_at) AS session_end, + argMax(sf.duration_seconds, sf.updated_at) AS duration_seconds, + + argMax(sf.total_events, sf.updated_at) AS total_events, + argMax(sf.pageviews, sf.updated_at) AS pageviews, + argMax(sf.conversions, sf.updated_at) AS conversions, + + argMax(sf.engaged, sf.updated_at) AS engaged, + argMax(sf.device_category, sf.updated_at) AS device_category, + argMax(sf.browser_family, sf.updated_at) AS browser_family, + argMax(sf.locale_id, sf.updated_at) AS locale_id, + + max(sf.updated_at) AS updated_at +FROM analytics.session_facts AS sf +GROUP BY ( + sf.customer_id, + sf.environment, + sf.site_id, + sf.session_id); + + +/* ===================================================================================================== + 9) DAILY KPI ROLL-UP + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.engagement_daily + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Stores dashboard-ready daily KPI numerators and denominators. + + WHY STORE DAILY SUMS INSTEAD OF DAILY RATES? + ----------------------------------------------------------------------------------------------------- + Because arbitrary date ranges must be computed correctly as: + + sum(numerator) / sum(denominator) + + not as: + average(daily_rate) + + This table therefore stores the raw daily ingredients needed to compute: + - engagement rate + - conversion rate + - average interactions + - average session duration + + GRAIN + ----------------------------------------------------------------------------------------------------- + (customer_id, environment, site_id, day) + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.engagement_daily +( + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + day Date, + + total_sessions UInt64, -- count of all sessions + engaged_sessions UInt64, -- count of engaged sessions + engaged_conversion_sessions UInt64, -- engaged sessions that include >=1 conversion + + total_events_all UInt64, -- sum(total_events) across all sessions + total_duration_all UInt64, -- sum(duration_seconds) across all sessions + + total_events_engaged UInt64, -- sum(total_events) across engaged sessions only + total_duration_engaged UInt64, -- sum(duration_seconds) across engaged sessions only + + updated_at DateTime64(3, 'UTC') +) + ENGINE = ReplicatedReplacingMergeTree(updated_at) + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, environment, site_id, day); + + +/* ===================================================================================================== + 10) REFRESHABLE MV: session_facts_latest → engagement_daily + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Refreshable Materialized View (RMV) + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.engagement_daily_rmv + + SOURCE + ----------------------------------------------------------------------------------------------------- + analytics.session_facts_latest + + TARGET + ----------------------------------------------------------------------------------------------------- + analytics.engagement_daily + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Rebuilds the daily KPI roll-up table from the latest one-row-per-session layer. + + WHY THIS SOURCE? + ----------------------------------------------------------------------------------------------------- + Because session_facts_latest already contains one latest row per session, this roll-up can aggregate + without FINAL and without inline dedup logic. + +===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.engagement_daily_rmv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv + TO analytics.engagement_daily +AS +SELECT + customer_id, + environment, + site_id, + toDate(session_start, 'UTC') AS day, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + countIf(engaged = 1 AND conversions >= 1) AS engaged_conversion_sessions, + + sum(total_events) AS total_events_all, + sum(duration_seconds) AS total_duration_all, + + sumIf(total_events, engaged = 1) AS total_events_engaged, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged, + + now64(3, 'UTC') AS updated_at +FROM analytics.session_facts_latest +GROUP BY ( + customer_id, + environment, + site_id, + day); + + +/* ===================================================================================================== + 11) DEVICE BREAKDOWN ROLL-UP + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.sessions_by_device_daily + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Daily distribution table by device category. + + Typical dashboard uses: + - total sessions by device + - engaged sessions by device + - average engaged duration by device + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.sessions_by_device_daily +( + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + day Date, + + device_category LowCardinality(String), -- Desktop/Mobile/Tablet/Other + + total_sessions UInt64, -- ALL sessions for this device_category + engaged_sessions UInt64, -- Engaged sessions for this device_category + total_duration_engaged_seconds UInt64, -- Sum(duration_seconds) for engaged sessions only + + updated_at DateTime64(3, 'UTC') +) + ENGINE = ReplicatedReplacingMergeTree(updated_at) + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, environment, site_id, day, device_category); + + +/* ===================================================================================================== + 12) REFRESHABLE MV: session_facts_latest → sessions_by_device_daily + ===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.sessions_by_device_daily_rmv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv + TO analytics.sessions_by_device_daily +AS +SELECT + customer_id, + environment, + site_id, + toDate(session_start, 'UTC') AS day, + device_category, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, + + now64(3, 'UTC') AS updated_at +FROM analytics.session_facts_latest +GROUP BY ( + customer_id, + environment, + site_id, + day, + device_category); + + +/* ===================================================================================================== + 13) BROWSER BREAKDOWN ROLL-UP + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.sessions_by_browser_daily + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Daily distribution table by browser family. + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.sessions_by_browser_daily +( + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + day Date, + + browser_family LowCardinality(String), -- Chrome/Safari/Firefox/Edge/Other + + total_sessions UInt64, + engaged_sessions UInt64, + total_duration_engaged_seconds UInt64, + + updated_at DateTime64(3, 'UTC') +) + ENGINE = ReplicatedReplacingMergeTree(updated_at) + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, environment, site_id, day, browser_family); + + +/* ===================================================================================================== + 14) REFRESHABLE MV: session_facts_latest → sessions_by_browser_daily + ===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.sessions_by_browser_daily_rmv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv + TO analytics.sessions_by_browser_daily +AS +SELECT + customer_id, + environment, + site_id, + toDate(session_start, 'UTC') AS day, + browser_family, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, + + now64(3, 'UTC') AS updated_at +FROM analytics.session_facts_latest +GROUP BY ( + customer_id, + environment, + site_id, + day, + browser_family); + + +/* ===================================================================================================== + 15) LANGUAGE BREAKDOWN ROLL-UP + ===================================================================================================== + + OBJECT TYPE + ----------------------------------------------------------------------------------------------------- + Table + + OBJECT NAME + ----------------------------------------------------------------------------------------------------- + analytics.sessions_by_language_daily + + PURPOSE + ----------------------------------------------------------------------------------------------------- + Daily distribution table by locale_id. + + NOTE + ----------------------------------------------------------------------------------------------------- + locale_id remains the raw dotCMS locale/language identifier. + If you later want user-friendly names, that translation can happen in SQL joins or in the service layer. + +===================================================================================================== */ +CREATE TABLE IF NOT EXISTS analytics.sessions_by_language_daily +( + customer_id LowCardinality(String), + environment LowCardinality(String), + site_id String, + day Date, + + locale_id LowCardinality(String), -- dotCMS language Locale ID ('' means undefined) + + total_sessions UInt64, + engaged_sessions UInt64, + total_duration_engaged_seconds UInt64, + + updated_at DateTime64(3, 'UTC') +) + ENGINE = ReplicatedReplacingMergeTree(updated_at) + PARTITION BY toYYYYMM(day) + ORDER BY (customer_id, environment, site_id, day, locale_id); + + +/* ===================================================================================================== + 16) REFRESHABLE MV: session_facts_latest → sessions_by_language_daily + ===================================================================================================== */ +CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.sessions_by_language_daily_rmv +-- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE +REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv + TO analytics.sessions_by_language_daily +AS +SELECT + customer_id, + environment, + site_id, + toDate(session_start, 'UTC') AS day, + locale_id, + + count() AS total_sessions, + countIf(engaged = 1) AS engaged_sessions, + sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, + + now64(3, 'UTC') AS updated_at +FROM analytics.session_facts_latest +GROUP BY ( + customer_id, + environment, + site_id, + day, + locale_id); + + +/* ===================================================================================================== + QUERYING GUIDANCE + ===================================================================================================== + + RECOMMENDED TABLES FOR RAW SQL + ----------------------------------------------------------------------------------------------------- + + 1) Query analytics.session_facts_latest when you need: + - one row per session + - latest session metrics only + - session-level exploration/debugging + - session KPI calculations on the fly + + 2) Query analytics.engagement_daily when you need: + - KPI cards + - trends over time + - engagement/conversion/avg-interaction metrics over arbitrary date ranges + + 3) Query analytics.sessions_by_device_daily / browser / language when you need: + - dashboard distribution widgets + - grouped daily breakdowns + - top-N device/browser/language reports + + 4) Query analytics.session_facts only when you specifically need: + - historical row versions + - debugging of late-event recalculations + - low-level understanding of how session versions changed over time + + EXAMPLE MENTAL MODEL + ----------------------------------------------------------------------------------------------------- + + analytics.session_facts = durable version history + analytics.session_facts_latest = current one-row-per-session truth + analytics.engagement_daily = daily KPI roll-up + analytics.sessions_by_*_daily = daily grouped dashboard roll-ups + +===================================================================================================== */ \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/init/50-users.sql b/docker/docker-compose-examples/analytics/init/50-users.sql new file mode 100644 index 000000000000..97c4deb5f2a4 --- /dev/null +++ b/docker/docker-compose-examples/analytics/init/50-users.sql @@ -0,0 +1,11 @@ +-- 1. Create the user +CREATE USER 'cust-001' IDENTIFIED BY 'abc' DEFAULT DATABASE analytics; +-- 2. Grant necessary privileges +GRANT SELECT ON analytics.* TO 'cust-001'; +-- 3. Create the row policy to filter by customer_id +CREATE ROW POLICY 'cust-001-policy' ON analytics.* USING customer_id='cust-001' TO 'cust-001'; + +--- 4. Allow from any host +ALTER USER 'cust-001' HOST ANY; +--- 5 - Grant Write permissions +GRANT INSERT ON analytics.events TO `cust-001`; \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/.env b/docker/docker-compose-examples/analytics/init/99-test-users.sql similarity index 100% rename from docker/docker-compose-examples/analytics/.env rename to docker/docker-compose-examples/analytics/init/99-test-users.sql diff --git a/docker/docker-compose-examples/experiments/README.md b/docker/docker-compose-examples/experiments/README.md new file mode 100644 index 000000000000..207a93412468 --- /dev/null +++ b/docker/docker-compose-examples/experiments/README.md @@ -0,0 +1,289 @@ +# dotCMS Experiments Complete Stack + +This docker-compose setup provides a complete dotCMS instance pre-configured with the full analytics stack including CubeJS, ClickHouse, Jitsu, and Keycloak. + +## Architecture Overview + +``` +┌─────────────┐ ┌──────────────────┐ ┌─────────────────┐ +│ dotCMS │────│ Analytics Stack │────│ Data Layer │ +│ │ │ │ │ │ +│ - dotCMS │ │ - Keycloak (IDP) │ │ - ClickHouse │ +│ - OpenSearch│ │ - Jitsu (Events) │ │ - PostgreSQL │ +│ - Database │ │ - Cube (Read) │ │ - Redis │ +└─────────────┘ │ - Configurator │ └─────────────────┘ + └──────────────────┘ +``` + +## Services and Ports + +### Core dotCMS Services +- **dotCMS**: http://localhost:8082 (HTTPS: 8443) +- **dotCMS Database**: PostgreSQL (internal only) +- **OpenSearch**: http://localhost:9200 (internal + external) +- **Glowroot**: http://localhost:4000 (monitoring) + +### Experiments Services +- **Keycloak (IDP)**: http://localhost:61111 +- **dotCMS Analytics Configurator**: http://localhost:8088 +- **Jitsu (Event Collection)**: http://localhost:8081 +- **CubeJS (Analytics Read)**: http://localhost:4001 +- **ClickHouse (Data Warehouse)**: http://localhost:8124 +- **Analytics Database**: PostgreSQL (internal only) + +## Pre-configured Experiments Settings + +The dotCMS instance is pre-configured with the following analytics settings via environment variables: + +### dotCMS Experiments URLs + +dotCMS reaches analytics services via `host.docker.internal`, routing through the host machine to the exposed container ports: + +```bash +DOT_ANALYTICS_IDP_URL="http://host.docker.internal:61111/realms/dotcms/protocol/openid-connect/token" +DOT_ANALYTICS_APP_CONFIG_URL="http://host.docker.internal:8088/c/customer1/cluster1/keys" +DOT_ANALYTICS_APP_WRITE_URL="http://host.docker.internal:8081/api/v1/s2s/event" +DOT_ANALYTICS_APP_READ_URL="http://host.docker.internal:4001" +``` + +### Client Configuration (customer1:cluster1) +```bash +DOT_ANALYTICS_APP_CLIENT_ID="analytics-customer-customer1" +DOT_ANALYTICS_APP_CLIENT_SECRET="testsecret" +Analytics Key: [Auto-generated by configurator] +``` + +## Getting Started + +### Quick Start Options + +Choose your startup method based on your needs: + +#### Option 1: Using the Startup Script (Recommended) +```bash +# Experiments services only (faster startup, less resources) +./start-experiments.sh --analytics-only + +# Full stack with dotCMS (complete development environment) +./start-experiments.sh + +# Force recreate containers (required for environment variable changes) +./start-experiments.sh --force-recreate +./start-experiments.sh --analytics-only --force-recreate + +# Show help and service details +./start-experiments.sh --help +``` + +#### Option 2: Using Docker Compose Directly +```bash +# Experiments services only +docker-compose up -d + +# Full stack with dotCMS +docker-compose --profile full up -d + +# Force recreate containers (for environment variable changes) +docker-compose up -d --force-recreate +docker-compose --profile full up -d --force-recreate + +# Stop everything (including dotCMS services) +docker-compose --profile full down +``` + +### Startup Modes + +**Experiments Only Mode** (`--experiments-only`): +- Faster startup and lower resource usage +- Includes: Keycloak, Analytics API, Jitsu, Cube, ClickHouse, Redis, PostgreSQL +- Best for: Analytics development, testing API integrations + +**Full Stack Mode** (Default): +- Complete development environment +- Includes: All analytics services + dotCMS + OpenSearch + dotCMS Database +- Best for: End-to-end testing, content + analytics workflows + +### Wait for Services + +```bash +# Check service health +docker-compose ps + +# Monitor startup logs +docker-compose logs -f keycloak dotcms-analytics + +# For full stack, monitor dotCMS startup +docker-compose logs -f dotcms +``` + +### Access Your Services + +**Experiments Services (Always Available):** +- **Keycloak Admin**: http://localhost:61111 (admin:keycloak) +- **Analytics API**: http://localhost:8088 +- **Cube Analytics**: http://localhost:4001 +- **Jitsu Events**: http://localhost:8081 +- **ClickHouse**: http://localhost:8124 + +**dotCMS Services (Full Stack Only):** +- **dotCMS**: http://localhost:8082 (admin@dotcms.com:admin) +- **Glowroot**: http://localhost:4000 + +### Verify Experiments Configuration (Full Stack) + +1. Access dotCMS at http://localhost:8082 +2. Navigate to: Apps → dotExperiments-config +3. Experiments should be pre-configured with the URLs above +4. Test connection to verify all services are communicating + +## Network Architecture + +### Networks +- **dotcms-net**: Isolated network for dotCMS core services (dotCMS, database, OpenSearch) +- **analytics-net**: Isolated network for analytics services (Keycloak, Jitsu, CubeJS, ClickHouse) +- **Bridge**: dotCMS connects to both networks to communicate with analytics services + +### Security +- Internal service communication uses container names (e.g., `keycloak:8080`) +- External access uses host ports (e.g., `localhost:61111`) +- Databases are isolated and only accessible within their respective networks +- Analytics uses JWT-based authentication with Keycloak + +## Environment Variables + +Key environment variables that can be customized: + +```bash +# Ports +KEYCLOAK_HOST_PORT=61111 +DOTCMS_ANALYTICS_HOST_PORT=8088 +JITSU_HOST_PORT=8081 +CUBE_HOST_PORT=4001 +CH_HOST_PORT=8124 +POSTGRESQL_HOST_PORT=54321 + +# Database +POSTGRESQL_DB=postgres +POSTGRESQL_USER=postgres +POSTGRESQL_PASS=postgres + +# ClickHouse +CH_DB=clickhouse_test_db +CH_USER=clickhouse_test_user +CH_PWD=clickhouse_password + +# Keycloak +KEYCLOAK_ADMIN=admin +KEYCLOAK_ADMIN_PASSWORD=keycloak + +# dotCMS Experiment Features +DOT_ENABLE_EXPERIMENTS_AUTO_JS_INJECTION=true +``` + +### ⚠️ Important: Environment Variable Changes + +**Environment variables are set when containers are first created and are NOT automatically updated when you restart services.** + +To apply changes to environment variables: + +1. **Stop and recreate containers:** + ```bash + docker-compose down + docker-compose up -d --force-recreate + ``` + +2. **Or use the startup script with force recreate:** + ```bash + ./start-experiments.sh --force-recreate + ./start-experiments.sh --experiments-only --force-recreate + ``` + +3. **For individual services:** + ```bash + docker-compose up -d --force-recreate [service-name] + ``` + +**Why this happens:** Docker containers bake environment variables into the container at creation time. Simply restarting (`docker-compose restart`) keeps the existing container with old environment variables. You must recreate the container to pick up new environment variables from the docker-compose.yml file. + +## Key Features + +### ✅ Complete Experiments Integration +- **Pre-configured dotCMS** with analytics URLs and client credentials +- **Defense-in-depth security** with multi-layer filtering +- **ClickHouse optimization** for customer partition elimination +- **JWT-based authentication** via Keycloak + +### ✅ Development Ready +- **Hot-reload** CubeJS schema changes via volume mounts +- **Debug logging** enabled for troubleshooting +- **Health checks** for all critical services +- **Glowroot monitoring** for performance analysis + +### ✅ Production Patterns +- **Separate databases** for dotCMS and analytics +- **Network isolation** between service layers +- **Persistent volumes** for data retention +- **Environment-based configuration** + +## Troubleshooting + +### Common Issues + +1. **Services not starting:** + ```bash + # Check logs + docker-compose logs [service-name] + + # Restart specific service + docker-compose restart [service-name] + ``` + +2. **Analytics connection issues:** + - Verify all services are running: `docker-compose ps` + - Check network connectivity: `docker-compose exec dotcms ping keycloak` + - Verify URLs in dotCMS analytics configuration + +3. **Permission issues:** + ```bash + # Fix volume permissions + sudo chown -R 1000:1000 ./setup/ + ``` + +4. **Port conflicts:** + - Modify port mappings in docker-compose.yml + - Update corresponding environment variables + +### Useful Commands + +```bash +# View service logs +docker-compose logs -f dotcms +docker-compose logs -f cube + +# Access service shells +docker-compose exec dotcms bash +docker-compose exec analytics-postgres psql -U postgres + +# Restart specific services +docker-compose restart dotcms keycloak + +# Clean restart +docker-compose down -v +docker-compose up -d +``` + +## Next Steps + +1. **Configure A/B Testing**: Set up experiments in dotCMS +2. **Create Dashboards**: Build analytics dashboards using CubeJS +3. **Monitor Performance**: Use Glowroot for application monitoring +4. **Scale Services**: Add replicas for high availability +5. **Production Hardening**: Implement proper secrets management and SSL certificates + +## Security Considerations + +- Change default passwords in production +- Use proper SSL certificates for external access +- Implement proper firewall rules +- Regular security updates for all services +- Monitor access logs and authentication attempts \ No newline at end of file diff --git a/docker/docker-compose-examples/experiments/docker-compose.yml b/docker/docker-compose-examples/experiments/docker-compose.yml new file mode 100644 index 000000000000..e1ebba54cfda --- /dev/null +++ b/docker/docker-compose-examples/experiments/docker-compose.yml @@ -0,0 +1,300 @@ +services: + + # Analytics Database (separate from dotCMS DB) + analytics-postgres: + container_name: analytics-postgres + image: pgvector/pgvector:pg18 + restart: unless-stopped + environment: + POSTGRES_DB: ${POSTGRESQL_DB:-postgres} + POSTGRES_USER: ${POSTGRESQL_USER:-postgres} + POSTGRES_PASSWORD: ${POSTGRESQL_PASS:-postgres} + ports: + - ${POSTGRESQL_HOST_PORT:-54321}:5432 + networks: + - analytics-net + volumes: + - analytics-db-data:/var/lib/postgresql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRESQL_USER:-postgres} -d ${POSTGRESQL_DB:-postgres} -h localhost -p 5432"] + interval: 10s + timeout: 5s + retries: 5 + + # dotCMS Database + dotcms-db: + profiles: ["full"] + container_name: dotcms-db + image: pgvector/pgvector:pg18 + command: postgres -c 'max_connections=400' -c 'shared_buffers=128MB' + environment: + POSTGRES_USER: 'dotcmsdbuser' + POSTGRES_PASSWORD: 'password' + POSTGRES_DB: 'dotcms' + volumes: + - dotcms-db-data:/var/lib/postgresql + networks: + - dotcms-net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U dotcmsdbuser -d dotcms -h localhost -p 5432"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + + # OpenSearch for dotCMS + opensearch: + profiles: ["full"] + container_name: opensearch + image: opensearchproject/opensearch:1 + environment: + cluster.name: "elastic-cluster" + discovery.type: "single-node" + bootstrap.memory_lock: "true" + OPENSEARCH_JAVA_OPTS: "-Xmx1G" + ulimits: + memlock: + soft: -1 + hard: -1 + nofile: + soft: 65536 + hard: 65536 + ports: + - "9200:9200" + - "9600:9600" + volumes: + - opensearch-data:/usr/share/opensearch/data + networks: + - dotcms-net + deploy: + resources: + limits: + cpus: "1.0" + memory: 2G + + keycloak: + container_name: keycloak + depends_on: + - analytics-postgres + environment: + DB_VENDOR: postgres + DB_ADDR: analytics-postgres + KEYCLOAK_ADMIN: ${KEYCLOAK_ADMIN:-admin} + KEYCLOAK_ADMIN_PASSWORD: ${KEYCLOAK_ADMIN_PASSWORD:-keycloak} + DB_DATABASE: ${POSTGRESQL_DB:-postgres} + DB_USER: ${POSTGRESQL_USER:-postgres} + DB_PASSWORD: ${POSTGRESQL_PASS:-postgres} + KC_HOSTNAME_STRICT_HTTPS: false + KC_HOSTNAME_STRICT: false + KC_HTTP_ENABLED: true + image: quay.io/keycloak/keycloak:${KEYCLOAK_VERSION:-18.0.2} + volumes: + - ./setup/config/dev/keycloak/test-realm.json:/opt/keycloak/data/import/example-realm.json + entrypoint: ["/opt/keycloak/bin/kc.sh", "start-dev", "--import-realm", "--hostname-strict-https=false", "--http-enabled=true"] + ports: + - "${KEYCLOAK_HOST_PORT:-61111}:8080" + networks: + - analytics-net + restart: always + + # dotCMS with Experiments Pre-configured + dotcms: + profiles: ["full"] + container_name: dotcms + image: dotcms/dotcms-test:1.0.0-SNAPSHOT + environment: + # dotCMS Core Configuration + CMS_JAVA_OPTS: '-Xmx1g' + LANG: 'C.UTF-8' + TZ: 'UTC' + DB_BASE_URL: "jdbc:postgresql://dotcms-db/dotcms" + DB_USERNAME: 'dotcmsdbuser' + DB_PASSWORD: 'password' + DOT_ES_AUTH_BASIC_PASSWORD: 'admin' + DOT_ES_ENDPOINTS: 'https://opensearch:9200' + DOT_INITIAL_ADMIN_PASSWORD: 'admin' + DOT_DOTCMS_CLUSTER_ID: 'dotcms-analytics-cluster' + GLOWROOT_ENABLED: 'true' + GLOWROOT_WEB_UI_ENABLED: 'true' + DOT_ALLOW_ACCESS_TO_PRIVATE_SUBNETS: 'true' + + # Experiments Configuration (Environment Variables Override) + # Internal URLs (container-to-container communication) + DOT_FEATURE_FLAG_EXPERIMENTS: "true" + DOT_ENABLE_EXPERIMENTS_AUTO_JS_INJECTION: "true" + DOT_FEATURE_FLAG_CONTENT_ANALYTICS_AUTO_INJECT: "true" + DOT_FEATURE_FLAG_CONTENT_ANALYTICS: "true" + + # Experiments Client Configuration (customer1:cluster1) + DOT_ANALYTICS_IDP_URL: "http://host.docker.internal:61111/realms/dotcms/protocol/openid-connect/token" + DOT_ANALYTICS_APP_CLIENT_ID: "analytics-customer-customer1" + DOT_ANALYTICS_APP_CLIENT_SECRET: "testsecret" + DOT_ANALYTICS_APP_CONFIG_URL: "http://host.docker.internal:8088/c/customer1/cluster1/keys" + DOT_ANALYTICS_APP_WRITE_URL: "http://host.docker.internal:8081/api/v1/s2s/event" + DOT_ANALYTICS_APP_READ_URL: "http://host.docker.internal:4001" + + depends_on: + dotcms-db: + condition: service_healthy + opensearch: + condition: service_started + keycloak: + condition: service_started + volumes: + - cms-shared:/data/shared + networks: + - dotcms-net + - analytics-net # Bridge to analytics network + ports: + - "8082:8082" # HTTP + - "8443:8443" # HTTPS + - "4000:4000" # Glowroot web UI + + dotcms-analytics: + container_name: dotcms-analytics + image: ghcr.io/dotcms/internal-infrastructure/configurator:latest + environment: + #- JITSU_USE_CONFIGURATOR='true' + #- JITSU_JITSU_CONFIGURATOR=http://host.docker.internal:7007/ + - JITSU_CLUSTER_ADMIN_TOKEN=myadmin + - JITSU_JITSU_SERVER=http://jitsu:8001/ + ## For local development ONLY. This forces events to be immediately persisted + ## to ClickHouse. For PROD instances, this must be set to 'batch' + - JITSU_DESTINATIONS_CLICKHOUSE_MODE=stream + - QUARKUS_OIDC_AUTH_SERVER_URL=${AUTH_SERVER_URL:-http://keycloak:8080/realms/dotcms} + - QUARKUS_DATASOURCE_DB_KIND=postgresql + - QUARKUS_DATASOURCE_REACTIVE_URL=postgresql://analytics-postgres:5432/${POSTGRESQL_DB:-postgres} + - QUARKUS_DATASOURCE_USERNAME=${POSTGRESQL_USER:-postgres} + - QUARKUS_DATASOURCE_PASSWORD=${POSTGRESQL_PASS:-postgres} + - QUARKUS_HIBERNATE_ORM_DATABASE_GENERATION=drop-and-create + - QUARKUS_HIBERNATE_ORM_DATABASE_GENERATION_CREATE_SCHEMAS=true + - QUARKUS_SWAGGER_UI_ALWAYS_INCLUDE=true + - EXCLUDED_QUERY_PARAMS=${ANALYTICS_EXCLUDED_QUERY_PARAMS:-variantName,redirect} + ## Enable this for extended logging and troubleshooting + #- QUARKUS_LOG_LEVEL=DEBUG + - QUARKUS_PROFILE=prod + - QUARKUS_KUBERNETES_CONFIG_ENABLED=false + ## If not using the prod profid, the jwks url and issuer need to be set directly. + #- MP_JWT_VERIFY_PUBLICKEY_LOCATION=http://keycloak:8080/realms/dotcms/protocol/openid-connect/certs + #- MP_JWT_VERIFY_ISSUER=http://keycloak:8080/realms/dotcms + ## Or, for local testing only, you can set issuer to 'NONE' to not validate + #- MP_JWT_VERIFY_ISSUER=NONE + - ISSUER_URI=${ISSUER_URI:-NONE} + - JWKS_URI=${JWKS_URL:-http://keycloak:8080/realms/dotcms/protocol/openid-connect/certs} + - CLICKHOUSE_URL=http://${CH_USER:-clickhouse_test_user}:${CH_PWD:-clickhouse_password}@ch_server:8123 + ports: + - "${DOTCMS_ANALYTICS_HOST_PORT:-8088}:8080" + networks: + - analytics-net + depends_on: + - keycloak + - analytics-postgres + - jitsu + + jitsu: + container_name: jitsu + image: jitsucom/server:latest + environment: + - CLUSTER_ADMIN_TOKEN=myadmin + - REDIS_URL=redis://jitsu_redis:6379 + - JITSU_CONFIGURATOR_URL=${JITSU_CONFIGURATOR_URL:-http://dotcms-analytics:8080} + - SERVER_PORT=8001 + - TERM=xterm-256color + - TLS_SKIP_VERIFY=true + depends_on: + - redis + - ch_server + volumes: + - ./setup/config/dev/jitsu/server/config:/home/eventnative/data/config + restart: always + networks: + - analytics-net + ports: + - "${JITSU_HOST_PORT:-8081}:8001" + + redis: + container_name: jitsu_redis + image: redis:6.2.6-bullseye + volumes: + - redis-data:/data + networks: + - analytics-net + restart: always + healthcheck: + test: ["CMD-SHELL", "redis-cli -h localhost -p 6379 PING"] + interval: 1s + timeout: 30s + + cube: + container_name: cube + image: cubejs/cube:v1.6.11 + ports: + - ${CUBE_HOST_PORT:-4001}:4000 + environment: + - CUBEJS_DEV_MODE=true + - CUBEJS_DB_TYPE=clickhouse + - CUBEJS_DB_HOST=${CH_SERVER:-ch_server} + - CUBEJS_DB_NAME=${CH_DB:-clickhouse_test_db} + - CUBEJS_DB_USER=${CH_USER:-clickhouse_test_user} + - CUBEJS_DB_PASS=${CH_PWD:-clickhouse_password} + # Use internal network for JWT validation + - CUBEJS_JWK_URL=${JWKS_URL:-http://keycloak:8080/realms/dotcms/protocol/openid-connect/certs} + - CUBEJS_JWT_AUDIENCE=api-dotcms-analytics-audience + #- CUBEJS_JWT_ISSUER=${AUTH_SERVER_URL:-http://keycloak:8080/realms/dotcms} + - CUBEJS_JWT_ALGS=RS256 + - CUBEJS_JWT_CLAIMS_NAMESPACE=https://dotcms.com/analytics + - CUBEJS_LOG_LEVEL=trace + - CUBEJS_REFRESH_WORKER=true + - CUBEJS_SCHEDULED_REFRESH_TIME=true + - CUBEJS_OVERRIDE_CUSTOMER=customer1 + - CUBEJS_OVERRIDE_CLUSTER=cluster1 + volumes: + - cube_metastore:/cube/conf/.cubestore + - ./setup/config/dev/cube/schema:/cube/conf/schema + - ./setup/config/dev/cube/cube.js:/cube/conf/cube.js + networks: + - analytics-net + depends_on: + - ch_server + - keycloak + + ch_server: + container_name: ch_server + image: clickhouse/clickhouse-server:25.8 + ports: + - "${CH_HOST_PORT:-8124}:8123" + ulimits: + nofile: + soft: 262144 + hard: 262144 + healthcheck: + test: wget --no-verbose --tries=1 --spider http://localhost:8123 || exit 1 + environment: + - CLICKHOUSE_DB=${CH_DB:-clickhouse_test_db} + - CLICKHOUSE_USER=${CH_USER:-clickhouse_test_user} + - CLICKHOUSE_PASSWORD=${CH_PWD:-clickhouse_password} + volumes: + - ch_data:/var/lib/clickhouse + - ./setup/db/clickhouse/init-scripts:/docker-entrypoint-initdb.d + networks: + - analytics-net + +networks: + dotcms-net: + driver: bridge + analytics-net: + driver: bridge + +volumes: + # dotCMS volumes + cms-shared: + dotcms-db-data: + opensearch-data: + + # Analytics volumes + analytics-db-data: + ch_data: + redis-data: + redis_ur_data: + cube_metastore: + workspace: diff --git a/docker/docker-compose-examples/analytics/get-token.sh b/docker/docker-compose-examples/experiments/get-token.sh similarity index 97% rename from docker/docker-compose-examples/analytics/get-token.sh rename to docker/docker-compose-examples/experiments/get-token.sh index ea0bf41c4f23..5b0f21ed3157 100755 --- a/docker/docker-compose-examples/analytics/get-token.sh +++ b/docker/docker-compose-examples/experiments/get-token.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Keycloak JWT Token Generator for Analytics +# Keycloak JWT Token Generator for Experiments # # Usage: # ./get-token.sh # Use defaults @@ -27,7 +27,7 @@ OUTPUT_JSON=false DECODE_TOKEN=false show_help() { - echo "Keycloak JWT Token Generator for dotCMS Analytics" + echo "Keycloak JWT Token Generator for dotCMS Experiments" echo "" echo "Usage:" echo " ./get-token.sh Generate token with defaults" @@ -49,7 +49,7 @@ show_help() { echo " # Get token and copy to clipboard" echo " ./get-token.sh | pbcopy" echo "" - echo " # Use with Analytics API" + echo " # Use with Experiments API" echo " curl -H \"Authorization: Bearer \$(./get-token.sh)\" \\" echo " \"http://localhost:8088/c/customer1/cluster1/keys\"" echo "" @@ -195,7 +195,7 @@ else echo "" >&2 echo "💡 Usage examples:" >&2 - echo " # Analytics API:" >&2 + echo " # Experiments API:" >&2 echo " curl -H \"Authorization: Bearer TOKEN\" http://localhost:8088/c/customer1/cluster1/keys" >&2 echo "" >&2 echo " # CubeJS Security Context:" >&2 diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/clickhouse/clickhouse-issue-15638.xml b/docker/docker-compose-examples/experiments/setup/config/dev/clickhouse/clickhouse-issue-15638.xml similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/clickhouse/clickhouse-issue-15638.xml rename to docker/docker-compose-examples/experiments/setup/config/dev/clickhouse/clickhouse-issue-15638.xml diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/cube.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/cube.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/cube.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/ContentAttribution.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/ContentAttribution.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/ContentAttribution.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/ContentAttribution.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Conversion.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/Conversion.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Conversion.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/Conversion.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/EngagementDaily.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EngagementDaily.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/EngagementDaily.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/EventSummary.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/EventSummary.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/EventSummary.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Events.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/Events.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Events.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/Events.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Request.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/Request.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Request.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/Request.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/SessionsByBrowserDaily.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByBrowserDaily.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/SessionsByBrowserDaily.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/SessionsByDeviceDaily.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByDeviceDaily.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/SessionsByDeviceDaily.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js b/docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/SessionsByLanguageDaily.js similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js rename to docker/docker-compose-examples/experiments/setup/config/dev/cube/schema/SessionsByLanguageDaily.js diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/jitsu/server/config/eventnative.yaml b/docker/docker-compose-examples/experiments/setup/config/dev/jitsu/server/config/eventnative.yaml similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/jitsu/server/config/eventnative.yaml rename to docker/docker-compose-examples/experiments/setup/config/dev/jitsu/server/config/eventnative.yaml diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/keycloak/keycloak-keystore.jks b/docker/docker-compose-examples/experiments/setup/config/dev/keycloak/keycloak-keystore.jks similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/keycloak/keycloak-keystore.jks rename to docker/docker-compose-examples/experiments/setup/config/dev/keycloak/keycloak-keystore.jks diff --git a/docker/docker-compose-examples/analytics/setup/config/dev/keycloak/test-realm.json b/docker/docker-compose-examples/experiments/setup/config/dev/keycloak/test-realm.json similarity index 100% rename from docker/docker-compose-examples/analytics/setup/config/dev/keycloak/test-realm.json rename to docker/docker-compose-examples/experiments/setup/config/dev/keycloak/test-realm.json diff --git a/docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql b/docker/docker-compose-examples/experiments/setup/db/clickhouse/init-scripts/init.sql similarity index 100% rename from docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql rename to docker/docker-compose-examples/experiments/setup/db/clickhouse/init-scripts/init.sql diff --git a/docker/docker-compose-examples/analytics/setup/db/mssql/entrypoint.sh b/docker/docker-compose-examples/experiments/setup/db/mssql/entrypoint.sh similarity index 100% rename from docker/docker-compose-examples/analytics/setup/db/mssql/entrypoint.sh rename to docker/docker-compose-examples/experiments/setup/db/mssql/entrypoint.sh diff --git a/docker/docker-compose-examples/analytics/setup/db/mssql/init-scripts/init.sql b/docker/docker-compose-examples/experiments/setup/db/mssql/init-scripts/init.sql similarity index 100% rename from docker/docker-compose-examples/analytics/setup/db/mssql/init-scripts/init.sql rename to docker/docker-compose-examples/experiments/setup/db/mssql/init-scripts/init.sql diff --git a/docker/docker-compose-examples/analytics/setup/db/postgres/init-scripts/init-config.sh b/docker/docker-compose-examples/experiments/setup/db/postgres/init-scripts/init-config.sh similarity index 100% rename from docker/docker-compose-examples/analytics/setup/db/postgres/init-scripts/init-config.sh rename to docker/docker-compose-examples/experiments/setup/db/postgres/init-scripts/init-config.sh diff --git a/docker/docker-compose-examples/analytics/setup/db/postgres/init-scripts/init.sql b/docker/docker-compose-examples/experiments/setup/db/postgres/init-scripts/init.sql similarity index 100% rename from docker/docker-compose-examples/analytics/setup/db/postgres/init-scripts/init.sql rename to docker/docker-compose-examples/experiments/setup/db/postgres/init-scripts/init.sql diff --git a/docker/docker-compose-examples/analytics/start-analytics.sh b/docker/docker-compose-examples/experiments/start-experiments.sh similarity index 58% rename from docker/docker-compose-examples/analytics/start-analytics.sh rename to docker/docker-compose-examples/experiments/start-experiments.sh index 04fb47f2388a..f3a2cebf200b 100755 --- a/docker/docker-compose-examples/analytics/start-analytics.sh +++ b/docker/docker-compose-examples/experiments/start-experiments.sh @@ -1,31 +1,31 @@ #!/bin/bash -# Analytics Docker Compose Startup Script +# Experiments Docker Compose Startup Script # # Usage: -# ./start-analytics.sh # Full stack (default) -# ./start-analytics.sh --analytics-only # Analytics only -# ./start-analytics.sh --force-recreate # Full stack, recreate containers -# ./start-analytics.sh --help # Show this help +# ./start-experiments.sh # Full stack (default) +# ./start-experiments.sh --experiments-only # Experiments only +# ./start-experiments.sh --force-recreate # Full stack, recreate containers +# ./start-experiments.sh --help # Show this help set -e show_help() { - echo "Analytics Docker Compose Startup Script" + echo "Experiments Docker Compose Startup Script" echo "" echo "Usage:" - echo " ./start-analytics.sh Start full stack (default)" - echo " ./start-analytics.sh --analytics-only Start analytics services only" - echo " ./start-analytics.sh --force-recreate Start full stack, recreate containers" - echo " ./start-analytics.sh --analytics-only --force-recreate Analytics only, recreate containers" - echo " ./start-analytics.sh --help Show this help message" + echo " ./start-experiments.sh Start full stack (default)" + echo " ./start-experiments.sh --experiments-only Start experiments services only" + echo " ./start-experiments.sh --force-recreate Start full stack, recreate containers" + echo " ./start-experiments.sh --experiments-only --force-recreate Experiments only, recreate containers" + echo " ./start-experiments.sh --help Show this help message" echo "" echo "Options:" echo " --force-recreate Force recreate containers (required for environment variable changes)" echo "" - echo "Analytics Services:" - echo " - PostgreSQL (analytics data)" + echo "Experiments Services:" + echo " - PostgreSQL (experiments data)" echo " - Keycloak (authentication)" - echo " - dotCMS Analytics API" + echo " - dotCMS Experiments API" echo " - Jitsu (event collection)" echo " - Redis (cache)" echo " - Cube (analytics queries)" @@ -41,7 +41,7 @@ show_help() { } # Parse arguments -ANALYTICS_ONLY=false +EXPERIMENTS_ONLY=false FORCE_RECREATE=false for arg in "$@"; do @@ -50,8 +50,8 @@ for arg in "$@"; do show_help exit 0 ;; - --analytics-only) - ANALYTICS_ONLY=true + --experiments-only) + EXPERIMENTS_ONLY=true ;; --force-recreate) FORCE_RECREATE=true @@ -66,7 +66,7 @@ done # Build docker-compose command COMPOSE_CMD="docker-compose" -if [[ "$ANALYTICS_ONLY" == "false" ]]; then +if [[ "$EXPERIMENTS_ONLY" == "false" ]]; then COMPOSE_CMD="$COMPOSE_CMD --profile full" fi @@ -76,19 +76,19 @@ if [[ "$FORCE_RECREATE" == "true" ]]; then fi # Execute the appropriate command -if [[ "$ANALYTICS_ONLY" == "true" ]]; then +if [[ "$EXPERIMENTS_ONLY" == "true" ]]; then if [[ "$FORCE_RECREATE" == "true" ]]; then - echo "📊 Starting analytics stack only (force recreating containers)..." + echo "📊 Starting Experiments stack only (force recreating containers)..." echo "⚠️ This will recreate containers to pick up environment variable changes." else - echo "📊 Starting analytics stack only..." + echo "📊 Starting Experiments stack only..." fi else if [[ "$FORCE_RECREATE" == "true" ]]; then - echo "🚀 Starting full analytics + dotCMS stack (force recreating containers)..." + echo "🚀 Starting full Experiments + dotCMS stack (force recreating containers)..." echo "⚠️ This will recreate containers to pick up environment variable changes." else - echo "🚀 Starting full analytics + dotCMS stack..." + echo "🚀 Starting full Experiments + dotCMS stack..." fi fi @@ -101,12 +101,12 @@ docker-compose ps --format "table {{.Name}}\t{{.Status}}\t{{.Ports}}" echo "" echo "🌐 Access URLs:" echo " - Keycloak Admin: http://localhost:61111 (admin:keycloak)" -echo " - Analytics API: http://localhost:8088" +echo " - Experiments API: http://localhost:8088" echo " - Cube Analytics: http://localhost:4001" echo " - Jitsu Events: http://localhost:8081" echo " - ClickHouse: http://localhost:8124" -if [[ "$1" != "--analytics-only" ]]; then +if [[ "$1" != "--experiments-only" ]]; then echo " - dotCMS: http://localhost:8082 (admin:admin)" echo " - Glowroot: http://localhost:4000" fi \ No newline at end of file From 9c2538401fb04268a57c7380f5b9611e968eb226 Mon Sep 17 00:00:00 2001 From: Jose Castro Date: Fri, 8 May 2026 10:41:53 -0600 Subject: [PATCH 2/5] Implementing code review feedback from Claude Code --- .../analytics/conf/toxiproxy/toxiproxy.json | 9 --------- .../analytics/init/99-test-users.sql | 0 .../experiments/docker-compose.yml | 1 - 3 files changed, 10 deletions(-) delete mode 100644 docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json delete mode 100644 docker/docker-compose-examples/analytics/init/99-test-users.sql diff --git a/docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json b/docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json deleted file mode 100644 index 4a38107404c6..000000000000 --- a/docker/docker-compose-examples/analytics/conf/toxiproxy/toxiproxy.json +++ /dev/null @@ -1,9 +0,0 @@ -[ - { - "_comment": "Pre-configures the clickhouse proxy at container startup via the -config flag. Without this file, Toxiproxy starts with no proxies and the proxy would need to be created manually via the REST API (POST /proxies) before any traffic can flow. For the integration test (HungConnectionIT) this is required: the Spring datasource URL points at the proxy port (18123) before the context boots, so the proxy must exist the moment the container is healthy. For the manual smoke-test stack (docker-compose.toxiproxy.yml) this file is not used — the proxy is created explicitly with a curl call after docker compose up.", - "name": "clickhouse", - "listen": "0.0.0.0:18123", - "upstream": "clickhouse-01:8123", - "enabled": true - } -] diff --git a/docker/docker-compose-examples/analytics/init/99-test-users.sql b/docker/docker-compose-examples/analytics/init/99-test-users.sql deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/docker/docker-compose-examples/experiments/docker-compose.yml b/docker/docker-compose-examples/experiments/docker-compose.yml index e1ebba54cfda..7558a65592ff 100644 --- a/docker/docker-compose-examples/experiments/docker-compose.yml +++ b/docker/docker-compose-examples/experiments/docker-compose.yml @@ -295,6 +295,5 @@ volumes: analytics-db-data: ch_data: redis-data: - redis_ur_data: cube_metastore: workspace: From 5551767624898e82f455b9cb42ae244f098da22d Mon Sep 17 00:00:00 2001 From: Jose Castro Date: Fri, 8 May 2026 10:50:15 -0600 Subject: [PATCH 3/5] Implementing code review feedback from Claude Code --- docker/docker-compose-examples/experiments/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/docker-compose-examples/experiments/README.md b/docker/docker-compose-examples/experiments/README.md index 207a93412468..46997fac3471 100644 --- a/docker/docker-compose-examples/experiments/README.md +++ b/docker/docker-compose-examples/experiments/README.md @@ -62,14 +62,14 @@ Choose your startup method based on your needs: #### Option 1: Using the Startup Script (Recommended) ```bash # Experiments services only (faster startup, less resources) -./start-experiments.sh --analytics-only +./start-experiments.sh --experiments-only # Full stack with dotCMS (complete development environment) ./start-experiments.sh # Force recreate containers (required for environment variable changes) ./start-experiments.sh --force-recreate -./start-experiments.sh --analytics-only --force-recreate +./start-experiments.sh --experiments-only --force-recreate # Show help and service details ./start-experiments.sh --help From f7d516717702dccb34529c6b425bb42287885252 Mon Sep 17 00:00:00 2001 From: Jose Castro Date: Fri, 8 May 2026 11:08:30 -0600 Subject: [PATCH 4/5] Implementing code review feedback from Claude Code --- docker/docker-compose-examples/analytics/README.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docker/docker-compose-examples/analytics/README.md b/docker/docker-compose-examples/analytics/README.md index b2ca78fe5175..18dc30ce5cfb 100644 --- a/docker/docker-compose-examples/analytics/README.md +++ b/docker/docker-compose-examples/analytics/README.md @@ -138,9 +138,9 @@ analytics/ └── 50-users.sql # Default customer user (cust-001) ``` -> **Note:** `init/` is mounted on **clickhouse-01 only** (as `/docker-entrypoint-initdb.d`). DDL -> is replicated automatically to clickhouse-02 via the `Replicated` database engine — do not -> mount `init/` on both nodes or scripts will run twice. +> **Note:** `init/` is mounted on **both data nodes** (as `/docker-entrypoint-initdb.d`). All +> `CREATE` statements use `IF NOT EXISTS`, so when clickhouse-02 runs the same scripts it finds +> objects already replicated and skips them gracefully. --- @@ -187,9 +187,8 @@ app on port `8080`. ## Database Initialization -The `init/` scripts run in filename order on clickhouse-01's first start. Because the `analytics` -database uses the `Replicated` engine, all DDL is automatically propagated to clickhouse-02 — -**do not mount `init/` on both nodes**. +The `init/` scripts run in filename order on first start. Both nodes mount `init/` — all +`CREATE` statements use `IF NOT EXISTS`, so scripts are safe to run on both replicas. | Script | What it creates | |---|---| From 99536ceb1a63eb0e9cd15cecd1d1f0fb951af99d Mon Sep 17 00:00:00 2001 From: Jose Castro Date: Mon, 11 May 2026 09:17:09 -0600 Subject: [PATCH 5/5] Removing example from core project, and point to the existing one in the CA Event Manager repo --- .../analytics/README.md | 261 +--- .../analytics/conf/clickhouse-01/macros.xml | 14 - .../analytics/conf/clickhouse-02/macros.xml | 14 - .../analytics/conf/keeper/keeper_config.xml | 111 -- .../analytics/conf/users.xml | 14 - .../analytics/conf/zookeeper.xml | 53 - .../analytics/docker-compose.yml | 87 -- .../analytics/init/01-init.sql | 14 - .../analytics/init/10-global.sql | 105 -- .../analytics/init/20-event-data.sql | 122 -- .../analytics/init/30-conversion-data.sql | 167 --- .../init/40-session-engagement-data.sql | 1085 ----------------- .../analytics/init/50-users.sql | 11 - 13 files changed, 8 insertions(+), 2050 deletions(-) delete mode 100644 docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml delete mode 100644 docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml delete mode 100644 docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml delete mode 100644 docker/docker-compose-examples/analytics/conf/users.xml delete mode 100644 docker/docker-compose-examples/analytics/conf/zookeeper.xml delete mode 100644 docker/docker-compose-examples/analytics/docker-compose.yml delete mode 100644 docker/docker-compose-examples/analytics/init/01-init.sql delete mode 100644 docker/docker-compose-examples/analytics/init/10-global.sql delete mode 100644 docker/docker-compose-examples/analytics/init/20-event-data.sql delete mode 100644 docker/docker-compose-examples/analytics/init/30-conversion-data.sql delete mode 100644 docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql delete mode 100644 docker/docker-compose-examples/analytics/init/50-users.sql diff --git a/docker/docker-compose-examples/analytics/README.md b/docker/docker-compose-examples/analytics/README.md index 18dc30ce5cfb..8e8f3921b268 100644 --- a/docker/docker-compose-examples/analytics/README.md +++ b/docker/docker-compose-examples/analytics/README.md @@ -1,256 +1,11 @@ -# Docker Setup +# Content Analytics Infrastructure -This directory contains the Docker Compose configuration and supporting files for running the -**dotCMS Content Analytics Event Manager** and its ClickHouse cluster locally. +The Docker Compose setup for the dotCMS Content Analytics infrastructure +(ClickHouse cluster + `ca-event-manager`) lives in its own repository to avoid +duplicating configuration files across repos: ---- +**https://github.com/dotCMS/dot-ca-event-manager** -## Table of Contents - -1. [Architecture Overview](#architecture-overview) -2. [Services](#services) - - [clickhouse-keeper](#clickhouse-keeper) - - [clickhouse-01 / clickhouse-02](#clickhouse-01--clickhouse-02) - - [ca-event-manager](#ca-event-manager) -3. [Directory Layout](#directory-layout) -4. [Running the Stack](#running-the-stack) - - [ClickHouse only (recommended for development)](#clickhouse-only-recommended-for-development) - - [Full stack](#full-stack) -5. [Configuration Files](#configuration-files) -6. [Database Initialization](#database-initialization) -7. [Default Credentials](#default-credentials) -8. [Ports at a Glance](#ports-at-a-glance) -9. [Scaling Keeper to Production](#scaling-keeper-to-production) - ---- - -## Architecture Overview - -``` -┌──────────────────────────────────────────────────────────────┐ -│ Docker network │ -│ │ -│ ┌──────────────────┐ ┌──────────────────────────┐ │ -│ │ clickhouse-keeper│◄──────►│ clickhouse-01 │ │ -│ │ (Raft / coord) │ │ (data node, replica 1) │ │ -│ └──────────────────┘ │ HTTP :8123 TCP :9000 │ │ -│ ▲ └──────────────────────────┘ │ -│ │ ▲ │ -│ │ │ replication │ -│ │ ┌──────────────────────────┐ │ -│ └──────────────────►│ clickhouse-02 │ │ -│ │ (data node, replica 2) │ │ -│ │ HTTP :8124 TCP :9001 │ │ -│ └──────────────────────────┘ │ -│ │ -│ ┌──────────────────────────────────────┐ │ -│ │ ca-event-manager │ │ -│ │ Spring Boot app HTTP :8080 │ │ -│ │ connects to clickhouse-01:8123 │ │ -│ └──────────────────────────────────────┘ │ -└──────────────────────────────────────────────────────────────┘ -``` - -The two ClickHouse data nodes form a **single-shard, two-replica** cluster. `clickhouse-keeper` -provides Raft-based coordination (replication queues, DDL distribution, leader election). DDL -executed on either node is automatically propagated to the other because the `analytics` database -uses the `Replicated` engine. - ---- - -## Services - -### clickhouse-keeper - -| Property | Value | -|---|---| -| Image | `clickhouse/clickhouse-keeper:25.8` | -| Role | Raft coordination — replication queues, distributed DDL, merge leader election | -| Host port | `9181` (ZooKeeper-compatible client port) | -| Internal Raft port | `9234` (peer-to-peer, not exposed to host) | - -ClickHouse Keeper is a lightweight, built-in replacement for Apache ZooKeeper. It runs as a -**single-node Raft group** in this local setup — always the leader, no quorum required. -See [Scaling Keeper to Production](#scaling-keeper-to-production) for HA options. - -Both data nodes declare `depends_on: clickhouse-keeper (service_healthy)`, so starting either -data node automatically starts Keeper first. - ---- - -### clickhouse-01 / clickhouse-02 - -| Property | clickhouse-01 | clickhouse-02 | -|---|---|---| -| Image | `clickhouse/clickhouse-server:25.8` | `clickhouse/clickhouse-server:25.8` | -| Role | Data node, **replica 1** | Data node, **replica 2** | -| HTTP API (host) | `localhost:8123` | `localhost:8124` | -| Native TCP (host) | `localhost:9000` | `localhost:9001` | -| Shard / Replica macro | `shard1` / `replica1` | `shard1` / `replica2` | - -Both nodes share the same configuration (users, Keeper address, init SQL) except for their -`macros.xml`, which assigns the unique `{replica}` value used by `ReplicatedMergeTree` engine -paths. All tables use `Replicated*MergeTree` without explicit ZooKeeper paths — the `Replicated` -database engine manages paths automatically. - -The application connects to **clickhouse-01 only**. clickhouse-02 exists to verify replication -correctness in integration tests. - ---- - -### ca-event-manager - -| Property | Value | -|---|---| -| Image | `ghcr.io/dotcms/dot-ca-event-manager:latest` | -| Role | Spring Boot analytics API | -| Host port | `8080` | -| ClickHouse target | `clickhouse-01:8123` | - -The service is only included in the **full stack** (`docker-compose.yml`). For day-to-day -development you typically run the app with `mvn spring-boot:run` on the host and start only the -ClickHouse containers. - ---- - -## Directory Layout - -``` -analytics/ -├── docker-compose.yml # Main compose file (full stack) -│ -├── conf/ -│ ├── keeper/ -│ │ └── keeper_config.xml # Keeper Raft config (single-node) -│ ├── clickhouse-01/ -│ │ └── macros.xml # {shard=shard1, replica=replica1} -│ ├── clickhouse-02/ -│ │ └── macros.xml # {shard=shard1, replica=replica2} -│ ├── users.xml # ClickHouse admin user definition -│ └── zookeeper.xml # Keeper endpoint for data nodes -│ -└── init/ # SQL files run by clickhouse-01 on first start - ├── 01-init.sql # CREATE DATABASE analytics (Replicated engine) - ├── 10-global.sql # Raw events table + data-skipping indexes - ├── 20-event-data.sql # Content analytics tables + materialized views - ├── 30-conversion-data.sql # Conversion attribution tables + MVs - ├── 40-session-engagement-data.sql # Session engagement pipeline tables + MVs - └── 50-users.sql # Default customer user (cust-001) -``` - -> **Note:** `init/` is mounted on **both data nodes** (as `/docker-entrypoint-initdb.d`). All -> `CREATE` statements use `IF NOT EXISTS`, so when clickhouse-02 runs the same scripts it finds -> objects already replicated and skips them gracefully. - ---- - -## Running the Stack - -### ClickHouse only (recommended for development) - -Starts Keeper and both data replicas. Run the application separately with `mvn spring-boot:run`. - -```bash -cd docker -docker compose up -d clickhouse-01 clickhouse-02 -``` - -Wait for both nodes to be healthy before starting the app: - -```bash -docker compose ps -``` - -### Full stack - -```bash -cd docker -docker compose up -``` - -Pulls `ghcr.io/dotcms/dot-ca-event-manager:latest` from GHCR, starts ClickHouse, and runs the -app on port `8080`. - ---- - -## Configuration Files - -| File | Purpose | -|---|---| -| `conf/keeper/keeper_config.xml` | Keeper Raft config: port 9181, single-node group, log/snapshot paths | -| `conf/zookeeper.xml` | Tells each data node where to find Keeper (`clickhouse-keeper:9181`) | -| `conf/clickhouse-01/macros.xml` | Node macros: `{shard}=shard1`, `{replica}=replica1` | -| `conf/clickhouse-02/macros.xml` | Node macros: `{shard}=shard1`, `{replica}=replica2` | -| `conf/users.xml` | Defines the `admin` user (password: `admin`, full access management) | - ---- - -## Database Initialization - -The `init/` scripts run in filename order on first start. Both nodes mount `init/` — all -`CREATE` statements use `IF NOT EXISTS`, so scripts are safe to run on both replicas. - -| Script | What it creates | -|---|---| -| `01-init.sql` | `analytics` database (`Replicated` engine), admin row policy | -| `10-global.sql` | `analytics.events` — raw event ingestion table (`ReplicatedMergeTree`) | -| `20-event-data.sql` | `content_events_counter` + `pageviews_by_device_browser_daily` + their materialized views | -| `30-conversion-data.sql` | `conversion_time`, `content_presents_in_conversion` + refreshable MV | -| `40-session-engagement-data.sql` | Full session engagement pipeline: `session_states` → `session_facts` → `session_facts_latest` → roll-up tables (`engagement_daily`, `sessions_by_device_daily`, `sessions_by_browser_daily`, `sessions_by_language_daily`) | -| `50-users.sql` | Creates `cust-001` with a row policy scoped to `customer_id='cust-001'` | - -> `CLICKHOUSE_DB` is intentionally **not set** in `docker-compose.yml`. Setting it would cause -> Docker's entrypoint to pre-create the database as a plain (non-replicated) engine before the -> init scripts run, making the `CREATE DATABASE … ENGINE = Replicated(…)` in `01-init.sql` a -> no-op. - ---- - -## Default Credentials - -| User | Password | Scope | -|---|---|---| -| `admin` | `admin` | Full access, all databases | -| `cust-001` | `abc` | `analytics` database, rows where `customer_id = 'cust-001'` | - -These are **local development defaults only**. All passwords must be rotated in any -non-development environment. - ---- - -## Ports at a Glance - -| Host port | Container | Protocol | Notes | -|---|---|---|---| -| `8123` | clickhouse-01 | HTTP | Primary ClickHouse HTTP API | -| `9000` | clickhouse-01 | TCP | ClickHouse native protocol | -| `8124` | clickhouse-02 | HTTP | Replica HTTP API (tests only) | -| `9001` | clickhouse-02 | TCP | Replica native protocol (tests only) | -| `9181` | clickhouse-keeper | TCP | ZooKeeper-compatible Keeper client port | -| `8080` | ca-event-manager | HTTP | Analytics REST API | - ---- - -## Scaling Keeper to Production - -The current single-node Keeper provides **no high availability**. If the Keeper container goes -down, the data nodes can still serve reads but cannot commit new inserts or run replicated DDL -until the connection is restored. - -For a fault-tolerant cluster, run an **odd number of Keeper nodes** (minimum 3): - -| Keeper nodes | Can lose | Quorum | -|---|---|---| -| 1 | 0 | 1 of 1 — no HA | -| 3 | 1 | 2 of 3 | -| 5 | 2 | 3 of 5 | - -Steps to expand: -1. Add `clickhouse-keeper-2` and `clickhouse-keeper-3` containers, each with its own - `keeper_config.xml` that has a unique `` and all three servers listed in - ``. -2. Update `conf/zookeeper.xml` on every data node to list all three Keeper endpoints. -3. Restart the cluster. - -See the comments inside `conf/keeper/keeper_config.xml` and `conf/zookeeper.xml` for full -configuration examples. \ No newline at end of file +Refer to the `docker/` directory in that repository for the full setup, +including the ClickHouse keeper, replica nodes, initialization scripts, and +the event manager service. \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml b/docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml deleted file mode 100644 index 0c4d93db955e..000000000000 --- a/docker/docker-compose-examples/analytics/conf/clickhouse-01/macros.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - shard1 - replica1 - - diff --git a/docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml b/docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml deleted file mode 100644 index f3db050bc343..000000000000 --- a/docker/docker-compose-examples/analytics/conf/clickhouse-02/macros.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - shard1 - replica2 - - diff --git a/docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml b/docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml deleted file mode 100644 index f4621d05bed3..000000000000 --- a/docker/docker-compose-examples/analytics/conf/keeper/keeper_config.xml +++ /dev/null @@ -1,111 +0,0 @@ - - - - information - 1 - - - - 0.0.0.0 - - - - 9181 - - - 1 - - - /var/lib/clickhouse-keeper/log - /var/lib/clickhouse-keeper/snapshots - - - - 10000 - - 30000 - information - - - - - - 1 - clickhouse-keeper - 9234 - - - - - diff --git a/docker/docker-compose-examples/analytics/conf/users.xml b/docker/docker-compose-examples/analytics/conf/users.xml deleted file mode 100644 index 7588f9f301d4..000000000000 --- a/docker/docker-compose-examples/analytics/conf/users.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - admin - - ::/0 - - default - default - 1 - 1 - - - \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/conf/zookeeper.xml b/docker/docker-compose-examples/analytics/conf/zookeeper.xml deleted file mode 100644 index b7688c905a1a..000000000000 --- a/docker/docker-compose-examples/analytics/conf/zookeeper.xml +++ /dev/null @@ -1,53 +0,0 @@ - - - - - clickhouse-keeper - 9181 - - - diff --git a/docker/docker-compose-examples/analytics/docker-compose.yml b/docker/docker-compose-examples/analytics/docker-compose.yml deleted file mode 100644 index 50714a0a2ac5..000000000000 --- a/docker/docker-compose-examples/analytics/docker-compose.yml +++ /dev/null @@ -1,87 +0,0 @@ -name: dotcms-analytics - -volumes: - clickhouse-01-data: - clickhouse-02-data: - clickhouse-keeper-data: - -services: - clickhouse-keeper: - image: clickhouse/clickhouse-keeper:25.8 - container_name: clickhouse-keeper - ports: - - "9181:9181" - volumes: - - ./conf/keeper/keeper_config.xml:/etc/clickhouse-keeper/keeper_config.xml - - clickhouse-keeper-data:/var/lib/clickhouse-keeper - healthcheck: - test: [ "CMD-SHELL", "clickhouse-keeper-client -h 127.0.0.1 -p 9181 -q 'ruok' 2>/dev/null | grep -q imok" ] - interval: 5s - timeout: 3s - retries: 30 - - clickhouse-01: - image: clickhouse/clickhouse-server:25.8 - container_name: clickhouse-01 - ports: - - "8123:8123" - - "9000:9000" - volumes: - - ./conf/users.xml:/etc/clickhouse-server/users.d/users.xml - - ./conf/zookeeper.xml:/etc/clickhouse-server/config.d/zookeeper.xml - - ./conf/clickhouse-01/macros.xml:/etc/clickhouse-server/config.d/macros.xml - - ./init:/docker-entrypoint-initdb.d - - clickhouse-01-data:/var/lib/clickhouse - ulimits: - nofile: - soft: 262144 - hard: 262144 - depends_on: - clickhouse-keeper: - condition: service_healthy - healthcheck: - test: [ "CMD-SHELL", "wget -qO- http://localhost:8123/ping | grep -q Ok" ] - interval: 5s - timeout: 3s - retries: 30 - - clickhouse-02: - image: clickhouse/clickhouse-server:25.8 - container_name: clickhouse-02 - ports: - - "8124:8123" - - "9001:9000" - volumes: - - ./conf/users.xml:/etc/clickhouse-server/users.d/users.xml - - ./conf/zookeeper.xml:/etc/clickhouse-server/config.d/zookeeper.xml - - ./conf/clickhouse-02/macros.xml:/etc/clickhouse-server/config.d/macros.xml - - ./init:/docker-entrypoint-initdb.d - - clickhouse-02-data:/var/lib/clickhouse - ulimits: - nofile: - soft: 262144 - hard: 262144 - depends_on: - clickhouse-keeper: - condition: service_healthy - healthcheck: - test: [ "CMD-SHELL", "wget -qO- http://localhost:8123/ping | grep -q Ok" ] - interval: 5s - timeout: 3s - retries: 30 - - ca-event-manager: - image: ghcr.io/dotcms/dot-ca-event-manager:latest - container_name: ca-event-manager - environment: - - SPRING_PROFILES_ACTIVE=dev - - ANALYTICS_ASYNC_INSERT_TIMEOUT_MS=200 - - ANALYTICS_ASYNC_INSERT_MAX_DATA_SIZE=104857600 - - CLICKHOUSE_URL=jdbc:clickhouse://clickhouse-01:8123/analytics?async_insert=1&wait_for_async_insert=1&async_insert_busy_timeout_ms=${ANALYTICS_ASYNC_INSERT_TIMEOUT_MS:-200}&async_insert_max_data_size=${ANALYTICS_ASYNC_INSERT_MAX_DATA_SIZE:-104857600} - - ANALYTICS_CH_PROBE_USR=${ANALYTICS_CH_PROBE_USR:-admin} - - ANALYTICS_CH_PROBE_PWD=${ANALYTICS_CH_PROBE_PWD:-admin} - ports: - - "8082:8080" - depends_on: - clickhouse-01: - condition: service_healthy diff --git a/docker/docker-compose-examples/analytics/init/01-init.sql b/docker/docker-compose-examples/analytics/init/01-init.sql deleted file mode 100644 index 6207f1c41bcb..000000000000 --- a/docker/docker-compose-examples/analytics/init/01-init.sql +++ /dev/null @@ -1,14 +0,0 @@ --- ===================================================================== --- Database creation and initialization --- ===================================================================== --- Replicated database: replicates DDL (CREATE/ALTER/DROP) across all nodes via Keeper, --- and coordinates refreshable MV execution so only one replica runs each refresh cycle. -CREATE DATABASE IF NOT EXISTS analytics - ENGINE = Replicated('/clickhouse/databases/analytics', '{shard}', '{replica}'); -USE analytics; - -CREATE ROW POLICY IF NOT EXISTS rp_admin_user -ON analytics.* -FOR SELECT - USING customer_id = 'customer1' - AND environment = 'cluster1'; \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/init/10-global.sql b/docker/docker-compose-examples/analytics/init/10-global.sql deleted file mode 100644 index 91c552125ad8..000000000000 --- a/docker/docker-compose-examples/analytics/init/10-global.sql +++ /dev/null @@ -1,105 +0,0 @@ --- ===================================================================== --- This is the raw event ingestion table. --- ===================================================================== -CREATE TABLE IF NOT EXISTS analytics.events -( - -- ###################################################### - -- General Event Properties - -- ###################################################### - timestamp DateTime64(3, 'UTC') CODEC(DoubleDelta, ZSTD(3)), - event_time DateTime64(3, 'UTC') CODEC(DoubleDelta, ZSTD(3)), - event_type LowCardinality(String), - environment LowCardinality(String), - customer_id LowCardinality(String), - - - -- ###################################################### - -- URL Properties - -- ###################################################### - url String, - page_title String, - site_id String, - doc_host String, - doc_path String, - doc_search String, - doc_encoding LowCardinality(String), - doc_hash Nullable(String), - doc_protocol LowCardinality(String), - referer String CODEC(ZSTD(3)), - - - -- ###################################################### - -- Browser Properties - -- ###################################################### - user_agent String, - -- Raw parsed UA fields (set by Java at ingest time via uap-java) - parsed_ua_device_family LowCardinality(String), - parsed_ua_os_family LowCardinality(String), - parsed_ua_ua_family LowCardinality(String), - -- Derived bucketed categories (set by Java at event ingestion time via in-memory lookup) - device_category LowCardinality(String) DEFAULT '', - browser_family LowCardinality(String) DEFAULT '', - screen_resolution String, - viewport_size String, - viewport_height String, - viewport_width String, - browser_language LowCardinality(String), - locale_id LowCardinality(String) DEFAULT '', - user_id String CODEC(ZSTD(3)), - session_id String CODEC(ZSTD(3)), - - - -- ###################################################### - -- Analytics Tool Properties - -- ###################################################### - utm_campaign LowCardinality(String), - utm_medium LowCardinality(String), - utm_source LowCardinality(String), - utm_term Nullable(String), - utm_content Nullable(String), - - - -- ###################################################### - -- Used in content_impression events - -- ###################################################### - content_identifier Nullable(String) CODEC(ZSTD(3)), - content_inode Nullable(String) CODEC(ZSTD(3)), - content_title Nullable(String), - content_content_type Nullable(String), - position_viewport_offset_pct Nullable(Int16), - position_dom_index Nullable(Int8), - - - -- ###################################################### - -- Used in content_click events - -- ###################################################### - dom_element_text Nullable(String), - dom_element_type Nullable(String), - dom_element_id Nullable(String), - dom_element_class Nullable(String), - dom_element_attributes Nullable(String), - - - -- ###################################################### - -- Used in conversion events - -- ###################################################### - conversion_name String, - - - -- ###################################################### - -- Data skipping indexes - -- ###################################################### - INDEX idx_event_time event_time TYPE minmax GRANULARITY 1, - INDEX idx_environment environment TYPE bloom_filter GRANULARITY 64, - INDEX idx_customer_id customer_id TYPE bloom_filter GRANULARITY 64, - INDEX idx_event_type event_type TYPE set(100) GRANULARITY 1, - INDEX idx_conversion conversion_name TYPE set(100) GRANULARITY 1, - INDEX idx_user_id user_id TYPE bloom_filter GRANULARITY 64, - INDEX idx_content_identifier content_identifier TYPE bloom_filter GRANULARITY 64, - INDEX idx_device_category device_category TYPE set(50) GRANULARITY 1, - INDEX idx_browser_family browser_family TYPE set(50) GRANULARITY 1 - -) Engine = ReplicatedMergeTree() - PARTITION BY customer_id - ORDER BY (timestamp, customer_id) - SETTINGS index_granularity = 8192; diff --git a/docker/docker-compose-examples/analytics/init/20-event-data.sql b/docker/docker-compose-examples/analytics/init/20-event-data.sql deleted file mode 100644 index 47ca9ff8742d..000000000000 --- a/docker/docker-compose-examples/analytics/init/20-event-data.sql +++ /dev/null @@ -1,122 +0,0 @@ --- ===================================================================== --- Stores daily aggregated counts of events per: --- --- day --- environment --- customer_id --- event_type --- user_id --- identifier (URL or content_id) --- title - --- Why SummingMergeTree? - --- Because the MV inserts pre-aggregated rows, and daily_total is summed on merge. - ---This allows: ---fast incremental updates ---easy "daily counts" reporting ---low storage overhead --- ===================================================================== - -CREATE TABLE IF NOT EXISTS analytics.content_events_counter -( - day Date, - - environment LowCardinality(String), - customer_id LowCardinality(String), - - site_id String, - - event_type LowCardinality(String), - user_id String CODEC(ZSTD(3)), - - identifier String CODEC(ZSTD(3)), - title String, - - daily_total UInt64 -) - ENGINE = ReplicatedSummingMergeTree(daily_total) -PARTITION BY (customer_id, environment, toYYYYMM(day)) -ORDER BY (customer_id, environment, user_id, day, identifier, title, event_type); - - --- ===================================================================== --- Transforms raw events into daily activity counters. --- For every event inserted into events, it computes: --- --- day → start-of-day from event_time --- identifier → URL for pageview, content_identifier otherwise --- title → page_title or content_title --- --- Then groups by: --- customer_id, environment, user_id, day, identifier, title, event_type --- --- And inserts: --- --- count(*) AS daily_total --- ===================================================================== - -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.content_events_counter_mv TO analytics.content_events_counter AS -SELECT customer_id, - environment, - event_type, - user_id, - site_id, - toStartOfDay(event_time) as day, - (CASE - WHEN event_type = 'pageview' THEN doc_path - WHEN event_type = 'conversion' THEN conversion_name - ELSE content_identifier - END) as identifier, - (CASE - WHEN event_type = 'pageview' THEN page_title - WHEN event_type = 'conversion' THEN conversion_name - ELSE content_title - END) as title, - count(*) as daily_total -FROM analytics.events -GROUP BY customer_id, environment, user_id, day, identifier, title, event_type, site_id; - - --- ===================================================================== --- Stores daily pre-aggregated pageview counts grouped by device category and browser family. --- --- Why SummingMergeTree? --- The same reasoning as content_events_counter: the MV inserts pre-aggregated rows, --- and pageview_count is summed on merge. Allows fast, scalable reads for the --- "Pageviews by Device & Browser" dashboard metric. --- --- Java sets device_category and browser_family at ingestion time (UA parsing). --- The MV normalizes empty strings (pre-enrichment historical events) to --- 'unknown' / 'unknown' so the table never stores blanks. --- --- ===================================================================== -CREATE TABLE IF NOT EXISTS analytics.pageviews_by_device_browser_daily -( - day Date, - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - device_category LowCardinality(String), - browser_family LowCardinality(String), - pageview_count UInt64 -) - ENGINE = ReplicatedSummingMergeTree(pageview_count) - PARTITION BY (customer_id, environment, toYYYYMM(day)) - ORDER BY (customer_id, environment, site_id, day, device_category, browser_family); - - -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.pageviews_by_device_browser_daily_mv - TO analytics.pageviews_by_device_browser_daily AS -SELECT - customer_id, - environment, - site_id, - toStartOfDay(event_time) AS day, - if(device_category = '', 'unknown', device_category) AS device_category, - if(browser_family = '', 'unknown', browser_family) AS browser_family, - count(*) AS pageview_count -FROM analytics.events -WHERE event_type = 'pageview' -GROUP BY customer_id, environment, site_id, day, device_category, browser_family; diff --git a/docker/docker-compose-examples/analytics/init/30-conversion-data.sql b/docker/docker-compose-examples/analytics/init/30-conversion-data.sql deleted file mode 100644 index 183397b7994f..000000000000 --- a/docker/docker-compose-examples/analytics/init/30-conversion-data.sql +++ /dev/null @@ -1,167 +0,0 @@ --- ===================================================================== --- Stores the latest known conversion timestamp per user, but in aggregate function format. --- Two aggregated fields: --- --- conversion_last_time → last conversion event time --- --- timestamp_last_time → last processed timestamp inside content_presents_in_conversion --- --- Why AggregatingMergeTree? --- --- Because conversion_time_mv inserts aggregate states (maxState) and later merges them. --- This table provides a "boundary" so that future incremental batches don't reprocess old records. --- ===================================================================== -CREATE TABLE IF NOT EXISTS analytics.conversion_time -( - environment LowCardinality(String), - customer_id LowCardinality(String), - - site_id String, - user_id String CODEC(ZSTD(3)), - - conversion_last_time AggregateFunction( max, DateTime64(3, 'UTC')), - timestamp_last_time AggregateFunction( max, DateTime64(3, 'UTC')) -) - ENGINE = ReplicatedAggregatingMergeTree() -PARTITION BY (customer_id, environment) -ORDER BY (customer_id, environment, user_id); - - - --- ===================================================================== --- Tracks which content a user interacted with prior to a conversion and after the user's previous conversion --- ===================================================================== -CREATE TABLE IF NOT EXISTS analytics.content_presents_in_conversion -( - day Date, - last_timestamp DateTime64(3, 'UTC'), - last_conversion_time DateTime64(3, 'UTC'), - - environment LowCardinality(String), - customer_id LowCardinality(String), - - site_id String, - - event_type LowCardinality(String), - user_id String CODEC(ZSTD(3)), - - identifier String CODEC(ZSTD(3)), - title String, - - conversion_name String, - conversion_count UInt32, - events_count UInt32 -) - ENGINE = ReplicatedSummingMergeTree() -PARTITION BY (customer_id, environment, toYYYYMM(day)) -ORDER BY (customer_id, environment, user_id, event_type, conversion_name, identifier, title, day); - - - --- ===================================================================== --- It does: --- --- Identifies new conversions since last refresh --- Locates content seen by the user right before each conversion --- Inserts attribution rows into content_presents_in_conversion --- --- How it works (step-by-step) --- A) Define conversion CTE --- For each conversion event: --- Joins against conversion_time to get the previous batch's last timestamps --- Uses lag() to find previous conversion in current batch --- --- Calculates: --- previous_conversion_timestamp = max(previous_timestamp_current_batch, last_timestamp_previous_batch) --- --- Filters conversions that are: --- --- new (timestamp > last_timestamp_previous_batch) --- recent (timestamp <= now()) --- --- This ensures incremental processing, no duplicates. --- --- B) Join events leading to conversion --- --- Matches events where: --- --- e.event_time < conversion.conversion_time --- e.event_time > conversion.conversion_last_time --- event_type <> 'conversion' - --- Meaning: --- --- Only consider events between the previous conversion timestamp and this conversion timestamp. --- --- C) Group and insert --- --- Inserts rows summarizing content presence before the conversion. --- ===================================================================== -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.content_presents_in_conversion_mv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND - APPEND TO analytics.content_presents_in_conversion AS -WITH conversion AS ( - SELECT user_id, - event_time AS conversion_time, - timestamp, - maxMerge(conversion_time.timestamp_last_time) as last_timestamp_previous_batch, - maxMerge(conversion_time.conversion_last_time) as conversion_last_time, - e.conversion_name, - lag(timestamp, 1) OVER ( - PARTITION BY user_id - ORDER BY timestamp - ) AS previous_timestamp_current_batch, - lag(event_time, 1) OVER ( - PARTITION BY user_id - ORDER BY event_time - ) AS previous_event_time_current_batch, - (CASE WHEN previous_event_time_current_batch > conversion_last_time THEN previous_event_time_current_batch ELSE conversion_last_time END) as previous_conversion_time - FROM analytics.events as e - LEFT JOIN analytics.conversion_time on e.customer_id = conversion_time.customer_id AND e.environment = conversion_time.environment AND - e.user_id = conversion_time.user_id AND e.site_id = conversion_time.site_id - WHERE event_type = 'conversion' - group by user_id,event_time, timestamp, conversion_name - HAVING (timestamp >= last_timestamp_previous_batch AND timestamp <= now()) -) -SELECT - toStartOfDay(conversion.conversion_time) as day, - customer_id, - environment, - (CASE WHEN event_type = 'pageview' THEN doc_path ELSE content_identifier END) as identifier, - (CASE WHEN event_type = 'pageview' THEN page_title ELSE content_title END) as title, - event_type, - user_id, - site_id, - conversion.conversion_name as conversion_name, - count(*) AS events_count, - count(DISTINCT conversion_time) AS conversion_count, - max(conversion.timestamp) as last_timestamp, - max(conversion.conversion_time) as last_conversion_time -FROM analytics.events e - INNER JOIN conversion ON e.user_id = conversion.user_id AND - e.event_time < conversion.conversion_time AND - e.event_time > conversion.previous_conversion_time AND - event_type <> 'conversion' -GROUP BY customer_id, environment, identifier, title, event_type, user_id, conversion.conversion_name, day, site_id; - - - --- ===================================================================== --- Updates the conversion_time table using the output of content_presents_in_conversion. Every time new attribution rows are emitted --- --- This ensures: --- --- Next execution of the refreshable MV knows where the last batch ended --- --- Prevents reprocessing or double counting --- ===================================================================== -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.conversion_time_mv TO analytics.conversion_time AS -SELECT customer_id, - environment, - user_id, - site_id, - maxState(last_timestamp) as timestamp_last_time, - maxState(last_conversion_time) as conversion_last_time -FROM analytics.content_presents_in_conversion -GROUP BY customer_id, environment, user_id, site_id; \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql b/docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql deleted file mode 100644 index 55d6031b24fd..000000000000 --- a/docker/docker-compose-examples/analytics/init/40-session-engagement-data.sql +++ /dev/null @@ -1,1085 +0,0 @@ -/* ===================================================================================================== - dotCMS Content Analytics - Session Engagement Pipeline - ===================================================================================================== - - OVERVIEW - ----------------------------------------------------------------------------------------------------- - - This script defines the complete session-engagement pipeline used to compute GA4-style engagement - metrics for dotCMS Content Analytics while keeping the architecture scalable, explicit, and easy to - reason about. - - This version assumes: - - 1) session_id is a REAL browser session identifier - - sessions are short-lived - - sessions rotate normally - - session_id is NOT a long-lived user identity - - 2) late-arriving events are still possible - - network retries - - collector delays - - buffering - - eventual ingestion into ClickHouse - - 3) ALL historical session data must be kept - - TTL must be defined when ready for production - - no dropping older sessions - - the late-event window is only for recomputation - - 4) downstream consumers will use RAW SQL - - no semantic modeling layer on top of ClickHouse - - the service layer / API can directly query the roll-up tables - - ----------------------------------------------------------------------------------------------------- - HIGH-LEVEL PIPELINE - ----------------------------------------------------------------------------------------------------- - - events (raw immutable event stream) - ↓ real-time MV - session_states (incremental mergeable session states) - ↓ refreshable MV APPEND - session_facts (full historical session table, versioned) - ↓ refreshable MV - session_facts_latest (latest effective row per session) - ↓ refreshable MVs - engagement_daily + sessions_by_*_daily - ↓ - raw SQL queries / API / Angular dashboard - - ----------------------------------------------------------------------------------------------------- - WHY THIS SHAPE? - ----------------------------------------------------------------------------------------------------- - - We want to solve two competing needs: - - A) Keep ALL session history forever - B) Still reprocess recent sessions to absorb late-arriving events - - A naive design would overwrite `session_facts` with only the recent sliding window, but that would make - older sessions disappear from the table. - - Instead, this design works like this: - - - `session_states` continuously accumulates mergeable event states - - `session_facts_rmv` recalculates ONLY recent sessions and APPENDS a newer version of the row into - `session_facts` - - `session_facts` therefore becomes a versioned historical store - - `session_facts_latest_rmv` deduplicates `session_facts` into exactly one latest row per session key - - all dashboard roll-ups read from `session_facts_latest`, so they do not need the `FINAL` keyword in - the `SELECT` queries used to read data from it. - - ----------------------------------------------------------------------------------------------------- - IMPORTANT CONCEPTS - ----------------------------------------------------------------------------------------------------- - - 1) "Sliding window" DOES NOT mean data retention - The sliding window only determines which sessions are recalculated for late-event correction. - - 2) `session_facts` keeps full history - Old sessions remain stored forever unless you later add a TTL or retention job. - - 3) `session_facts_latest` is the "current truth" layer - It contains the latest effective version of each session and is the recommended source for roll-ups - and direct SQL queries that need one row per session. - - 4) This script is optimized for correctness and clarity first - It is already production-friendly in shape, but you can later tune refresh frequencies, partitions, - and roll-up scopes after observing real data volume and ingestion lag. - -===================================================================================================== */ - - -/* ===================================================================================================== - PIPELINE DIAGRAM - ===================================================================================================== - - ┌───────────────────────────────────────────────┐ - │ Browser / Site │ - │ │ - │ pageview | content_click | conversion | ... │ - │ │ - └──────────────────────┬────────────────────────┘ - │ - ▼ - ┌───────────────────────────────────────────────┐ - │ events (MergeTree) │ - │ │ - │ - one row per event │ - │ - raw immutable ingestion stream │ - │ - device_category / browser_family set by │ - │ Java at ingest time (UA parsing) │ - │ │ - └──────────────────────┬────────────────────────┘ - │ (incremental MV) - ▼ - ┌───────────────────────────────────────────────┐ - │ session_states (AggregatingMT) │ - │ │ - │ - mergeable per-session states │ - │ - late events naturally merge in │ - │ │ - └──────────────────────┬────────────────────────┘ - │ (refreshable MV APPEND) - ▼ - ┌───────────────────────────────────────────────┐ - │ session_facts (ReplacingMT) │ - │ │ - │ - full historical session store │ - │ - newer versions appended for recent rows │ - │ │ - └──────────────────────┬────────────────────────┘ - │ (refreshable MV) - ▼ - ┌───────────────────────────────────────────────┐ - │ session_facts_latest (ReplacingMT) │ - │ │ - │ - exactly one latest row per session │ - │ - source of truth for roll-ups │ - │ │ - └────────────┬────────────────────────┬─────────┘ - │ │ - ▼ ▼ - ┌─────────────────────────┐ ┌────────────────────────────┐ - │ engagement_daily │ │ sessions_by_*_daily │ - │ (daily KPI roll-up) │ │ (device / browser / lang) │ - └────────────┬────────────┘ └──────────────┬─────────────┘ - │ │ - ▼ ▼ - ┌────────────────────────────────────────────────────────┐ - │ Raw SQL queries / service layer │ - │ │ - │ - KPI cards │ - │ - trend charts │ - │ - distribution widgets │ - │ - arbitrary date ranges │ - │ │ - └───────────────────────────┬────────────────────────────┘ - │ - ▼ - ┌───────────────────────────────────────────────┐ - │ Angular Dashboard │ - └───────────────────────────────────────────────┘ - -===================================================================================================== */ - - -/* ===================================================================================================== - 1) SESSION STATES - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.session_states - - ENGINE - ----------------------------------------------------------------------------------------------------- - ReplicatedAggregatingMergeTree - - PURPOSE - ----------------------------------------------------------------------------------------------------- - This table stores mergeable per-session aggregate states derived from the raw events stream. - - Instead of repeatedly scanning analytics.events and running large GROUP BY queries every time we want - session-level metrics, we continuously maintain aggregate states per session. - - This is the scalable "intermediate session layer." - - WHY IT EXISTS - ----------------------------------------------------------------------------------------------------- - - keeps ingestion incremental and efficient - - absorbs late-arriving events automatically - - avoids rebuilding sessions from scratch from raw events over and over - - allows downstream session finalization to work on a much smaller table than analytics.events - - GRAIN - ----------------------------------------------------------------------------------------------------- - One logical session is identified by: - - (customer_id, environment, site_id, session_id) - - WHY AGGREGATE FUNCTION COLUMNS? - ----------------------------------------------------------------------------------------------------- - Because AggregatingMergeTree expects mergeable states: - - minState(...) / minMerge(...) - - maxState(...) / maxMerge(...) - - countState() / countMerge(...) - - argMaxState(...) / argMaxMerge(...) - - This allows partial rows written from many insert batches to merge correctly into one logical session. - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.session_states -( - /* Tenant scope: required to isolate customers and environments cleanly */ - customer_id LowCardinality(String), -- dotCMS customer / tenant identifier - environment LowCardinality(String), -- deployment environment (prod/stage/etc.) - site_id String, - /* Session boundary */ - session_id String, -- unique session identifier. All events with the same session_id belong together - - /* Session time window (mergeable aggregate states) */ - min_ts_state AggregateFunction(min, DateTime64(3, 'UTC')), -- earliest event timestamp seen in session - max_ts_state AggregateFunction(max, DateTime64(3, 'UTC')), -- latest event timestamp seen in session - - /* Event counters (mergeable) */ - total_events_state AggregateFunction(count), -- total events in session - pageviews_state AggregateFunction(countIf, UInt8), -- total number of pageview events in the session - conversions_state AggregateFunction(countIf, UInt8), -- total number of conversion events in the session - - /* Dimension "last known value" states (mergeable) */ - -- last-seen device category label for the session (set by Java at ingestion time) - -- stored as state so that late events can update the final value deterministically. - device_category_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), - -- last-seen browser family bucket (Chrome/Safari/Firefox/Edge/Other) - browser_family_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')), - -- last-seen dotCMS language ISO code, defaulting to '' ('undefined') if unknown - locale_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')) -) - /* Why this engine is mandatory: - -> You are storing aggregate states - -> You rely on merge correctness - -> Without replication, different replicas would compute different session states */ - ENGINE = ReplicatedAggregatingMergeTree() - /* Partitioning note: - We partition by a hash of (customer, cluster) to spread writes and merges. - This avoids a single giant partition for big tenants and keeps merges parallelizable. */ - PARTITION BY sipHash64(customer_id, environment) % 64 - /* Note for the sort key: - ORDER BY includes tenant + session to keep session states physically clustered for merges/finalization. - This also ensures stable grouping keys for session_facts refresh queries. */ - ORDER BY ( - customer_id, - environment, - site_id, - session_id); - - -/* ===================================================================================================== - 4) REAL-TIME MV: events → session_states - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Materialized View (incremental, insert-triggered) - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.session_states_mv - - SOURCE - ----------------------------------------------------------------------------------------------------- - analytics.events - - TARGET - ----------------------------------------------------------------------------------------------------- - analytics.session_states - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Runs on every insert into analytics.events and converts the newly inserted batch into mergeable - session aggregate states. - - WHY THIS MV IS IMPORTANT - ----------------------------------------------------------------------------------------------------- - This is the object that keeps the whole pipeline scalable. - - Without it, every sessionization/finalization step would need to repeatedly scan raw events and group - them again. With this MV: - - inserts stay cheap - - aggregation work is incremental - - late events naturally merge into existing sessions - - DIMENSION STRATEGY - ----------------------------------------------------------------------------------------------------- - device_category and browser_family are set by Java at ingestion time and read directly from analytics.events. - -===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.session_states_mv - TO analytics.session_states -AS -WITH - /* Normalize empty locale values so they can be ignored cleanly in argMaxStateIf */ - nullIf(locale_id, '') AS normalized_locale_id -SELECT - e.customer_id, - e.environment, - e.site_id, - e.session_id, - - /* Time boundaries for the session */ - minState(e.event_time) AS min_ts_state, - maxState(e.event_time) AS max_ts_state, - - /* Mergeable counters */ - countState() AS total_events_state, - countIfState(e.event_type = 'pageview') AS pageviews_state, - countIfState(e.event_type = 'conversion') AS conversions_state, - - /* Mergeable latest dimension states — values already set by Java at ingestion time */ - argMaxState(e.device_category, e.event_time) AS device_category_state, - argMaxState(e.browser_family, e.event_time) AS browser_family_state, - - /* Locale is tracked only from pageview events and only when present */ - argMaxStateIf( - normalized_locale_id, - e.event_time, - e.event_type = 'pageview' AND normalized_locale_id IS NOT NULL - ) AS locale_id_state -FROM analytics.events AS e -WHERE e.session_id != '' - AND e.customer_id != '' - AND e.environment != '' - AND e.site_id != '' - /* Defensive guard to avoid broken/null-ish timestamps participating in session logic */ - AND e.event_time > toDateTime64(0, 3, 'UTC') -GROUP BY ( - e.customer_id, - e.environment, - e.site_id, - e.session_id); - - -/* ===================================================================================================== - 5) SESSION FACTS (FULL HISTORICAL VERSIONED TABLE) - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.session_facts - - ENGINE - ----------------------------------------------------------------------------------------------------- - ReplicatedReplacingMergeTree(updated_at) - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Stores the full historical session table. - - This is NOT just a hot window table. - This table is meant to retain all sessions across all history. - - WHY ReplacingMergeTree(updated_at)? - ----------------------------------------------------------------------------------------------------- - Because recent sessions may be recalculated when late events arrive. - - Example: - - session originally finalized at 10:05 - - delayed event arrives at 10:20 - - next RMV refresh recalculates that session and appends a newer row - - ReplacingMergeTree allows the newer version to win logically by updated_at. - - IMPORTANT - ----------------------------------------------------------------------------------------------------- - Because session_facts_rmv uses APPEND TO, multiple physical versions of the same session may coexist - temporarily in this table until background merges occur. - - That is exactly why we introduce analytics.session_facts_latest later: - it provides a deduplicated "latest truth" layer for roll-ups and direct querying. - - RECOMMENDED USAGE - ----------------------------------------------------------------------------------------------------- - - Keep this table as your durable historical versioned store - - Do NOT use it directly for roll-ups if you need exactly one row per session - - Use analytics.session_facts_latest for that - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.session_facts -( - /* Tenant scope */ - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - /* Session identity */ - session_id String, - - /* Finalized session times */ - session_start DateTime64(3, 'UTC'), -- earliest event timestamp - session_end DateTime64(3, 'UTC'), -- latest event timestamp - duration_seconds UInt32, -- session_end - session_start (seconds) - - /* Finalized counters */ - total_events UInt32, -- total events in session - pageviews UInt32, -- total pageview events - conversions UInt32, -- total conversion events - - /* Engagement flag (GA4-style) */ - engaged UInt8, -- 1 if engaged, else 0 - - /* Finalized dimensions */ - device_category LowCardinality(String), -- Desktop/Mobile/Tablet/Other - browser_family LowCardinality(String), -- Chrome/Safari/Firefox/Edge/Other - locale_id LowCardinality(String), -- dotCMS language Locale ID ('' means undefined) - - /* Version column. Newer recalculations must have a greater timestamp. */ - updated_at DateTime64(3, 'UTC') -) - ENGINE = ReplicatedReplacingMergeTree(updated_at) - /* Partition by month of session_start. Keeps partitions time-bounded and supports TTL strategies - later if desired. */ - PARTITION BY toYYYYMM(toDate(session_start)) - /* Sort key includes session identity for deterministic replacement. */ - ORDER BY ( - customer_id, - environment, - site_id, - session_id); - - -/* ===================================================================================================== - 6) REFRESHABLE MV: session_states → session_facts - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Refreshable Materialized View (RMV) - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.session_facts_rmv - - SOURCE - ----------------------------------------------------------------------------------------------------- - analytics.session_states - - TARGET - ----------------------------------------------------------------------------------------------------- - analytics.session_facts - - WRITE MODE - ----------------------------------------------------------------------------------------------------- - APPEND TO - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Re-finalizes only RECENT sessions and appends a new version into analytics.session_facts. - - This is the core late-event correction mechanism. - - SLIDING WINDOW - ----------------------------------------------------------------------------------------------------- - start_cutoff = now() - 1 day - - This means: - - only sessions whose latest activity is recent are recalculated - - older sessions remain in analytics.session_facts untouched - - this is NOT a retention window - - WHY 1 DAY? - ----------------------------------------------------------------------------------------------------- - Good conservative default for local testing: - - covers same-day late arrivals comfortably - - easy to reason about - - can later be reduced to 12h or 6h if actual ingestion lag is small - - ENGAGEMENT LOGIC - ----------------------------------------------------------------------------------------------------- - A session is engaged if ANY of these is true: - - duration > 10 seconds - - pageviews >= 2 - - conversions >= 1 - - DIMENSION FINALIZATION - ----------------------------------------------------------------------------------------------------- - device_category and browser_family are read directly from session_states — values were set by Java - at ingestion time and propagated via session_states_mv. - -===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.session_facts_rmv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND APPEND TO analytics.session_facts -AS -WITH - /* Sliding recomputation window for late-event correction */ - (now64(3, 'UTC') - INTERVAL 1 DAY) AS start_cutoff -SELECT - customer_id, - environment, - site_id, - session_id, - - session_start, - session_end, - duration_seconds, - - total_events, - pageviews, - conversions, - - engaged, - - device_category, - browser_family, - - locale_id, - - /* Version timestamp for ReplacingMergeTree */ - now64(3, 'UTC') AS updated_at -FROM - ( - /* Aggregate session_states into finalized scalar columns */ - SELECT - customer_id, - environment, - site_id, - session_id, - - /* Finalized time boundaries */ - minMerge(min_ts_state) AS session_start, - maxMerge(max_ts_state) AS session_end, - - /* Derived duration */ - toUInt32(greatest(0, dateDiff('second', session_start, session_end))) AS duration_seconds, - - /* Finalized counters */ - toUInt32(countMerge(total_events_state)) AS total_events, - toUInt32(countIfMerge(pageviews_state)) AS pageviews, - toUInt32(countIfMerge(conversions_state)) AS conversions, - - /* Business rules that determine whether a session is flagged as 'engaged' or not */ - toUInt8( - -- 1. Sessions that last more than 10 seconds - (dateDiff('second', session_start, session_end) > 10) - -- 2. Or, sessions that trigger at least 2 events of type 'pageview' - OR (countIfMerge(pageviews_state) >= 2) - -- 3. Or, sessions that trigger at least 1 event of type 'conversion' - OR (countIfMerge(conversions_state) >= 1) - ) AS engaged, - - /* Dimension values set by Java at ingestion time, propagated via session_states_mv */ - argMaxMerge(device_category_state) AS device_category, - argMaxMerge(browser_family_state) AS browser_family, - - /* Locale defaults to empty string when unknown */ - coalesce(argMaxMerge(locale_id_state), '') AS locale_id - FROM analytics.session_states - GROUP BY ( - customer_id, - environment, - site_id, - session_id) - /* Only recent sessions are recalculated */ - HAVING session_end >= start_cutoff - ) finalized_sessions; - - -/* ===================================================================================================== - 7) SESSION FACTS LATEST (DEDUPLICATED INTERMEDIATE TABLE) - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.session_facts_latest - - ENGINE - ----------------------------------------------------------------------------------------------------- - ReplicatedReplacingMergeTree(updated_at) - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Stores exactly one latest effective row per session key. - - WHY THIS TABLE EXISTS - ----------------------------------------------------------------------------------------------------- - analytics.session_facts is a versioned historical store. Because it receives APPENDed updates for - recent sessions, the same logical session may temporarily exist in multiple versions. - - We could read analytics.session_facts FINAL everywhere, but FINAL is heavier and we do not want every - downstream roll-up to pay that cost repeatedly. - - So instead: - - we deduplicate once into session_facts_latest - - roll-ups and direct SQL queries can use this table - - downstream SQL stays simpler and more efficient - - RECOMMENDED USAGE - ----------------------------------------------------------------------------------------------------- - This is the preferred source when you want: - - one row per session - - latest metrics only - - session-level raw SQL queries - - roll-up generation - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.session_facts_latest -( - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - session_id String, - - session_start DateTime64(3, 'UTC'), - session_end DateTime64(3, 'UTC'), - duration_seconds UInt32, - - total_events UInt32, - pageviews UInt32, - conversions UInt32, - - engaged UInt8, - - device_category LowCardinality(String), - browser_family LowCardinality(String), - locale_id LowCardinality(String), - - updated_at DateTime64(3, 'UTC') -) - ENGINE = ReplicatedReplacingMergeTree(updated_at) - PARTITION BY toYYYYMM(toDate(session_start)) - ORDER BY ( - customer_id, - environment, - site_id, - session_id - ); - - -/* ===================================================================================================== - 8) REFRESHABLE MV: session_facts → session_facts_latest - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Refreshable Materialized View (RMV) - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.session_facts_latest_rmv - - SOURCE - ----------------------------------------------------------------------------------------------------- - analytics.session_facts - - TARGET - ----------------------------------------------------------------------------------------------------- - analytics.session_facts_latest - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Centralizes deduplication of the historical versioned session table into one latest row per session key. - - WHY THIS IS BETTER THAN USING FINAL EVERYWHERE - ----------------------------------------------------------------------------------------------------- - Instead of every downstream roll-up doing its own deduplication or reading session_facts FINAL, this - RMV does the work once and stores the result in a clean intermediate table. - - DEDUPLICATION RULE - ----------------------------------------------------------------------------------------------------- - For each session key: - - take the value associated with the greatest updated_at - - that is done via argMax(..., updated_at) - - store max(updated_at) as the effective row version - -===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.session_facts_latest_rmv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_rmv -TO analytics.session_facts_latest -AS -SELECT - sf.customer_id, - sf.environment, - sf.site_id, - sf.session_id, - - argMax(sf.session_start, sf.updated_at) AS session_start, - argMax(sf.session_end, sf.updated_at) AS session_end, - argMax(sf.duration_seconds, sf.updated_at) AS duration_seconds, - - argMax(sf.total_events, sf.updated_at) AS total_events, - argMax(sf.pageviews, sf.updated_at) AS pageviews, - argMax(sf.conversions, sf.updated_at) AS conversions, - - argMax(sf.engaged, sf.updated_at) AS engaged, - argMax(sf.device_category, sf.updated_at) AS device_category, - argMax(sf.browser_family, sf.updated_at) AS browser_family, - argMax(sf.locale_id, sf.updated_at) AS locale_id, - - max(sf.updated_at) AS updated_at -FROM analytics.session_facts AS sf -GROUP BY ( - sf.customer_id, - sf.environment, - sf.site_id, - sf.session_id); - - -/* ===================================================================================================== - 9) DAILY KPI ROLL-UP - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.engagement_daily - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Stores dashboard-ready daily KPI numerators and denominators. - - WHY STORE DAILY SUMS INSTEAD OF DAILY RATES? - ----------------------------------------------------------------------------------------------------- - Because arbitrary date ranges must be computed correctly as: - - sum(numerator) / sum(denominator) - - not as: - average(daily_rate) - - This table therefore stores the raw daily ingredients needed to compute: - - engagement rate - - conversion rate - - average interactions - - average session duration - - GRAIN - ----------------------------------------------------------------------------------------------------- - (customer_id, environment, site_id, day) - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.engagement_daily -( - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - day Date, - - total_sessions UInt64, -- count of all sessions - engaged_sessions UInt64, -- count of engaged sessions - engaged_conversion_sessions UInt64, -- engaged sessions that include >=1 conversion - - total_events_all UInt64, -- sum(total_events) across all sessions - total_duration_all UInt64, -- sum(duration_seconds) across all sessions - - total_events_engaged UInt64, -- sum(total_events) across engaged sessions only - total_duration_engaged UInt64, -- sum(duration_seconds) across engaged sessions only - - updated_at DateTime64(3, 'UTC') -) - ENGINE = ReplicatedReplacingMergeTree(updated_at) - PARTITION BY toYYYYMM(day) - ORDER BY (customer_id, environment, site_id, day); - - -/* ===================================================================================================== - 10) REFRESHABLE MV: session_facts_latest → engagement_daily - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Refreshable Materialized View (RMV) - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.engagement_daily_rmv - - SOURCE - ----------------------------------------------------------------------------------------------------- - analytics.session_facts_latest - - TARGET - ----------------------------------------------------------------------------------------------------- - analytics.engagement_daily - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Rebuilds the daily KPI roll-up table from the latest one-row-per-session layer. - - WHY THIS SOURCE? - ----------------------------------------------------------------------------------------------------- - Because session_facts_latest already contains one latest row per session, this roll-up can aggregate - without FINAL and without inline dedup logic. - -===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.engagement_daily_rmv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv - TO analytics.engagement_daily -AS -SELECT - customer_id, - environment, - site_id, - toDate(session_start, 'UTC') AS day, - - count() AS total_sessions, - countIf(engaged = 1) AS engaged_sessions, - countIf(engaged = 1 AND conversions >= 1) AS engaged_conversion_sessions, - - sum(total_events) AS total_events_all, - sum(duration_seconds) AS total_duration_all, - - sumIf(total_events, engaged = 1) AS total_events_engaged, - sumIf(duration_seconds, engaged = 1) AS total_duration_engaged, - - now64(3, 'UTC') AS updated_at -FROM analytics.session_facts_latest -GROUP BY ( - customer_id, - environment, - site_id, - day); - - -/* ===================================================================================================== - 11) DEVICE BREAKDOWN ROLL-UP - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.sessions_by_device_daily - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Daily distribution table by device category. - - Typical dashboard uses: - - total sessions by device - - engaged sessions by device - - average engaged duration by device - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.sessions_by_device_daily -( - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - day Date, - - device_category LowCardinality(String), -- Desktop/Mobile/Tablet/Other - - total_sessions UInt64, -- ALL sessions for this device_category - engaged_sessions UInt64, -- Engaged sessions for this device_category - total_duration_engaged_seconds UInt64, -- Sum(duration_seconds) for engaged sessions only - - updated_at DateTime64(3, 'UTC') -) - ENGINE = ReplicatedReplacingMergeTree(updated_at) - PARTITION BY toYYYYMM(day) - ORDER BY (customer_id, environment, site_id, day, device_category); - - -/* ===================================================================================================== - 12) REFRESHABLE MV: session_facts_latest → sessions_by_device_daily - ===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.sessions_by_device_daily_rmv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv - TO analytics.sessions_by_device_daily -AS -SELECT - customer_id, - environment, - site_id, - toDate(session_start, 'UTC') AS day, - device_category, - - count() AS total_sessions, - countIf(engaged = 1) AS engaged_sessions, - sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, - - now64(3, 'UTC') AS updated_at -FROM analytics.session_facts_latest -GROUP BY ( - customer_id, - environment, - site_id, - day, - device_category); - - -/* ===================================================================================================== - 13) BROWSER BREAKDOWN ROLL-UP - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.sessions_by_browser_daily - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Daily distribution table by browser family. - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.sessions_by_browser_daily -( - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - day Date, - - browser_family LowCardinality(String), -- Chrome/Safari/Firefox/Edge/Other - - total_sessions UInt64, - engaged_sessions UInt64, - total_duration_engaged_seconds UInt64, - - updated_at DateTime64(3, 'UTC') -) - ENGINE = ReplicatedReplacingMergeTree(updated_at) - PARTITION BY toYYYYMM(day) - ORDER BY (customer_id, environment, site_id, day, browser_family); - - -/* ===================================================================================================== - 14) REFRESHABLE MV: session_facts_latest → sessions_by_browser_daily - ===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.sessions_by_browser_daily_rmv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv - TO analytics.sessions_by_browser_daily -AS -SELECT - customer_id, - environment, - site_id, - toDate(session_start, 'UTC') AS day, - browser_family, - - count() AS total_sessions, - countIf(engaged = 1) AS engaged_sessions, - sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, - - now64(3, 'UTC') AS updated_at -FROM analytics.session_facts_latest -GROUP BY ( - customer_id, - environment, - site_id, - day, - browser_family); - - -/* ===================================================================================================== - 15) LANGUAGE BREAKDOWN ROLL-UP - ===================================================================================================== - - OBJECT TYPE - ----------------------------------------------------------------------------------------------------- - Table - - OBJECT NAME - ----------------------------------------------------------------------------------------------------- - analytics.sessions_by_language_daily - - PURPOSE - ----------------------------------------------------------------------------------------------------- - Daily distribution table by locale_id. - - NOTE - ----------------------------------------------------------------------------------------------------- - locale_id remains the raw dotCMS locale/language identifier. - If you later want user-friendly names, that translation can happen in SQL joins or in the service layer. - -===================================================================================================== */ -CREATE TABLE IF NOT EXISTS analytics.sessions_by_language_daily -( - customer_id LowCardinality(String), - environment LowCardinality(String), - site_id String, - day Date, - - locale_id LowCardinality(String), -- dotCMS language Locale ID ('' means undefined) - - total_sessions UInt64, - engaged_sessions UInt64, - total_duration_engaged_seconds UInt64, - - updated_at DateTime64(3, 'UTC') -) - ENGINE = ReplicatedReplacingMergeTree(updated_at) - PARTITION BY toYYYYMM(day) - ORDER BY (customer_id, environment, site_id, day, locale_id); - - -/* ===================================================================================================== - 16) REFRESHABLE MV: session_facts_latest → sessions_by_language_daily - ===================================================================================================== */ -CREATE MATERIALIZED VIEW IF NOT EXISTS analytics.sessions_by_language_daily_rmv --- Refreshing every 30 seconds FOR LOCAL DEVELOPMENT ONLY! For DEV, use at least REFRESH EVERY 15 MINUTE -REFRESH EVERY 30 SECOND DEPENDS ON analytics.session_facts_latest_rmv - TO analytics.sessions_by_language_daily -AS -SELECT - customer_id, - environment, - site_id, - toDate(session_start, 'UTC') AS day, - locale_id, - - count() AS total_sessions, - countIf(engaged = 1) AS engaged_sessions, - sumIf(duration_seconds, engaged = 1) AS total_duration_engaged_seconds, - - now64(3, 'UTC') AS updated_at -FROM analytics.session_facts_latest -GROUP BY ( - customer_id, - environment, - site_id, - day, - locale_id); - - -/* ===================================================================================================== - QUERYING GUIDANCE - ===================================================================================================== - - RECOMMENDED TABLES FOR RAW SQL - ----------------------------------------------------------------------------------------------------- - - 1) Query analytics.session_facts_latest when you need: - - one row per session - - latest session metrics only - - session-level exploration/debugging - - session KPI calculations on the fly - - 2) Query analytics.engagement_daily when you need: - - KPI cards - - trends over time - - engagement/conversion/avg-interaction metrics over arbitrary date ranges - - 3) Query analytics.sessions_by_device_daily / browser / language when you need: - - dashboard distribution widgets - - grouped daily breakdowns - - top-N device/browser/language reports - - 4) Query analytics.session_facts only when you specifically need: - - historical row versions - - debugging of late-event recalculations - - low-level understanding of how session versions changed over time - - EXAMPLE MENTAL MODEL - ----------------------------------------------------------------------------------------------------- - - analytics.session_facts = durable version history - analytics.session_facts_latest = current one-row-per-session truth - analytics.engagement_daily = daily KPI roll-up - analytics.sessions_by_*_daily = daily grouped dashboard roll-ups - -===================================================================================================== */ \ No newline at end of file diff --git a/docker/docker-compose-examples/analytics/init/50-users.sql b/docker/docker-compose-examples/analytics/init/50-users.sql deleted file mode 100644 index 97c4deb5f2a4..000000000000 --- a/docker/docker-compose-examples/analytics/init/50-users.sql +++ /dev/null @@ -1,11 +0,0 @@ --- 1. Create the user -CREATE USER 'cust-001' IDENTIFIED BY 'abc' DEFAULT DATABASE analytics; --- 2. Grant necessary privileges -GRANT SELECT ON analytics.* TO 'cust-001'; --- 3. Create the row policy to filter by customer_id -CREATE ROW POLICY 'cust-001-policy' ON analytics.* USING customer_id='cust-001' TO 'cust-001'; - ---- 4. Allow from any host -ALTER USER 'cust-001' HOST ANY; ---- 5 - Grant Write permissions -GRANT INSERT ON analytics.events TO `cust-001`; \ No newline at end of file