pipelines/controller/config/application.yaml

#
# Copyright 2020-2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

fhirdata:
  # The base URL of the source FHIR server. If `dbConfig` is not set, resources
  # are fetched from this URL through the FHIR Search API.
  # Equivalent to the pipeline `fhirServerUrl` parameter.
  fhirServerUrl: "http://172.17.0.1:8091/fhir"
  #fhirServerUrl: "http://localhost:8099/openmrs/ws/fhir2/R4"
  #fhirServerUrl: "http://localhost:9002/fhir"

  # The following user-name/password should be set if the FHIR server supports
  # Basic Auth.
  #fhirServerUserName: "admin"
  #fhirServerPassword: "Admin123"
  # The following client credentials should be set if the FHIR server accepts
  # OAuth access tokens. Note the client credentials, e.g., the secret, are
  # sensitive, and it is probably a better practice to set these through
  # command-line arguments.
  #fhirServerOAuthTokenEndpoint: "http://localhost:9080/auth/realms/test/protocol/openid-connect/token"
  #fhirServerOAuthClientId: "THE_CLIENT_ID"
  #fhirServerOAuthClientSecret: "THE_CLIENT_SECRET"

  # The path to the file containing JDBC settings for connecting to a HAPI FHIR
  # server database. If this is set, resources are fetched directly from the
  # database and `fhirServerUrl` is ignored.
  # Equivalent to pipeline `fhirDatabaseConfigPath` parameter.
  # dbConfig: "config/hapi-postgres-config.json"

  # The path to output Parquet files to. The last portion of the
  # path is used as a prefix for naming the directory that contains
  # per-resource directories and a timestamp will be added;
  # for example, "config/controller_DWH_ORIG" will create something like
  # ./config/controller_DWH_ORIG_TIMESTAMP_2023-01-27T23-55-39.295824Z
  # Similar to the pipeline `outputParquetPath` parameter.
  #
  # For GCS buckets, dwhRootPrefix must be of the format
  # "gs://<bucket>/<baseDirPath>/<prefix>". <baseDirPath> is optional
  # for GCS buckets and may contain 0 or more directory names.
  #
  # For *nix file systems, dwhRootPrefix must be of the format
  # "/<baseDirPath>/<prefix>" and can be absolute or relative. <baseDirPath>
  # is required for *nix, and must contain 1 or more directory names.
  # Sample 1 : "/fhir-test-analytics/dwh/controller_DWH_ORIG"
  # Sample 2 : "dwh/controller_DWH_ORIG"
  #
  # For Windows file systems, dwhRootPrefix must be of the format '<baseDirPath>\<prefix>' and can
  # be absolute or relative. In case of absolute the <baseDirPath> is non-mandatory and in case of
  # relative it should contain mandatory <baseDirPath> with 1 or more directory names. Use
  # single-quotes or no-quotes for the value so that backslash character is treated as a regular
  # character and not as an escaped character
  # Sample 1 : 'C:\controller_DWH_ORIG'
  # Sample 1 : 'C:\fhir-test-analytics\dwh\controller_DWH_ORIG'
  # Sample 2 : dwh\controller_DWH_ORIG
  #
  # Note for developers: You can make a symlink of `[repo_root]/docker/dwh` here
  # such that the Thrift Server of `compose-controller-spark-sql-single.yaml`
  # config can easily be used in dev env. too. You may need to set the ACL of
  # that directory too, such that files created by the pipelines are readable by
  # the Thrift Server, e.g., `setfacl -d -m o::rx dwh/`.
  dwhRootPrefix: "dwh/controller_DEV_DWH"

  # The schedule for automatic incremental pipeline runs.
  # Uses the Spring CronExpression format, i.e.,
  # "second minute hour day-of-the-month month day-of-the-week", so:
  # "0 0 * * * *" means top of every hour;
  # "*/40 * * * * *" means every 40 seconds;
  # Scheduling very frequent runs is resource intensive.
  incrementalSchedule: "0 0 * * * *"

  # The schedule for automatic DWH snapshot purging. There is no benefit
  # to scheduling the purge job more frequently than incremental runs.
  # Uses the Spring CronExpression format.
  purgeSchedule: "0 30 * * * *"

  # The number of DWH snapshots to retain when the purge job runs.
  # This must be > 0 or the purge job will not run. If a pipeline run fails
  # for any reason, any partial output must be manually removed.
  numOfDwhSnapshotsToRetain: 2

  # The comma-separated list of FHIR resources to fetch/monitor.
  # Equivalent to pipeline `resourceList` parameter.
  # Note there is no Questionnaire in our test FHIR server, but it is okay; see
  # https://github.com/google/fhir-data-pipes/issues/785.
  resourceList: "Patient,Encounter,Observation,Questionnaire,Condition,Practitioner,Location,Organization"

  # The parallelism to be used for a pipeline job. In case of FlinkRunner, if the value is set to
  # -1, then in the local execution mode the number of threads the job uses will be equal to the
  # number of cores in the machine, whereas in the remote mode (cluster) only 1 thread is used.
  # If set to a positive value, then in both the modes the pipeline uses these many threads combined
  # across all the workers.
  numThreads: -1

  # In case of Flink local execution mode (which is the default currently), generate Flink
  # configuration file `flink-conf.yaml` automatically based on the parallelism set via the
  # parameter `numThreads` and the `cores` available in the machine. The generated file will have
  # the parameters set with optimised values necessary to run the pipelines without fail. Disable
  # this parameter to manually pass the configuration file by pointing the environment variable
  # FLINK_CONF_DIR to the directory where the flink-conf.yaml is placed.
  #
  # Note for Flink non-local execution mode, this parameter has to be disabled and the configuration
  # file has to be passed manually which has more fine-grained control parameters
  autoGenerateFlinkConfiguration: true

  # Whether resource tables should be automatically created on a
  # Hive/Spark server. Primarily meant for single-machine deployment.
  createHiveResourceTables: false

  # Path to a file with the settings used to create tables.
  # Required if createHiveResourceTables is `true`.
  thriftserverHiveConfig: "config/thriftserver-hive-config.json"

  # Path to a directory containing view definitions for each resource type.
  # If not set or set to empty string, automatic view creation is disabled.
  # Otherwise, for each resource type, its view definition SQL queries are read
  # and applied from corresponding files, i.e., any file that starts with the
  # resource name and ends in `.sql`, e.g., `DiagnosticReport_flat.sql`.
  # Only applies when createHiveResourceTables is `true`.
  #
  # Note for developers: If you symlink `[repo_root]/docker/config/views` here
  # you can use those predefined views in your dev. env. too.
  hiveResourceViewsDir: "config/views"

  # This is the size of the Parquet Row Group (a logical horizontal partitioning into rows) that
  # will be used for creating row groups in parquet file by pipelines. A large value means more data
  # for one column can be fit into one big column chunk which will speed up the reading of column
  # data. On the downside, more in-memory will be needed to hold the data before writing to files.
  rowGroupSizeForParquetFiles: 33554432   # 32mb

  # The location from which ViewDefinition resources are read and applied to the
  # corresponding input FHIR resources. Any file in this directory that ends
  # `.json` is assumed to be a single ViewDefinition. To output these views to a
  # relational database, the next sinkDbConfigPath should also be set.
  viewDefinitionsDir: "config/views"

  # The configuration file for the sink database. If `viewDefinitionsDir` is set
  # then the generated views are materialized and written to this DB. If not,
  # then the raw FHIR JSON resources are written to this DB. Note enabling this
  # feature can have a noticeable impact on pipelines performance. The default
  # empty string disables this feature.
  sinkDbConfigPath: "config/hapi-postgres-config_local_views.json"

# Enable spring boot actuator end points, use "*" to expose all endpoints, or a comma-separated
# list to expose selected ones
management:
  endpoints:
    web:
      exposure:
        include: health,info,metrics,prometheus,pipeline-metrics