Skip to content

Commit

Permalink
feat(ingest): Starburst Trino usage (#3558)
Browse files Browse the repository at this point in the history
  • Loading branch information
treff7es committed Nov 18, 2021
1 parent 658fa81 commit a36fefa
Show file tree
Hide file tree
Showing 8 changed files with 475 additions and 4 deletions.
1 change: 1 addition & 0 deletions metadata-ingestion/README.md
Expand Up @@ -65,6 +65,7 @@ Sources:
| [sqlalchemy](./source_docs/sqlalchemy.md) | `pip install 'acryl-datahub[sqlalchemy]'` | Generic SQLAlchemy source |
| [superset](./source_docs/superset.md) | `pip install 'acryl-datahub[superset]'` | Superset source |
| [trino](./source_docs/trino.md) | `pip install 'acryl-datahub[trino]` | Trino source |
| [starburst-trino-usage](./source_docs/trino.md) | `pip install 'acryl-datahub[starburst-trino-usage]'` | Starburst Trino usage statistics source |

Sinks

Expand Down
12 changes: 11 additions & 1 deletion metadata-ingestion/setup.py
Expand Up @@ -135,6 +135,14 @@ def get_long_description():
# PR is from same author as that of sqlalchemy-trino library below.
"sqlalchemy-trino"
},
"starburst-trino-usage": sql_common
| {
# SQLAlchemy support is coming up in trino python client
# subject to PR merging - https://github.com/trinodb/trino-python-client/pull/81.
# PR is from same author as that of sqlalchemy-trino library below.
"sqlalchemy-trino"
},

}

all_exclude_plugins: Set[str] = {
Expand Down Expand Up @@ -209,7 +217,7 @@ def get_long_description():
# The trino plugin only works on Python 3.7 or newer.
# The trino plugin can be supported on Python 3.6 with minimal changes to opensource sqlalchemy-trino sourcecode.
base_dev_requirements = base_dev_requirements.union(
{dependency for plugin in ["lookml", "trino"] for dependency in plugins[plugin]}
{dependency for plugin in ["lookml", "trino", "starburst-trino-usage"] for dependency in plugins[plugin]}
)

dev_requirements = {
Expand Down Expand Up @@ -281,6 +289,8 @@ def get_long_description():
"superset = datahub.ingestion.source.superset:SupersetSource",
"openapi = datahub.ingestion.source.openapi:OpenApiSource",
"trino = datahub.ingestion.source.sql.trino:TrinoSource",
"starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource",

],
"datahub.ingestion.sink.plugins": [
"file = datahub.ingestion.sink.file:FileSink",
Expand Down
55 changes: 55 additions & 0 deletions metadata-ingestion/source_docs/trino.md
Expand Up @@ -69,6 +69,61 @@ As a SQL-based service, the Trino integration is also supported by our SQL profi

Coming soon!

## Trino Usage Stats

For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).

### Starburst Trino Usage Stats
If you are using Starburst Trino you can collect usage stats the following way.
#### Prerequsities
1. You need to setup Event Logger which saves audit logs into a Postgres db and setup this db as a catalog in Trino
Here you can find more info about how to setup:
https://docs.starburst.io/354-e/security/event-logger.html#security-event-logger--page-root
https://docs.starburst.io/354-e/security/event-logger.html#analyzing-the-event-log

2. Install starbust-trino-usage plugin
Run pip install 'acryl-datahub[starburst-trino-usage]'.

#### Usage stats ingestion job
Here is a sample recipe to ingest usage data:
```
source:
type: starburst-trino-usage
config:
# Coordinates
host_port: yourtrinohost:port
# The name of the catalog from getting the usage
database: hive
# Credentials
username: trino_username
password: trino_password
email_domain: test.com
audit_catalog: audit
audit_schema: audit_schema
sink:
type: "datahub-rest"
config:
server: "http://localhost:8080"
```
### Config details

Note that a `.` is used to denote nested fields in the YAML recipe.

By default, we extract usage stats for the last day, with the recommendation that this source is executed every day.

| Field | Required | Default | Description |
| ---------------------- | -------- | -------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `database` | yes | | The name of the catalog from getting the usage |
| `audit_catalog` | yes | | The catalog name where the audit table can be found |
| `audit_schema` | yes | | The schema name where the audit table can be found |
| `email_domain` | yes | | The email domain which will be appended to the users |
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
| `bucket_duration` | | `"DAY"` | Duration to bucket usage events by. Can be `"DAY"` or `"HOUR"`. |
| `start_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Earliest date of usage logs to consider. |
| `end_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Latest date of usage logs to consider. |
| `top_n_queries` | | `10` | Number of top queries to save to each table. |

## Questions

If you've got any questions on configuring this source, feel free to ping us on [our Slack](https://slack.datahubproject.io/)!
14 changes: 11 additions & 3 deletions metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
Expand Up @@ -418,9 +418,17 @@ def loop_tables(
self.report.report_dropped(dataset_name)
continue

columns = inspector.get_columns(table, schema)
if len(columns) == 0:
self.report.report_warning(dataset_name, "missing column information")
try:
columns = inspector.get_columns(table, schema)
if len(columns) == 0:
self.report.report_warning(
dataset_name, "missing column information"
)
except Exception as e:
self.report.report_warning(
dataset_name,
f"unable to get column information due to an error -> {e}",
)

try:
# SQLALchemy stubs are incomplete and missing this method.
Expand Down
@@ -0,0 +1,253 @@
import collections
import dataclasses
import json
import logging
import sys
from datetime import datetime
from email.utils import parseaddr
from typing import Dict, Iterable, List

from dateutil import parser
from pydantic import Field
from pydantic.main import BaseModel
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine

if sys.version_info >= (3, 7): # noqa: C901
import datahub.emitter.mce_builder as builder
from datahub.configuration.time_window_config import get_time_bucket
from datahub.ingestion.api.source import Source, SourceReport
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.ingestion.source.sql.trino import TrinoConfig
from datahub.ingestion.source.usage.usage_common import (
BaseUsageConfig,
GenericAggregatedDataset,
)

logger = logging.getLogger(__name__)

trino_datetime_format = "%Y-%m-%d %H:%M:%S.%f %Z"

# Qeurying Starburst completed queries table
# https://docs.starburst.io/latest/security/event-logger.html#completed-queries
trino_usage_sql_comment = """
SELECT DISTINCT usr,
query,
"catalog",
"schema",
query_type,
accessed_metadata,
create_time,
end_time
FROM {audit_catalog}.{audit_schema}.completed_queries
WHERE 1 = 1
AND query_type = 'SELECT'
AND create_time >= timestamp '{start_time}'
AND end_time < timestamp '{end_time}'
AND query_state = 'FINISHED'
ORDER BY end_time desc
""".strip()

TrinoTableRef = str
AggregatedDataset = GenericAggregatedDataset[TrinoTableRef]

class TrinoConnectorInfo(BaseModel):
partitionIds: List[str]
truncated: bool

class TrinoAccessedMetadata(BaseModel):
catalog_name: str = Field(None, alias="catalogName")
schema_name: str = Field(None, alias="schema") # type: ignore
table: str = None # type: ignore
columns: List[str]
connector_info: TrinoConnectorInfo = Field(None, alias="connectorInfo")

class TrinoJoinedAccessEvent(BaseModel):
usr: str = None # type:ignore
query: str = None # type: ignore
catalog: str = None # type: ignore
schema_name: str = Field(None, alias="schema")
query_type: str = None # type:ignore
table: str = None # type:ignore
accessed_metadata: List[TrinoAccessedMetadata]
starttime: datetime = Field(None, alias="create_time")
endtime: datetime = Field(None, alias="end_time")

class TrinoUsageConfig(TrinoConfig, BaseUsageConfig):
env: str = builder.DEFAULT_ENV
email_domain: str
audit_catalog: str
audit_schema: str
options: dict = {}

def get_sql_alchemy_url(self):
return super().get_sql_alchemy_url()

@dataclasses.dataclass
class TrinoUsageSource(Source):
config: TrinoUsageConfig
report: SourceReport = dataclasses.field(default_factory=SourceReport)

@classmethod
def create(cls, config_dict, ctx):
config = TrinoUsageConfig.parse_obj(config_dict)
return cls(ctx, config)

def get_workunits(self) -> Iterable[MetadataWorkUnit]:
access_events = self._get_trino_history()
# If the query results is empty, we don't want to proceed
if not access_events:
return []

joined_access_event = self._get_joined_access_event(access_events)
aggregated_info = self._aggregate_access_events(joined_access_event)

for time_bucket in aggregated_info.values():
for aggregate in time_bucket.values():
wu = self._make_usage_stat(aggregate)
self.report.report_workunit(wu)
yield wu

def _make_usage_query(self) -> str:
return trino_usage_sql_comment.format(
audit_catalog=self.config.audit_catalog,
audit_schema=self.config.audit_schema,
start_time=self.config.start_time.strftime(trino_datetime_format),
end_time=self.config.end_time.strftime(trino_datetime_format),
)

def _make_sql_engine(self) -> Engine:
url = self.config.get_sql_alchemy_url()
logger.debug(f"sql_alchemy_url = {url}")
engine = create_engine(url, **self.config.options)
return engine

def _get_trino_history(self):
query = self._make_usage_query()
engine = self._make_sql_engine()
results = engine.execute(query)
events = []
for row in results:
# minor type conversion
if hasattr(row, "_asdict"):
event_dict = row._asdict()
else:
event_dict = dict(row)

# stripping extra spaces caused by above _asdict() conversion
for k, v in event_dict.items():
if isinstance(v, str):
event_dict[k] = v.strip()

if event_dict.get("starttime", None):
event_dict["starttime"] = event_dict.get("starttime").__str__()
if event_dict.get("endtime", None):
event_dict["endtime"] = event_dict.get("endtime").__str__()

logger.debug(f"event_dict: {event_dict}")
events.append(event_dict)

if events:
return events

# SQL results can be empty. If results is empty, the SQL connection closes.
# Then, we don't want to proceed ingestion.
logging.info("SQL Result is empty")
return None

def _convert_str_to_datetime(self, v):
if isinstance(v, str):
isodate = parser.parse(v) # compatible with Python 3.6+
return isodate

def _get_joined_access_event(self, events):
joined_access_events = []
for event_dict in events:
event_dict["create_time"] = self._convert_str_to_datetime(
event_dict.get("create_time")
)

event_dict["end_time"] = self._convert_str_to_datetime(
event_dict.get("end_time")
)

if not event_dict["accessed_metadata"]:
logging.info("Field accessed_metadata is empty. Skipping ....")
continue

event_dict["accessed_metadata"] = json.loads(
event_dict["accessed_metadata"]
)

if not event_dict.get("usr"):
logging.info("The username parameter is missing. Skipping ....")
continue

joined_access_events.append(TrinoJoinedAccessEvent(**event_dict))
return joined_access_events

def _aggregate_access_events(
self, events: List[TrinoJoinedAccessEvent]
) -> Dict[datetime, Dict[TrinoTableRef, AggregatedDataset]]:
datasets: Dict[
datetime, Dict[TrinoTableRef, AggregatedDataset]
] = collections.defaultdict(dict)

for event in events:
floored_ts = get_time_bucket(
event.starttime, self.config.bucket_duration
)
for metadata in event.accessed_metadata:

# Skipping queries starting with $system@
if metadata.catalog_name.startswith("$system@"):
logging.debug(
f"Skipping system query for {metadata.catalog_name}..."
)
continue

# Filtering down queries to the selected catalog
if metadata.catalog_name != self.config.database:
continue

resource = f"{metadata.catalog_name}.{metadata.schema_name}.{metadata.table}"

agg_bucket = datasets[floored_ts].setdefault(
resource,
AggregatedDataset(
bucket_start_time=floored_ts, resource=resource
),
)

# add @unknown.com to username
# current limitation in user stats UI, we need to provide email to show users
if "@" in parseaddr(event.usr)[1]:
username = event.usr
else:
username = f"{event.usr if event.usr else 'unknown'}@{self.config.email_domain}"

agg_bucket.add_read_entry(
username,
event.query,
metadata.columns,
)
return datasets

def _make_usage_stat(self, agg: AggregatedDataset) -> MetadataWorkUnit:
return agg.make_usage_workunit(
self.config.bucket_duration,
lambda resource: builder.make_dataset_urn(
"trino", resource.lower(), self.config.env
),
self.config.top_n_queries,
)

def get_report(self) -> SourceReport:
return self.report

def close(self) -> None:
pass


else:
raise ModuleNotFoundError("The trino usage plugin requires Python 3.7 or newer.")

0 comments on commit a36fefa

Please sign in to comment.