Skip to content

Commit

Permalink
Updating maintenance metrics workflow to support overwrite (#1222)
Browse files Browse the repository at this point in the history
* Adding expiry for github table

* Updating dynamo model to add expiry

* Updating github_activity_models

* Updating data generation

* Removing timestamp criteria for commit query

* Fixing datetime -> date for month and refactoring

* Updating Tests

* Update data-workflows/activity/tests/test_github_activity_model.py

Co-authored-by: Ashley Anderson <aganders3@gmail.com>

* Minor refactor

---------

Co-authored-by: Ashley Anderson <aganders3@gmail.com>
  • Loading branch information
manasaV3 and aganders3 committed Sep 6, 2023
1 parent efaf71a commit e34a96b
Show file tree
Hide file tree
Showing 14 changed files with 922 additions and 320 deletions.
2 changes: 2 additions & 0 deletions .happy/terraform/modules/ecs-stack/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ module github_dynamodb_table {
type = "S"
}
]
ttl_enabled = true
ttl_attribute_name = "expiry"
autoscaling_enabled = var.env == "dev" ? false : true
create_table = true
tags = var.tags
Expand Down
Empty file removed data-workflows/__init__.py
Empty file.
95 changes: 55 additions & 40 deletions data-workflows/activity/github_activity_model.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,85 @@
import logging
import time
from datetime import datetime
from datetime import date, datetime, time as dt_time
from enum import Enum, auto
from typing import Union, Optional
from typing import Callable, Optional

from nhcommons.models.github_activity import batch_write
from utils.utils import (
date_to_utc_timestamp_in_millis, datetime_to_utc_timestamp_in_millis
)
from dateutil.relativedelta import relativedelta

from nhcommons.models.github_activity import batch_write
from utils.utils import datetime_to_utc_timestamp_in_millis, to_datetime

logger = logging.getLogger(__name__)
TIMESTAMP_FORMAT = "TO_TIMESTAMP('{0:%Y-%m-%d %H:%M:%S}')"


class GitHubActivityType(Enum):
def __new__(cls, timestamp_formatter, type_id_formatter, projection, sort):
def __new__(
cls,
timestamp_formatter: Callable[[date], Optional[int]],
type_id_formatter: str,
projection: str,
sort: str,
expiry_formatter: Callable[[date], Optional[int]],
):
github_activity_type = object.__new__(cls)
github_activity_type._value = auto()
github_activity_type.timestamp_formatter = timestamp_formatter
github_activity_type.type_identifier_formatter = type_id_formatter
github_activity_type.query_projection = projection
github_activity_type.query_sorting = sort
github_activity_type.expiry_formatter = expiry_formatter
return github_activity_type

LATEST = (
datetime_to_utc_timestamp_in_millis,
"LATEST:{repo}",
"TO_TIMESTAMP(MAX(commit_author_date)) AS latest_commit",
"name"
"name",
lambda timestamp: None,
)
MONTH = (
date_to_utc_timestamp_in_millis,
lambda ts: datetime_to_utc_timestamp_in_millis(to_datetime(ts)),
"MONTH:{timestamp:%Y%m}:{repo}",
"DATE_TRUNC('month', TO_DATE(commit_author_date)) AS month, "
"COUNT(*) AS commit_count",
"name, month"
"name, month",
lambda ts: int((to_datetime(ts) + relativedelta(months=14)).timestamp()),
)
TOTAL = (
lambda timestamp: None,
"TOTAL:{repo}",
"COUNT(*) AS commit_count",
"name"
"name",
lambda timestamp: None,
)

def format_to_timestamp(self, timestamp: datetime) -> Union[int, None]:
def format_to_timestamp(self, timestamp: Optional[date]) -> Optional[int]:
return self.timestamp_formatter(timestamp)

def format_to_type_identifier(self,
repo_name: str,
timestamp: Optional[datetime]) -> str:
def format_to_type_identifier(
self, repo_name: str, timestamp: Optional[date]
) -> str:
return self.type_identifier_formatter.format(
repo=repo_name, timestamp=timestamp
)

def _create_subquery(
self, plugins_by_earliest_ts: dict[str, datetime]
) -> str:
if self is GitHubActivityType.MONTH:
return " OR ".join(
[
f"repo = '{name}' AND TO_TIMESTAMP(commit_author_date) >= "
f"{TIMESTAMP_FORMAT.format(ts.replace(day=1))}"
for name, ts in plugins_by_earliest_ts.items()
]
)
def to_expiry(self, timestamp: Optional[date]) -> Optional[int]:
return self.expiry_formatter(timestamp)

def _create_subquery(self, plugins_by_earliest_ts: dict[str, datetime]) -> str:
plugins = [f"'{plugin}'" for plugin in plugins_by_earliest_ts.keys()]
return f"repo IN ({','.join(plugins)})"
plugins_subquery = f"repo IN ({','.join(plugins)})"

if self is not GitHubActivityType.MONTH:
return plugins_subquery

earliest_date = (date.today() - relativedelta(months=14)).replace(day=1)
timestamp = datetime.combine(earliest_date, dt_time.min)
return (
f"{plugins_subquery} AND "
f"TO_TIMESTAMP(commit_author_date) >= {TIMESTAMP_FORMAT.format(timestamp)}"
)

def get_query(self, plugins_by_earliest_ts: dict[str, datetime]) -> str:
return f"""
Expand All @@ -83,14 +96,14 @@ def get_query(self, plugins_by_earliest_ts: dict[str, datetime]) -> str:
"""


def transform_and_write_to_dynamo(data: dict[str, list],
activity_type: GitHubActivityType,
plugin_name_by_repo: dict[str, str]) -> None:
"""Transforms data generated by get_plugins_commit_count_since_timestamp to
the expected format and then writes the formatted data to the corresponding
github-activity dynamo table in each environment
:param dict[str, list] data: plugin commit data in which the key is plugin
name and the value is GitHub activities
def transform_and_write_to_dynamo(
data: dict[str, list],
activity_type: GitHubActivityType,
plugin_name_by_repo: dict[str, str],
) -> None:
"""Transforms data to the json of _GitHubActivity model and then batch writes the
formatted data to github-activity dynamo table
:param dict[str, list] data: plugin commit activities data keyed on plugin name
:param GitHubActivityType activity_type:
:param dict[str, str] plugin_name_by_repo: dict mapping repo to plugin name
"""
Expand All @@ -105,22 +118,24 @@ def transform_and_write_to_dynamo(data: dict[str, list],
if plugin_name is None:
logger.warning(f"Unable to find plugin name for repo={repo}")
continue

for activity in github_activities:
timestamp = activity.get("timestamp")
type_identifier = activity_type.format_to_type_identifier(
repo, timestamp
)
type_identifier = activity_type.format_to_type_identifier(repo, timestamp)
item = {
"plugin_name": plugin_name.lower(),
"type_identifier": type_identifier,
"granularity": granularity,
"timestamp": activity_type.format_to_timestamp(timestamp),
"commit_count": activity.get("count"),
"repo": repo,
"expiry": activity_type.to_expiry(timestamp),
}
batch.append(item)

batch_write(batch)
duration = (time.perf_counter() - start) * 1000
logger.info(f"Completed processing for github-activity type={granularity} "
f"count={len(batch)} timeTaken={duration}ms")
logger.info(
f"Completed processing for github-activity type={granularity} "
f"count={len(batch)} timeTaken={duration}ms"
)
Loading

0 comments on commit e34a96b

Please sign in to comment.