From 5b138613a072d937dabbaf4b2dbd1d83b7062ed4 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 17 Mar 2023 18:13:22 -0500 Subject: [PATCH 01/46] RedisScalar class Signed-off-by: Isaac Milarsky --- augur/tasks/util/collection_util.py | 5 +++- augur/tasks/util/redis_scalar.py | 37 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 augur/tasks/util/redis_scalar.py diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 53824a9ab7..16e569b797 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -431,7 +431,10 @@ def __init__(self,session,repos: List[str]=[],collection_phases: List[str]=[],co #now returns resulting weight after either reaching zero or #scheduling all repos assigned to the object. def start_data_collection(self): - super().start_data_collection() + #Send messages starts each repo and yields its running info + #to concurrently update the correct field in the database. + for repo_git, task_id in self.send_messages(): + self.update_status_and_id(repo_git,task_id) return self.total_repo_weight.value diff --git a/augur/tasks/util/redis_scalar.py b/augur/tasks/util/redis_scalar.py new file mode 100644 index 0000000000..29491cce67 --- /dev/null +++ b/augur/tasks/util/redis_scalar.py @@ -0,0 +1,37 @@ +"""This module defines the RedisCount class. +It imports the redis_connection as redis which is a connection to the redis cache +""" +from typing import Iterable, Any, Union + +from collections.abc import MutableSequence +from augur.tasks.init.redis_connection import redis_connection as redis +from augur import instance_id +from redis import exceptions +import numbers + +class RedisScalar: + + def __init__(self, scalar_name: str, default_value: int = 0, override_existing: bool = False): + + self.redis_scalar_key = f"{instance_id}_{scalar_name}" + self._scalar_name = scalar_name + + self.__value = default_value + + #Check redis to see if key exists in cache + if 1 != redis.exists(self.redis_scalar_key) or override_existing: + #Set value + redis.set(self.redis_scalar_key,self.__value) + else: + #else get the value + self.__value = int(redis.get(self.redis_scalar_key)) + + @property + def value(self): + return self.__value + + @value.setter + def value(self, otherVal): + if isinstance(otherVal, numbers.Number): + self.__value = otherVal + redis.set(self.redis_scalar_key,self.__value) From 1fd6ffbf9a42305e0fd0bb7d394b6cb33e3cbba1 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 24 Mar 2023 12:36:19 -0500 Subject: [PATCH 02/46] libyear fix Signed-off-by: Isaac Milarsky --- .../git/dependency_libyear_tasks/libyear_util/util.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 1089c29281..0d47424588 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -139,7 +139,12 @@ def get_deps_libyear_data(path, logger): except KeyError: current_release_date = None - libyear = get_libyear(current_version, current_release_date, latest_version, latest_release_date) + if current_release_date: + libyear = get_libyear(current_version, current_release_date, latest_version, latest_release_date) + else: + current_release_date = dateutil.parser.parse('1970-01-01 00:00:00') + libyear = -1 + if not latest_release_date: latest_release_date = dateutil.parser.parse('1970-01-01 00:00:00') libyear = -1 From 77dc04d1a34b31fadeaf8d9055f3d9c4bc47f1ed Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 24 Mar 2023 13:04:03 -0500 Subject: [PATCH 03/46] Implement redhat weight changes Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 62 ++++++++++++++------------------------ 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 7abc7a06d7..cde51ec672 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -33,6 +33,7 @@ from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo from augur.tasks.util.collection_util import * +from augur.tasks.util.redis_scalar import RedisScalar CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -74,36 +75,6 @@ def primary_repo_collect_phase(repo_git): #A chain is needed for each repo. repo_info_task = collect_repo_info.si(repo_git)#collection_task_wrapper(self) - -## I think this section is outdated -# ### Section from traffic metric merge that may need to be changed - -# with DatabaseSession(logger) as session: -# query = session.query(Repo) -# repos = execute_session_query(query, 'all') -# #Just use list comprehension for simple group -# repo_info_tasks = [collect_repo_info.si(repo.repo_git) for repo in repos] - -# for repo in repos: -# first_tasks_repo = group(collect_issues.si(repo.repo_git),collect_pull_requests.si(repo.repo_git),collect_github_repo_clones_data.si(repo.repo_git)) -# second_tasks_repo = group(collect_events.si(repo.repo_git), -# collect_github_messages.si(repo.repo_git),process_pull_request_files.si(repo.repo_git), process_pull_request_commits.si(repo.repo_git)) - -# repo_chain = chain(first_tasks_repo,second_tasks_repo) -# issue_dependent_tasks.append(repo_chain) - -# repo_task_group = group( -# *repo_info_tasks, -# chain(group(*issue_dependent_tasks),process_contributors.si()), -# generate_facade_chain(logger), -# collect_releases.si() -# ) - -# chain(repo_task_group, refresh_materialized_views.si()).apply_async() - -# #### End of section from traffic metric merge that may need to be changed - - primary_repo_jobs = group( collect_issues.si(repo_git), collect_pull_requests.si(repo_git) @@ -189,7 +160,7 @@ def non_repo_domain_tasks(): Each collection hook schedules tasks for a number of repos that are either new or older than a set amount of days. """ -def start_primary_collection(session,max_repo,days): +def start_primary_collection(session,max_repo,days,max_collection_weight): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) @@ -229,13 +200,13 @@ def core_task_success_util_gen(repo_git): session.logger.info(f"Primary collection starting for: {tuple(repo_git_identifiers)}") - primary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases) + primary_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases,total_repo_weight=max_collection_weight) #Start data collection and update the collectionStatus with the task_ids primary_augur_collection.start_data_collection() -def start_secondary_collection(session,max_repo,days): +def start_secondary_collection(session,max_repo,days,max_collection_weight): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) @@ -273,7 +244,7 @@ def secondary_task_success_util_gen(repo_git): session.logger.info(f"Secondary collection starting for: {tuple(repo_git_identifiers)}") - secondary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary") + secondary_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary",total_repo_weight=max_collection_weight) secondary_augur_collection.start_data_collection() @@ -314,7 +285,7 @@ def facade_clone_update_success_util_gen(repo_git): facade_augur_collection.start_data_collection() -def start_facade_collection(session,max_repo,days): +def start_facade_collection(session,max_repo,days,max_collection_weight): #Deal with secondary collection facade_enabled_phases = [] @@ -347,7 +318,7 @@ def facade_task_success_util_gen(repo_git): session.logger.info(f"Facade collection starting for: {tuple(repo_git_identifiers)}") - facade_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade") + facade_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade",total_repo_weight=max_collection_weight) facade_augur_collection.start_data_collection() @@ -360,19 +331,32 @@ def augur_collection_monitor(): logger.info("Checking for repos to collect") + #These weights correspond to prs and issues. + #i.e. a weight of 10,000 means 10,000 prs and issues should be scheduled at once. + core_weight = RedisScalar("core-weight",default_value=10000) + secondary_weight = RedisScalar("secondary-weight",default_value=10000) + + #This weight corresponds to commit count + facade_weight = RedisScalar("facade-weight", default_value=20000) + + logger.info("Checking current weights for repo collection") + logger.info(f"Core weight: {core_weight.value}") + logger.info(f"Secondary weight: {secondary_weight.value}") + logger.info(f"Facade weight: {facade_weight.value}") + with DatabaseSession(logger, engine) as session: #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) if primary_repo_collect_phase.__name__ in enabled_phase_names: - start_primary_collection(session, max_repo=50, days=30) + start_primary_collection(session, max_repo=50, days=30,max_collection_weight=core_weight.value) if secondary_repo_collect_phase.__name__ in enabled_phase_names: - start_secondary_collection(session, max_repo=30, days=30) + start_secondary_collection(session, max_repo=30, days=30,max_collection_weight=secondary_weight.value) if facade_phase.__name__ in enabled_phase_names: #Schedule facade collection before clone/updates as that is a higher priority - start_facade_collection(session, max_repo=30, days=30) + start_facade_collection(session, max_repo=30, days=30,max_collection_weight=facade_weight.value) start_facade_clone_update(session,max_repo=15,days=30) From b103acb729257ff545acc21db4522bfc5995c15e Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 24 Mar 2023 13:17:30 -0500 Subject: [PATCH 04/46] subtract weights Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index cde51ec672..a956753a7a 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -203,7 +203,7 @@ def core_task_success_util_gen(repo_git): primary_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases,total_repo_weight=max_collection_weight) #Start data collection and update the collectionStatus with the task_ids - primary_augur_collection.start_data_collection() + return primary_augur_collection.start_data_collection() def start_secondary_collection(session,max_repo,days,max_collection_weight): @@ -246,7 +246,7 @@ def secondary_task_success_util_gen(repo_git): secondary_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary",total_repo_weight=max_collection_weight) - secondary_augur_collection.start_data_collection() + return secondary_augur_collection.start_data_collection() def start_facade_clone_update(session,max_repo,days): facade_enabled_phases = [] @@ -320,7 +320,7 @@ def facade_task_success_util_gen(repo_git): facade_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade",total_repo_weight=max_collection_weight) - facade_augur_collection.start_data_collection() + return facade_augur_collection.start_data_collection() @celery.task def augur_collection_monitor(): @@ -349,15 +349,21 @@ def augur_collection_monitor(): enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) if primary_repo_collect_phase.__name__ in enabled_phase_names: - start_primary_collection(session, max_repo=50, days=30,max_collection_weight=core_weight.value) + raw_weight = start_primary_collection(session, max_repo=50, days=30,max_collection_weight=core_weight.value) + + #Subtract weight used. + core_weight.value = core_weight.value - raw_weight if secondary_repo_collect_phase.__name__ in enabled_phase_names: - start_secondary_collection(session, max_repo=30, days=30,max_collection_weight=secondary_weight.value) + raw_weight = start_secondary_collection(session, max_repo=30, days=30,max_collection_weight=secondary_weight.value) + secondary_weight.value = secondary_weight.value - raw_weight if facade_phase.__name__ in enabled_phase_names: #Schedule facade collection before clone/updates as that is a higher priority - start_facade_collection(session, max_repo=30, days=30,max_collection_weight=facade_weight.value) + weight = start_facade_collection(session, max_repo=30, days=30,max_collection_weight=facade_weight.value) start_facade_clone_update(session,max_repo=15,days=30) + facade_weight.value = facade_weight.value - weight + From 83417bd5ef0d7067993a192672a7d1f5268aaa80 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 24 Mar 2023 13:22:35 -0500 Subject: [PATCH 05/46] dumb python type cast Signed-off-by: Isaac Milarsky --- augur/tasks/util/redis_scalar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/util/redis_scalar.py b/augur/tasks/util/redis_scalar.py index 29491cce67..88ebfb3d43 100644 --- a/augur/tasks/util/redis_scalar.py +++ b/augur/tasks/util/redis_scalar.py @@ -24,7 +24,7 @@ def __init__(self, scalar_name: str, default_value: int = 0, override_existing: redis.set(self.redis_scalar_key,self.__value) else: #else get the value - self.__value = int(redis.get(self.redis_scalar_key)) + self.__value = int(float(redis.get(self.redis_scalar_key))) @property def value(self): From 4222d607ed93b657250c5041ae11d6e14746bfb5 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 24 Mar 2023 13:29:36 -0500 Subject: [PATCH 06/46] fix no return and more clear logging Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 6 +++--- augur/tasks/util/collection_util.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index a956753a7a..6037cedfa2 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -196,7 +196,7 @@ def core_task_success_util_gen(repo_git): session.logger.info(f"Starting primary collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: - return + return 0 session.logger.info(f"Primary collection starting for: {tuple(repo_git_identifiers)}") @@ -240,7 +240,7 @@ def secondary_task_success_util_gen(repo_git): session.logger.info(f"Starting secondary collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: - return + return 0 session.logger.info(f"Secondary collection starting for: {tuple(repo_git_identifiers)}") @@ -314,7 +314,7 @@ def facade_task_success_util_gen(repo_git): session.logger.info(f"Starting facade collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: - return + return 0 session.logger.info(f"Facade collection starting for: {tuple(repo_git_identifiers)}") diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 16e569b797..423d41d37d 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -337,7 +337,7 @@ class to keep track of various groups of collection tasks for a group of repos. session: Database session to use """ def __init__(self,session,repos: List[str]=[],collection_phases: List=[],collection_hook: str="core"): - self.logger = AugurLogger("data_collection_jobs").get_logger() + self.logger = session.logger #self.session = TaskSession(self.logger) self.collection_phases = collection_phases #self.disabled_collection_tasks = disabled_collection_tasks From 2f47a028804dc9555d63c30944f08904782d8d12 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 17:43:08 -0500 Subject: [PATCH 07/46] Scrap hard limit idea Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 41 +++++++++----------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 69d7f8df77..f0c3518088 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -160,7 +160,7 @@ def non_repo_domain_tasks(): Each collection hook schedules tasks for a number of repos that are either new or older than a set amount of days. """ -def start_primary_collection(session,max_repo,days,max_collection_weight): +def start_primary_collection(session,max_repo,days): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) @@ -200,13 +200,10 @@ def core_task_success_util_gen(repo_git): session.logger.info(f"Primary collection starting for: {tuple(repo_git_identifiers)}") - primary_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases,total_repo_weight=max_collection_weight) + primary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases) - #Start data collection and update the collectionStatus with the task_ids - return primary_augur_collection.start_data_collection() - -def start_secondary_collection(session,max_repo,days,max_collection_weight): +def start_secondary_collection(session,max_repo,days): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) @@ -244,9 +241,8 @@ def secondary_task_success_util_gen(repo_git): session.logger.info(f"Secondary collection starting for: {tuple(repo_git_identifiers)}") - secondary_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary",total_repo_weight=max_collection_weight) + secondary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary") - return secondary_augur_collection.start_data_collection() def start_facade_clone_update(session,max_repo,days): facade_enabled_phases = [] @@ -318,9 +314,8 @@ def facade_task_success_util_gen(repo_git): session.logger.info(f"Facade collection starting for: {tuple(repo_git_identifiers)}") - facade_augur_collection = AugurWeightedTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade",total_repo_weight=max_collection_weight) + facade_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade") - return facade_augur_collection.start_data_collection() @celery.task def augur_collection_monitor(): @@ -331,39 +326,21 @@ def augur_collection_monitor(): logger.info("Checking for repos to collect") - #These weights correspond to prs and issues. - #i.e. a weight of 10,000 means 10,000 prs and issues should be scheduled at once. - core_weight = RedisScalar("core-weight",default_value=10000) - secondary_weight = RedisScalar("secondary-weight",default_value=10000) - - #This weight corresponds to commit count - facade_weight = RedisScalar("facade-weight", default_value=20000) - - logger.info("Checking current weights for repo collection") - logger.info(f"Core weight: {core_weight.value}") - logger.info(f"Secondary weight: {secondary_weight.value}") - logger.info(f"Facade weight: {facade_weight.value}") - with DatabaseSession(logger, engine) as session: #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) if primary_repo_collect_phase.__name__ in enabled_phase_names: - raw_weight = start_primary_collection(session, max_repo=20, days=30,max_collection_weight=core_weight.value) + start_primary_collection(session, max_repo=20, days=30) - #Subtract weight used. - core_weight.value = core_weight.value - raw_weight - if secondary_repo_collect_phase.__name__ in enabled_phase_names: - raw_weight = start_secondary_collection(session, max_repo=30, days=30,max_collection_weight=secondary_weight.value) - secondary_weight.value = secondary_weight.value - raw_weight + start_secondary_collection(session, max_repo=30, days=30) + if facade_phase.__name__ in enabled_phase_names: #Schedule facade collection before clone/updates as that is a higher priority - weight = start_facade_collection(session, max_repo=20, days=30,max_collection_weight=facade_weight.value) + start_facade_collection(session, max_repo=20, days=30) start_facade_clone_update(session,max_repo=15,days=30) - facade_weight.value = facade_weight.value - weight - From a952d550f78653133f879f76cf15b9de6fdecaca Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 18:10:07 -0500 Subject: [PATCH 08/46] add weights to collection_status Signed-off-by: Isaac Milarsky --- .../application/db/models/augur_operations.py | 12 ++++- .../facade_worker/facade02utilitymethods.py | 4 +- augur/tasks/util/collection_util.py | 54 +++---------------- 3 files changed, 21 insertions(+), 49 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 2206f612cd..e565b27b79 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -15,6 +15,8 @@ from augur.application.db.session import DatabaseSession from augur.application.db.models.base import Base +from augur.tasks.util.collection_util import get_repo_weight_core, get_repo_weight_facade + FRONTEND_REPO_GROUP_NAME = "Frontend Repos" logger = logging.getLogger(__name__) @@ -946,14 +948,22 @@ class CollectionStatus(Base): facade_status = Column(String,nullable=False, server_default=text("'Pending'")) facade_data_last_collected = Column(TIMESTAMP) facade_task_id = Column(String) + + core_weight = Column(BigInteger,nullable=False, server_default=text("0")) + facade_weight = Column(BigInteger,nullable=False, server_default=text("0")) repo = relationship("Repo", back_populates="collection_status") @staticmethod def insert(session, repo_id): + query = s.sql.text("""SELECT repo_git FROM repo + WHERE repo_id=:value""").bindparams(value=repo_id) + + repo = session.execute_sql(query).fetchone() + repo_git = repo[0] collection_status_unique = ["repo_id"] - result = session.insert_data({"repo_id": repo_id}, CollectionStatus, collection_status_unique, on_conflict_update=False) + result = session.insert_data({"repo_id": repo_id, "core_weight": get_repo_weight_core(session.logger, repo_git), "facade_weight": get_repo_weight_facade(session.logger, repo_git)}, CollectionStatus, collection_status_unique, on_conflict_update=False) if not result: return False diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py index 9d35faeaed..80b665fe32 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py @@ -145,7 +145,7 @@ def get_existing_commits_set(session, repo_id): def date_weight_factor(days_since_last_collection): return (days_since_last_collection ** 3) / 25 -def get_repo_weight_by_commit(logger,repo_git,days_since_last_collection): +def get_repo_weight_by_commit(logger,repo_git): with FacadeSession(logger) as session: repo = Repo.get_by_repo_git(session, repo_git) absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_group_id, repo.repo_path, repo.repo_name) @@ -156,4 +156,4 @@ def get_repo_weight_by_commit(logger,repo_git,days_since_last_collection): commit_count = int(check_commit_count_cmd) - return commit_count - date_weight_factor(days_since_last_collection) \ No newline at end of file + return commit_count \ No newline at end of file diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 423d41d37d..c9dacb9b93 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -135,7 +135,7 @@ def date_weight_factor(days_since_last_collection): return (days_since_last_collection ** 3) / 25 -def get_repo_weight_by_issue(logger,repo_git,days_since_last_collection): +def get_repo_weight_by_issue(logger,repo_git): owner,name = get_owner_repo(repo_git) @@ -144,7 +144,7 @@ def get_repo_weight_by_issue(logger,repo_git,days_since_last_collection): repo_graphql = GitHubRepoGraphql(logger, manifest.key_auth, owner, name) number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) - return number_of_issues_and_prs - date_weight_factor(days_since_last_collection) + return number_of_issues_and_prs #Get the weight for each repo for the core collection hook @@ -156,17 +156,8 @@ def get_repo_weight_core(logger,repo_git): if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - status = repo.collection_status[0] - - last_collected = status.core_data_last_collected - if last_collected: - time_delta = datetime.datetime.now() - last_collected - days = time_delta.days - else: - days = 0 - - return get_repo_weight_by_issue(logger, repo_git, days) + return get_repo_weight_by_issue(logger, repo_git) @celery.task @@ -245,16 +236,8 @@ def get_repo_weight_facade(logger,repo_git): if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - status = repo.collection_status[0] - last_collected = status.facade_data_last_collected - - if last_collected: - time_delta = datetime.datetime.now() - last_collected - days = time_delta.days - else: - days = 0 - return get_repo_weight_by_commit(logger, repo_git, days) + return get_repo_weight_by_commit(logger, repo_git) @celery.task @@ -396,9 +379,9 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id - +""" class AugurWeightedTaskRoutine(AugurTaskRoutine): - """ + class to keep track of various groups of collection tasks for a group of repos. Intermediate class that takes into account relative weights of repos and stops after a set limit of repos limited by their size. @@ -411,7 +394,7 @@ class to keep track of various groups of collection tasks for a group of repos. collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core session: Database session to use total_repo_weight (AugurCollectionTotalRepoWeight): object that allows repo objects and repo_git strings to be subtracted from it - """ + def __init__(self,session,repos: List[str]=[],collection_phases: List[str]=[],collection_hook: str="core",total_repo_weight=10000): #Define superclass vars @@ -467,25 +450,4 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id - -if __name__ == "__main__": - #Examples of using AugurCollectionTotalRepoWeight - weight = AugurCollectionTotalRepoWeight(10000) - print(f"Weight value: {weight.value}") - - #Apply subtraction operation with string - weight = weight - "https://github.com/chaoss/augur" - print(f"Weight value: {weight.value}") - - #Apply subtraction operation with orm object - with DatabaseSession(logging.getLogger()) as session: - repo = Repo.get_by_repo_git(session, 'https://github.com/operate-first/blueprint') - weight = weight - repo - - print(f"Weight value: {weight.value}") - - #Use commit count instead of issues and pr count - commitWeight = AugurCollectionTotalRepoWeight(100000,weight_calculation=get_repo_weight_facade) - print(f"commit weight value: {commitWeight.value}") - #commitWeight = commitWeight - "https://github.com/eclipse/che-theia-activity-tracker" - #print(f"commit weight value: {commitWeight.value}") \ No newline at end of file +""" \ No newline at end of file From 85212cf3997936dc99f5de2d91c5887a98d55a57 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 18:30:26 -0500 Subject: [PATCH 09/46] add columns for collection status in alembic Signed-off-by: Isaac Milarsky --- .../application/db/models/augur_operations.py | 11 ++++-- ...dd_weight_data_to_collection_status_to_.py | 34 +++++++++++++++++++ augur/tasks/github/util/util.py | 28 ++++++++++++++- augur/tasks/util/collection_util.py | 26 ++------------ 4 files changed, 73 insertions(+), 26 deletions(-) create mode 100644 augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index e565b27b79..9b24c1d26b 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -15,7 +15,6 @@ from augur.application.db.session import DatabaseSession from augur.application.db.models.base import Base -from augur.tasks.util.collection_util import get_repo_weight_core, get_repo_weight_facade FRONTEND_REPO_GROUP_NAME = "Frontend Repos" @@ -956,6 +955,9 @@ class CollectionStatus(Base): @staticmethod def insert(session, repo_id): + from augur.tasks.github.util.util import get_repo_weight_core + from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_repo_weight_by_commit + query = s.sql.text("""SELECT repo_git FROM repo WHERE repo_id=:value""").bindparams(value=repo_id) @@ -963,7 +965,12 @@ def insert(session, repo_id): repo_git = repo[0] collection_status_unique = ["repo_id"] - result = session.insert_data({"repo_id": repo_id, "core_weight": get_repo_weight_core(session.logger, repo_git), "facade_weight": get_repo_weight_facade(session.logger, repo_git)}, CollectionStatus, collection_status_unique, on_conflict_update=False) + + record = {"repo_id": repo_id, "core_weight": get_repo_weight_core(session.logger, repo_git), "facade_weight": get_repo_weight_by_commit(session.logger, repo_git)} + result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) + + session.logger.info(f"Trying to insert repo \n core weight: {record['core_weight']} \n facade_weight: {record['facade_weight']}") + if not result: return False diff --git a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py new file mode 100644 index 0000000000..c4d3270cd7 --- /dev/null +++ b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py @@ -0,0 +1,34 @@ +"""Add weight data to collection status to determine collection order of repos + +Revision ID: 16 +Revises: 15 +Create Date: 2023-04-10 18:28:12.460522 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '16' +down_revision = '15' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('collection_status', sa.Column('core_weight', sa.BigInteger(), server_default=sa.text('0'), nullable=False), schema='augur_operations') + op.add_column('collection_status', sa.Column('facade_weight', sa.BigInteger(), server_default=sa.text('0'), nullable=False), schema='augur_operations') + op.drop_constraint('collection_status_repo_id_fk', 'collection_status', schema='augur_operations', type_='foreignkey') + op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations', referent_schema='augur_data') + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint('collection_status_repo_id_fk', 'collection_status', schema='augur_operations', type_='foreignkey') + op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations') + op.drop_column('collection_status', 'facade_weight', schema='augur_operations') + op.drop_column('collection_status', 'core_weight', schema='augur_operations') + # ### end Alembic commands ### diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 37ee12dd5e..0a5a6326f2 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -4,7 +4,10 @@ import logging import json import httpx - +from augur.tasks.github.util.github_task_session import GithubTaskManifest +from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql +from augur.application.db.session import DatabaseSession +from augur.application.db.models import Repo # This function adds a key value pair to a list of dicts and returns the modified list of dicts back def add_key_value_pair_to_dicts(data: List[dict], key: str, value: Any) -> List[dict]: @@ -53,3 +56,26 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic logger.warning(f"invalid return from GitHub. Response was: {response.text}. Exception: {e}") return json.loads(json.dumps(response.text)) +def get_repo_weight_by_issue(logger,repo_git): + + + owner,name = get_owner_repo(repo_git) + + with GithubTaskManifest(logger) as manifest: + repo_graphql = GitHubRepoGraphql(logger, manifest.key_auth, owner, name) + number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) + + return number_of_issues_and_prs + +#Get the weight for each repo for the core collection hook +def get_repo_weight_core(logger,repo_git): + from augur.tasks.init.celery_app import engine + + with DatabaseSession(logger,engine) as session: + repo = Repo.get_by_repo_git(session, repo_git) + if not repo: + raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") + + + return get_repo_weight_by_issue(logger, repo_git) + diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index c9dacb9b93..dc904a78ac 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -22,7 +22,7 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession -from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_repo_weight_by_commit + # class syntax class CollectionState(Enum): @@ -131,20 +131,9 @@ def core_task_success_util(repo_git): session.commit() -def date_weight_factor(days_since_last_collection): - return (days_since_last_collection ** 3) / 25 - +#def date_weight_factor(days_since_last_collection): +# return (days_since_last_collection ** 3) / 25 -def get_repo_weight_by_issue(logger,repo_git): - - - owner,name = get_owner_repo(repo_git) - - with GithubTaskManifest(logger) as manifest: - repo_graphql = GitHubRepoGraphql(logger, manifest.key_auth, owner, name) - number_of_issues_and_prs = len(repo_graphql.get_issues_collection()) + len(repo_graphql.get_pull_requests_collection()) - - return number_of_issues_and_prs #Get the weight for each repo for the core collection hook @@ -228,16 +217,7 @@ def facade_task_success_util(repo_git): session.commit() -def get_repo_weight_facade(logger,repo_git): - from augur.tasks.init.celery_app import engine - - with DatabaseSession(logger,engine) as session: - repo = Repo.get_by_repo_git(session, repo_git) - if not repo: - raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - - return get_repo_weight_by_commit(logger, repo_git) @celery.task From b4b0cea3b268130a0ddcdf8ad4e347ee46b453ee Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 18:53:20 -0500 Subject: [PATCH 10/46] Update Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 1 + augur/tasks/github/util/util.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 9b24c1d26b..3baf8a996b 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -4,6 +4,7 @@ from sqlalchemy.orm.exc import NoResultFound from sqlalchemy.exc import IntegrityError from sqlalchemy.orm import relationship +import sqlalchemy as s from werkzeug.security import generate_password_hash, check_password_hash from typing import List, Any, Dict diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 0a5a6326f2..49ecb53b29 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -5,7 +5,6 @@ import json import httpx from augur.tasks.github.util.github_task_session import GithubTaskManifest -from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql from augur.application.db.session import DatabaseSession from augur.application.db.models import Repo @@ -57,7 +56,7 @@ def parse_json_response(logger: logging.Logger, response: httpx.Response) -> dic return json.loads(json.dumps(response.text)) def get_repo_weight_by_issue(logger,repo_git): - + from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql owner,name = get_owner_repo(repo_git) From 45326c8b971e68b231a6016aefa2987840cea320 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 19:26:20 -0500 Subject: [PATCH 11/46] print Signed-off-by: Isaac Milarsky --- augur/tasks/github/util/gh_graphql_entities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index 626c8ac51e..d35bffe02d 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -203,7 +203,7 @@ def request_graphql_dict(self,variables={},timeout_wait=10): if err and err != GithubApiResult.SUCCESS: attempts += 1 - self.logger.info(f"err: {err}") + self.logger.info(f"err: {err} \n response_data: {response_data}") continue success = True From c49471e82429caf09f28414465fec617f5daebce Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 19:28:28 -0500 Subject: [PATCH 12/46] vscode Signed-off-by: Isaac Milarsky --- .../git/dependency_libyear_tasks/libyear_util/util.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py index 33c07df29b..2d60976983 100644 --- a/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py +++ b/augur/tasks/git/dependency_libyear_tasks/libyear_util/util.py @@ -164,18 +164,9 @@ def get_deps_libyear_data(path, logger): logger.error(f"Could not get latest version of dependency for path {path}.\n Dependency: {dependency}") current_release_date = dateutil.parser.parse('1970-01-01 00:00:00') -<<<<<<< HEAD - if current_release_date: - libyear = get_libyear(current_version, current_release_date, latest_version, latest_release_date) - else: - current_release_date = dateutil.parser.parse('1970-01-01 00:00:00') - libyear = -1 - -======= libyear = get_libyear(current_version, current_release_date, latest_version, latest_release_date) ->>>>>>> dev if not latest_release_date: latest_release_date = dateutil.parser.parse('1970-01-01 00:00:00') libyear = -1 From 74d79f260a7c22d48c54d2f9d24d9def8b991a0a Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 19:45:31 -0500 Subject: [PATCH 13/46] fix schema Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 13 +++++++++---- .../16_add_weight_data_to_collection_status_to_.py | 4 ++-- augur/tasks/github/util/gh_graphql_entities.py | 7 ++++++- augur/tasks/github/util/github_paginator.py | 6 ++++++ 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 3ea9c68207..33cd7ee682 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1011,8 +1011,8 @@ class CollectionStatus(Base): facade_data_last_collected = Column(TIMESTAMP) facade_task_id = Column(String) - core_weight = Column(BigInteger,nullable=False, server_default=text("0")) - facade_weight = Column(BigInteger,nullable=False, server_default=text("0")) + core_weight = Column(BigInteger) + facade_weight = Column(BigInteger) repo = relationship("Repo", back_populates="collection_status") @@ -1029,10 +1029,15 @@ def insert(session, repo_id): collection_status_unique = ["repo_id"] - record = {"repo_id": repo_id, "core_weight": get_repo_weight_core(session.logger, repo_git), "facade_weight": get_repo_weight_by_commit(session.logger, repo_git)} + try: + core_weight = get_repo_weight_core(session.logger, repo_git) + except Exception as e: + core_weight = None + + record = {"repo_id": repo_id, "core_weight": core_weight} result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) - session.logger.info(f"Trying to insert repo \n core weight: {record['core_weight']} \n facade_weight: {record['facade_weight']}") + session.logger.info(f"Trying to insert repo \n core weight: {record['core_weight']}") if not result: return False diff --git a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py index c4d3270cd7..f7f94dfb6a 100644 --- a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py +++ b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py @@ -18,8 +18,8 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.add_column('collection_status', sa.Column('core_weight', sa.BigInteger(), server_default=sa.text('0'), nullable=False), schema='augur_operations') - op.add_column('collection_status', sa.Column('facade_weight', sa.BigInteger(), server_default=sa.text('0'), nullable=False), schema='augur_operations') + op.add_column('collection_status', sa.Column('core_weight', sa.BigInteger()), schema='augur_operations') + op.add_column('collection_status', sa.Column('facade_weight', sa.BigInteger()), schema='augur_operations') op.drop_constraint('collection_status_repo_id_fk', 'collection_status', schema='augur_operations', type_='foreignkey') op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations', referent_schema='augur_data') # ### end Alembic commands ### diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index d35bffe02d..e4f718af68 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -200,7 +200,11 @@ def request_graphql_dict(self,variables={},timeout_wait=10): if type(response_data) == dict: err = process_dict_response(self.logger, result, response_data) - + + if err == GithubApiResult.REPO_NOT_FOUND: + self.logger.error(f"Repo not found! \n response_data: {response_data}") + return None + if err and err != GithubApiResult.SUCCESS: attempts += 1 self.logger.info(f"err: {err} \n response_data: {response_data}") @@ -227,6 +231,7 @@ def request_graphql_dict(self,variables={},timeout_wait=10): #If we get an error message that's not None if err and err != GithubApiResult.SUCCESS: + attempts += 1 continue success = True diff --git a/augur/tasks/github/util/github_paginator.py b/augur/tasks/github/util/github_paginator.py index 26b3c1c795..1c252d8ce6 100644 --- a/augur/tasks/github/util/github_paginator.py +++ b/augur/tasks/github/util/github_paginator.py @@ -134,6 +134,12 @@ def process_dict_response(logger: logging.Logger, response: httpx.Response, page logger.info(f"\n\n\nAPI rate limit exceeded. Sleeping until the key resets ({key_reset_time} seconds)") time.sleep(key_reset_time) return GithubApiResult.RATE_LIMIT_EXCEEDED + + err_type = error.get('type') + + if err_type and 'NOT_FOUND' in err_type: + return GithubApiResult.REPO_NOT_FOUND + return GithubApiResult.NEW_RESULT From 638925d73d5b920ea7d4b05cad63c04851422564 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 10 Apr 2023 21:01:05 -0500 Subject: [PATCH 14/46] fill in weight Signed-off-by: Isaac Milarsky --- augur/application/cli/collection.py | 2 -- .../application/db/models/augur_operations.py | 1 - augur/tasks/git/facade_tasks.py | 26 ++++++++++++++++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index ad7533eebc..e29a04fe58 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -36,8 +36,6 @@ logger = AugurLogger("augur", reset_logfiles=True).get_logger() -def get_page_count() - def check_collection(owner, repo, key_manager, session): diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 33cd7ee682..41b5d57cf8 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1019,7 +1019,6 @@ class CollectionStatus(Base): @staticmethod def insert(session, repo_id): from augur.tasks.github.util.util import get_repo_weight_core - from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_repo_weight_by_commit query = sql_text("""SELECT repo_git FROM repo WHERE repo_id=:value""").bindparams(value=repo_id) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index a412554963..1fd543f185 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -24,10 +24,13 @@ from datetime import timedelta import sqlalchemy as s +from sqlalchemy import or_, and_, update from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import update_repo_log, trim_commit, store_working_author, trim_author from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.facade03analyzecommit import analyze_commit +from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_repo_weight_by_commit + from augur.tasks.github.facade_github.tasks import * from augur.tasks.util.worker_util import create_grouped_task_load @@ -347,6 +350,25 @@ def git_repo_initialize_facade_task(repo_git): # with FacadeSession(logger) as session: # check_for_repo_updates(session, repo_git) +@celery.task +def git_update_commit_count_weight(repo_git): + + from augur.tasks.init.celery_app import engine + logger = logging.getLogger(git_update_commit_count_weight.__name__) + + weight = get_repo_weight_by_commit(logger,repo_git) + + with DatabaseSession(logger,engine=engine) as session: + repo = Repo.get_by_repo_git(session, repo_git) + + update_query = ( + update(CollectionStatus) + .where(CollectionStatus.repo_id == repo.repo_id) + .values(facade_weight=weight) + ) + + session.execute(update_query) + @celery.task def git_repo_updates_facade_task(repo_git): @@ -460,7 +482,9 @@ def facade_clone_update_phase(repo_git): if not limited_run or (limited_run and pull_repos): facade_sequence.append(git_repo_updates_facade_task.si(repo_git)) - + + facade_sequence.append(git_update_commit_count_weight.si(repo_git)) + return chain(*facade_sequence) From 61f41fd83d18f551fe0d94030907ab7be07ae83f Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 09:47:31 -0500 Subject: [PATCH 15/46] Complete sort logic for collection weight Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 3 +++ augur/tasks/start_tasks.py | 20 ++++++++++++++++---- augur/tasks/util/collection_util.py | 8 ++++++-- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 1fd543f185..d70c4bec76 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -358,6 +358,8 @@ def git_update_commit_count_weight(repo_git): weight = get_repo_weight_by_commit(logger,repo_git) + logger.info(f"Repo {repo_git} has a weight of {weight}") + with DatabaseSession(logger,engine=engine) as session: repo = Repo.get_by_repo_git(session, repo_git) @@ -368,6 +370,7 @@ def git_update_commit_count_weight(repo_git): ) session.execute(update_query) + session.commit() @celery.task diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 7fd45286b1..8f2a688e55 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -192,8 +192,10 @@ def core_task_success_util_gen(repo_git): limit = max_repo-active_repo_count + core_order = CollectionStatus.core_weight + #Get repos for primary collection hook - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_erroed, not_collecting, or_(never_collected, old_collection)),limit) + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_erroed, not_collecting, or_(never_collected, old_collection)),limit,order=core_order) session.logger.info(f"Starting primary collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: @@ -203,6 +205,8 @@ def core_task_success_util_gen(repo_git): primary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases) + primary_augur_collection.start_data_collection() + def start_secondary_collection(session,max_repo,days): @@ -234,7 +238,9 @@ def secondary_task_success_util_gen(repo_git): limit = max_repo-active_repo_count - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(primary_collected,not_erroed, not_collecting, or_(never_collected, old_collection)),limit) + secondary_order = CollectionStatus.core_weight + + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(primary_collected,not_erroed, not_collecting, or_(never_collected, old_collection)),limit,order=secondary_order) session.logger.info(f"Starting secondary collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: @@ -244,6 +250,8 @@ def secondary_task_success_util_gen(repo_git): secondary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary") + secondary_augur_collection.start_data_collection() + def start_facade_clone_update(session,max_repo,days): facade_enabled_phases = [] @@ -282,7 +290,7 @@ def facade_clone_update_success_util_gen(repo_git): facade_augur_collection.start_data_collection() -def start_facade_collection(session,max_repo,days,max_collection_weight): +def start_facade_collection(session,max_repo,days): #Deal with secondary collection facade_enabled_phases = [] @@ -307,7 +315,9 @@ def facade_task_success_util_gen(repo_git): limit = max_repo-active_repo_count - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing, or_(never_collected, old_collection)),limit) + facade_order = CollectionStatus.facade_weight + + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing, or_(never_collected, old_collection)),limit,order=facade_order) session.logger.info(f"Starting facade collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: @@ -317,6 +327,8 @@ def facade_task_success_util_gen(repo_git): facade_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade") + facade_augur_collection.start_data_collection() + @celery.task def augur_collection_monitor(): diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index dc904a78ac..b86f64bba6 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -46,8 +46,12 @@ def get_enabled_phase_names_from_config(logger, session): #Query db for CollectionStatus records that fit the desired condition. #Used to get CollectionStatus for differant collection hooks -def get_collection_status_repo_git_from_filter(session,filter_condition,limit): - repo_status_list = session.query(CollectionStatus).filter(filter_condition).limit(limit).all() +def get_collection_status_repo_git_from_filter(session,filter_condition,limit,order=None): + + if order: + repo_status_list = session.query(CollectionStatus).filter(filter_condition).order_by(order).limit(limit).all() + else: + repo_status_list = session.query(CollectionStatus).filter(filter_condition).limit(limit).all() return [status.repo.repo_git for status in repo_status_list] From 25e3ac89bca268f89ea41dfebdabf2d1caf4f0dd Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 10:07:42 -0500 Subject: [PATCH 16/46] syntax Signed-off-by: Isaac Milarsky --- augur/tasks/util/collection_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index b86f64bba6..b216387737 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -48,8 +48,8 @@ def get_enabled_phase_names_from_config(logger, session): #Used to get CollectionStatus for differant collection hooks def get_collection_status_repo_git_from_filter(session,filter_condition,limit,order=None): - if order: - repo_status_list = session.query(CollectionStatus).filter(filter_condition).order_by(order).limit(limit).all() + if order is not None: + repo_status_list = session.query(CollectionStatus).order_by(order).filter(filter_condition).limit(limit).all() else: repo_status_list = session.query(CollectionStatus).filter(filter_condition).limit(limit).all() From fc189237247347c386ec59e3e94b6d364c1d86df Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 10:12:48 -0500 Subject: [PATCH 17/46] make sure that commit weight is updated Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index d70c4bec76..9e802d7008 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -534,7 +534,8 @@ def facade_phase(repo_git): group( chain(*facade_core_collection), process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git) + process_libyear_dependency_metrics.si(repo_git), + git_update_commit_count_weight.si(repo_git) ) ) From b66f2c4bd17c2ca74c3a625f5d22003f823d0674 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 10:37:27 -0500 Subject: [PATCH 18/46] core repo update weight logic Signed-off-by: Isaac Milarsky --- augur/tasks/github/issues/tasks.py | 8 +++++-- augur/tasks/github/pull_requests/tasks.py | 5 +++- augur/tasks/start_tasks.py | 13 +++++++---- augur/tasks/util/collection_util.py | 28 ++++++++++++++++++++++- 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/augur/tasks/github/issues/tasks.py b/augur/tasks/github/issues/tasks.py index d5ce19f68d..81fa3a341a 100644 --- a/augur/tasks/github/issues/tasks.py +++ b/augur/tasks/github/issues/tasks.py @@ -21,7 +21,7 @@ development = get_development_flag() @celery.task(base=AugurCoreRepoCollectionTask) -def collect_issues(repo_git : str) -> None: +def collect_issues(repo_git : str) -> int: logger = logging.getLogger(collect_issues.__name__) @@ -39,14 +39,18 @@ def collect_issues(repo_git : str) -> None: issue_data = retrieve_all_issue_data(repo_git, logger, manifest.key_auth) + if issue_data: - + total_issues = len(issue_data) process_issues(issue_data, f"{owner}/{repo}: Issue task", repo_id, logger, augur_db) + return total_issues else: logger.info(f"{owner}/{repo} has no issues") + return 0 except Exception as e: logger.error(f"Could not collect issues for repo {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}") + return -1 diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 0fef125408..a5ba6db7c4 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -20,7 +20,7 @@ @celery.task(base=AugurCoreRepoCollectionTask) -def collect_pull_requests(repo_git: str) -> None: +def collect_pull_requests(repo_git: str) -> int: logger = logging.getLogger(collect_pull_requests.__name__) @@ -36,8 +36,11 @@ def collect_pull_requests(repo_git: str) -> None: if pr_data: process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + + return len(pr_data) else: logger.info(f"{owner}/{repo} has no pull requests") + return 0 # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 8f2a688e55..a95a4095ad 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -75,10 +75,13 @@ def primary_repo_collect_phase(repo_git): #A chain is needed for each repo. repo_info_task = collect_repo_info.si(repo_git)#collection_task_wrapper(self) - primary_repo_jobs = group( - collect_issues.si(repo_git), - collect_pull_requests.si(repo_git) - ) + header = [collect_issues.s(repo_git),collect_pull_requests.s(repo_git)] + primary_chord = chord(header)(core_task_update_weight_util.s(repo_git=repo_git)) + + #primary_repo_jobs = group( + # collect_issues.si(repo_git), + # collect_pull_requests.si(repo_git) + #) secondary_repo_jobs = group( collect_events.si(repo_git),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, @@ -88,7 +91,7 @@ def primary_repo_collect_phase(repo_git): repo_task_group = group( repo_info_task, - chain(primary_repo_jobs,secondary_repo_jobs,process_contributors.si()), + chain(primary_chord,secondary_repo_jobs,process_contributors.si()), #facade_phase(logger,repo_git), collect_releases.si(repo_git), diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index b216387737..dcdbafff4a 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -17,7 +17,7 @@ from augur.application.db.models import CollectionStatus, Repo from augur.application.db.util import execute_session_query from augur.application.config import AugurConfig -from augur.tasks.github.util.util import get_owner_repo +from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.tasks.github.util.github_task_session import GithubTaskManifest @@ -135,6 +135,32 @@ def core_task_success_util(repo_git): session.commit() +@celery.task +def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): + from augur.tasks.init.celery_app import engine + logger = logging.getLogger(git_update_commit_count_weight.__name__) + + if repo_git is None: + return + + try: + weight = get_repo_weight_core(logger,repo_git) + except Exception as e: + weight = None + + with DatabaseSession(logger,engine=engine) as session: + repo = Repo.get_by_repo_git(session, repo_git) + + update_query = ( + update(CollectionStatus) + .where(CollectionStatus.repo_id == repo.repo_id) + .values(core_weight=weight) + ) + + session.execute(update_query) + session.commit() + + #def date_weight_factor(days_since_last_collection): # return (days_since_last_collection ** 3) / 25 From eb4c5442d4e748cf234d041224c8789f249715a4 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 11:17:50 -0500 Subject: [PATCH 19/46] try to get celery chord working Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 6 ++++-- augur/tasks/start_tasks.py | 14 +++++++------- augur/tasks/util/collection_util.py | 2 ++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 27ffb2e8f6..ef41ff3872 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -187,13 +187,15 @@ def augur_stop(signal, logger): """ augur_processes = get_augur_processes() - _broadcast_signal_to_processes(augur_processes, broadcast_signal=signal, given_logger=logger) - # if celery is running, run the cleanup function process_names = [process.name() for process in augur_processes] + + _broadcast_signal_to_processes(augur_processes, broadcast_signal=signal, given_logger=logger) + if "celery" in process_names: cleanup_after_collection_halt(logger) + def cleanup_after_collection_halt(logger): clear_redis_caches() connection_string = "" diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index a95a4095ad..2eb1c20c6e 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -75,13 +75,13 @@ def primary_repo_collect_phase(repo_git): #A chain is needed for each repo. repo_info_task = collect_repo_info.si(repo_git)#collection_task_wrapper(self) - header = [collect_issues.s(repo_git),collect_pull_requests.s(repo_git)] - primary_chord = chord(header)(core_task_update_weight_util.s(repo_git=repo_git)) + header = [collect_issues.si(repo_git),collect_pull_requests.si(repo_git)] + primary_chord = chord(header) - #primary_repo_jobs = group( - # collect_issues.si(repo_git), - # collect_pull_requests.si(repo_git) - #) + primary_repo_jobs = group( + collect_issues.si(repo_git), + collect_pull_requests.si(repo_git) + ) secondary_repo_jobs = group( collect_events.si(repo_git),#*create_grouped_task_load(dataList=first_pass, task=collect_events).tasks, @@ -91,7 +91,7 @@ def primary_repo_collect_phase(repo_git): repo_task_group = group( repo_info_task, - chain(primary_chord,secondary_repo_jobs,process_contributors.si()), + chain(primary_repo_jobs | core_task_update_weight_util.s(repo_git=repo_git),secondary_repo_jobs,process_contributors.si()), #facade_phase(logger,repo_git), collect_releases.si(repo_git), diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index dcdbafff4a..5f00134f65 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -148,6 +148,8 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): except Exception as e: weight = None + logger.info(f"Repo {repo_git} has a weight of {weight}") + with DatabaseSession(logger,engine=engine) as session: repo = Repo.get_by_repo_git(session, repo_git) From 4477429f73e1a510dccb4865faf9f71240e9b1e8 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 12:06:32 -0500 Subject: [PATCH 20/46] syntax Signed-off-by: Isaac Milarsky --- augur/tasks/util/collection_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 5f00134f65..076685863d 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -138,7 +138,7 @@ def core_task_success_util(repo_git): @celery.task def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): from augur.tasks.init.celery_app import engine - logger = logging.getLogger(git_update_commit_count_weight.__name__) + logger = logging.getLogger(core_task_update_weight_util.__name__) if repo_git is None: return @@ -149,7 +149,7 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): weight = None logger.info(f"Repo {repo_git} has a weight of {weight}") - + with DatabaseSession(logger,engine=engine) as session: repo = Repo.get_by_repo_git(session, repo_git) From deb7c8eec3f22506034be93d236359ed395096e0 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 12:19:16 -0500 Subject: [PATCH 21/46] syntax Signed-off-by: Isaac Milarsky --- augur/tasks/util/collection_util.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 076685863d..0d4d59682a 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -144,12 +144,18 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): return try: - weight = get_repo_weight_core(logger,repo_git) + weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) except Exception as e: + logger.error(f"{e}") weight = None logger.info(f"Repo {repo_git} has a weight of {weight}") + logger.info(f"Args: {issue_and_pr_nums} , {repo_git}") + + if weight is None: + return + with DatabaseSession(logger,engine=engine) as session: repo = Repo.get_by_repo_git(session, repo_git) From ecdffa21d0e4c2d3e2c109cd6da1e8c62db8ba33 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 17:04:52 -0500 Subject: [PATCH 22/46] apply date factor for core repo weight Signed-off-by: Isaac Milarsky --- augur/tasks/github/util/util.py | 20 ++++++++++++++++- augur/tasks/util/collection_util.py | 34 ++++++++++++++++------------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index 49ecb53b29..e7ed998892 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -4,10 +4,24 @@ import logging import json import httpx +import datetime +from datetime import timedelta from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession from augur.application.db.models import Repo +def date_weight_factor(days_since_last_collection): + return (days_since_last_collection ** 3) / 25 + +def calculate_date_weight_from_timestamps(added,last_collection): + #Get the time since last collection as well as when the repo was added. + if last_collection is None: + delta = datetime.now() - added + else: + delta = datetime.now() - last_collection + + return date_weight_factor(delta.days) + # This function adds a key value pair to a list of dicts and returns the modified list of dicts back def add_key_value_pair_to_dicts(data: List[dict], key: str, value: Any) -> List[dict]: """Adds a key value pair to a list of dicts @@ -74,7 +88,11 @@ def get_repo_weight_core(logger,repo_git): repo = Repo.get_by_repo_git(session, repo_git) if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") + + status = repo.collection_status[0] + + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) - return get_repo_weight_by_issue(logger, repo_git) + return get_repo_weight_by_issue(logger, repo_git) - time_factor diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 0d4d59682a..945504bb1b 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -11,13 +11,13 @@ #from celery.result import AsyncResult from celery import signature from celery import group, chain, chord, signature -from sqlalchemy import or_, and_ +from sqlalchemy import or_, and_, update from augur.application.logs import AugurLogger from augur.tasks.init.celery_app import celery_app as celery from augur.application.db.models import CollectionStatus, Repo from augur.application.db.util import execute_session_query from augur.application.config import AugurConfig -from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core +from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core, calculate_date_weight_from_timestamps from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.tasks.github.util.github_task_session import GithubTaskManifest @@ -143,21 +143,25 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): if repo_git is None: return - try: - weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) - except Exception as e: - logger.error(f"{e}") - weight = None - - logger.info(f"Repo {repo_git} has a weight of {weight}") - - logger.info(f"Args: {issue_and_pr_nums} , {repo_git}") - - if weight is None: - return - with DatabaseSession(logger,engine=engine) as session: repo = Repo.get_by_repo_git(session, repo_git) + status = repo.collection_status[0] + + try: + weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) + + weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) + except Exception as e: + logger.error(f"{e}") + weight = None + + logger.info(f"Repo {repo_git} has a weight of {weight}") + + logger.info(f"Args: {issue_and_pr_nums} , {repo_git}") + + if weight is None: + return + update_query = ( update(CollectionStatus) From f6afabb411fda71bb0251ea81900fa420d91c2a7 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 17:12:47 -0500 Subject: [PATCH 23/46] add time_factor calculation to facade weight logic Signed-off-by: Isaac Milarsky --- .../facade_worker/facade02utilitymethods.py | 10 ++++++---- augur/tasks/github/util/util.py | 14 +------------- augur/tasks/start_tasks.py | 3 --- augur/tasks/util/collection_util.py | 7 ++----- augur/tasks/util/worker_util.py | 15 ++++++++++++++- 5 files changed, 23 insertions(+), 26 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py index 80b665fe32..28c5f3c0a6 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py @@ -41,6 +41,7 @@ from .facade01config import get_database_args_from_env from augur.application.db.models.augur_data import * from .facade01config import FacadeSession as FacadeSession +from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps #from augur.tasks.git.util.facade_worker.facade def update_repo_log(session, repos_id,status): @@ -142,12 +143,11 @@ def get_existing_commits_set(session, repo_id): return set(existing_commits) -def date_weight_factor(days_since_last_collection): - return (days_since_last_collection ** 3) / 25 - def get_repo_weight_by_commit(logger,repo_git): with FacadeSession(logger) as session: repo = Repo.get_by_repo_git(session, repo_git) + status = repo.collection_status[0] + absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_group_id, repo.repo_path, repo.repo_name) repo_loc = (f"{absolute_path}/.git") @@ -155,5 +155,7 @@ def get_repo_weight_by_commit(logger,repo_git): check_commit_count_cmd = check_output(["git","--git-dir",repo_loc, "rev-list", "--count", "HEAD"]) commit_count = int(check_commit_count_cmd) + + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) - return commit_count \ No newline at end of file + return commit_count - time_factor \ No newline at end of file diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index e7ed998892..b3f97e6aec 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -4,23 +4,11 @@ import logging import json import httpx -import datetime -from datetime import timedelta from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession from augur.application.db.models import Repo +from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps -def date_weight_factor(days_since_last_collection): - return (days_since_last_collection ** 3) / 25 - -def calculate_date_weight_from_timestamps(added,last_collection): - #Get the time since last collection as well as when the repo was added. - if last_collection is None: - delta = datetime.now() - added - else: - delta = datetime.now() - last_collection - - return date_weight_factor(delta.days) # This function adds a key value pair to a list of dicts and returns the modified list of dicts back def add_key_value_pair_to_dicts(data: List[dict], key: str, value: Any) -> List[dict]: diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 2eb1c20c6e..e5c7cb6885 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -75,9 +75,6 @@ def primary_repo_collect_phase(repo_git): #A chain is needed for each repo. repo_info_task = collect_repo_info.si(repo_git)#collection_task_wrapper(self) - header = [collect_issues.si(repo_git),collect_pull_requests.si(repo_git)] - primary_chord = chord(header) - primary_repo_jobs = group( collect_issues.si(repo_git), collect_pull_requests.si(repo_git) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 945504bb1b..fd43ceb16a 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -22,6 +22,7 @@ from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.application.db.session import DatabaseSession +from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps # class syntax @@ -149,7 +150,7 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): try: weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) - + weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) except Exception as e: logger.error(f"{e}") @@ -173,10 +174,6 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): session.commit() -#def date_weight_factor(days_since_last_collection): -# return (days_since_last_collection ** 3) / 25 - - #Get the weight for each repo for the core collection hook def get_repo_weight_core(logger,repo_git): diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index b0cb335d94..ead46bde05 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -8,7 +8,8 @@ from celery.result import allow_join_result from typing import Optional, List, Any, Tuple - +import datetime +from datetime import timedelta def create_grouped_task_load(*args,processes=8,dataList=[],task=None): @@ -100,6 +101,18 @@ def remove_duplicate_naturals(data, natural_keys): #print(new_data) return new_data +def date_weight_factor(days_since_last_collection): + return (days_since_last_collection ** 3) / 25 + +def calculate_date_weight_from_timestamps(added,last_collection): + #Get the time since last collection as well as when the repo was added. + if last_collection is None: + delta = datetime.now() - added + else: + delta = datetime.now() - last_collection + + return date_weight_factor(delta.days) + # def create_server(app, worker=None): From b894bc25b06b63f8abe497fd84c27386582f5ac7 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 17:36:10 -0500 Subject: [PATCH 24/46] repo added is a value that doesn't get updated when it gets introduced to a new database Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 3 +++ .../facade_worker/facade02utilitymethods.py | 2 +- augur/tasks/github/util/util.py | 12 ++++++++---- augur/tasks/util/collection_util.py | 2 ++ augur/tasks/util/worker_util.py | 4 ++-- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 41b5d57cf8..021eb4aaa5 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -10,6 +10,7 @@ import logging import secrets +import traceback import importlib from augur.application.db.models import Repo, RepoGroup @@ -1032,6 +1033,8 @@ def insert(session, repo_id): core_weight = get_repo_weight_core(session.logger, repo_git) except Exception as e: core_weight = None + session.logger.error( + ''.join(traceback.format_exception(None, e, e.__traceback__))) record = {"repo_id": repo_id, "core_weight": core_weight} result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py index 28c5f3c0a6..1586a62ffc 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py @@ -158,4 +158,4 @@ def get_repo_weight_by_commit(logger,repo_git): time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) - return commit_count - time_factor \ No newline at end of file + return max(0, commit_count - time_factor) \ No newline at end of file diff --git a/augur/tasks/github/util/util.py b/augur/tasks/github/util/util.py index b3f97e6aec..fbb23dd6e8 100644 --- a/augur/tasks/github/util/util.py +++ b/augur/tasks/github/util/util.py @@ -77,10 +77,14 @@ def get_repo_weight_core(logger,repo_git): if not repo: raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - status = repo.collection_status[0] + #try to get the collection status if it exists at this point + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added,None) - time_factor = calculate_date_weight_from_timestamps(repo.repo_added,status.core_data_last_collected) - - return get_repo_weight_by_issue(logger, repo_git) - time_factor + #Don't go below zero. + return max(0,get_repo_weight_by_issue(logger, repo_git) - time_factor) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index fd43ceb16a..08a4e651be 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -152,6 +152,8 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) + + weight = max(0,weight) except Exception as e: logger.error(f"{e}") weight = None diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index ead46bde05..2f502d8f6a 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -8,8 +8,7 @@ from celery.result import allow_join_result from typing import Optional, List, Any, Tuple -import datetime -from datetime import timedelta +from datetime import datetime, timedelta def create_grouped_task_load(*args,processes=8,dataList=[],task=None): @@ -111,6 +110,7 @@ def calculate_date_weight_from_timestamps(added,last_collection): else: delta = datetime.now() - last_collection + print(f"Days: {delta.days}") return date_weight_factor(delta.days) From 2172aaafdfb75e3ae3ab607786d58b667f78bfc2 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 11 Apr 2023 17:40:39 -0500 Subject: [PATCH 25/46] deal with if the record doesn't have collection yet Signed-off-by: Isaac Milarsky --- .../facade_worker/facade02utilitymethods.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py index 1586a62ffc..0a49dc37aa 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py @@ -146,7 +146,7 @@ def get_existing_commits_set(session, repo_id): def get_repo_weight_by_commit(logger,repo_git): with FacadeSession(logger) as session: repo = Repo.get_by_repo_git(session, repo_git) - status = repo.collection_status[0] + absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_group_id, repo.repo_path, repo.repo_name) repo_loc = (f"{absolute_path}/.git") @@ -155,7 +155,11 @@ def get_repo_weight_by_commit(logger,repo_git): check_commit_count_cmd = check_output(["git","--git-dir",repo_loc, "rev-list", "--count", "HEAD"]) commit_count = int(check_commit_count_cmd) - - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) + + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) return max(0, commit_count - time_factor) \ No newline at end of file From 8b64cff5b6a4fcc24f3132d96781556e3f79355a Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 12 Apr 2023 11:53:39 -0500 Subject: [PATCH 26/46] add periodic task to update repo weights on midnight during even numbered days Signed-off-by: Isaac Milarsky --- augur/tasks/init/celery_app.py | 6 +- augur/tasks/start_tasks.py | 28 ++++++++ augur/tasks/util/collection_util.py | 101 ++++------------------------ 3 files changed, 44 insertions(+), 91 deletions(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index 59f6f30084..9cea8460c1 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -187,7 +187,7 @@ def setup_periodic_tasks(sender, **kwargs): The tasks so that they are grouped by the module they are defined in """ from celery.schedules import crontab - from augur.tasks.start_tasks import augur_collection_monitor + from augur.tasks.start_tasks import augur_collection_monitor, augur_collection_update_weights from augur.tasks.start_tasks import non_repo_domain_tasks from augur.tasks.db.refresh_materialized_views import refresh_materialized_views @@ -208,7 +208,9 @@ def setup_periodic_tasks(sender, **kwargs): logger.info(f"Scheduling refresh materialized view every night at 1am CDT") sender.add_periodic_task(crontab(hour=1, minute=0), refresh_materialized_views.s()) - + + logger.info(f"Scheduling update of collection weights on midnight on even numbered days.") + sender.add_periodic_task(crontab(0, 0,day_of_month='2-30/2'),augur_collection_update_weights.s()) @after_setup_logger.connect diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index e5c7cb6885..da09e07f38 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -355,4 +355,32 @@ def augur_collection_monitor(): start_facade_clone_update(session,max_repo=5,days=30) +@celery.task +def augur_collection_update_weights(): + + from augur.tasks.init.celery_app import engine + + logger = logging.getLogger(augur_collection_update_weights.__name__) + + logger.info("Updating stale collection weights") + + with DatabaseSession(logger,engine) as session: + + core_weight_update_repos = session.query(CollectionStatus).filter(CollectionStatus.core_weight != None).all() + + for repo in core_weight_update_repos: + task_id = core_task_fetch_issues_prs_update_weight_util.si(repo.repo_git).apply_async().task_id + + logger.info(f"Scheduling task {task_id} for repo {repo.repo_id} to update core weight") + + facade_not_pending = CollectionStatus.facade_status != CollectionState.PENDING.value + facade_not_failed = CollectionStatus.facade_status != CollectionState.FAILED_CLONE.value + facade_weight_not_null = CollectionStatus.facade_weight != None + + facade_weight_update_repos = session.query(CollectionStatus).filter(and_(facade_not_pending,facade_not_failed,facade_weight_not_null)).all() + + for repo in facade_weight_update_repos: + task_id = git_update_commit_count_weight.si(repo_git).apply_async().task_id + + logger.info(f"Scheduling task {task_id} for repo {repo.repo_id} to update facade weight") diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 08a4e651be..908e9260c1 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -17,7 +17,7 @@ from augur.application.db.models import CollectionStatus, Repo from augur.application.db.util import execute_session_query from augur.application.config import AugurConfig -from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core, calculate_date_weight_from_timestamps +from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core, calculate_date_weight_from_timestamps, get_repo_weight_by_issue from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.tasks.github.util.github_task_session import GithubTaskManifest @@ -136,6 +136,7 @@ def core_task_success_util(repo_git): session.commit() +#This task updates the weight with the issues and prs already passed in @celery.task def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): from augur.tasks.init.celery_app import engine @@ -175,20 +176,14 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): session.execute(update_query) session.commit() +#Same as above just fetches the count with an api call before passing it into the above function. +@celery.task +def core_task_fetch_issues_prs_update_weight_util(repo_git): + logger = logging.getLogger(core_task_fetch_issues_prs_update_weight_util.__name__) + + raw_count = get_repo_weight_by_issue(logger,repo_git) - -#Get the weight for each repo for the core collection hook -def get_repo_weight_core(logger,repo_git): - from augur.tasks.init.celery_app import engine - - with DatabaseSession(logger,engine) as session: - repo = Repo.get_by_repo_git(session, repo_git) - if not repo: - raise Exception(f"Task with repo_git of {repo_git} but could not be found in Repo table") - - - return get_repo_weight_by_issue(logger, repo_git) - + core_task_update_weight_util([int(raw_count)],repo_git=repo_git) @celery.task def secondary_task_success_util(repo_git): @@ -381,8 +376,8 @@ def send_messages(self): for repo_git in self.repos: - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id + #repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() + #repo_id = repo.repo_id augur_collection_sequence = [] for job in self.collection_phases: @@ -395,80 +390,8 @@ def send_messages(self): augur_collection_chain = chain(*augur_collection_sequence) task_id = augur_collection_chain.apply_async(link_error=task_failed_util.s()).task_id - self.logger.info(f"Setting repo_id {repo_id} to collecting for repo: {repo_git}") + self.logger.info(f"Setting repo {self.collection_hook} status to collecting for repo: {repo_git}") #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id -""" -class AugurWeightedTaskRoutine(AugurTaskRoutine): - - class to keep track of various groups of collection tasks for a group of repos. - Intermediate class that takes into account relative weights of repos and stops after - a set limit of repos limited by their size. - - - Attributes: - logger (Logger): Get logger from AugurLogger - repos (List[str]): List of repo_ids to run collection on. - collection_phases (List[str]): List of phases to run in augur collection. - collection_hook (str): String determining the attributes to update when collection for a repo starts. e.g. core - session: Database session to use - total_repo_weight (AugurCollectionTotalRepoWeight): object that allows repo objects and repo_git strings to be subtracted from it - - def __init__(self,session,repos: List[str]=[],collection_phases: List[str]=[],collection_hook: str="core",total_repo_weight=10000): - - #Define superclass vars - super().__init__(session,repos=repos,collection_phases=collection_phases,collection_hook=collection_hook) - - #Define Total repo weight - if collection_hook == "core": - #Core collection hook has a repo weight of - self.total_repo_weight = AugurCollectionTotalRepoWeight(total_repo_weight) - elif collection_hook == "secondary": - self.total_repo_weight = AugurCollectionTotalRepoWeight(total_repo_weight,weight_calculation=get_repo_weight_secondary) - elif collection_hook == "facade": - self.total_repo_weight = AugurCollectionTotalRepoWeight(total_repo_weight,weight_calculation=get_repo_weight_facade) - - - #Overwrite super method - #now returns resulting weight after either reaching zero or - #scheduling all repos assigned to the object. - def start_data_collection(self): - #Send messages starts each repo and yields its running info - #to concurrently update the correct field in the database. - for repo_git, task_id in self.send_messages(): - self.update_status_and_id(repo_git,task_id) - - return self.total_repo_weight.value - - def send_messages(self): - augur_collection_list = [] - - for repo_git in self.repos: - #Check total repo weight - if self.total_repo_weight.value == 0: - break - - #Subtract repo's weight - self.total_repo_weight = self.total_repo_weight - repo_git - - repo = self.session.query(Repo).filter(Repo.repo_git == repo_git).one() - repo_id = repo.repo_id - - augur_collection_sequence = [] - for job in self.collection_phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async(link_error=task_failed_util.s()).task_id - - self.logger.info(f"Setting repo_id {repo_id} to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id -""" \ No newline at end of file From a75ab1c3e4fc729d70e88c334094010f2a70c787 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 12 Apr 2023 12:11:53 -0500 Subject: [PATCH 27/46] add command to reset repo age Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index ef41ff3872..3625a2b92a 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -17,7 +17,7 @@ import traceback from urllib.parse import urlparse from sqlalchemy import update - +from datetime import datetime from augur import instance_id from augur.tasks.start_tasks import augur_collection_monitor, CollectionState @@ -443,6 +443,22 @@ def order_repos(repos): return repo_git_urls +@cli.command('reset-repo-age') +@test_connection +@test_db_connection +def reset_repo_age(): + + logger.info("resetting collection age on all repositories") + + with DatabaseSession(logger) as session: + update_query = ( + update(Repo) + .values(repo_added=datetime.now()) + ) + + session.execute(update_query) + session.commit() + # def initialize_components(augur_app, disable_housekeeper): # master = None # manager = None From e9cc16fc2175331a88d704877be1b76937593ad0 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 12 Apr 2023 17:36:23 -0500 Subject: [PATCH 28/46] Un-needed import Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index da09e07f38..ea2ae8b6e5 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -33,7 +33,6 @@ from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo from augur.tasks.util.collection_util import * -from augur.tasks.util.redis_scalar import RedisScalar CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) From c5bd0f453b6ba05f9210fad2898a6d6c199c4b7d Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 12 Apr 2023 17:43:54 -0500 Subject: [PATCH 29/46] replace sql with orm query Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 021eb4aaa5..1d8ffd8f35 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1021,11 +1021,9 @@ class CollectionStatus(Base): def insert(session, repo_id): from augur.tasks.github.util.util import get_repo_weight_core - query = sql_text("""SELECT repo_git FROM repo - WHERE repo_id=:value""").bindparams(value=repo_id) - repo = session.execute_sql(query).fetchone() - repo_git = repo[0] + repo = Repo.get_by_id(session, repo_id) + repo_git = repo.repo_git collection_status_unique = ["repo_id"] From e358c43aaf8cb05bb6a65b4846f7ae3cd120bcb2 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 13 Apr 2023 13:51:08 -0500 Subject: [PATCH 30/46] adjust date weight Signed-off-by: Isaac Milarsky --- .../facade_worker/facade_worker/facade02utilitymethods.py | 3 +++ augur/tasks/util/worker_util.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py index 0a49dc37aa..c7c8925559 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py @@ -162,4 +162,7 @@ def get_repo_weight_by_commit(logger,repo_git): except IndexError: time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) + #Adjust for commits. + time_factor *= 1.2 + return max(0, commit_count - time_factor) \ No newline at end of file diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index 2f502d8f6a..cdacd6d197 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -100,8 +100,10 @@ def remove_duplicate_naturals(data, natural_keys): #print(new_data) return new_data +#4th root of 10,000 is 10 +#ten days for a 10,000 weight repo to reach zero. def date_weight_factor(days_since_last_collection): - return (days_since_last_collection ** 3) / 25 + return days_since_last_collection ** 4 def calculate_date_weight_from_timestamps(added,last_collection): #Get the time since last collection as well as when the repo was added. From 35201afd2ed273b0c724fc22f8483dd9214647f2 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 13 Apr 2023 14:41:00 -0500 Subject: [PATCH 31/46] change update weight to not schedule per repo Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index ea2ae8b6e5..dd47a16630 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -368,9 +368,7 @@ def augur_collection_update_weights(): core_weight_update_repos = session.query(CollectionStatus).filter(CollectionStatus.core_weight != None).all() for repo in core_weight_update_repos: - task_id = core_task_fetch_issues_prs_update_weight_util.si(repo.repo_git).apply_async().task_id - - logger.info(f"Scheduling task {task_id} for repo {repo.repo_id} to update core weight") + core_task_fetch_issues_prs_update_weight_util(repo.repo_git) facade_not_pending = CollectionStatus.facade_status != CollectionState.PENDING.value facade_not_failed = CollectionStatus.facade_status != CollectionState.FAILED_CLONE.value @@ -379,7 +377,5 @@ def augur_collection_update_weights(): facade_weight_update_repos = session.query(CollectionStatus).filter(and_(facade_not_pending,facade_not_failed,facade_weight_not_null)).all() for repo in facade_weight_update_repos: - task_id = git_update_commit_count_weight.si(repo_git).apply_async().task_id - - logger.info(f"Scheduling task {task_id} for repo {repo.repo_id} to update facade weight") + task_id = git_update_commit_count_weight(repo_git) From 0357be39bd56ffa3ebbb24b8310c8b3f835de171 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 14 Apr 2023 12:15:30 -0500 Subject: [PATCH 32/46] Add offset for 30 day collection restriction Signed-off-by: Isaac Milarsky --- augur/tasks/util/worker_util.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index cdacd6d197..ade842985c 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -109,11 +109,10 @@ def calculate_date_weight_from_timestamps(added,last_collection): #Get the time since last collection as well as when the repo was added. if last_collection is None: delta = datetime.now() - added + return date_weight_factor(delta.days) else: delta = datetime.now() - last_collection - - print(f"Days: {delta.days}") - return date_weight_factor(delta.days) + return date_weight_factor(delta.days - 30) From 209ad2abf3c22ff80010a2d10f366fe29a69777b Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 14 Apr 2023 16:13:34 -0500 Subject: [PATCH 33/46] prelim tweaks from meeting with Andrew Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 29 ++++++++++------------------- augur/tasks/util/worker_util.py | 16 ++++++++++++---- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index dd47a16630..a1eb644f21 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -158,9 +158,8 @@ def non_repo_domain_tasks(): """ The below functions define augur's collection hooks. Each collection hook schedules tasks for a number of repos - that are either new or older than a set amount of days. """ -def start_primary_collection(session,max_repo,days): +def start_primary_collection(session,max_repo): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) @@ -183,18 +182,15 @@ def core_task_success_util_gen(repo_git): active_repo_count = len(session.query(CollectionStatus).filter(CollectionStatus.core_status == CollectionState.COLLECTING.value).all()) - cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days) not_erroed = CollectionStatus.core_status != str(CollectionState.ERROR.value) not_collecting = CollectionStatus.core_status != str(CollectionState.COLLECTING.value) - never_collected = CollectionStatus.core_data_last_collected == None - old_collection = CollectionStatus.core_data_last_collected <= cutoff_date limit = max_repo-active_repo_count core_order = CollectionStatus.core_weight #Get repos for primary collection hook - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_erroed, not_collecting, or_(never_collected, old_collection)),limit,order=core_order) + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_erroed, not_collecting),limit,order=core_order) session.logger.info(f"Starting primary collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: @@ -207,7 +203,7 @@ def core_task_success_util_gen(repo_git): primary_augur_collection.start_data_collection() -def start_secondary_collection(session,max_repo,days): +def start_secondary_collection(session,max_repo): #Get list of enabled phases enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) @@ -228,18 +224,15 @@ def secondary_task_success_util_gen(repo_git): active_repo_count = len(session.query(CollectionStatus).filter(CollectionStatus.secondary_status == CollectionState.COLLECTING.value).all()) - cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days) not_erroed = CollectionStatus.secondary_status != str(CollectionState.ERROR.value) not_collecting = CollectionStatus.secondary_status != str(CollectionState.COLLECTING.value) - never_collected = CollectionStatus.secondary_data_last_collected == None - old_collection = CollectionStatus.secondary_data_last_collected <= cutoff_date primary_collected = CollectionStatus.core_status == str(CollectionState.SUCCESS.value) limit = max_repo-active_repo_count secondary_order = CollectionStatus.core_weight - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(primary_collected,not_erroed, not_collecting, or_(never_collected, old_collection)),limit,order=secondary_order) + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(primary_collected,not_erroed, not_collecting),limit,order=secondary_order) session.logger.info(f"Starting secondary collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: @@ -289,7 +282,7 @@ def facade_clone_update_success_util_gen(repo_git): facade_augur_collection.start_data_collection() -def start_facade_collection(session,max_repo,days): +def start_facade_collection(session,max_repo): #Deal with secondary collection facade_enabled_phases = [] @@ -303,20 +296,18 @@ def facade_task_success_util_gen(repo_git): active_repo_count = len(session.query(CollectionStatus).filter(CollectionStatus.facade_status == CollectionState.COLLECTING.value).all()) - cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days) + #cutoff_date = datetime.datetime.now() - datetime.timedelta(days=days) not_erroed = CollectionStatus.facade_status != str(CollectionState.ERROR.value) not_pending = CollectionStatus.facade_status != str(CollectionState.PENDING.value) not_failed_clone = CollectionStatus.facade_status != str(CollectionState.FAILED_CLONE.value) not_collecting = CollectionStatus.facade_status != str(CollectionState.COLLECTING.value) not_initializing = CollectionStatus.facade_status != str(CollectionState.INITIALIZING.value) - never_collected = CollectionStatus.facade_data_last_collected == None - old_collection = CollectionStatus.facade_data_last_collected <= cutoff_date limit = max_repo-active_repo_count facade_order = CollectionStatus.facade_weight - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing, or_(never_collected, old_collection)),limit,order=facade_order) + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing),limit,order=facade_order) session.logger.info(f"Starting facade collection on {len(repo_git_identifiers)} repos") if len(repo_git_identifiers) == 0: @@ -343,14 +334,14 @@ def augur_collection_monitor(): enabled_phase_names = get_enabled_phase_names_from_config(session.logger, session) if primary_repo_collect_phase.__name__ in enabled_phase_names: - start_primary_collection(session, max_repo=40, days=30) + start_primary_collection(session, max_repo=40) if secondary_repo_collect_phase.__name__ in enabled_phase_names: - start_secondary_collection(session, max_repo=10, days=30) + start_secondary_collection(session, max_repo=5) if facade_phase.__name__ in enabled_phase_names: #Schedule facade collection before clone/updates as that is a higher priority - start_facade_collection(session, max_repo=15, days=30) + start_facade_collection(session, max_repo=15) start_facade_clone_update(session,max_repo=5,days=30) diff --git a/augur/tasks/util/worker_util.py b/augur/tasks/util/worker_util.py index ade842985c..cbfeb0e904 100644 --- a/augur/tasks/util/worker_util.py +++ b/augur/tasks/util/worker_util.py @@ -102,17 +102,25 @@ def remove_duplicate_naturals(data, natural_keys): #4th root of 10,000 is 10 #ten days for a 10,000 weight repo to reach zero. -def date_weight_factor(days_since_last_collection): - return days_since_last_collection ** 4 +def date_weight_factor(days_since_last_collection,domain_shift=0): + return (days_since_last_collection - domain_shift) ** 4 -def calculate_date_weight_from_timestamps(added,last_collection): +def calculate_date_weight_from_timestamps(added,last_collection,domain_start_days=30): #Get the time since last collection as well as when the repo was added. if last_collection is None: delta = datetime.now() - added return date_weight_factor(delta.days) else: delta = datetime.now() - last_collection - return date_weight_factor(delta.days - 30) + + factor = date_weight_factor(delta.days,domain_shift=domain_start_days) + + #If the repo is older than thirty days, start to decrease its weight. + if delta.days >= domain_start_days: + return factor + else: + #Else increase its weight + return -1 * factor From 820e1aae0e21556a3ba703ec5bc5e362228e015b Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 17 Apr 2023 19:51:45 -0500 Subject: [PATCH 34/46] schema changes following meeting with Andrew Brain Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 4 ++++ .../versions/16_add_weight_data_to_collection_status_to_.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 1d8ffd8f35..bed01c8a84 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1014,6 +1014,10 @@ class CollectionStatus(Base): core_weight = Column(BigInteger) facade_weight = Column(BigInteger) + secondary_weight = Column(BigInteger) + + issue_pr_sum = Column(BigInteger) + commit_sum = Column(BigInteger) repo = relationship("Repo", back_populates="collection_status") diff --git a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py index f7f94dfb6a..90a3b0f534 100644 --- a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py +++ b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py @@ -20,6 +20,11 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.add_column('collection_status', sa.Column('core_weight', sa.BigInteger()), schema='augur_operations') op.add_column('collection_status', sa.Column('facade_weight', sa.BigInteger()), schema='augur_operations') + op.add_column('collection_status', sa.Column('secondary_weight', sa.BigInteger()), schema='augur_operations') + + op.add_column('collection_status', sa.Column('issue_pr_sum', sa.BigInteger()), schema='augur_operations') + op.add_column('collection_status', sa.Column('commit_sum', sa.BigInteger()), schema='augur_operations') + op.drop_constraint('collection_status_repo_id_fk', 'collection_status', schema='augur_operations', type_='foreignkey') op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations', referent_schema='augur_data') # ### end Alembic commands ### From 5d94dd3f222b74b4ab5e01fc35dd04949beadf1e Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 17 Apr 2023 19:54:00 -0500 Subject: [PATCH 35/46] downgrade logic in alembic Signed-off-by: Isaac Milarsky --- .../versions/16_add_weight_data_to_collection_status_to_.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py index 90a3b0f534..95a36f08c0 100644 --- a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py +++ b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py @@ -36,4 +36,8 @@ def downgrade(): op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations') op.drop_column('collection_status', 'facade_weight', schema='augur_operations') op.drop_column('collection_status', 'core_weight', schema='augur_operations') + op.drop_column('collection_status', 'secondary_weight', schema='augur_operations') + + op.drop_column('collection_status', 'issue_pr_sum', schema='augur_operations') + op.drop_column('collection_status', 'commit_sum', schema='augur_operations') # ### end Alembic commands ### From bfbe438f0545336c5adc1b535072a98261d8bbe5 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Mon, 17 Apr 2023 21:03:59 -0500 Subject: [PATCH 36/46] update weight logic Signed-off-by: Isaac Milarsky --- .../application/db/models/augur_operations.py | 19 ++++--- augur/tasks/git/facade_tasks.py | 15 +++--- .../facade_worker/facade02utilitymethods.py | 49 ++++++++++++------- augur/tasks/start_tasks.py | 2 +- augur/tasks/util/collection_util.py | 6 +-- 5 files changed, 55 insertions(+), 36 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index bed01c8a84..f6db738ea6 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1023,8 +1023,8 @@ class CollectionStatus(Base): @staticmethod def insert(session, repo_id): - from augur.tasks.github.util.util import get_repo_weight_core - + from augur.tasks.github.util.util import get_repo_weight_by_issue + from augur.tasks.util.worker_util import calculate_date_weight_from_timestamps repo = Repo.get_by_id(session, repo_id) repo_git = repo.repo_git @@ -1032,16 +1032,23 @@ def insert(session, repo_id): collection_status_unique = ["repo_id"] try: - core_weight = get_repo_weight_core(session.logger, repo_git) + pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) except Exception as e: - core_weight = None + pr_issue_count = None session.logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) - record = {"repo_id": repo_id, "core_weight": core_weight} + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + record = { + "repo_id": repo_id, + "issue_pr_sum": pr_issue_count, + "core_weight": github_weight, + "secondary_weight": github_weight + } + result = session.insert_data(record, CollectionStatus, collection_status_unique, on_conflict_update=False) - session.logger.info(f"Trying to insert repo \n core weight: {record['core_weight']}") + session.logger.info(f"Trying to insert repo \n issue and pr sum: {record['issue_pr_sum']}") if not result: return False diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index 9e802d7008..d9a556e9c2 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -29,7 +29,7 @@ from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import update_repo_log, trim_commit, store_working_author, trim_author from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_absolute_repo_path, get_parent_commits_set, get_existing_commits_set from augur.tasks.git.util.facade_worker.facade_worker.facade03analyzecommit import analyze_commit -from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_repo_weight_by_commit +from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_facade_weight_time_factor, get_repo_commit_count from augur.tasks.github.facade_github.tasks import * @@ -355,10 +355,13 @@ def git_update_commit_count_weight(repo_git): from augur.tasks.init.celery_app import engine logger = logging.getLogger(git_update_commit_count_weight.__name__) - - weight = get_repo_weight_by_commit(logger,repo_git) - - logger.info(f"Repo {repo_git} has a weight of {weight}") + + with FacadeSession(logger) as session: + commit_count = get_repo_commit_count(session, repo_git) + date_factor = get_facade_weight_time_factor(session, repo_git) + + weight = commit_count - date_factor + logger.info(f"Repo {repo_git} has a weight of {weight} and a commit count of {commit_count}") with DatabaseSession(logger,engine=engine) as session: repo = Repo.get_by_repo_git(session, repo_git) @@ -366,7 +369,7 @@ def git_update_commit_count_weight(repo_git): update_query = ( update(CollectionStatus) .where(CollectionStatus.repo_id == repo.repo_id) - .values(facade_weight=weight) + .values(facade_weight=weight,commit_sum=commit_count) ) session.execute(update_query) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py index c7c8925559..a9c48f42ab 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/facade02utilitymethods.py @@ -143,26 +143,37 @@ def get_existing_commits_set(session, repo_id): return set(existing_commits) -def get_repo_weight_by_commit(logger,repo_git): - with FacadeSession(logger) as session: - repo = Repo.get_by_repo_git(session, repo_git) - - - absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_group_id, repo.repo_path, repo.repo_name) - repo_loc = (f"{absolute_path}/.git") - - #git --git-dir <.git directory> rev-list --count HEAD - check_commit_count_cmd = check_output(["git","--git-dir",repo_loc, "rev-list", "--count", "HEAD"]) - - commit_count = int(check_commit_count_cmd) - - try: - status = repo.collection_status[0] - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) - except IndexError: - time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) + +def get_repo_commit_count(session,repo_git): + + repo = Repo.get_by_repo_git(session, repo_git) + + + absolute_path = get_absolute_repo_path(session.repo_base_directory, repo.repo_group_id, repo.repo_path, repo.repo_name) + repo_loc = (f"{absolute_path}/.git") + + #git --git-dir <.git directory> rev-list --count HEAD + check_commit_count_cmd = check_output(["git","--git-dir",repo_loc, "rev-list", "--count", "HEAD"]) + + commit_count = int(check_commit_count_cmd) + + return commit_count + +def get_facade_weight_time_factor(session,repo_git): + repo = Repo.get_by_repo_git(session, repo_git) + + try: + status = repo.collection_status[0] + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, status.facade_data_last_collected) + except IndexError: + time_factor = calculate_date_weight_from_timestamps(repo.repo_added, None) #Adjust for commits. time_factor *= 1.2 - return max(0, commit_count - time_factor) \ No newline at end of file + return time_factor + + +def get_repo_weight_by_commit(logger,repo_git): + with FacadeSession(logger) as session: + return get_repo_commit_count(session, repo_git) - get_facade_weight_time_factor(session, repo_git) \ No newline at end of file diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index a1eb644f21..ae60c09761 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -230,7 +230,7 @@ def secondary_task_success_util_gen(repo_git): limit = max_repo-active_repo_count - secondary_order = CollectionStatus.core_weight + secondary_order = CollectionStatus.secondary_weight repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(primary_collected,not_erroed, not_collecting),limit,order=secondary_order) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 908e9260c1..f605acd749 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -17,7 +17,7 @@ from augur.application.db.models import CollectionStatus, Repo from augur.application.db.util import execute_session_query from augur.application.config import AugurConfig -from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core, calculate_date_weight_from_timestamps, get_repo_weight_by_issue +from augur.tasks.github.util.util import get_owner_repo, get_repo_weight_core, get_repo_weight_by_issue from augur.tasks.github.util.gh_graphql_entities import GitHubRepo as GitHubRepoGraphql from augur.tasks.github.util.gh_graphql_entities import GraphQlPageCollection from augur.tasks.github.util.github_task_session import GithubTaskManifest @@ -153,8 +153,6 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) - - weight = max(0,weight) except Exception as e: logger.error(f"{e}") weight = None @@ -170,7 +168,7 @@ def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): update_query = ( update(CollectionStatus) .where(CollectionStatus.repo_id == repo.repo_id) - .values(core_weight=weight) + .values(core_weight=weight,issue_pr_sum=sum(issue_and_pr_nums)) ) session.execute(update_query) From 3164692069a9e7fe6bef6567f8adf2fb37d16871 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 10:45:53 -0500 Subject: [PATCH 37/46] fix update logic Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 31 ++++++++++--- augur/tasks/util/collection_util.py | 67 +++++++++++++++-------------- 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index ae60c09761..9ca4c5af2a 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -10,7 +10,7 @@ #from celery.result import AsyncResult from celery import signature from celery import group, chain, chord, signature -from sqlalchemy import or_, and_ +from sqlalchemy import or_, and_, update from augur.tasks.github import * @@ -33,6 +33,7 @@ from augur.tasks.util.redis_list import RedisList from augur.application.db.models import CollectionStatus, Repo from augur.tasks.util.collection_util import * +from augur.tasks.git.util.facade_worker.facade_worker.facade02utilitymethods import get_facade_weight_time_factor CELERY_GROUP_TYPE = type(group()) CELERY_CHAIN_TYPE = type(chain()) @@ -358,8 +359,14 @@ def augur_collection_update_weights(): core_weight_update_repos = session.query(CollectionStatus).filter(CollectionStatus.core_weight != None).all() - for repo in core_weight_update_repos: - core_task_fetch_issues_prs_update_weight_util(repo.repo_git) + for status in core_weight_update_repos: + repo = Repo.get_by_id(session, status.repo_id) + + repo_git = repo.repo_git + status = repo.collection_status[0] + raw_count = status.issue_pr_sum + + core_task_update_weight_util([int(raw_count)],repo_git=repo_git,session=session) facade_not_pending = CollectionStatus.facade_status != CollectionState.PENDING.value facade_not_failed = CollectionStatus.facade_status != CollectionState.FAILED_CLONE.value @@ -367,6 +374,20 @@ def augur_collection_update_weights(): facade_weight_update_repos = session.query(CollectionStatus).filter(and_(facade_not_pending,facade_not_failed,facade_weight_not_null)).all() - for repo in facade_weight_update_repos: - task_id = git_update_commit_count_weight(repo_git) + for status in facade_weight_update_repos: + repo = Repo.get_by_id(session, status.repo_id) + + commit_count = status.commit_sum + date_factor = get_facade_weight_time_factor(session, repo.repo_git) + weight = commit_count - date_factor + + update_query = ( + update(CollectionStatus) + .where(CollectionStatus.repo_id == status.repo_id) + .values(facade_weight=weight) + ) + + session.execute(update_query) + session.commit() + #git_update_commit_count_weight(repo_git) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index f605acd749..4637607ee4 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -136,52 +136,53 @@ def core_task_success_util(repo_git): session.commit() -#This task updates the weight with the issues and prs already passed in -@celery.task -def core_task_update_weight_util(issue_and_pr_nums,repo_git=None): - from augur.tasks.init.celery_app import engine - logger = logging.getLogger(core_task_update_weight_util.__name__) +def update_repo_core_weight(logger,session,repo_git,raw_sum): + repo = Repo.get_by_repo_git(session, repo_git) + status = repo.collection_status[0] - if repo_git is None: - return - - with DatabaseSession(logger,engine=engine) as session: - repo = Repo.get_by_repo_git(session, repo_git) - status = repo.collection_status[0] + try: + weight = raw_sum#get_repo_weight_core(logger,repo_git) - try: - weight = sum(issue_and_pr_nums)#get_repo_weight_core(logger,repo_git) + weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) + except Exception as e: + logger.error(f"{e}") + weight = None - weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) - except Exception as e: - logger.error(f"{e}") - weight = None + logger.info(f"Repo {repo_git} has a weight of {weight}") + + logger.info(f"Args: {raw_sum} , {repo_git}") - logger.info(f"Repo {repo_git} has a weight of {weight}") + if weight is None: + return - logger.info(f"Args: {issue_and_pr_nums} , {repo_git}") - if weight is None: - return + update_query = ( + update(CollectionStatus) + .where(CollectionStatus.repo_id == repo.repo_id) + .values(core_weight=weight,issue_pr_sum=raw_sum) + ) + session.execute(update_query) + session.commit() - update_query = ( - update(CollectionStatus) - .where(CollectionStatus.repo_id == repo.repo_id) - .values(core_weight=weight,issue_pr_sum=sum(issue_and_pr_nums)) - ) - session.execute(update_query) - session.commit() -#Same as above just fetches the count with an api call before passing it into the above function. +#This task updates the weight with the issues and prs already passed in @celery.task -def core_task_fetch_issues_prs_update_weight_util(repo_git): - logger = logging.getLogger(core_task_fetch_issues_prs_update_weight_util.__name__) +def core_task_update_weight_util(issue_and_pr_nums,repo_git=None,session=None): + from augur.tasks.init.celery_app import engine + logger = logging.getLogger(core_task_update_weight_util.__name__) + + if repo_git is None: + return - raw_count = get_repo_weight_by_issue(logger,repo_git) + if session is not None: + update_repo_core_weight(logger, session, repo_git, sum(issue_and_pr_nums)) + else: + with DatabaseSession(logger,engine=engine) as session: + update_repo_core_weight(logger,session,repo_git,sum(issue_and_pr_nums)) + - core_task_update_weight_util([int(raw_count)],repo_git=repo_git) @celery.task def secondary_task_success_util(repo_git): From 18acf48d7e603ce55ddd8fd34fdc87d7d2ef93ef Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 11:20:38 -0500 Subject: [PATCH 38/46] make secondary weight differant Signed-off-by: Isaac Milarsky --- augur/application/db/models/augur_operations.py | 5 ++++- augur/tasks/util/collection_util.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index f6db738ea6..246d07951f 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1033,12 +1033,15 @@ def insert(session, repo_id): try: pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) + session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") + github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) except Exception as e: pr_issue_count = None + github_weight = None session.logger.error( ''.join(traceback.format_exception(None, e, e.__traceback__))) - github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) + record = { "repo_id": repo_id, "issue_pr_sum": pr_issue_count, diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 4637607ee4..218ce4d8cc 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -144,9 +144,12 @@ def update_repo_core_weight(logger,session,repo_git,raw_sum): weight = raw_sum#get_repo_weight_core(logger,repo_git) weight -= calculate_date_weight_from_timestamps(repo.repo_added, status.core_data_last_collected) + + secondary_tasks_weight = raw_sum - calculate_date_weight_from_timestamps(repo.repo_added, status.secondary_data_last_collected) except Exception as e: logger.error(f"{e}") weight = None + secondary_tasks_weight = None logger.info(f"Repo {repo_git} has a weight of {weight}") @@ -159,7 +162,7 @@ def update_repo_core_weight(logger,session,repo_git,raw_sum): update_query = ( update(CollectionStatus) .where(CollectionStatus.repo_id == repo.repo_id) - .values(core_weight=weight,issue_pr_sum=raw_sum) + .values(core_weight=weight,issue_pr_sum=raw_sum,secondary_weight=secondary_tasks_weight) ) session.execute(update_query) From 4ebc2c8de7a8aae638148dd09e5d6c458725f5fd Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 12:04:11 -0500 Subject: [PATCH 39/46] implement new/old repo hierarchy as discussed Signed-off-by: Isaac Milarsky --- .../application/db/models/augur_operations.py | 2 +- augur/tasks/start_tasks.py | 73 ++++++++++++------- augur/tasks/util/collection_util.py | 14 ++++ 3 files changed, 63 insertions(+), 26 deletions(-) diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 246d07951f..17035c856a 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -1033,7 +1033,7 @@ def insert(session, repo_id): try: pr_issue_count = get_repo_weight_by_issue(session.logger, repo_git) - session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") + #session.logger.info(f"date weight: {calculate_date_weight_from_timestamps(repo.repo_added, None)}") github_weight = pr_issue_count - calculate_date_weight_from_timestamps(repo.repo_added, None) except Exception as e: pr_issue_count = None diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 9ca4c5af2a..4ade612f68 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -185,23 +185,30 @@ def core_task_success_util_gen(repo_git): not_erroed = CollectionStatus.core_status != str(CollectionState.ERROR.value) not_collecting = CollectionStatus.core_status != str(CollectionState.COLLECTING.value) + never_collected = CollectionStatus.core_data_last_collected == None limit = max_repo-active_repo_count core_order = CollectionStatus.core_weight #Get repos for primary collection hook - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_erroed, not_collecting),limit,order=core_order) + collection_size = start_block_of_repos( + session.logger, session, + and_(not_erroed, not_collecting,never_collected), + limit, primary_enabled_phases,sort=core_order + ) - session.logger.info(f"Starting primary collection on {len(repo_git_identifiers)} repos") - if len(repo_git_identifiers) == 0: - return 0 - session.logger.info(f"Primary collection starting for: {tuple(repo_git_identifiers)}") + #Now start old repos if there is space to do so. + limit -= len(collection_size) - primary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=primary_enabled_phases) + collected_before = CollectionStatus.core_data_last_collected != None - primary_augur_collection.start_data_collection() + start_block_of_repos( + session.logger, session, + and_(not_erroed, not_collecting,collected_before), + limit, primary_enabled_phases,sort=core_order + ) def start_secondary_collection(session,max_repo): @@ -228,22 +235,30 @@ def secondary_task_success_util_gen(repo_git): not_erroed = CollectionStatus.secondary_status != str(CollectionState.ERROR.value) not_collecting = CollectionStatus.secondary_status != str(CollectionState.COLLECTING.value) primary_collected = CollectionStatus.core_status == str(CollectionState.SUCCESS.value) + never_collected = CollectionStatus.secondary_data_last_collected == None limit = max_repo-active_repo_count secondary_order = CollectionStatus.secondary_weight - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(primary_collected,not_erroed, not_collecting),limit,order=secondary_order) - - session.logger.info(f"Starting secondary collection on {len(repo_git_identifiers)} repos") - if len(repo_git_identifiers) == 0: - return 0 - - session.logger.info(f"Secondary collection starting for: {tuple(repo_git_identifiers)}") + collection_size = start_block_of_repos( + session.logger, session, + and_(primary_collected,not_erroed, not_collecting,never_collected), + limit, secondary_enabled_phases, + hook="secondary", + sort=secondary_order + ) - secondary_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=secondary_enabled_phases,collection_hook="secondary") + limit -= collection_size + collected_before = CollectionStatus.secondary_data_last_collected != None - secondary_augur_collection.start_data_collection() + start_block_of_repos( + session.logger, session, + and_(primary_collected,not_erroed, not_collecting,collected_before), + limit, secondary_enabled_phases, + hook="secondary", + sort=secondary_order + ) def start_facade_clone_update(session,max_repo,days): @@ -303,22 +318,30 @@ def facade_task_success_util_gen(repo_git): not_failed_clone = CollectionStatus.facade_status != str(CollectionState.FAILED_CLONE.value) not_collecting = CollectionStatus.facade_status != str(CollectionState.COLLECTING.value) not_initializing = CollectionStatus.facade_status != str(CollectionState.INITIALIZING.value) + never_collected = CollectionStatus.facade_data_last_collected == None limit = max_repo-active_repo_count facade_order = CollectionStatus.facade_weight - repo_git_identifiers = get_collection_status_repo_git_from_filter(session,and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing),limit,order=facade_order) - - session.logger.info(f"Starting facade collection on {len(repo_git_identifiers)} repos") - if len(repo_git_identifiers) == 0: - return 0 + collection_size = start_block_of_repos( + session.logger, session, + and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing,never_collected), + limit, facade_enabled_phases, + hook="facade", + sort=facade_order + ) - session.logger.info(f"Facade collection starting for: {tuple(repo_git_identifiers)}") + limit -= collection_size + collected_before = CollectionStatus.facade_data_last_collected != None - facade_augur_collection = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=facade_enabled_phases,collection_hook="facade") - - facade_augur_collection.start_data_collection() + start_block_of_repos( + session.logger, session, + and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing,collected_before), + limit, facade_enabled_phases, + hook="facade", + sort=facade_order + ) @celery.task diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 218ce4d8cc..3b0e679dc7 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -397,3 +397,17 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id +def start_block_of_repos(logger,session,condition,limit,phases,hook="core",sort=None): + repo_git_identifiers = get_collection_status_repo_git_from_filter(session,condition,limit,order=sort) + + logger.info(f"Starting new collection on {hook}: {len(repo_git_identifiers)} repos") + if len(repo_git_identifiers) == 0: + return 0 + + logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") + + routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) + + routine.start_data_collection() + + return len(repo_git_identifiers) \ No newline at end of file From e01c1d6e1273373f5ad3665069476d13f5b31fd9 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 12:18:06 -0500 Subject: [PATCH 40/46] move reset-repo-age to augur db Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 14 -------------- augur/application/cli/db.py | 17 +++++++++++++++++ scripts/install/install.sh | 2 +- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index 3625a2b92a..d74a7e8d76 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -443,21 +443,7 @@ def order_repos(repos): return repo_git_urls -@cli.command('reset-repo-age') -@test_connection -@test_db_connection -def reset_repo_age(): - - logger.info("resetting collection age on all repositories") - - with DatabaseSession(logger) as session: - update_query = ( - update(Repo) - .values(repo_added=datetime.now()) - ) - session.execute(update_query) - session.commit() # def initialize_components(augur_app, disable_housekeeper): # master = None diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index 382d5042a6..41b5655476 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -456,3 +456,20 @@ def check_pgpass_credentials(config): pgpass_file.write(credentials_string + "\n") else: print("Credentials found in $HOME/.pgpass") + + +@cli.command('reset-repo-age') +@test_connection +@test_db_connection +def reset_repo_age(): + + logger.info("resetting collection age on all repositories") + + with DatabaseSession(logger) as session: + update_query = ( + update(Repo) + .values(repo_added=datetime.now()) + ) + + session.execute(update_query) + session.commit() \ No newline at end of file diff --git a/scripts/install/install.sh b/scripts/install/install.sh index 479640f8cb..c8ea6acf36 100755 --- a/scripts/install/install.sh +++ b/scripts/install/install.sh @@ -88,7 +88,7 @@ scripts/install/config.sh $target # scripts/install/api_key.sh augur db check-pgpass - +augur db reset-repo-age echo "**********************************" echo "***** INSTALLATION COMPLETE *****" From dccb7c1268c57754313649e6fbd95831f4be2410 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 12:44:59 -0500 Subject: [PATCH 41/46] move reset-repo-age to augur db Signed-off-by: Isaac Milarsky --- augur/application/cli/backend.py | 1 - augur/application/cli/db.py | 15 ++++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index d74a7e8d76..ae94b1f800 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -16,7 +16,6 @@ import uuid import traceback from urllib.parse import urlparse -from sqlalchemy import update from datetime import datetime from augur import instance_id diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index 41b5655476..f09aaabbd2 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -22,6 +22,9 @@ from augur.application.db.session import DatabaseSession from augur.application.logs import AugurLogger from augur.application.db.engine import DatabaseEngine +from sqlalchemy import update +from datetime import datetime +from augur.application.db.models import Repo logger = logging.getLogger(__name__) @@ -373,7 +376,6 @@ def init_database( def test_db_connection(): pass - # TODO: Fix this function def run_psql_command_in_database(target_type, target): if target_type not in ["-f", "-c"]: @@ -458,12 +460,11 @@ def check_pgpass_credentials(config): print("Credentials found in $HOME/.pgpass") -@cli.command('reset-repo-age') -@test_connection -@test_db_connection +#NOTE: For some reason when I try to add function decorators to this function +#click thinks it's an argument and tries to parse it but it errors since a function +#isn't an iterable. +@cli.command("reset-repo-age") def reset_repo_age(): - - logger.info("resetting collection age on all repositories") with DatabaseSession(logger) as session: update_query = ( @@ -472,4 +473,4 @@ def reset_repo_age(): ) session.execute(update_query) - session.commit() \ No newline at end of file + session.commit() From 01919fd8531ce7bb7410ffffd7536c25f43aaad8 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 12:52:59 -0500 Subject: [PATCH 42/46] stop alembic from adding repos to collection_status Signed-off-by: Isaac Milarsky --- .../alembic/versions/5_add_collection_status_table.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/augur/application/schema/alembic/versions/5_add_collection_status_table.py b/augur/application/schema/alembic/versions/5_add_collection_status_table.py index b233049368..ceb6024e57 100644 --- a/augur/application/schema/alembic/versions/5_add_collection_status_table.py +++ b/augur/application/schema/alembic/versions/5_add_collection_status_table.py @@ -34,14 +34,6 @@ def upgrade(): schema='augur_operations' ) - # add collection status for any existing repos - conn = op.get_bind() - repos = conn.execute(text("""SELECT repo_id from repo""")).fetchall() - - for repo in repos: - repo_id = repo[0] - conn.execute(text(f"""INSERT INTO "augur_operations"."collection_status" ("repo_id") VALUES ({repo_id});""")) - conn.execute(text(""" UPDATE augur_operations.config SET value = '600' From d44c894095011b5e34fee6d45c4fed185fbf385e Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 12:57:47 -0500 Subject: [PATCH 43/46] typo Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 4ade612f68..6094f91342 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -200,7 +200,7 @@ def core_task_success_util_gen(repo_git): #Now start old repos if there is space to do so. - limit -= len(collection_size) + limit -= collection_size collected_before = CollectionStatus.core_data_last_collected != None From 9102ce9316734b88384a4c4be1520066ecc405ba Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 13:21:36 -0500 Subject: [PATCH 44/46] use default repo_added Signed-off-by: Isaac Milarsky --- .../alembic/versions/legacy/80.3-sample-data.sql | 14 +++++++------- scripts/install/install.sh | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/augur/application/schema/alembic/versions/legacy/80.3-sample-data.sql b/augur/application/schema/alembic/versions/legacy/80.3-sample-data.sql index a8a60c8cdc..26a6451ba2 100644 --- a/augur/application/schema/alembic/versions/legacy/80.3-sample-data.sql +++ b/augur/application/schema/alembic/versions/legacy/80.3-sample-data.sql @@ -1072,13 +1072,13 @@ INSERT INTO "augur_data"."contributor_affiliations"("ca_id", "ca_domain", "tool_ INSERT INTO "augur_data"."repo_groups"("repo_group_id", "rg_name", "rg_description", "rg_website", "rg_recache", "rg_last_modified", "rg_type", "tool_source", "tool_version", "data_source", "data_collection_date") VALUES (10, 'Default Repo Group', 'The default repo group created by the schema generation script', '', 0, '2021-06-03 15:55:20', 'GitHub Organization', 'load', 'one', 'git', '2019-06-05 13:36:25'); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25452, 10, 'https://github.com/chaoss/whitepaper', 'github.com/chaoss/', 'whitepaper', '2021-04-17 21:40:42', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-04-17 21:40:42', 0, NULL); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (24441, 10, 'https://github.com/operate-first/operate-first-twitter', 'github.com/operate-first/', 'operate-first-twitter', '2021-08-25 16:47:47', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-08-25 16:47:47', 0, NULL); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (24442, 10, 'https://github.com/operate-first/blueprint', 'github.com/operate-first/', 'blueprint', '2021-08-25 16:47:47', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-08-25 16:47:47', 0, NULL); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25445, 10, 'https://github.com/chaoss/grimoirelab-perceval-opnfv', 'github.com/chaoss/', 'grimoirelab-perceval-opnfv', '2020-04-17 21:40:39', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-04-17 21:40:39', 0, NULL); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (1, 1, 'https://github.com/chaoss/augur', 'github.com/chaoss/', 'augur', '2021-08-10 14:28:44', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'data load', 'one', 'git', '2021-06-05 18:41:14', 0, NULL); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25430, 10, 'https://github.com/SociallyCompute/update-test', 'github.com/SociallyCompute/', 'update-test', '2021-10-07 08:50:13', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, NULL, NULL, NULL, NULL, 0, NULL); -INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_added", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25450, 10, 'https://github.com/chaoss/grimoirelab-hatstall', 'github.com/chaoss/', 'grimoirelab-hatstall', '2021-04-17 21:40:42', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-04-17 21:40:42', 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25452, 10, 'https://github.com/chaoss/whitepaper', 'github.com/chaoss/', 'whitepaper', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-04-17 21:40:42', 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (24441, 10, 'https://github.com/operate-first/operate-first-twitter', 'github.com/operate-first/', 'operate-first-twitter', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-08-25 16:47:47', 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (24442, 10, 'https://github.com/operate-first/blueprint', 'github.com/operate-first/', 'blueprint', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-08-25 16:47:47', 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25445, 10, 'https://github.com/chaoss/grimoirelab-perceval-opnfv', 'github.com/chaoss/', 'grimoirelab-perceval-opnfv', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-04-17 21:40:39', 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (1, 1, 'https://github.com/chaoss/augur', 'github.com/chaoss/', 'augur', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'data load', 'one', 'git', '2021-06-05 18:41:14', 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25430, 10, 'https://github.com/SociallyCompute/update-test', 'github.com/SociallyCompute/', 'update-test', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, NULL, NULL, NULL, NULL, 0, NULL); +INSERT INTO "augur_data"."repo" ("repo_id", "repo_group_id", "repo_git", "repo_path", "repo_name", "repo_status", "repo_type", "url", "owner_id", "description", "primary_language", "created_at", "forked_from", "updated_at", "tool_source", "tool_version", "data_source", "data_collection_date", "repo_archived", "repo_archived_date_collected") VALUES (25450, 10, 'https://github.com/chaoss/grimoirelab-hatstall', 'github.com/chaoss/', 'grimoirelab-hatstall', 'Complete', '', NULL, NULL, NULL, NULL, NULL, 'Parent not available', NULL, 'CLI', '1.0', 'Git', '2021-04-17 21:40:42', 0, NULL); UPDATE "augur_data"."repo" set repo_name = NULL, repo_path = NULL, repo_status = 'New'; diff --git a/scripts/install/install.sh b/scripts/install/install.sh index c8ea6acf36..f5dbe89902 100755 --- a/scripts/install/install.sh +++ b/scripts/install/install.sh @@ -88,7 +88,6 @@ scripts/install/config.sh $target # scripts/install/api_key.sh augur db check-pgpass -augur db reset-repo-age echo "**********************************" echo "***** INSTALLATION COMPLETE *****" From 241ae72eb4b12a41b7f5c1e0bfc4ce3cf4df0360 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 13:31:10 -0500 Subject: [PATCH 45/46] limit check before old repo schedule Signed-off-by: Isaac Milarsky --- augur/tasks/start_tasks.py | 41 ++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/augur/tasks/start_tasks.py b/augur/tasks/start_tasks.py index 6094f91342..3e999c5c47 100644 --- a/augur/tasks/start_tasks.py +++ b/augur/tasks/start_tasks.py @@ -204,11 +204,12 @@ def core_task_success_util_gen(repo_git): collected_before = CollectionStatus.core_data_last_collected != None - start_block_of_repos( - session.logger, session, - and_(not_erroed, not_collecting,collected_before), - limit, primary_enabled_phases,sort=core_order - ) + if limit > 0: + start_block_of_repos( + session.logger, session, + and_(not_erroed, not_collecting,collected_before), + limit, primary_enabled_phases,sort=core_order + ) def start_secondary_collection(session,max_repo): @@ -252,13 +253,14 @@ def secondary_task_success_util_gen(repo_git): limit -= collection_size collected_before = CollectionStatus.secondary_data_last_collected != None - start_block_of_repos( - session.logger, session, - and_(primary_collected,not_erroed, not_collecting,collected_before), - limit, secondary_enabled_phases, - hook="secondary", - sort=secondary_order - ) + if limit > 0: + start_block_of_repos( + session.logger, session, + and_(primary_collected,not_erroed, not_collecting,collected_before), + limit, secondary_enabled_phases, + hook="secondary", + sort=secondary_order + ) def start_facade_clone_update(session,max_repo,days): @@ -335,13 +337,14 @@ def facade_task_success_util_gen(repo_git): limit -= collection_size collected_before = CollectionStatus.facade_data_last_collected != None - start_block_of_repos( - session.logger, session, - and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing,collected_before), - limit, facade_enabled_phases, - hook="facade", - sort=facade_order - ) + if limit > 0: + start_block_of_repos( + session.logger, session, + and_(not_pending,not_failed_clone,not_erroed, not_collecting, not_initializing,collected_before), + limit, facade_enabled_phases, + hook="facade", + sort=facade_order + ) @celery.task From c02068f8b89f81af3836b3188163294fc37840f3 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 18 Apr 2023 13:41:00 -0500 Subject: [PATCH 46/46] alembic changes Signed-off-by: Isaac Milarsky --- .../versions/16_add_weight_data_to_collection_status_to_.py | 5 ----- .../schema/alembic/versions/5_add_collection_status_table.py | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py index 95a36f08c0..9126606cb2 100644 --- a/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py +++ b/augur/application/schema/alembic/versions/16_add_weight_data_to_collection_status_to_.py @@ -24,16 +24,11 @@ def upgrade(): op.add_column('collection_status', sa.Column('issue_pr_sum', sa.BigInteger()), schema='augur_operations') op.add_column('collection_status', sa.Column('commit_sum', sa.BigInteger()), schema='augur_operations') - - op.drop_constraint('collection_status_repo_id_fk', 'collection_status', schema='augur_operations', type_='foreignkey') - op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations', referent_schema='augur_data') # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.drop_constraint('collection_status_repo_id_fk', 'collection_status', schema='augur_operations', type_='foreignkey') - op.create_foreign_key('collection_status_repo_id_fk', 'collection_status', 'repo', ['repo_id'], ['repo_id'], source_schema='augur_operations') op.drop_column('collection_status', 'facade_weight', schema='augur_operations') op.drop_column('collection_status', 'core_weight', schema='augur_operations') op.drop_column('collection_status', 'secondary_weight', schema='augur_operations') diff --git a/augur/application/schema/alembic/versions/5_add_collection_status_table.py b/augur/application/schema/alembic/versions/5_add_collection_status_table.py index ceb6024e57..bc97278a19 100644 --- a/augur/application/schema/alembic/versions/5_add_collection_status_table.py +++ b/augur/application/schema/alembic/versions/5_add_collection_status_table.py @@ -34,6 +34,9 @@ def upgrade(): schema='augur_operations' ) + # add collection status for any existing repos + conn = op.get_bind() + conn.execute(text(""" UPDATE augur_operations.config SET value = '600'