diff --git a/README.md b/README.md index 296c50a241..02ec125fb6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.63.0 +# Augur NEW Release v0.70.0 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o ## NEW RELEASE ALERT! ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md) -Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.63.0 +Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0 - The `main` branch is a stable version of our new architecture, which features: - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks. diff --git a/augur/application/cli/__init__.py b/augur/application/cli/__init__.py index e07e880bd9..f15758c9cf 100644 --- a/augur/application/cli/__init__.py +++ b/augur/application/cli/__init__.py @@ -3,9 +3,9 @@ from functools import update_wrapper import os import sys -import socket import re import json +import httpx from augur.application.db.engine import DatabaseEngine from augur.application.db import get_engine, dispose_database_engine @@ -16,13 +16,22 @@ def test_connection(function_internet_connection): @click.pass_context def new_func(ctx, *args, **kwargs): usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0] - try: - #try to ping google's dns server - socket.create_connection(("8.8.8.8",53)) - return ctx.invoke(function_internet_connection, *args, **kwargs) - except OSError as e: - print(e) - print(f"\n\n{usage} command setup failed\nYou are not connect to the internet. Please connect to the internet to run Augur\n") + with httpx.Client() as client: + try: + _ = client.request( + method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True) + + return ctx.invoke(function_internet_connection, *args, **kwargs) + except (TimeoutError, httpx.TimeoutException): + print("Request timed out.") + except httpx.NetworkError: + print(f"Network Error: {httpx.NetworkError}") + except httpx.ProtocolError: + print(f"Protocol Error: {httpx.ProtocolError}") + print(f"\n\n{usage} command setup failed\n \ + You are not connected to the internet.\n \ + Please connect to the internet to run Augur\n \ + Consider setting http_proxy variables for limited access installations.") sys.exit() return update_wrapper(new_func, function_internet_connection) diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py index a0480adab4..d7a8ad745d 100644 --- a/augur/application/cli/backend.py +++ b/augur/application/cli/backend.py @@ -172,7 +172,7 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.25, 45) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py index 63c433a79e..7d65cad978 100644 --- a/augur/application/cli/collection.py +++ b/augur/application/cli/collection.py @@ -132,7 +132,7 @@ def determine_worker_processes(ratio,maximum): sleep_time += 6 #20% of estimate, Maximum value of 25 - secondary_num_processes = determine_worker_processes(.25, 25) + secondary_num_processes = determine_worker_processes(.25, 45) logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}") secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary" process_list.append(subprocess.Popen(secondary_worker.split(" "))) diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py index c2ffc9463e..c20fcf0b2e 100644 --- a/augur/application/cli/db.py +++ b/augur/application/cli/db.py @@ -14,7 +14,12 @@ import re import stat as stat_module -from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext +from augur.application.cli import ( + test_connection, + test_db_connection, + with_database, + DatabaseContext, +) from augur.application.db.session import DatabaseSession from sqlalchemy import update @@ -23,8 +28,9 @@ logger = logging.getLogger(__name__) + @click.group("db", short_help="Database utilities") -@click.pass_context +@click.pass_context def cli(ctx): ctx.obj = DatabaseContext() @@ -36,36 +42,43 @@ def cli(ctx): @with_database @click.pass_context def add_repos(ctx, filename): - """Add repositories to Augur's database. + """Add repositories to Augur's database. The .csv file format should be repo_url,group_id NOTE: The Group ID must already exist in the REPO_Groups Table. - If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" + If you want to add an entire GitHub organization, refer to the command: augur db add-github-org""" from augur.tasks.github.util.github_task_session import GithubTaskSession from augur.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: - controller = RepoLoadController(session) + line_total = len(open(filename).readlines()) with open(filename) as upload_repos_file: data = csv.reader(upload_repos_file, delimiter=",") - for row in data: - + for line_num, row in enumerate(data): repo_data = {} repo_data["url"] = row[0] try: repo_data["repo_group_id"] = int(row[1]) except ValueError: - print(f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`") + print( + f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`" + ) continue - + print( - f"Inserting repo with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}") - controller.add_cli_repo(repo_data) + f"Inserting repo {line_num}/{line_total} with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}" + ) + succeeded, message = controller.add_cli_repo(repo_data) + if not succeeded: + logger.error(f"insert repo failed with error: {message['status']}`") + else: + logger.info(f"Repo added: {repo_data}") + print("Success") @cli.command("get-repo-groups") @@ -101,7 +114,6 @@ def add_repo_groups(ctx, filename): Create new repo groups in Augur's database """ with ctx.obj.engine.begin() as connection: - df = pd.read_sql( s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"), connection, @@ -117,7 +129,6 @@ def add_repo_groups(ctx, filename): with open(filename) as create_repo_groups_file: data = csv.reader(create_repo_groups_file, delimiter=",") for row in data: - # Handle case where there's a hanging empty row. if not row: logger.info("Skipping empty data...") @@ -137,6 +148,7 @@ def add_repo_groups(ctx, filename): f"Repo group with ID {row[1]} for repo group {row[1]} already exists, skipping..." ) + @cli.command("add-github-org") @click.argument("organization_name") @test_connection @@ -151,14 +163,13 @@ def add_github_org(ctx, organization_name): from augur.util.repo_load_controller import RepoLoadController with GithubTaskSession(logger, engine=ctx.obj.engine) as session: - controller = RepoLoadController(session) controller.add_cli_org(organization_name) + # get_db_version is a helper function to print_db_version and upgrade_db_version def get_db_version(engine): - db_version_sql = s.sql.text( """ SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version' @@ -166,14 +177,12 @@ def get_db_version(engine): ) with engine.connect() as connection: - result = int(connection.execute(db_version_sql).fetchone()[2]) engine.dispose() return result - @cli.command("print-db-version") @test_connection @test_db_connection @@ -252,10 +261,10 @@ def update_api_key(ctx, api_key): ) with ctx.obj.engine.begin() as connection: - connection.execute(update_api_key_sql, api_key=api_key) logger.info(f"Updated Augur API key to: {api_key}") + @cli.command("get-api-key") @test_connection @test_db_connection @@ -282,20 +291,21 @@ def get_api_key(ctx): def check_pgpass(): augur_db_env_var = getenv("AUGUR_DB") if augur_db_env_var: - # gets the user, passowrd, host, port, and database_name out of environment variable # assumes database string of structure //:@:/ # it returns a tuple like (, , , , bool: continue data = result.json() + if result.status_code == 403: #GH Rate limiting + wait_until = int(result.headers.get("x-ratelimit-reset")) + # use time package to find how many seconds to wait + wait_in_seconds = int( + mktime(gmtime(wait_until)) - + mktime(gmtime(time())) + ) + wait_until_time = localtime(wait_until) + logger.error(f"rate limited fetching {url}z") + logger.error(f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)") + sleep(wait_in_seconds) # if there was an error return False if "message" in data.keys(): @@ -928,6 +946,8 @@ def is_valid_github_repo(gh_session, url: str) -> bool: return True, {"status": "Valid repo", "repo_type": data["owner"]["type"]} + return False, {"status": "Failed to validate repo after multiple attempts"} + @staticmethod def is_valid_gitlab_repo(gl_session, url: str) -> bool: """Determine whether a GitLab repo URL is valid. @@ -955,6 +975,11 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: while attempts < 10: response = hit_api(gl_session.oauths, url, logger) + if wait_in_seconds := response.headers.get("Retry-After") is not None: + logger.info(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + print(f"rate limited fetching {url}, sleeping for {wait_in_seconds}") + sleep(int(wait_in_seconds)) + if response.status_code == 404: return False, {"status": "Invalid repo"} @@ -962,6 +987,8 @@ def is_valid_gitlab_repo(gl_session, url: str) -> bool: return True, {"status": "Valid repo"} attempts += 1 + logger.info(f"could not validate {url}, will attempt again in {attempts*5} seconds") + sleep(attempts*3) return False, {"status": "Failed to validate repo after multiple attempts"} diff --git a/augur/application/db/models/augur_operations.py b/augur/application/db/models/augur_operations.py index 47f28b12f2..029444215e 100644 --- a/augur/application/db/models/augur_operations.py +++ b/augur/application/db/models/augur_operations.py @@ -329,6 +329,9 @@ def get_user(session, username: str): return user except NoResultFound: return None + except Exception as e: + session.rollback() + raise e @staticmethod def get_by_id(session, user_id: int): @@ -1073,7 +1076,13 @@ def __eq__(self, other): @staticmethod def get_by_id(session, client_id): - return session.query(ClientApplication).filter(ClientApplication.id == client_id).first() + + try: + return session.query(ClientApplication).filter(ClientApplication.id == client_id).first() + except Exception as e: + session.rollback() + raise e + class Subscription(Base): __tablename__ = "subscriptions" diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 296e69075e..e4c6273479 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -72,7 +72,11 @@ def generate_scorecard(session,repo_id,path): key_handler = GithubApiKeyHandler(session, session.logger) os.environ['GITHUB_AUTH_TOKEN'] = key_handler.get_random_key() - required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + try: + required_output = parse_json_from_subprocess_call(session.logger,['./scorecard', command, '--format=json'],cwd=path_to_scorecard) + except Exception as e: + session.logger.error(f"Could not parse required output! Error: {e}") + raise e session.logger.info('adding to database...') session.logger.debug(f"output: {required_output}") diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index b96c596bc7..9c699f7e79 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -439,32 +439,6 @@ def generate_analysis_sequence(logger,repo_git, session): return analysis_sequence - -def generate_contributor_sequence(logger,repo_git, session): - - contributor_sequence = [] - #all_repo_ids = [] - repo_id = None - - #contributor_sequence.append(facade_start_contrib_analysis_task.si()) - query = s.sql.text("""SELECT repo_id FROM repo - WHERE repo_git=:value""").bindparams(value=repo_git) - - repo = session.execute_sql(query).fetchone() - session.logger.info(f"repo: {repo}") - repo_id = repo[0] - #pdb.set_trace() - #breakpoint() - #for repo in all_repos: - # contributor_sequence.append(insert_facade_contributors.si(repo['repo_id'])) - #all_repo_ids = [repo['repo_id'] for repo in all_repos] - - #contrib_group = create_grouped_task_load(dataList=all_repo_ids,task=insert_facade_contributors)#group(contributor_sequence) - #contrib_group.link_error(facade_error_handler.s()) - #return contrib_group#chain(facade_start_contrib_analysis_task.si(), contrib_group) - return insert_facade_contributors.si(repo_id) - - def facade_phase(repo_git): logger = logging.getLogger(facade_phase.__name__) logger.info("Generating facade sequence") @@ -506,7 +480,7 @@ def facade_phase(repo_git): #Generate contributor analysis task group. if not limited_run or (limited_run and run_facade_contributors): - facade_core_collection.append(generate_contributor_sequence(logger,repo_git,session)) + facade_core_collection.append(insert_facade_contributors.si(repo_git)) #These tasks need repos to be cloned by facade before they can work. diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 64ce4f7409..6bf9888c07 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -194,14 +194,22 @@ def link_commits_to_contributor(session,contributorQueue): # Update the contributors table from the data facade has gathered. @celery.task(base=AugurFacadeRepoCollectionTask, bind=True) -def insert_facade_contributors(self, repo_id): +def insert_facade_contributors(self, repo_git): engine = self.app.engine logger = logging.getLogger(insert_facade_contributors.__name__) + repo_id = None with GithubTaskManifest(logger) as manifest: + #contributor_sequence.append(facade_start_contrib_analysis_task.si()) + query = s.sql.text("""SELECT repo_id FROM repo + WHERE repo_git=:value""").bindparams(value=repo_git) + + repo = manifest.augur_db.execute_sql(query).fetchone() + logger.info(f"repo: {repo}") + repo_id = repo[0] # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. diff --git a/augur/tasks/github/messages/tasks.py b/augur/tasks/github/messages/tasks.py index 54a4c41e0c..f3a30a54f6 100644 --- a/augur/tasks/github/messages/tasks.py +++ b/augur/tasks/github/messages/tasks.py @@ -8,13 +8,12 @@ from augur.tasks.github.util.github_task_session import GithubTaskManifest from augur.tasks.util.worker_util import remove_duplicate_dicts from augur.tasks.github.util.util import get_owner_repo -from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo - - +from augur.application.db.models import PullRequest, Message, Issue, PullRequestMessageRef, IssueMessageRef, Contributor, Repo, CollectionStatus +from augur.application.db import get_engine, get_session +from sqlalchemy.sql import text platform_id = 1 - @celery.task(base=AugurCoreRepoCollectionTask) def collect_github_messages(repo_git: str) -> None: @@ -29,18 +28,30 @@ def collect_github_messages(repo_git: str) -> None: owner, repo = get_owner_repo(repo_git) task_name = f"{owner}/{repo}: Message Task" - message_data = retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) - - if message_data: + - process_messages(message_data, task_name, repo_id, logger, augur_db) + if is_repo_small(repo_id): + message_data = fast_retrieve_all_pr_and_issue_messages(repo_git, logger, manifest.key_auth, task_name) + + if message_data: + process_messages(message_data, task_name, repo_id, logger, augur_db) + + else: + logger.info(f"{owner}/{repo} has no messages") else: - logger.info(f"{owner}/{repo} has no messages") + process_large_issue_and_pr_message_collection(repo_id, repo_git, logger, manifest.key_auth, task_name, augur_db) + +def is_repo_small(repo_id): + with get_session() as session: -def retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name) -> None: + result = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo_id, CollectionStatus.issue_pr_sum <= 10).first() + + return result != None + +def fast_retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_name) -> None: owner, repo = get_owner_repo(repo_git) @@ -77,7 +88,50 @@ def retrieve_all_pr_and_issue_messages(repo_git: str, logger, key_auth, task_nam return all_data - + + +def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger, key_auth, task_name, augur_db) -> None: + + owner, repo = get_owner_repo(repo_git) + + # define logger for task + logger.info(f"Collecting github comments for {owner}/{repo}") + + engine = get_engine() + + with engine.connect() as connection: + + query = text(f""" + (select pr_comments_url from pull_requests WHERE repo_id={repo_id} order by pr_created_at desc) + UNION + (select comments_url as comment_url from issues WHERE repo_id={repo_id} order by created_at desc); + """) + + result = connection.execute(query).fetchall() + comment_urls = [x[0] for x in result] + + all_data = [] + for index, comment_url in enumerate(comment_urls): + + logger.info(f"{task_name}: Github messages index {index+1} of {len(comment_urls)}") + + messages = GithubPaginator(comment_url, key_auth, logger) + for page_data, _ in messages.iter_pages(): + + if page_data is None or len(page_data) == 0: + break + + all_data += page_data + + logger.info(f"All data size: {len(all_data)}") + + if len(all_data) >= 20: + process_messages(all_data, task_name, repo_id, logger, augur_db) + all_data.clear() + + if len(all_data) > 0: + process_messages(all_data, task_name, repo_id, logger, augur_db) + def process_messages(messages, task_name, repo_id, logger, augur_db): diff --git a/augur/tasks/github/pull_requests/tasks.py b/augur/tasks/github/pull_requests/tasks.py index 69e40f6818..73ea1b025a 100644 --- a/augur/tasks/github/pull_requests/tasks.py +++ b/augur/tasks/github/pull_requests/tasks.py @@ -12,6 +12,8 @@ from augur.application.db.util import execute_session_query from ..messages.tasks import process_github_comment_contributors +from typing import Generator, List, Dict + platform_id = 1 @@ -29,20 +31,32 @@ def collect_pull_requests(repo_git: str) -> int: Repo.repo_git == repo_git).one().repo_id owner, repo = get_owner_repo(repo_git) - pr_data = retrieve_all_pr_data(repo_git, logger, manifest.key_auth) - if pr_data: - process_pull_requests(pr_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count = 0 + all_data = [] + for page in retrieve_all_pr_data(repo_git, logger, manifest.key_auth): + all_data += page + + if len(all_data) >= 1000: + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count += len(all_data) + all_data.clear() + + if len(all_data): + process_pull_requests(all_data, f"{owner}/{repo}: Pr task", repo_id, logger, augur_db) + total_count += len(all_data) - return len(pr_data) + if total_count > 0: + return total_count else: logger.info(f"{owner}/{repo} has no pull requests") return 0 + # TODO: Rename pull_request_reviewers table to pull_request_requested_reviewers # TODO: Fix column names in pull request labels table -def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: +def retrieve_all_pr_data(repo_git: str, logger, key_auth): #-> Generator[List[Dict]]: owner, repo = get_owner_repo(repo_git) @@ -52,24 +66,21 @@ def retrieve_all_pr_data(repo_git: str, logger, key_auth) -> None: # returns an iterable of all prs at this url (this essentially means you can treat the prs variable as a list of the prs) prs = GithubPaginator(url, key_auth, logger) - all_data = [] num_pages = prs.get_num_pages() for page_data, page in prs.iter_pages(): if page_data is None: - return all_data + return if len(page_data) == 0: logger.debug( f"{owner}/{repo} Prs Page {page} contains no data...returning") logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - return all_data + return logger.info(f"{owner}/{repo} Prs Page {page} of {num_pages}") - - all_data += page_data - - return all_data + + yield page_data def process_pull_requests(pull_requests, task_name, repo_id, logger, augur_db): diff --git a/augur/tasks/github/util/gh_graphql_entities.py b/augur/tasks/github/util/gh_graphql_entities.py index cb5df455b7..574adbbaf0 100644 --- a/augur/tasks/github/util/gh_graphql_entities.py +++ b/augur/tasks/github/util/gh_graphql_entities.py @@ -338,17 +338,21 @@ def __iter__(self): #self.logger.info(f"{params}") data = self.request_graphql_dict(variables=params) try: - coreData = self.extract_paginate_result(data) - #Check to make sure we have data - coreData['totalCount'] + coreData = self.extract_paginate_result(data) + if coreData is not None: + if coreData.get('totalCount') is not None: + self.logger.info("... core data obtained") + else: + self.logger.info(f"Helen, the ghost in our machine, did not get a numerical result for core data (value): {data} \n Zero value assigned.") + coreData['totalCount'] = 0 + else: + self.logger.error("Core data is None, cannot proceed with operations on it, but assigning a value of Zero to ensure continued collection.") + yield None + return except KeyError as e: self.logger.error("Could not extract paginate result because there was no data returned") - self.logger.error( - ''.join(traceback.format_exception(None, e, e.__traceback__))) - - self.logger.info(f"Graphql paramters: {params}") - return + self.logger.error(''.join(traceback.format_exception(None, e, e.__traceback__))) if int(coreData['totalCount']) == 0: diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 3561b19b40..9776258626 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -132,58 +132,76 @@ def __init__(self,name,phases,max_repo = 10,days_until_collect_again = 1, gitlab def get_active_repo_count(self,session): return len(session.query(CollectionStatus).filter(getattr(CollectionStatus,f"{self.name}_status" ) == CollectionState.COLLECTING.value).all()) - #Get repo urls based on passed in info. + def get_valid_repos(self,session): - #getattr(CollectionStatus,f"{hook}_status" ) represents the status of the given hook - #Get the count of repos that are currently running this collection hook - #status_column = f"{hook}_status" + active_repo_count = self.get_active_repo_count(session) + limit = self.max_repo-active_repo_count - #Will always disallow errored repos and repos that are already collecting + if limit <= 0: + return - #The maximum amount of repos to schedule is affected by the existing repos running tasks - limit = self.max_repo-active_repo_count + collection_list = get_newly_added_repos(session, limit, hook=self.name) + self.repo_list.extend(collection_list) + limit -= len(collection_list) - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_random_users_list(session,f"{self.name}_status",self.new_status) + #Now start recollecting other repos if there is space to do so. + if limit <= 0: + return - session.logger.info(f"User_list: {split_user_list}") + collection_list = get_repos_for_recollection(session, limit, hook=self.name, days_until_collect_again=self.days_until_collect_again) - #Iterate through each fourth of the users fetched - for quarter_list in split_user_list: - if limit <= 0: - return + self.repo_list.extend(collection_list) - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) - self.repo_list.extend(collection_list) - #Update limit with amount of repos started - limit -= len(collection_list) +def get_newly_added_repos(session, limit, hook): - #Now start old repos if there is space to do so. - if limit <= 0: - return + condition_string = "" + if hook in ["core", "secondary", "ml"]: + condition_string += f"""{hook}_status='{str(CollectionState.PENDING.value)}'""" + + elif hook == "facade": + condition_string += f"""facade_status='{str(CollectionState.UPDATE.value)}'""" + + if hook == "secondary": + condition_string += f""" and core_status='{str(CollectionState.SUCCESS.value)}'""" + + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, augur_data.repo y + where x.repo_id=y.repo_id + and {condition_string} + order by repo_added + limit :limit_num + """).bindparams(limit_num=limit) + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] - user_list = get_list_of_all_users(session) - random.shuffle(user_list) + return valid_repo_git_list - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) +def get_repos_for_recollection(session, limit, hook, days_until_collect_again): - for quarter_list in split_user_list: + if hook in ["core", "secondary", "ml"]: + condition_string = f"""{hook}_status='{str(CollectionState.SUCCESS.value)}'""" - #Break out if limit has been reached - if limit <= 0: - return + elif hook == "facade": + condition_string = f"""facade_status='{str(CollectionState.SUCCESS.value)}'""" - #only start repos older than the specified amount of days - #Query a set of valid repositories sorted by weight, also making sure that the repos aren't new or errored - #Order by the relevant weight for the collection hook - collection_list = get_valid_repos_for_users(session,limit,tuple(quarter_list),allow_old_repos=True,hook=self.name, days_to_wait_until_next_collection=self.days_until_collect_again) + repo_query = s.sql.text(f""" + select repo_git + from augur_operations.collection_status x, repo y + where x.repo_id = y.repo_id + and {condition_string} + and {hook}_data_last_collected <= NOW() - INTERVAL '{days_until_collect_again} DAYS' + order by {hook}_data_last_collected + limit :limit_num + """).bindparams(limit_num=limit) - self.repo_list.extend(collection_list) - limit -= len(collection_list) + valid_repos = session.execute_sql(repo_query).fetchall() + valid_repo_git_list = [repo[0] for repo in valid_repos] + + return valid_repo_git_list def get_enabled_phase_names_from_config(): @@ -610,80 +628,3 @@ def send_messages(self): #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated yield repo_git, task_id, col_hook.name - -#def start_block_of_repos(logger,session,repo_git_identifiers,phases,repos_type,hook="core"): -# -# logger.info(f"Starting collection on {len(repo_git_identifiers)} {repos_type} {hook} repos") -# if len(repo_git_identifiers) == 0: -# return 0 -# -# logger.info(f"Collection starting for {hook}: {tuple(repo_git_identifiers)}") -# -# routine = AugurTaskRoutine(session,repos=repo_git_identifiers,collection_phases=phases,collection_hook=hook) -# -# routine.start_data_collection() -# -# return len(repo_git_identifiers) - -def get_valid_repos_for_users(session,limit,users,allow_old_repos = False,hook="core",days_to_wait_until_next_collection = 1): - - condition_string = "1" - - if hook == "core": - condition_string = get_required_conditions_for_core_repos(allow_collected_before=allow_old_repos,days_until_collect_again= days_to_wait_until_next_collection) - elif hook == "secondary": - condition_string = get_required_conditions_for_secondary_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "facade": - condition_string = get_required_conditions_for_facade_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - elif hook == "ml": - condition_string = get_required_conditions_for_ml_repos(allow_collected_before=allow_old_repos,days_until_collect_again = days_to_wait_until_next_collection) - - #Query a set of valid repositories sorted by weight, also making sure that the repos are new - #Order by the relevant weight for the collection hook - repo_query = s.sql.text(f""" - SELECT DISTINCT repo.repo_id, repo.repo_git, collection_status.{hook}_weight - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE user_id IN :list_of_user_ids AND {condition_string} - ORDER BY augur_operations.collection_status.{hook}_weight - LIMIT :limit_num - """).bindparams(list_of_user_ids=users,limit_num=limit) - - #Get a list of valid repo ids, limit set to 2 times the usual - valid_repos = session.execute_sql(repo_query).fetchall() - valid_repo_git_list = [repo[1] for repo in valid_repos] - - session.logger.info(f"valid repo git list: {tuple(valid_repo_git_list)}") - - #start repos for new primary collection hook - #collection_size = start_block_of_repos( - # session.logger, session, - # valid_repo_git_list, - # phases, repos_type=repos_type, hook=hook - #) - - return valid_repo_git_list - -def split_random_users_list(session,status_col, status_new): - #Split all users that have new repos into four lists and randomize order - query = s.sql.text(f""" - SELECT - user_id - FROM augur_operations.user_groups - JOIN augur_operations.user_repos ON augur_operations.user_groups.group_id = augur_operations.user_repos.group_id - JOIN augur_data.repo ON augur_operations.user_repos.repo_id = augur_data.repo.repo_id - JOIN augur_operations.collection_status ON augur_operations.user_repos.repo_id = augur_operations.collection_status.repo_id - WHERE {status_col}='{str(status_new)}' - GROUP BY user_id - """) - - user_list = session.execute_sql(query).fetchall() - random.shuffle(user_list) - - #Extract the user id from the randomized list and split into four chunks - split_user_list = split_list_into_chunks([row[0] for row in user_list], 4) - - return split_user_list - diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index e5c98c6dc4..6e158d199b 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-bookworm LABEL maintainer="outdoors@acm.org" -LABEL version="0.63.0" +LABEL version="0.70.0" ENV DEBIAN_FRONTEND=noninteractive diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index 7dad2c2887..1421e1f76c 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,7 @@ FROM postgres:14 LABEL maintainer="outdoors@acm.org" -LABEL version="0.63.0" +LABEL version="0.70.0" ENV POSTGRES_DB "test" ENV POSTGRES_USER "augur" diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index 795e998777..9feca83cd9 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,7 +1,7 @@ FROM rabbitmq:3.12-management-alpine LABEL maintainer="574/augur@simplelogin.com" -LABEL version="0.63.0" +LABEL version="0.70.0" ARG RABBIT_MQ_DEFAULT_USER=augur ARG RABBIT_MQ_DEFAULT_PASSWORD=password123 diff --git a/docs/new-install.md b/docs/new-install.md index ad9707b07f..c48e5e8ae7 100644 --- a/docs/new-install.md +++ b/docs/new-install.md @@ -18,7 +18,8 @@ Here we ensure your system is up to date, install required python libraries, ins ```shell sudo apt update && sudo apt upgrade && -sudo apt install software-properties-common && +sudo apt install software-properties-common && +sudo apt-get install libpq-dev && sudo apt install python3-dev && sudo apt install python3.10-venv && sudo apt install postgresql postgresql-contrib postgresql-client && diff --git a/docs/source/docker/docker.rst b/docs/source/docker/docker.rst index 170a5fce39..c1ef0e7818 100644 --- a/docs/source/docker/docker.rst +++ b/docs/source/docker/docker.rst @@ -10,7 +10,7 @@ Augur provides a separate Docker image for each layer of our application (databa Building the images -------------------- -All ``Dockerfiles`` and other Docker-related files are located in ``util/docker/``, where ```` is either ``backend``, ``frontend``, or ``database``. To build these images locally, use the following command, being sure to replace ```` and ```` as appropriate. +All ``Dockerfiles`` and other Docker-related files are located in ``docker/``, where ```` is either ``backend``, ``frontend``, or ``database``. To build these images locally, use the following command, being sure to replace ```` and ```` as appropriate. .. code-block:: bash diff --git a/docs/source/getting-started/database.rst b/docs/source/getting-started/database.rst index 056be08ba3..8eee00416a 100644 --- a/docs/source/getting-started/database.rst +++ b/docs/source/getting-started/database.rst @@ -1,7 +1,7 @@ Database setup =============== -One of the reasons that Augur is so powerful is because of its `unified data model <../schema/data-model.html>`_. +One of the reasons that Augur is so powerful is because of its `unified data model <../schema/overview.html>`_. To ensure this data model remains performant with large amounts of data, we use PostgreSQL as our database engine. We'll need to set up a PostgreSQL instance and create a database, after which Augur can take care of the rest. Make sure to save off the credentials you use when creating the database; you'll need them again to configure Augur. diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index df0379f1f9..5c2e2f62f7 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -69,6 +69,7 @@ After installation, you must also set up your rabbitmq instance by running the b to communicate with nodes. Then, start rabbitmq server with + .. code-block:: bash sudo systemctl start rabbitmq.service @@ -76,7 +77,7 @@ Then, start rabbitmq server with If your setup of rabbitmq is successful your broker url should look like this: -broker_url = 'amqp://augur:password123@localhost:5672/augur_vhost' +``broker_url = 'amqp://augur:password123@localhost:5672/augur_vhost'`` During installation you will be prompted for this broker url. diff --git a/metadata.py b/metadata.py index acca40a837..b914869d58 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.63.0" -__release__ = "v0.63.0 (Ides of March)" +__version__ = "0.70.0" +__release__ = "v0.70.0 (Windows 95 Man!)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Brian Warner & Augurlabs 2024" diff --git a/podman-compose.yml b/podman-compose.yml new file mode 100644 index 0000000000..9970f81325 --- /dev/null +++ b/podman-compose.yml @@ -0,0 +1,68 @@ +#SPDX-License-Identifier: MIT +version: '3' +services: + augur-db: + image: postgres:14 + restart: unless-stopped + environment: + - "POSTGRES_DB=augur" + - "POSTGRES_USER=${AUGUR_DB_USER:-augur}" + - "POSTGRES_PASSWORD=${AUGUR_DB_PASSWORD:-augur}" + - "PGDATA=/var/lib/postgresql/data/pgdata" + ports: + - "${AUGUR_DB_PORT:-5432}:5432" + volumes: + - augurpostgres:/var/lib/postgresql/data + + redis: + image: "redis:alpine" + ports: + - 6379:6379 + + rabbitmq: + image: augur-rabbitmq + build: + context: . + dockerfile: ./docker/rabbitmq/Dockerfile + args: + - RABBIT_MQ_DEFAULT_USER=${AUGUR_RABBITMQ_USERNAME:-augur} + - RABBIT_MQ_DEFAULT_PASSWORD=${AUGUR_RABBITMQ_PASSWORD:-password123} + - RABBIT_MQ_DEFAULT_VHOST=${AUGUR_RABBITMQ_VHOST:-augur_vhost} + # ports for amqp connections / management api + ports: + - 5671:5671 + - 5672:5672 + - 15671:15671 + - 15672:15672 + + augur: + image: augur-new:latest + build: + context: . + dockerfile: ./docker/backend/Dockerfile + volumes: + - facade:/augur/facade + restart: unless-stopped + ports: + - 5002:5000 + environment: + - "AUGUR_DB=postgresql+psycopg2://${AUGUR_DB_USER:-augur}:${AUGUR_DB_PASSWORD:-augur}@augur-db:5432/augur" + - "AUGUR_DB_SCHEMA_BUILD=1" + - "AUGUR_GITHUB_API_KEY=${AUGUR_GITHUB_API_KEY}" + - "AUGUR_GITLAB_API_KEY=${AUGUR_GITLAB_API_KEY}" + - "AUGUR_GITHUB_USERNAME=${AUGUR_GITHUB_USERNAME}" + - "AUGUR_GITLAB_USERNAME=${AUGUR_GITLAB_USERNAME}" + - REDIS_CONN_STRING=redis://redis:6379 + - RABBITMQ_CONN_STRING=amqp://${AUGUR_RABBITMQ_USERNAME:-augur}:${AUGUR_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${AUGUR_RABBITMQ_VHOST:-augur_vhost} + depends_on: + - augur-db + - redis + - rabbitmq + +volumes: + facade: + driver: local + augurpostgres: + driver: local + + diff --git a/setup.py b/setup.py index 1a72c87b33..279496bf82 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ "distributed >= 2021.03.0", # 2022.8.1 "nltk==3.6.6", # 3.7 "h5py==3.10.0", # 3.7 - "scipy>=1.10.0", # 1.9.0 + "scipy>=1.10.0, <1.13.0", # 1.9.0 "blinker==1.4", # 1.5 "protobuf<3.22", # 4.21.5 "slack==0.0.2", # 0.0.2