-
Notifications
You must be signed in to change notification settings - Fork 847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize secondary task recollection #2800
Changes from 7 commits
9167238
0e47f00
4d04633
2af7ef1
2a76f9c
6730a11
39f552b
6035218
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
from celery import chain | ||
import logging | ||
|
||
def machine_learning_phase(repo_git): | ||
def machine_learning_phase(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.data_analysis.clustering_worker.tasks import clustering_task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.data_analysis.discourse_analysis.tasks import discourse_analysis_task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.data_analysis.insight_worker.tasks import insight_task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -410,7 +410,7 @@ def generate_contributor_sequence(logger,repo_git, session): | |
return insert_facade_contributors.si(repo_id) | ||
|
||
|
||
def facade_phase(repo_git): | ||
def facade_phase(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
logger = logging.getLogger(facade_phase.__name__) | ||
logger.info("Generating facade sequence") | ||
facade_helper = FacadeHelper(logger) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,20 +2,40 @@ | |
from augur.tasks.github.util.github_paginator import GithubPaginator | ||
from augur.application.db.models import * | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.github.util.util import get_owner_repo | ||
from augur.application.db.lib import bulk_insert_dicts, fetchall_data_from_sql_text | ||
from augur.application.db.util import execute_session_query | ||
from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs | ||
|
||
def pull_request_commits_model(repo,logger, key_auth): | ||
|
||
def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collection=False): | ||
|
||
if full_collection: | ||
# query existing PRs and the respective url we will append the commits url to | ||
pr_url_sql = s.sql.text(""" | ||
SELECT DISTINCT pr_url, pull_requests.pull_request_id | ||
FROM pull_requests--, pull_request_meta | ||
WHERE repo_id = :repo_id | ||
""").bindparams(repo_id=repo_id) | ||
pr_urls = [] | ||
#pd.read_sql(pr_number_sql, self.db, params={}) | ||
|
||
pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() | ||
|
||
# query existing PRs and the respective url we will append the commits url to | ||
pr_url_sql = s.sql.text(""" | ||
SELECT DISTINCT pr_url, pull_requests.pull_request_id | ||
FROM pull_requests--, pull_request_meta | ||
WHERE repo_id = :repo_id | ||
""").bindparams(repo_id=repo.repo_id) | ||
pr_urls = [] | ||
#pd.read_sql(pr_number_sql, self.db, params={}) | ||
|
||
pr_urls = fetchall_data_from_sql_text(pr_url_sql) | ||
else: | ||
last_collected = get_secondary_data_last_collected(repo_id).date() | ||
prs = get_updated_prs(repo_id, last_collected) | ||
pr_urls = [pr.pr_url for pr in prs] | ||
|
||
pr_urls = [] | ||
for pr in prs: | ||
pr_urls.append({ | ||
'pr_url': pr.pr_url, | ||
'pull_request_id': pr.pull_request_id | ||
}) | ||
|
||
|
||
query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) | ||
repo = execute_session_query(query, 'one') | ||
|
||
owner, name = get_owner_repo(repo.repo_git) | ||
|
||
task_name = f"{owner}/{name} Pr commits" | ||
|
@@ -53,6 +73,7 @@ def pull_request_commits_model(repo,logger, key_auth): | |
logger.info(f"{task_name}: Inserting {len(all_data)} rows") | ||
pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] | ||
bulk_insert_dicts(logger, all_data,PullRequestCommit,pr_commits_natural_keys) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) | ||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,9 +12,7 @@ | |
from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id | ||
from augur.application.db.util import execute_session_query | ||
from ..messages.tasks import process_github_comment_contributors | ||
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth | ||
|
||
import httpx | ||
from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs | ||
|
||
from typing import Generator, List, Dict | ||
|
||
|
@@ -328,7 +326,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: | |
|
||
|
||
@celery.task(base=AugurSecondaryRepoCollectionTask) | ||
def collect_pull_request_reviews(repo_git: str) -> None: | ||
def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
logger = logging.getLogger(collect_pull_request_reviews.__name__) | ||
|
||
|
@@ -339,13 +337,20 @@ def collect_pull_request_reviews(repo_git: str) -> None: | |
data_source = "Github API" | ||
|
||
repo_id = get_repo_by_repo_git(repo_git).repo_id | ||
with GithubTaskManifest(logger) as manifest: | ||
|
||
key_auth = GithubRandomKeyAuth(logger) | ||
augur_db = manifest.augur_db | ||
|
||
with get_session() as session: | ||
|
||
query = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) | ||
prs = execute_session_query(query, 'all') | ||
query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) | ||
repo_id = execute_session_query(query, 'one').repo_id | ||
|
||
if full_collection: | ||
|
||
query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) | ||
prs = execute_session_query(query, 'all') | ||
else: | ||
last_collected = get_secondary_data_last_collected(repo_id).date() | ||
prs = get_updated_prs(repo_id, last_collected) | ||
|
||
pr_count = len(prs) | ||
|
||
|
@@ -398,8 +403,8 @@ def collect_pull_request_reviews(repo_git: str) -> None: | |
if contributor: | ||
contributors.append(contributor) | ||
|
||
logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") | ||
bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) | ||
logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") | ||
augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) | ||
|
||
|
||
pr_reviews = [] | ||
|
@@ -411,9 +416,9 @@ def collect_pull_request_reviews(repo_git: str) -> None: | |
if "cntrb_id" in review: | ||
pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) | ||
|
||
logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") | ||
pr_review_natural_keys = ["pr_review_src_id",] | ||
bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) | ||
logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") | ||
pr_review_natural_keys = ["pr_review_src_id",] | ||
augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) | ||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,21 +43,21 @@ | |
""" | ||
|
||
#Prelim phases are used to detect if where the repo has hosted has moved or not. | ||
def prelim_phase(repo_git): | ||
def prelim_phase(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
logger = logging.getLogger(prelim_phase.__name__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
return detect_github_repo_move_core.si(repo_git) | ||
|
||
def prelim_phase_secondary(repo_git): | ||
def prelim_phase_secondary(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
logger = logging.getLogger(prelim_phase.__name__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
return detect_github_repo_move_secondary.si(repo_git) | ||
|
||
|
||
#This is the phase that defines the message for core augur collection | ||
#A chain is needed for each repo. | ||
def primary_repo_collect_phase(repo_git): | ||
def primary_repo_collect_phase(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
logger = logging.getLogger(primary_repo_collect_phase.__name__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
|
||
|
@@ -86,7 +86,7 @@ def primary_repo_collect_phase(repo_git): | |
|
||
return repo_task_group | ||
|
||
def primary_repo_collect_phase_gitlab(repo_git): | ||
def primary_repo_collect_phase_gitlab(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
logger = logging.getLogger(primary_repo_collect_phase_gitlab.__name__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
|
@@ -110,13 +110,13 @@ def primary_repo_collect_phase_gitlab(repo_git): | |
|
||
#This phase creates the message for secondary collection tasks. | ||
#These are less important and have their own worker. | ||
def secondary_repo_collect_phase(repo_git): | ||
def secondary_repo_collect_phase(repo_git, full_collection): | ||
logger = logging.getLogger(secondary_repo_collect_phase.__name__) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
repo_task_group = group( | ||
process_pull_request_files.si(repo_git), | ||
process_pull_request_commits.si(repo_git), | ||
chain(collect_pull_request_reviews.si(repo_git), collect_pull_request_review_comments.si(repo_git)), | ||
process_pull_request_files.si(repo_git, full_collection), | ||
process_pull_request_commits.si(repo_git, full_collection), | ||
chain(collect_pull_request_reviews.si(repo_git, full_collection), collect_pull_request_review_comments.si(repo_git)), | ||
process_ossf_dependency_metrics.si(repo_git) | ||
) | ||
|
||
|
@@ -167,7 +167,7 @@ def build_primary_repo_collect_request(session, logger, enabled_phase_names, day | |
primary_gitlab_enabled_phases.append(primary_repo_collect_phase_gitlab) | ||
|
||
#task success is scheduled no matter what the config says. | ||
def core_task_success_util_gen(repo_git): | ||
def core_task_success_util_gen(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
return core_task_success_util.si(repo_git) | ||
|
||
primary_enabled_phases.append(core_task_success_util_gen) | ||
|
@@ -187,7 +187,7 @@ def build_secondary_repo_collect_request(session, logger, enabled_phase_names, d | |
|
||
secondary_enabled_phases.append(secondary_repo_collect_phase) | ||
|
||
def secondary_task_success_util_gen(repo_git): | ||
def secondary_task_success_util_gen(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
return secondary_task_success_util.si(repo_git) | ||
|
||
secondary_enabled_phases.append(secondary_task_success_util_gen) | ||
|
@@ -203,12 +203,12 @@ def build_facade_repo_collect_request(session, logger, enabled_phase_names, days | |
|
||
facade_enabled_phases.append(facade_phase) | ||
|
||
def facade_task_success_util_gen(repo_git): | ||
def facade_task_success_util_gen(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
return facade_task_success_util.si(repo_git) | ||
|
||
facade_enabled_phases.append(facade_task_success_util_gen) | ||
|
||
def facade_task_update_weight_util_gen(repo_git): | ||
def facade_task_update_weight_util_gen(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
return git_update_commit_count_weight.si(repo_git) | ||
|
||
facade_enabled_phases.append(facade_task_update_weight_util_gen) | ||
|
@@ -223,7 +223,7 @@ def build_ml_repo_collect_request(session, logger, enabled_phase_names, days_unt | |
|
||
ml_enabled_phases.append(machine_learning_phase) | ||
|
||
def ml_task_success_util_gen(repo_git): | ||
def ml_task_success_util_gen(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
return ml_task_success_util.si(repo_git) | ||
|
||
ml_enabled_phases.append(ml_task_success_util_gen) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[pylint] reported by reviewdog 🐶
C0114: Missing module docstring (missing-module-docstring)