-
Notifications
You must be signed in to change notification settings - Fork 847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize secondary task recollection #2800
Changes from all commits
9167238
0e47f00
4d04633
2af7ef1
2a76f9c
6730a11
39f552b
6035218
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
from celery import chain | ||
import logging | ||
|
||
def machine_learning_phase(repo_git): | ||
def machine_learning_phase(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.data_analysis.clustering_worker.tasks import clustering_task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.data_analysis.discourse_analysis.tasks import discourse_analysis_task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.data_analysis.insight_worker.tasks import insight_task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -410,7 +410,7 @@ def generate_contributor_sequence(logger,repo_git, session): | |
return insert_facade_contributors.si(repo_id) | ||
|
||
|
||
def facade_phase(repo_git): | ||
def facade_phase(repo_git, full_collection): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
logger = logging.getLogger(facade_phase.__name__) | ||
logger.info("Generating facade sequence") | ||
facade_helper = FacadeHelper(logger) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,20 +2,40 @@ | |
from augur.tasks.github.util.github_paginator import GithubPaginator | ||
from augur.application.db.models import * | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.github.util.util import get_owner_repo | ||
from augur.application.db.lib import bulk_insert_dicts, fetchall_data_from_sql_text | ||
from augur.application.db.util import execute_session_query | ||
from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs | ||
|
||
def pull_request_commits_model(repo,logger, key_auth): | ||
|
||
def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collection=False): | ||
|
||
if full_collection: | ||
# query existing PRs and the respective url we will append the commits url to | ||
pr_url_sql = s.sql.text(""" | ||
SELECT DISTINCT pr_url, pull_requests.pull_request_id | ||
FROM pull_requests--, pull_request_meta | ||
WHERE repo_id = :repo_id | ||
""").bindparams(repo_id=repo_id) | ||
pr_urls = [] | ||
#pd.read_sql(pr_number_sql, self.db, params={}) | ||
|
||
pr_urls = augur_db.fetchall_data_from_sql_text(pr_url_sql)#session.execute_sql(pr_number_sql).fetchall() | ||
|
||
# query existing PRs and the respective url we will append the commits url to | ||
pr_url_sql = s.sql.text(""" | ||
SELECT DISTINCT pr_url, pull_requests.pull_request_id | ||
FROM pull_requests--, pull_request_meta | ||
WHERE repo_id = :repo_id | ||
""").bindparams(repo_id=repo.repo_id) | ||
pr_urls = [] | ||
#pd.read_sql(pr_number_sql, self.db, params={}) | ||
|
||
pr_urls = fetchall_data_from_sql_text(pr_url_sql) | ||
else: | ||
last_collected = get_secondary_data_last_collected(repo_id).date() | ||
prs = get_updated_prs(repo_id, last_collected) | ||
pr_urls = [pr.pr_url for pr in prs] | ||
|
||
pr_urls = [] | ||
for pr in prs: | ||
pr_urls.append({ | ||
'pr_url': pr.pr_url, | ||
'pull_request_id': pr.pull_request_id | ||
}) | ||
|
||
|
||
query = augur_db.session.query(Repo).filter(Repo.repo_id == repo_id) | ||
repo = execute_session_query(query, 'one') | ||
|
||
owner, name = get_owner_repo(repo.repo_git) | ||
|
||
task_name = f"{owner}/{name} Pr commits" | ||
|
@@ -52,7 +72,7 @@ def pull_request_commits_model(repo,logger, key_auth): | |
if len(all_data) > 0: | ||
logger.info(f"{task_name}: Inserting {len(all_data)} rows") | ||
pr_commits_natural_keys = ["pull_request_id", "repo_id", "pr_cmt_sha"] | ||
bulk_insert_dicts(logger, all_data,PullRequestCommit,pr_commits_natural_keys) | ||
augur_db.insert_data(all_data,PullRequestCommit,pr_commits_natural_keys) | ||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,18 +2,18 @@ | |
from augur.tasks.github.pull_requests.commits_model.core import * | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
from augur.tasks.init.celery_app import celery_app as celery | ||
from augur.tasks.init.celery_app import AugurSecondaryRepoCollectionTask | ||
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth | ||
from augur.tasks.github.util.github_task_session import GithubTaskManifest | ||
from augur.application.db.lib import get_repo_by_repo_git | ||
|
||
|
||
|
||
@celery.task(base=AugurSecondaryRepoCollectionTask) | ||
def process_pull_request_commits(repo_git: str) -> None: | ||
def process_pull_request_commits(repo_git: str, full_collection: bool) -> None: | ||
|
||
logger = logging.getLogger(process_pull_request_commits.__name__) | ||
|
||
repo = get_repo_by_repo_git(repo_git) | ||
|
||
key_auth = GithubRandomKeyAuth(logger) | ||
with GithubTaskManifest(logger) as manifest: | ||
|
||
pull_request_commits_model(repo, logger, key_auth) | ||
pull_request_commits_model(repo.repo_id, logger, manifest.augur_db, manifest.key_auth, full_collection) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,12 +9,11 @@ | |
from augur.tasks.github.util.util import add_key_value_pair_to_dicts, get_owner_repo | ||
from augur.application.db.models import PullRequest, Message, PullRequestReview, PullRequestLabel, PullRequestReviewer, PullRequestMeta, PullRequestAssignee, PullRequestReviewMessageRef, Contributor, Repo | ||
from augur.tasks.github.util.github_task_session import GithubTaskManifest | ||
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth | ||
from augur.application.db.lib import get_session, get_repo_by_repo_git, bulk_insert_dicts, get_pull_request_reviews_by_repo_id | ||
from augur.application.db.util import execute_session_query | ||
from ..messages.tasks import process_github_comment_contributors | ||
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth | ||
|
||
import httpx | ||
from augur.application.db.lib import get_secondary_data_last_collected, get_updated_prs | ||
|
||
from typing import Generator, List, Dict | ||
|
||
|
@@ -328,7 +327,7 @@ def collect_pull_request_review_comments(repo_git: str) -> None: | |
|
||
|
||
@celery.task(base=AugurSecondaryRepoCollectionTask) | ||
def collect_pull_request_reviews(repo_git: str) -> None: | ||
def collect_pull_request_reviews(repo_git: str, full_collection: bool) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
logger = logging.getLogger(collect_pull_request_reviews.__name__) | ||
|
||
|
@@ -339,81 +338,88 @@ def collect_pull_request_reviews(repo_git: str) -> None: | |
data_source = "Github API" | ||
|
||
repo_id = get_repo_by_repo_git(repo_git).repo_id | ||
with GithubTaskManifest(logger) as manifest: | ||
|
||
key_auth = GithubRandomKeyAuth(logger) | ||
augur_db = manifest.augur_db | ||
|
||
with get_session() as session: | ||
|
||
query = session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) | ||
prs = execute_session_query(query, 'all') | ||
query = augur_db.session.query(Repo).filter(Repo.repo_git == repo_git) | ||
repo_id = execute_session_query(query, 'one').repo_id | ||
|
||
pr_count = len(prs) | ||
if full_collection: | ||
|
||
all_pr_reviews = {} | ||
for index, pr in enumerate(prs): | ||
query = augur_db.session.query(PullRequest).filter(PullRequest.repo_id == repo_id).order_by(PullRequest.pr_src_number) | ||
prs = execute_session_query(query, 'all') | ||
else: | ||
last_collected = get_secondary_data_last_collected(repo_id).date() | ||
prs = get_updated_prs(repo_id, last_collected) | ||
|
||
pr_number = pr.pr_src_number | ||
pull_request_id = pr.pull_request_id | ||
pr_count = len(prs) | ||
|
||
logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") | ||
all_pr_reviews = {} | ||
for index, pr in enumerate(prs): | ||
|
||
pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" | ||
pr_number = pr.pr_src_number | ||
pull_request_id = pr.pull_request_id | ||
|
||
pr_reviews = [] | ||
pr_reviews_generator = GithubPaginator(pr_review_url, key_auth, logger) | ||
for page_data, page in pr_reviews_generator.iter_pages(): | ||
|
||
if page_data is None: | ||
break | ||
|
||
if len(page_data) == 0: | ||
break | ||
|
||
if isinstance(page_data, list): | ||
page_data = [ | ||
element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element | ||
for element in page_data | ||
] | ||
logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") | ||
elif isinstance(page_data, bytes): | ||
page_data = page_data.decode('utf-8').replace('\x00', ' ') | ||
logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") | ||
logger.info(f"{owner}/{repo} Collecting Pr Reviews for pr {index + 1} of {pr_count}") | ||
|
||
pr_review_url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/reviews" | ||
|
||
pr_reviews = [] | ||
pr_reviews_generator = GithubPaginator(pr_review_url, manifest.key_auth, logger) | ||
for page_data, page in pr_reviews_generator.iter_pages(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
if page_data is None: | ||
break | ||
|
||
if len(page_data) == 0: | ||
break | ||
|
||
if isinstance(page_data, list): | ||
page_data = [ | ||
element.decode('utf-8').replace('\x00', ' ') if isinstance(element, bytes) else element | ||
for element in page_data | ||
] | ||
logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
elif isinstance(page_data, bytes): | ||
page_data = page_data.decode('utf-8').replace('\x00', ' ') | ||
logger.info(f"NUL characters were found in PR Reviews and replaced with spaces.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
|
||
pr_reviews.extend(page_data) | ||
|
||
pr_reviews.extend(page_data) | ||
|
||
if pr_reviews: | ||
all_pr_reviews[pull_request_id] = pr_reviews | ||
if pr_reviews: | ||
all_pr_reviews[pull_request_id] = pr_reviews | ||
|
||
if not list(all_pr_reviews.keys()): | ||
logger.info(f"{owner}/{repo} No pr reviews for repo") | ||
return | ||
if not list(all_pr_reviews.keys()): | ||
logger.info(f"{owner}/{repo} No pr reviews for repo") | ||
return | ||
|
||
contributors = [] | ||
for pull_request_id in all_pr_reviews.keys(): | ||
contributors = [] | ||
for pull_request_id in all_pr_reviews.keys(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
reviews = all_pr_reviews[pull_request_id] | ||
for review in reviews: | ||
contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) | ||
if contributor: | ||
contributors.append(contributor) | ||
reviews = all_pr_reviews[pull_request_id] | ||
for review in reviews: | ||
contributor = process_pull_request_review_contributor(review, tool_source, tool_version, data_source) | ||
if contributor: | ||
contributors.append(contributor) | ||
|
||
logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") | ||
bulk_insert_dicts(logger, contributors, Contributor, ["cntrb_id"]) | ||
logger.info(f"{owner}/{repo} Pr reviews: Inserting {len(contributors)} contributors") | ||
augur_db.insert_data(contributors, Contributor, ["cntrb_id"]) | ||
|
||
|
||
pr_reviews = [] | ||
for pull_request_id in all_pr_reviews.keys(): | ||
pr_reviews = [] | ||
for pull_request_id in all_pr_reviews.keys(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [pylint] reported by reviewdog 🐶 |
||
|
||
reviews = all_pr_reviews[pull_request_id] | ||
for review in reviews: | ||
|
||
if "cntrb_id" in review: | ||
pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) | ||
reviews = all_pr_reviews[pull_request_id] | ||
for review in reviews: | ||
if "cntrb_id" in review: | ||
pr_reviews.append(extract_needed_pr_review_data(review, pull_request_id, repo_id, platform_id, tool_source, tool_version)) | ||
|
||
logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") | ||
pr_review_natural_keys = ["pr_review_src_id",] | ||
bulk_insert_dicts(logger, pr_reviews, PullRequestReview, pr_review_natural_keys) | ||
logger.info(f"{owner}/{repo}: Inserting pr reviews of length: {len(pr_reviews)}") | ||
pr_review_natural_keys = ["pr_review_src_id",] | ||
augur_db.insert_data(pr_reviews, PullRequestReview, pr_review_natural_keys) | ||
|
||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[pylint] reported by reviewdog 🐶
C0114: Missing module docstring (missing-module-docstring)