Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Facade repo path changes #2186

Merged
merged 11 commits into from
Feb 23, 2023
14 changes: 6 additions & 8 deletions augur/tasks/git/dependency_tasks/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import requests
import json
import re
import traceback
from augur.application.db.data_parse import *
from augur.application.db.models import *
Expand Down Expand Up @@ -44,20 +45,17 @@ def generate_deps_data(session, repo_id, path):
session.logger.error(f"Could not complete generate_deps_data!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}")


def deps_model(session, repo_id):
def deps_model(session, repo_id,repo_git,repo_group_id):
""" Data collection and storage method
"""
session.logger.info(f"This is the deps model repo: {repo_id}.")

repo_path_sql = s.sql.text("""
SELECT repo_id, CONCAT(repo_group_id || chr(47) || repo_path || repo_name) AS path
FROM repo
WHERE repo_id = :repo_id
""").bindparams(repo_id=repo_id)


result = session.execute_sql(repo_path_sql)
#result = session.execute_sql(repo_path_sql)
result = re.search(r"https:\/\/(github\.com\/[A-Za-z0-9 \- _]+\/)([A-Za-z0-9 \- _ .]+)$", repo_git).groups()

relative_repo_path = result.fetchone()[1]
relative_repo_path = f"{repo_group_id}/{result[0]}{result[1]}"
config = AugurConfig(session.logger, session)
absolute_repo_path = config.get_section("Facade")['repo_directory'] + relative_repo_path

Expand Down
2 changes: 1 addition & 1 deletion augur/tasks/git/dependency_tasks/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ def process_dependency_metrics(repo_git):

try:
repo = execute_session_query(query,'one')
deps_model(session, repo.repo_id)
deps_model(session, repo.repo_id,repo_git,repo.repo_group_id)
except Exception as e:
session.logger.error(f"Could not complete deps_model!\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}")
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
import sqlalchemy as s
from .facade02utilitymethods import update_repo_log, trim_commit, store_working_author, trim_author
from augur.application.db.models.augur_data import *
from augur.application.db.models.augur_operations import CollectionStatus
from augur.application.db.util import execute_session_query, convert_orm_list_to_dict_list

def git_repo_initialize(session, repo_git):
Expand Down Expand Up @@ -86,27 +87,30 @@ def git_repo_initialize(session, repo_git):
session.log_activity('Info',f"Repo Name from facade05, line 93: {repo_name}")


# Check if there will be a storage path collision
query = s.sql.text("""SELECT NULL FROM repo WHERE CONCAT(repo_group_id,'/',repo_path,repo_name) = :repo_group_id
""").bindparams(repo_group_id=f"{row.repo_group_id}/{repo_relative_path}{repo_name}")

result = session.fetchall_data_from_sql_text(query)

# If there is a collision, append a slug to repo_name to yield a unique path
if len(result):
#query = s.sql.text("""SELECT NULL FROM repo WHERE CONCAT(repo_group_id,'/',repo_path,repo_name) = :repo_group_id
# """).bindparams(repo_group_id=f"{row.repo_group_id}/{repo_relative_path}{repo_name}")
#
#result = session.fetchall_data_from_sql_text(query)

slug = 1
is_collision = True
while is_collision:

if os.path.isdir(f"{repo_path}{repo_name}-{slug}"):
slug += 1
else:
is_collision = False
query = s.sql.text("""UPDATE repo SET repo_path=:pathParam,
repo_name=:nameParam WHERE repo_id=:idParam
""").bindparams(pathParam=repo_relative_path,nameParam=repo_name,idParam=row.repo_id)

repo_name = f"{repo_name}-{slug}"
session.execute_sql(query)
# Check if there will be a storage path collision
# If there is a collision, throw an error so that it updates the existing repo instead of trying
# to reclone.
if os.path.isdir(f"{repo_path}{repo_name}"):#len(result):

session.log_activity('Verbose',f"Identical repo detected, storing {git} in {repo_name}")
session.logger.error("Identical repo found in facade directory!")
statusQuery = session.query(CollectionStatus).filter(CollectionStatus.repo_id == row.repo_id)
collectionRecord = execute_session_query(statusQuery,'one')
collectionRecord.facade_status = 'Update'
collectionRecord.facade_task_id = None
session.commit()
raise FileExistsError("Repo already found in facade directory! Cannot clone. Setting repo to Update state and exiting.")

# Create the prerequisite directories
return_code = subprocess.Popen([f"mkdir -p {repo_path}"],shell=True).wait()
Expand Down
10 changes: 8 additions & 2 deletions augur/tasks/github/detect_move/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def extract_owner_and_repo_from_endpoint(key_auth, url, logger):

return splits[0], splits[-1]

def ping_github_for_repo_move(session,repo, logger):
def ping_github_for_repo_move(session,repo, logger,collection_hook='core'):

owner, name = get_owner_repo(repo.repo_git)
url = f"https://api.github.com/repos/{owner}/{name}"
Expand Down Expand Up @@ -78,7 +78,13 @@ def ping_github_for_repo_move(session,repo, logger):
statusQuery = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id)

collectionRecord = execute_session_query(statusQuery,'one')
collectionRecord.status = CollectionState.PENDING.value
if collection_hook == 'core':
collectionRecord.core_status = CollectionState.PENDING.value
collectionRecord.core_task_id = None
elif collection_hook == 'secondary':
collectionRecord.secondary_status = CollectionState.PENDING.value
collectionRecord.secondary_task_id = None

session.commit()

raise Exception("ERROR: Repo has moved! Marked repo as pending and stopped collection")
Expand Down
32 changes: 23 additions & 9 deletions augur/tasks/github/detect_move/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,34 @@


@celery.task()
def detect_github_repo_move(repo_git : str) -> None:
def detect_github_repo_move_core(repo_git : str) -> None:

from augur.tasks.init.celery_app import engine

logger = logging.getLogger(detect_github_repo_move.__name__)
logger = logging.getLogger(detect_github_repo_move_core.__name__)

logger.info(f"Starting repo_move operation with {repo_git}")
with GithubTaskSession(logger, engine) as session:
#Ping each repo with the given repo_git to make sure
#that they are still in place.
try:
query = session.query(Repo).filter(Repo.repo_git == repo_git)
repo = execute_session_query(query, 'one')
logger.info(f"Pinging repo: {repo_git}")
ping_github_for_repo_move(session, repo, logger)
except Exception as e:
logger.error(f"Could not check repo source for {repo_git}\n Reason: {e} \n Traceback: {''.join(traceback.format_exception(None, e, e.__traceback__))}")
query = session.query(Repo).filter(Repo.repo_git == repo_git)
repo = execute_session_query(query, 'one')
logger.info(f"Pinging repo: {repo_git}")
ping_github_for_repo_move(session, repo, logger)


@celery.task()
def detect_github_repo_move_secondary(repo_git : str) -> None:

from augur.tasks.init.celery_app import engine

logger = logging.getLogger(detect_github_repo_move_secondary.__name__)

logger.info(f"Starting repo_move operation with {repo_git}")
with GithubTaskSession(logger, engine) as session:
#Ping each repo with the given repo_git to make sure
#that they are still in place.
query = session.query(Repo).filter(Repo.repo_git == repo_git)
repo = execute_session_query(query, 'one')
logger.info(f"Pinging repo: {repo_git}")
ping_github_for_repo_move(session, repo, logger,collection_hook='secondary')
26 changes: 16 additions & 10 deletions augur/tasks/start_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from augur.tasks.github import *
if os.environ.get('AUGUR_DOCKER_DEPLOY') != "1":
from augur.tasks.data_analysis import *
from augur.tasks.github.detect_move.tasks import detect_github_repo_move
from augur.tasks.github.detect_move.tasks import detect_github_repo_move_core, detect_github_repo_move_secondary
from augur.tasks.github.releases.tasks import collect_releases
from augur.tasks.github.repo_info.tasks import collect_repo_info
from augur.tasks.github.pull_requests.files_model.tasks import process_pull_request_files
Expand Down Expand Up @@ -145,8 +145,6 @@ def task_failed(request,exc,traceback):

query = session.query(CollectionStatus).filter(or_(core_id_match,secondary_id_match,facade_id_match))

collectionRecord = execute_session_query(query,'one')

print(f"chain: {request.chain}")
#Make sure any further execution of tasks dependent on this one stops.
try:
Expand All @@ -157,6 +155,12 @@ def task_failed(request,exc,traceback):
except Exception as e:
logger.error(f"Could not mutate request chain! \n Error: {e}")

try:
collectionRecord = execute_session_query(query,'one')
except:
#Exit if we can't find the record.
return

if collectionRecord.core_task_id == request.id:
# set status to Error in db
collectionRecord.core_status = CollectionStatus.ERROR.value
Expand Down Expand Up @@ -184,7 +188,12 @@ def prelim_phase(repo_git):

logger = logging.getLogger(prelim_phase.__name__)

return detect_github_repo_move.si(repo_git)
return detect_github_repo_move_core.si(repo_git)

def prelim_phase_secondary(repo_git):
logger = logging.getLogger(prelim_phase.__name__)

return detect_github_repo_move_secondary.si(repo_git)


def primary_repo_collect_phase(repo_git):
Expand Down Expand Up @@ -214,6 +223,7 @@ def primary_repo_collect_phase(repo_git):
repo_info_task,
chain(primary_repo_jobs,secondary_repo_jobs,process_contributors.si()),
#facade_phase(logger,repo_git),
process_dependency_metrics.si(repo_git),
collect_releases.si(repo_git)
)

Expand All @@ -225,8 +235,7 @@ def secondary_repo_collect_phase(repo_git):

repo_task_group = group(
process_pull_request_files.si(repo_git),
process_pull_request_commits.si(repo_git),
process_dependency_metrics.si(repo_git)
process_pull_request_commits.si(repo_git)
)

return repo_task_group
Expand Down Expand Up @@ -421,7 +430,7 @@ def start_secondary_collection(session,max_repo,days):
secondary_enabled_phases = []

if prelim_phase.__name__ in enabled_phase_names:
secondary_enabled_phases.append(prelim_phase)
secondary_enabled_phases.append(prelim_phase_secondary)

if secondary_repo_collect_phase.__name__ in enabled_phase_names:
secondary_enabled_phases.append(secondary_repo_collect_phase)
Expand Down Expand Up @@ -469,9 +478,6 @@ def start_facade_collection(session,max_repo,days):
#Deal with secondary collection
facade_enabled_phases = []

if prelim_phase.__name__ in enabled_phase_names:
facade_enabled_phases.append(prelim_phase)

if facade_phase.__name__ in enabled_phase_names:
facade_enabled_phases.append(facade_phase)

Expand Down