Merge pull request #2790 from chaoss/pr-file-patch

PR File Patch / Varoius Bug Fixes
chaoss · May 21, 2024 · 064a917 · 064a917
2 parents dd05af9 + 2241f33
commit 064a917
Show file tree

Hide file tree

Showing 25 changed files with 351 additions and 236 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Augur NEW Release v0.63.0
+# Augur NEW Release v0.70.0
 
 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else! 
 The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io 
@@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
 ## NEW RELEASE ALERT!
 ### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)
 
-Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.63.0 
+Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0  
 
 - The `main` branch is a stable version of our new architecture, which features:
   - Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.

diff --git a/augur/application/cli/__init__.py b/augur/application/cli/__init__.py
@@ -3,9 +3,9 @@
 from functools import update_wrapper
 import os
 import sys
-import socket
 import re
 import json
+import httpx
 
 from augur.application.db.engine import DatabaseEngine
 from augur.application.db import get_engine, dispose_database_engine
@@ -16,13 +16,22 @@ def test_connection(function_internet_connection):
     @click.pass_context
     def new_func(ctx, *args, **kwargs):
         usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0]
-        try:
-            #try to ping google's dns server
-            socket.create_connection(("8.8.8.8",53))
-            return ctx.invoke(function_internet_connection, *args, **kwargs)
-        except OSError as e:
-            print(e)
-            print(f"\n\n{usage} command setup failed\nYou are not connect to the internet. Please connect to the internet to run Augur\n")
+        with httpx.Client() as client:
+            try:
+                _ = client.request(
+                    method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True)
+
+                return ctx.invoke(function_internet_connection, *args, **kwargs)
+            except (TimeoutError, httpx.TimeoutException):
+                print("Request timed out.")
+            except httpx.NetworkError:
+                print(f"Network Error: {httpx.NetworkError}")
+            except httpx.ProtocolError:
+                print(f"Protocol Error: {httpx.ProtocolError}")
+            print(f"\n\n{usage} command setup failed\n \
+                You are not connected to the internet.\n \
+                Please connect to the internet to run Augur\n \
+                Consider setting http_proxy variables for limited access installations.")
             sys.exit()        
 
     return update_wrapper(new_func, function_internet_connection)

diff --git a/augur/application/cli/backend.py b/augur/application/cli/backend.py
@@ -172,7 +172,7 @@ def determine_worker_processes(ratio,maximum):
         sleep_time += 6
 
         #20% of estimate, Maximum value of 25
-        secondary_num_processes = determine_worker_processes(.25, 25)
+        secondary_num_processes = determine_worker_processes(.25, 45)
         logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
         secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
         process_list.append(subprocess.Popen(secondary_worker.split(" ")))

diff --git a/augur/application/cli/collection.py b/augur/application/cli/collection.py
@@ -132,7 +132,7 @@ def determine_worker_processes(ratio,maximum):
     sleep_time += 6
 
     #20% of estimate, Maximum value of 25
-    secondary_num_processes = determine_worker_processes(.25, 25)
+    secondary_num_processes = determine_worker_processes(.25, 45)
     logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
     secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
     process_list.append(subprocess.Popen(secondary_worker.split(" ")))

diff --git a/augur/application/cli/db.py b/augur/application/cli/db.py
@@ -14,7 +14,12 @@
 import re
 import stat as stat_module
 
-from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext
+from augur.application.cli import (
+    test_connection,
+    test_db_connection,
+    with_database,
+    DatabaseContext,
+)
 
 from augur.application.db.session import DatabaseSession
 from sqlalchemy import update
@@ -23,8 +28,9 @@
 
 logger = logging.getLogger(__name__)
 
+
 @click.group("db", short_help="Database utilities")
-@click.pass_context 
+@click.pass_context
 def cli(ctx):
     ctx.obj = DatabaseContext()
 
@@ -36,36 +42,43 @@ def cli(ctx):
 @with_database
 @click.pass_context
 def add_repos(ctx, filename):
-    """Add repositories to Augur's database. 
+    """Add repositories to Augur's database.
 
     The .csv file format should be repo_url,group_id
 
     NOTE: The Group ID must already exist in the REPO_Groups Table.
 
-    If you want to add an entire GitHub organization, refer to the command: augur db add-github-org"""   
+    If you want to add an entire GitHub organization, refer to the command: augur db add-github-org"""
     from augur.tasks.github.util.github_task_session import GithubTaskSession
     from augur.util.repo_load_controller import RepoLoadController
 
     with GithubTaskSession(logger, engine=ctx.obj.engine) as session:
-
         controller = RepoLoadController(session)
 
+        line_total = len(open(filename).readlines())
         with open(filename) as upload_repos_file:
             data = csv.reader(upload_repos_file, delimiter=",")
-            for row in data:
-
+            for line_num, row in enumerate(data):
                 repo_data = {}
                 repo_data["url"] = row[0]
                 try:
                     repo_data["repo_group_id"] = int(row[1])
                 except ValueError:
-                    print(f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`")
+                    print(
+                        f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`"
+                    )
                     continue
-                
+
                 print(
-                    f"Inserting repo with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}")
-                controller.add_cli_repo(repo_data)
+                    f"Inserting repo {line_num}/{line_total} with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}"
+                )
 
+                succeeded, message = controller.add_cli_repo(repo_data)
+                if not succeeded:
+                    logger.error(f"insert repo failed with error: {message['status']}`")
+                else:
+                    logger.info(f"Repo added: {repo_data}")
+                    print("Success")
 
 
 @cli.command("get-repo-groups")
@@ -101,7 +114,6 @@ def add_repo_groups(ctx, filename):
     Create new repo groups in Augur's database
     """
     with ctx.obj.engine.begin() as connection:
-
         df = pd.read_sql(
             s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"),
             connection,
@@ -117,7 +129,6 @@ def add_repo_groups(ctx, filename):
         with open(filename) as create_repo_groups_file:
             data = csv.reader(create_repo_groups_file, delimiter=",")
             for row in data:
-
                 # Handle case where there's a hanging empty row.
                 if not row:
                     logger.info("Skipping empty data...")
@@ -137,6 +148,7 @@ def add_repo_groups(ctx, filename):
                         f"Repo group with ID {row[1]} for repo group {row[1]} already exists, skipping..."
                     )
 
+
 @cli.command("add-github-org")
 @click.argument("organization_name")
 @test_connection
@@ -151,29 +163,26 @@ def add_github_org(ctx, organization_name):
     from augur.util.repo_load_controller import RepoLoadController
 
     with GithubTaskSession(logger, engine=ctx.obj.engine) as session:
-
         controller = RepoLoadController(session)
 
         controller.add_cli_org(organization_name)
 
+
 # get_db_version is a helper function to print_db_version and upgrade_db_version
 def get_db_version(engine):
-
     db_version_sql = s.sql.text(
         """
         SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version'
         """
     )
 
     with engine.connect() as connection:
-
         result = int(connection.execute(db_version_sql).fetchone()[2])
 
     engine.dispose()
     return result
 
 
-
 @cli.command("print-db-version")
 @test_connection
 @test_db_connection
@@ -252,10 +261,10 @@ def update_api_key(ctx, api_key):
     )
 
     with ctx.obj.engine.begin() as connection:
-
         connection.execute(update_api_key_sql, api_key=api_key)
         logger.info(f"Updated Augur API key to: {api_key}")
 
+
 @cli.command("get-api-key")
 @test_connection
 @test_db_connection
@@ -282,36 +291,35 @@ def get_api_key(ctx):
 def check_pgpass():
     augur_db_env_var = getenv("AUGUR_DB")
     if augur_db_env_var:
-
         # gets the user, passowrd, host, port, and database_name out of environment variable
         # assumes database string of structure <beginning_of_db_string>//<user>:<password>@<host>:<port>/<database_name>
         # it returns a tuple like (<user>, <password>, <host>, <port>, <database_name)
-        db_string_parsed = re.search(r"^.+:\/\/([a-zA-Z0-9_]+):(.+)@([a-zA-Z0-9-_~\.]+):(\d{1,5})\/([a-zA-Z0-9_-]+)", augur_db_env_var).groups()
+        db_string_parsed = re.search(
+            r"^.+:\/\/([a-zA-Z0-9_]+):(.+)@([a-zA-Z0-9-_~\.]+):(\d{1,5})\/([a-zA-Z0-9_-]+)",
+            augur_db_env_var,
+        ).groups()
 
         if db_string_parsed:
-
             db_config = {
                 "user": db_string_parsed[0],
                 "password": db_string_parsed[1],
-                "host":  db_string_parsed[2],
+                "host": db_string_parsed[2],
                 "port": db_string_parsed[3],
-                "database_name": db_string_parsed[4] 
+                "database_name": db_string_parsed[4],
             }
 
             check_pgpass_credentials(db_config)
 
         else:
             print("Database string is invalid and cannot be used")
 
-
     else:
-         with open("db.config.json", "r") as f:
+        with open("db.config.json", "r") as f:
             config = json.load(f)
             print(f"Config: {config}")
             check_pgpass_credentials(config)
 
 
-
 @cli.command("init-database")
 @click.option("--default-db-name", default="postgres")
 @click.option("--default-user", default="postgres")
@@ -370,22 +378,20 @@ def init_database(
         f"GRANT ALL PRIVILEGES ON DATABASE {target_db_name} TO {target_user};",
     )
 
+
 @cli.command("reset-repo-age")
 @test_connection
 @test_db_connection
 @with_database
 @click.pass_context
 def reset_repo_age(ctx):
-
     with DatabaseSession(logger, engine=ctx.obj.engine) as session:
-        update_query = (
-            update(Repo)
-            .values(repo_added=datetime.now())
-        )
+        update_query = update(Repo).values(repo_added=datetime.now())
 
         session.execute(update_query)
         session.commit()
 
+
 @cli.command("test-connection")
 @test_connection
 @test_db_connection
@@ -406,14 +412,13 @@ def run_psql_command_in_database(target_type, target):
 
     if augur_db_environment_var:
         pass
-        #TODO: Add functionality for environment variable
+        # TODO: Add functionality for environment variable
     else:
-        with open("db.config.json", 'r') as f:
+        with open("db.config.json", "r") as f:
             db_config = json.load(f)
 
-            host = db_config['host']
-            database_name = db_config['database_name']
-
+            host = db_config["host"]
+            database_name = db_config["database_name"]
 
             db_conn_string = f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database_name']}"
             engine = s.create_engine(db_conn_string)
@@ -442,7 +447,7 @@ def check_pgpass_credentials(config):
 
     if not path.isfile(pgpass_file_path):
         print("~/.pgpass does not exist, creating.")
-        with open(pgpass_file_path, "w+",encoding="utf-8") as _:
+        with open(pgpass_file_path, "w+", encoding="utf-8") as _:
             chmod(pgpass_file_path, stat_module.S_IWRITE | stat_module.S_IREAD)
 
     pgpass_file_mask = oct(os.stat(pgpass_file_path).st_mode & 0o777)
@@ -451,7 +456,7 @@ def check_pgpass_credentials(config):
         print("Updating ~/.pgpass file permissions.")
         chmod(pgpass_file_path, stat_module.S_IWRITE | stat_module.S_IREAD)
 
-    with open(pgpass_file_path, "a+",encoding="utf-8") as pgpass_file:
+    with open(pgpass_file_path, "a+", encoding="utf-8") as pgpass_file:
         end = pgpass_file.tell()
         pgpass_file.seek(0)
 
@@ -475,4 +480,3 @@ def check_pgpass_credentials(config):
             pgpass_file.write(credentials_string + "\n")
         else:
             print("Credentials found in $HOME/.pgpass")
-
diff --git a/augur/application/cli/tasks.py b/augur/application/cli/tasks.py
@@ -37,7 +37,7 @@ def start():
 
     scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling"
     core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h"
-    secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
+    secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
 
     scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" "))
     core_worker_process = subprocess.Popen(core_worker.split(" "))