Skip to content

Commit

Permalink
Merge pull request #2790 from chaoss/pr-file-patch
Browse files Browse the repository at this point in the history
PR File Patch / Varoius Bug Fixes
  • Loading branch information
ABrain7710 committed May 21, 2024
2 parents dd05af9 + 2241f33 commit 064a917
Show file tree
Hide file tree
Showing 25 changed files with 351 additions and 236 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.63.0
# Augur NEW Release v0.70.0

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data. Less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot) ... A public instance of 8Knot is available at https://metrix.chaoss.io ... That is tied to a public instance of Augur at https://ai.chaoss.io
Expand All @@ -10,7 +10,7 @@ The primary way of looking at Augur data is through [8Knot](https://github.com/o
## NEW RELEASE ALERT!
### [If you want to jump right in, updated docker build/compose and bare metal installation instructions are available here](docs/new-install.md)

Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.63.0
Augur is now releasing a dramatically improved new version to the main branch. It is also available here: https://github.com/chaoss/augur/releases/tag/v0.70.0

- The `main` branch is a stable version of our new architecture, which features:
- Dramatic improvement in the speed of large scale data collection (100,000+ repos). All data is obtained for 100k+ repos within 2 weeks.
Expand Down
25 changes: 17 additions & 8 deletions augur/application/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from functools import update_wrapper
import os
import sys
import socket
import re
import json
import httpx

from augur.application.db.engine import DatabaseEngine
from augur.application.db import get_engine, dispose_database_engine
Expand All @@ -16,13 +16,22 @@ def test_connection(function_internet_connection):
@click.pass_context
def new_func(ctx, *args, **kwargs):
usage = re.search(r"Usage:\s(.*)\s\[OPTIONS\]", str(ctx.get_usage())).groups()[0]
try:
#try to ping google's dns server
socket.create_connection(("8.8.8.8",53))
return ctx.invoke(function_internet_connection, *args, **kwargs)
except OSError as e:
print(e)
print(f"\n\n{usage} command setup failed\nYou are not connect to the internet. Please connect to the internet to run Augur\n")
with httpx.Client() as client:
try:
_ = client.request(
method="GET", url="http://chaoss.community", timeout=10, follow_redirects=True)

return ctx.invoke(function_internet_connection, *args, **kwargs)
except (TimeoutError, httpx.TimeoutException):
print("Request timed out.")
except httpx.NetworkError:
print(f"Network Error: {httpx.NetworkError}")
except httpx.ProtocolError:
print(f"Protocol Error: {httpx.ProtocolError}")
print(f"\n\n{usage} command setup failed\n \
You are not connected to the internet.\n \
Please connect to the internet to run Augur\n \
Consider setting http_proxy variables for limited access installations.")
sys.exit()

return update_wrapper(new_func, function_internet_connection)
Expand Down
2 changes: 1 addition & 1 deletion augur/application/cli/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def determine_worker_processes(ratio,maximum):
sleep_time += 6

#20% of estimate, Maximum value of 25
secondary_num_processes = determine_worker_processes(.25, 25)
secondary_num_processes = determine_worker_processes(.25, 45)
logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
process_list.append(subprocess.Popen(secondary_worker.split(" ")))
Expand Down
2 changes: 1 addition & 1 deletion augur/application/cli/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def determine_worker_processes(ratio,maximum):
sleep_time += 6

#20% of estimate, Maximum value of 25
secondary_num_processes = determine_worker_processes(.25, 25)
secondary_num_processes = determine_worker_processes(.25, 45)
logger.info(f"Starting secondary worker processes with concurrency={secondary_num_processes}")
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency={secondary_num_processes} -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
process_list.append(subprocess.Popen(secondary_worker.split(" ")))
Expand Down
82 changes: 43 additions & 39 deletions augur/application/cli/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
import re
import stat as stat_module

from augur.application.cli import test_connection, test_db_connection, with_database, DatabaseContext
from augur.application.cli import (
test_connection,
test_db_connection,
with_database,
DatabaseContext,
)

from augur.application.db.session import DatabaseSession
from sqlalchemy import update
Expand All @@ -23,8 +28,9 @@

logger = logging.getLogger(__name__)


@click.group("db", short_help="Database utilities")
@click.pass_context
@click.pass_context
def cli(ctx):
ctx.obj = DatabaseContext()

Expand All @@ -36,36 +42,43 @@ def cli(ctx):
@with_database
@click.pass_context
def add_repos(ctx, filename):
"""Add repositories to Augur's database.
"""Add repositories to Augur's database.
The .csv file format should be repo_url,group_id
NOTE: The Group ID must already exist in the REPO_Groups Table.
If you want to add an entire GitHub organization, refer to the command: augur db add-github-org"""
If you want to add an entire GitHub organization, refer to the command: augur db add-github-org"""
from augur.tasks.github.util.github_task_session import GithubTaskSession
from augur.util.repo_load_controller import RepoLoadController

with GithubTaskSession(logger, engine=ctx.obj.engine) as session:

controller = RepoLoadController(session)

line_total = len(open(filename).readlines())
with open(filename) as upload_repos_file:
data = csv.reader(upload_repos_file, delimiter=",")
for row in data:

for line_num, row in enumerate(data):
repo_data = {}
repo_data["url"] = row[0]
try:
repo_data["repo_group_id"] = int(row[1])
except ValueError:
print(f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`")
print(
f"Invalid repo group_id: {row[1]} for Git url: `{repo_data['url']}`"
)
continue

print(
f"Inserting repo with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}")
controller.add_cli_repo(repo_data)
f"Inserting repo {line_num}/{line_total} with Git URL `{repo_data['url']}` into repo group {repo_data['repo_group_id']}"
)

succeeded, message = controller.add_cli_repo(repo_data)
if not succeeded:
logger.error(f"insert repo failed with error: {message['status']}`")
else:
logger.info(f"Repo added: {repo_data}")
print("Success")


@cli.command("get-repo-groups")
Expand Down Expand Up @@ -101,7 +114,6 @@ def add_repo_groups(ctx, filename):
Create new repo groups in Augur's database
"""
with ctx.obj.engine.begin() as connection:

df = pd.read_sql(
s.sql.text("SELECT repo_group_id FROM augur_data.repo_groups"),
connection,
Expand All @@ -117,7 +129,6 @@ def add_repo_groups(ctx, filename):
with open(filename) as create_repo_groups_file:
data = csv.reader(create_repo_groups_file, delimiter=",")
for row in data:

# Handle case where there's a hanging empty row.
if not row:
logger.info("Skipping empty data...")
Expand All @@ -137,6 +148,7 @@ def add_repo_groups(ctx, filename):
f"Repo group with ID {row[1]} for repo group {row[1]} already exists, skipping..."
)


@cli.command("add-github-org")
@click.argument("organization_name")
@test_connection
Expand All @@ -151,29 +163,26 @@ def add_github_org(ctx, organization_name):
from augur.util.repo_load_controller import RepoLoadController

with GithubTaskSession(logger, engine=ctx.obj.engine) as session:

controller = RepoLoadController(session)

controller.add_cli_org(organization_name)


# get_db_version is a helper function to print_db_version and upgrade_db_version
def get_db_version(engine):

db_version_sql = s.sql.text(
"""
SELECT * FROM augur_operations.augur_settings WHERE setting = 'augur_data_version'
"""
)

with engine.connect() as connection:

result = int(connection.execute(db_version_sql).fetchone()[2])

engine.dispose()
return result



@cli.command("print-db-version")
@test_connection
@test_db_connection
Expand Down Expand Up @@ -252,10 +261,10 @@ def update_api_key(ctx, api_key):
)

with ctx.obj.engine.begin() as connection:

connection.execute(update_api_key_sql, api_key=api_key)
logger.info(f"Updated Augur API key to: {api_key}")


@cli.command("get-api-key")
@test_connection
@test_db_connection
Expand All @@ -282,36 +291,35 @@ def get_api_key(ctx):
def check_pgpass():
augur_db_env_var = getenv("AUGUR_DB")
if augur_db_env_var:

# gets the user, passowrd, host, port, and database_name out of environment variable
# assumes database string of structure <beginning_of_db_string>//<user>:<password>@<host>:<port>/<database_name>
# it returns a tuple like (<user>, <password>, <host>, <port>, <database_name)
db_string_parsed = re.search(r"^.+:\/\/([a-zA-Z0-9_]+):(.+)@([a-zA-Z0-9-_~\.]+):(\d{1,5})\/([a-zA-Z0-9_-]+)", augur_db_env_var).groups()
db_string_parsed = re.search(
r"^.+:\/\/([a-zA-Z0-9_]+):(.+)@([a-zA-Z0-9-_~\.]+):(\d{1,5})\/([a-zA-Z0-9_-]+)",
augur_db_env_var,
).groups()

if db_string_parsed:

db_config = {
"user": db_string_parsed[0],
"password": db_string_parsed[1],
"host": db_string_parsed[2],
"host": db_string_parsed[2],
"port": db_string_parsed[3],
"database_name": db_string_parsed[4]
"database_name": db_string_parsed[4],
}

check_pgpass_credentials(db_config)

else:
print("Database string is invalid and cannot be used")


else:
with open("db.config.json", "r") as f:
with open("db.config.json", "r") as f:
config = json.load(f)
print(f"Config: {config}")
check_pgpass_credentials(config)



@cli.command("init-database")
@click.option("--default-db-name", default="postgres")
@click.option("--default-user", default="postgres")
Expand Down Expand Up @@ -370,22 +378,20 @@ def init_database(
f"GRANT ALL PRIVILEGES ON DATABASE {target_db_name} TO {target_user};",
)


@cli.command("reset-repo-age")
@test_connection
@test_db_connection
@with_database
@click.pass_context
def reset_repo_age(ctx):

with DatabaseSession(logger, engine=ctx.obj.engine) as session:
update_query = (
update(Repo)
.values(repo_added=datetime.now())
)
update_query = update(Repo).values(repo_added=datetime.now())

session.execute(update_query)
session.commit()


@cli.command("test-connection")
@test_connection
@test_db_connection
Expand All @@ -406,14 +412,13 @@ def run_psql_command_in_database(target_type, target):

if augur_db_environment_var:
pass
#TODO: Add functionality for environment variable
# TODO: Add functionality for environment variable
else:
with open("db.config.json", 'r') as f:
with open("db.config.json", "r") as f:
db_config = json.load(f)

host = db_config['host']
database_name = db_config['database_name']

host = db_config["host"]
database_name = db_config["database_name"]

db_conn_string = f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database_name']}"
engine = s.create_engine(db_conn_string)
Expand Down Expand Up @@ -442,7 +447,7 @@ def check_pgpass_credentials(config):

if not path.isfile(pgpass_file_path):
print("~/.pgpass does not exist, creating.")
with open(pgpass_file_path, "w+",encoding="utf-8") as _:
with open(pgpass_file_path, "w+", encoding="utf-8") as _:
chmod(pgpass_file_path, stat_module.S_IWRITE | stat_module.S_IREAD)

pgpass_file_mask = oct(os.stat(pgpass_file_path).st_mode & 0o777)
Expand All @@ -451,7 +456,7 @@ def check_pgpass_credentials(config):
print("Updating ~/.pgpass file permissions.")
chmod(pgpass_file_path, stat_module.S_IWRITE | stat_module.S_IREAD)

with open(pgpass_file_path, "a+",encoding="utf-8") as pgpass_file:
with open(pgpass_file_path, "a+", encoding="utf-8") as pgpass_file:
end = pgpass_file.tell()
pgpass_file.seek(0)

Expand All @@ -475,4 +480,3 @@ def check_pgpass_credentials(config):
pgpass_file.write(credentials_string + "\n")
else:
print("Credentials found in $HOME/.pgpass")

2 changes: 1 addition & 1 deletion augur/application/cli/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def start():

scheduling_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=1 -n scheduling:{uuid.uuid4().hex}@%h -Q scheduling"
core_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n core:{uuid.uuid4().hex}@%h"
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=25 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"
secondary_worker = f"celery -A augur.tasks.init.celery_app.celery_app worker -l info --concurrency=45 -n secondary:{uuid.uuid4().hex}@%h -Q secondary"

scheduling_worker_process = subprocess.Popen(scheduling_worker.split(" "))
core_worker_process = subprocess.Popen(core_worker.split(" "))
Expand Down

0 comments on commit 064a917

Please sign in to comment.