diff --git a/.gitignore b/.gitignore index 5cb68e33..bd8dc7dd 100755 --- a/.gitignore +++ b/.gitignore @@ -76,7 +76,7 @@ fil* summary_* # Environment variables -.env +.env* env.sh # PyCharm @@ -88,8 +88,6 @@ packaged.yaml samconfig.toml # Project files - -## Miscellaneous change-batch.json.tmp _cache/ tags diff --git a/Makefile b/Makefile index 782ec6d5..78c0929c 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,14 @@ # Application specific environment variables can replace variables declared in the Makefile with `?=` assignment. -include .env -# TODO use environment-specific .env, ie., .env.dev, .env.prod, etc. ==> make deploy STAGE=dev -# include .env.${STAGE} +# Environment variables +# include .env # Optional, include STAGE and AWS_PROFILE +include .env.${STAGE} export -# Base settings, these should almost never change -export AWS_ACCOUNT ?= $(shell aws sts get-caller-identity --query Account --output text) +export AWS_ACCOUNT ?= $(shell aws sts get-caller-identity \ + --query Account \ + --output text) + export ROOT_DIR := $(shell pwd) export DATABASE_DIR := ${ROOT_DIR}/${APP_NAME}/database export INFRA_DIR := ${ROOT_DIR}/${APP_NAME}/infrastructure @@ -30,12 +32,62 @@ export INSTANCE_ID := $(shell aws ssm get-parameters \ --output json \ | jq -r '.Parameters[0].Value') +# S3 paths +export PIPELINE_STATE_PATH := config/IMGTHLA-repository-state.json +export PIPELINE_PARAMS_PATH := config/pipeline-input.json +export FUNCTIONS_PATH := ${APP_NAME}/pipeline/functions + +# print colors +define blue + @tput setaf 4 + @echo $1 + @tput sgr0 +endef + +define green + @tput setaf 2 + @echo $1 + @tput sgr0 +endef + +define yellow + @tput setaf 3 + @echo $1 + @tput sgr0 +endef + +define red + @tput setaf 1 + @echo $1 + @tput sgr0 +endef + target: $(info ${HELP_MESSAGE}) @exit 0 -deploy: logs.purge check.env ##=> Deploy services +splash-screen: + @echo "\033[0;34m " + @echo "\033[0;34m ____ __ __ " + @echo "\033[0;34m ____ _ / __/___ ____/ // /_ " + @echo "\033[0;32m / __ \`// /_ / _ \ ______ / __ // __ \\" + @echo "\033[0;32m / /_/ // __// __/ /_____/ / /_/ // /_/ /" + @echo "\033[0;34m \__, //_/ \___/ \____//_____/ " + @echo "\033[0;34m/____/ \033[0m" + @echo "\033[0;34m \033[0m" + + +env.print: + @echo "\033[0;33mReview the contents of the .env file:\033[0m" + @echo "+---------------------------------------------------------------------------------+" + @awk '{ if (substr($$0, 1, 1) != "#") { line = substr($$0, 1, 76); if (length($$0) > 76) line = line "..."; printf "| %-79s |\n", line }}' .env.${STAGE} + @echo "+---------------------------------------------------------------------------------+" + @echo "\033[0;33mPlease confirm the above values are correct.\033[0m" + +deploy: splash-screen logs.purge env.validate.stage env.validate ##=> Deploy all services @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying ${APP_NAME} to ${AWS_ACCOUNT}" 2>&1 | tee -a ${CFN_LOG_PATH} + $(MAKE) env.print + @echo "Deploy stack to the \`${STAGE}\` environment? [y/N] \c " && read ans && [ $${ans:-N} = y ] $(MAKE) infrastructure.deploy $(MAKE) database.deploy $(MAKE) pipeline.deploy @@ -52,33 +104,6 @@ logs.dirs: "${LOGS_DIR}/pipeline/load" \ "${LOGS_DIR}/database/bootstrap" || true -check.env: check.dependencies -ifndef AWS_REGION -$(error AWS_REGION is not set. Please add AWS_REGION to the environment variables.) -endif -ifndef AWS_PROFILE -$(error AWS_PROFILE is not set. Please select an AWS profile to use.) -endif -ifndef GITHUB_REPOSITORY_OWNER -$(error GITHUB_REPOSITORY_OWNER is not set. Please add GITHUB_REPOSITORY_OWNER to the environment variables.) -endif -ifndef GITHUB_REPOSITORY_NAME -$(error GITHUB_REPOSITORY_NAME is not set. Please add GITHUB_REPOSITORY_NAME to the environment variables.) -endif -ifndef GITHUB_PERSONAL_ACCESS_TOKEN -$(error GITHUB_PERSONAL_ACCESS_TOKEN is not set. Please add GITHUB_PERSONAL_ACCESS_TOKEN to the environment variables.) -endif -ifndef CONFIG_S3_PATH -$(error CONFIG_S3_PATH is not set. Please add CONFIG_S3_PATH to the environment variables.) -endif -ifndef HOST_DOMAIN -$(error HOST_DOMAIN is not set. Please add HOST_DOMAIN to the environment variables.) -endif -ifndef ADMIN_EMAIL -$(error ADMIN_EMAIL is not set. Please add ADMIN_EMAIL to the environment variables.) -endif - @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Found environment variables" 2>&1 | tee -a ${CFN_LOG_PATH} - check.dependencies: $(MAKE) check.dependencies.docker $(MAKE) check.dependencies.awscli @@ -113,8 +138,71 @@ check.dependencies.jq: exit 1; \ fi -# Deploy specific stacks -infrastructure.deploy: +# TODO use cloudformation list-stacks as alternative to SSM parameter +env.validate.stage: + @res=$$(aws ssm get-parameters \ + --names "/${APP_NAME}/${STAGE}/${AWS_REGION}/Stage" \ + --output json \ + | jq -r '.Parameters[0].Value') && \ + echo "Found stage: $${res}" && \ + if [ "$${res}" = "null" ]; then \ + echo "\033[0;32m**** Starting new deployment. ****\033[0m"; \ + elif [ "$${res}" = "${STAGE}" ]; then \ + echo "\033[0;32m**** Found existing deployment for \`${STAGE}\` ****\033[0m"; \ + else \ + echo "\033[0;31m**** STAGE mismatch or bad credential configuration. ****\033[0m" && \ + echo "\033[0;31m**** Please refer to the documentation for a list of prerequisites. ****\033[0m" && \ + exit 1; \ + fi + +env.validate.no-vpc: +ifeq ($(VPC_ID),) + $(call red, "VPC_ID must be set as an environment variable when \`CREATE_VPC\` is false") + @exit 1 +else + $(call green, "Found VPC_ID: ${VPC_ID}") +endif +ifeq ($(PUBLIC_SUBNET_ID),) + $(call red, "PUBLIC_SUBNET_ID must be set as an environment variable when \`CREATE_VPC\` is false") + @exit 1 +else + $(call green, "Found PUBLIC_SUBNET_ID: ${PUBLIC_SUBNET_ID}") +endif + +env.validate: check.dependencies +ifndef AWS_ACCOUNT + $(error AWS_ACCOUNT is not set. Please add AWS_ACCOUNT to the environment variables.) +endif +ifndef AWS_REGION + $(error AWS_REGION is not set. Please add AWS_REGION to the environment variables.) +endif +ifndef AWS_PROFILE + $(error AWS_PROFILE is not set. Please select an AWS profile to use.) +endif +ifndef GITHUB_PERSONAL_ACCESS_TOKEN + $(error GITHUB_PERSONAL_ACCESS_TOKEN is not set. Please add GITHUB_PERSONAL_ACCESS_TOKEN to the environment variables.) +endif +ifndef HOST_DOMAIN + $(error HOST_DOMAIN is not set. Please add HOST_DOMAIN to the environment variables.) +endif +ifndef ADMIN_EMAIL + $(error ADMIN_EMAIL is not set. Please add ADMIN_EMAIL to the environment variables.) +endif +ifndef CREATE_VPC + $(info 'CREATE_VPC' is not set. Defaulting to 'false') + $(eval export CREATE_VPC := false) + $(call blue, "**** This deployment uses an existing VPC**** ") + $(MAKE) env.validate.no-vpc +endif +ifeq ($(CREATE_VPC),false) + $(call blue, "**** This deployment uses an existing VPC**** ") + $(MAKE) env.validate.no-vpc +else ifeq ($(CREATE_VPC),true) + $(call blue, "**** This deployment includes a VPC**** ") +endif + @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Found environment variables" 2>&1 | tee -a ${CFN_LOG_PATH} + +infrastructure.deploy: $(MAKE) -C ${APP_NAME}/infrastructure/ deploy database.deploy: @@ -239,9 +327,14 @@ database.get.credentials: database.get.instance-id: @echo "${INSTANCE_ID}" -# TODO add confirmation to proceed +# TODO add confirmation to proceed BOOKMARK delete: # data=true/false ##=> Delete services @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deleting ${APP_NAME} in ${AWS_ACCOUNT}" 2>&1 | tee -a ${CFN_LOG_PATH} + @[[ $$data != true ]] && echo "Data will not be deleted. To delete pass \`data=true\`" || true + @echo "Delete all stacks from the \`${STAGE}\` environment? [y/N] \c " && read ans && [ $${ans:-N} = y ] && \ + if [ "${data}" = "true" ]; then \ + aws s3 rm --recursive s3://${DATA_BUCKET_NAME}; \ + fi $(MAKE) pipeline.delete $(MAKE) database.delete $(MAKE) infrastructure.delete diff --git a/README.md b/README.md index 24eb7fea..f4f85c43 100755 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Graph database representing IPD-IMGT/HLA sequence data as GFE. ├── Makefile # Use the root Makefile to deploy, delete and manage resources and configuration ├── README.md ├── docs # Sphinx documentation -├── (-gfe-db--neo4j-key.pem) # EC2 key pair for SSH access to Neo4j server, ccreated on deployment +├── (-gfe-db--neo4j-key.pem) # EC2 key pair for SSH access to Neo4j server, created on deployment ├── requirements-dev.txt # Python requirements for local development ├── requirements-docs.txt # Python requirements for documentation └── gfe-db @@ -110,43 +110,63 @@ The `gfe-db` represents IPD-IMGT/HLA sequence data as GFE nodes and relationship This allows the database and pipeline layers to be decoupled from each other and deployed or destroyed independently without affecting the other. Common configuration parameters are shared between resources using environment variables, JSON files, AWS SSM Paramter Store and Secrets Manager. ### Base Infrastructure -The base infrastructure layer deploys a VPC, public subnet, S3 bucket, Elastic IP and common SSM parameters and secrets for the other services to use. +The base infrastructure layer deploys a VPC (optional), public subnet (optional), S3 bucket, Elastic IP and common SSM parameters and secrets for the other services to use. ### Database -The database layer deploys an EC2 instance running the Bitnami Neo4j AMI (Ubuntu 18.04) into a public subnet. CloudFormation also creates an A record for a Route53 domain under a pre-existing Route53 domain and hosted zone so that SSL can be used to connect to Neo4j. During database deploymeny the SSL certificate is created and Cypher queries are run to create constraints and indexes, which help speed up loading and ensure data integrity. Neo4j is ready to be accessed through a browser once the instance has booted sucessfully. +The database layer deploys an EC2 instance running the Bitnami Neo4j AMI (Ubuntu 18.04) into a public subnet. An A record is required for a pre-existing Route53 domain and hosted zone so that SSL can be used to connect to Neo4j. During database deployment the SSL certificate is created and Cypher queries are run to create constraints and indexes, which help speed up loading and ensure data integrity. Neo4j is ready to be accessed through a browser once the instance has booted sucessfully. -During loading, the `invoke_load_script` Lambda function uses SSM Run Command to execute bash scripts on the daatabase instance. These scripts communicate with the Step Functions API to retrieve the job parameters, fetch the CSVs from S3 and load the alleles into Neo4j. +During loading, a Lambda function calls the SSM Run Command API to execute bash scripts on the database instance. These scripts communicate with the Step Functions API to retrieve the job parameters, fetch the CSVs from S3 and populate the graph in Neo4j. It is also possible to backup & restore to and from S3 by specific date checkpoints. ### Data Pipeline -The data pipeline layer automates integration of newly released IMGT/HLA data into Neo4j using a scheduled Lambda which watches the source data repository and invokes the build and load processes when it detects a new IMGT/HLA version. The pipeline consists of a Step Functions state machine which orchestrates two basic processes: build and load. The build process employs a Batch job which produces an intermediate set of CSV files. The load process leverages SSM Run Command to copy the CSV files to the Neo4j server and execute Cypher statements directly on the server (server-side loading). When loading the full dataset of 35,000+ alleles, the build step will generally take around 15 minutes, however the load step can take an hour or more. +The data pipeline layer automates integration of newly released IMGT/HLA data into Neo4j using a scheduled Lambda which watches the source data repository and invokes the build and load processes when it detects a new IMGT/HLA version in the upstream repository. The pipeline consists of a Step Functions state machine which orchestrates the build and load stages. The build process employs a Batch jobs to generate an intermediate set of CSV files. The load process leverages SSM Run Command to copy the CSV files to the Neo4j server and execute Cypher statements directly on the server (server-side loading). When loading the full dataset of 35,000+ alleles, the build step will generally take around 15 minutes, however the load step can take an hour or more. ## Deployment -Follow the steps to build and deploy the application to AWS. +It is possible to deploy gfe-db within it's own VPC, or to connect it to an external VPC by specigying `CREATE_VPC=true/false`. ### Quick Start +These list outline the basic steps for deployments. For more details please see the following sections. + +**Using external VPC** 1. Retrieve the VPC ID and subnet ID from the AWS console or using the AWS CLI. -This list outlines the basic steps for deployment. For more details please see the following sections. -2. Purchase or designate a domain in Route53 and create a hosted zone with an A record for the subdomain. You can use the VPC's IP address for the A record because it will be updated later by the deployment script. +2. Purchase or designate a domain in Route53 and create a hosted zone with an A record for the subdomain. You can use `0.0.0.0` for the A record because it will be updated later by the deployment script. 3. Acquire a subscription for the Bitnami Neo4j AMI through [AWS Marketplace](https://aws.amazon.com/marketplace/pp/prodview-v47qqrn2yy7ie?sr=0-4&ref_=beagle&applicationId=AWSMPContessa). 4. [Install prerequisites](#Prerequisites). -5. [Set environment variables](#environment) including the ones from the previous steps: - - VPC_ID (step 1) - - PUBLIC_SUBNET_ID (step 1) - - HOSTED_ZONE_ID (step 2) - - HOST_DOMAIN (step 2) - - SUBDOMAIN (step 2) - - NEO4J_AMI_ID (step 3) +5. [Set environment variables](#environment) including the ones from the previous steps. You must store these in a file named `.env.`, for example `.env.dev` or `.env.prod`: + - CREATE_VPC=false + - VPC_ID + - PUBLIC_SUBNET_ID + - HOSTED_ZONE_ID + - HOST_DOMAIN + - SUBDOMAIN + - NEO4J_AMI_ID 6. Check the [config JSONs](#data-pipeline-config) (parameters and state) and edit the values as desired. -7. Run `make deploy` to deploy the stacks to AWS. -8. Run `make database.load.run releases=` to load the Neo4j, or `make database.load.run releases= limit=` to run with a limited number of alleles. -9. Run `make database.get-credentials` to get the username and password for Neo4j. -10. Run `make database.get-url` to get the URL for Neo4j and navigate to the Neo4j browser at the subdomain and host domain, for example `https://gfe-db.cloudftl.com:7473/browser/`. +7. Run `STAGE= make deploy` to deploy the stacks to AWS. +8. Run `STAGE= make database.load.run releases=` to load the Neo4j, or `STAGE= make database.load.run releases= limit=` to run with a limited number of alleles. +9. Run `STAGE= make database.get.credentials` to get the username and password for Neo4j. +10. Run `STAGE= make database.get.endpoint` to get the URL for Neo4j and navigate to the Neo4j browser at the subdomain and host domain, for example `https://gfe-db.cloudftl.com:7473/browser/`. + +**Creating a new VPC** +1. Purchase or designate a domain in Route53 and create a hosted zone with an A record for the subdomain. You can use `0.0.0.0` for the A record because it will be updated later by the deployment script. +2. Acquire a subscription for the Bitnami Neo4j AMI through [AWS Marketplace](https://aws.amazon.com/marketplace/pp/prodview-v47qqrn2yy7ie?sr=0-4&ref_=beagle&applicationId=AWSMPContessa). +3. [Install prerequisites](#Prerequisites). +4. [Set environment variables](#environment) including the ones from the previous steps. You must store these in a file named `.env.`, for example `.env.dev` or `.env.prod`: + - CREATE_VPC=true + - HOSTED_ZONE_ID + - HOST_DOMAIN + - SUBDOMAIN + - NEO4J_AMI_ID +5. Check the [config JSONs](#data-pipeline-config) (parameters and state) and edit the values as desired. +6. Run `STAGE= make deploy` to deploy the stacks to AWS. +7. Run `STAGE= make database.load.run releases=` to load the Neo4j, or `STAGE= make database.load.run releases= limit=` to run with a limited number of alleles. +8. Run `STAGE= make database.get-credentials` to get the username and password for Neo4j. +9. Run `STAGE= make database.get.endpoint` to get the URL for Neo4j and navigate to the Neo4j browser at the subdomain and host domain, for example `https://gfe-db.cloudftl.com:7473/browser/`. ### Prerequisites Please refer to the respective documentation for specific installation instructions. -* Route53 domain and hosted zone +* Route53 domain, hosted zone, and A record +* VPC & Public Subnet (if using external VPC) * Bitnami Neo4j AMI subscription and AMI ID * GNU Make 3.81 * coreutils (optional but recommended) @@ -154,7 +174,7 @@ Please refer to the respective documentation for specific installation instructi * SAM CLI * Docker * jq -* Python 3.9 (if developing locally) +* Python 3.9+ (if developing locally) ### Environment @@ -170,56 +190,68 @@ For more information visit the documentation page: [Configuration and credential file settings](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html) #### Shell Variables -These variables must be defined before running Make. The best way to set these variables is with a `.env` file following this structure. +These variables must be defined before running Make. The best way to set these variables is with a `.env.` file following this structure. ```bash -# .env +# .env. +AWS_PROFILE= # Include profile if stacks are in a different accounts STAGE= -APP_NAME=gfe-db +APP_NAME= AWS_REGION= -VPC_ID= # Available through the console or CLI -PUBLIC_SUBNET_ID= # Available through the console or CLI; Public subnets have a route to an internet gateway -GITHUB_PERSONAL_ACCESS_TOKEN= -HOSTED_ZONE_ID= # Available through the console or CLI -HOST_DOMAIN= -SUBDOMAIN= ADMIN_EMAIL= SUBSCRIBE_EMAILS=,,,... -APOC_VERSION=4.4.0.3 -GDS_VERSION=2.0.1 +GITHUB_REPOSITORY_OWNER= +GITHUB_REPOSITORY_NAME= +HOST_DOMAIN= +CREATE_VPC= +VPC_ID= # if CREATE_VPC=false +PUBLIC_SUBNET_ID= # if CREATE_VPC=false +HOSTED_ZONE_ID= +SUBDOMAIN= NEO4J_AMI_ID= # Requires AWS Marketplace subscription +APOC_VERSION= +GDS_VERSION= +GITHUB_PERSONAL_ACCESS_TOKEN= ``` | Variable Name | Example Value | Type | Description | | ---------------------------- | ---------------------------------- | ------ | ------------------------------------------------ | +| AWS_PROFILE | | string | AWS profile for deployment. | | STAGE | dev | string | The stage of the application. | | APP_NAME | gfe-db | string | The name of the application. | | AWS_REGION | us-east-1 | string | The AWS region to deploy to. | -| VPC_ID | vpc-1234567890abcdef | string | The ID of the VPC to deploy to. | -| PUBLIC_SUBNET_ID | subnet-1234567890abcdef | string | The ID of the public subnet to deploy to. | -| GITHUB_PERSONAL_ACCESS_TOKEN | | string | GitHub PAT for repository access | -| HOSTED_ZONE_ID | Z1234567890ABCDEF | string | The ID of the hosted zone to deploy to. | +| ADMIN_EMAIL | user@company.com | string | Admin's email required for SSL certificate. | +| SUBSCRIBE_EMAILS | user@company.com,user2@company.com | string | Comma-separated list of emails for notifications | +| GITHUB_REPOSITORY_OWNER | | string | GitHub repository owner. | +| GITHUB_REPOSITORY_NAME | | string | GitHub repository name. | | HOST_DOMAIN | example.com | string | The domain to deploy to. | +| CREATE_VPC | true or false | string | Whether to create a new VPC. | +| HOSTED_ZONE_ID | Z1234567890ABCDEF | string | The ID of the hosted zone to deploy to. | | SUBDOMAIN | gfe-db | string | The subdomain to deploy to. | -| ADMIN_EMAIL | user@company.com | string | Admin's email required for SSL certificate | -| SUBSCRIBE_EMAILS | user@company.com,user2@company.com | string | Comma-separated list of emails for notifications | -| APOC_VERSION | 4.4.0.3 | string | APOC version for Neo4j | -| GDS_VERSION | 2.0.1 | string | GDS version for Neo4j | -| NEO4J_AMI_ID | ami-0b9a2b6b1c5b8b5b9 | string | Bitnami Neo4j AMI ID | +| NEO4J_AMI_ID | ami-0b9a2b6b1c5b8b5b9 | string | Bitnami Neo4j AMI ID. | +| APOC_VERSION | 4.4.0.3 | string | APOC version for Neo4j. | +| GDS_VERSION | 2.0.1 | string | GDS version for Neo4j. | +| GITHUB_PERSONAL_ACCESS_TOKEN | | string | GitHub PAT for repository access. | ***Important**:* *Always use a `.env` file or AWS SSM Parameter Store or Secrets Manager for sensitive variables like credentials and API keys. Never hard-code them, including when developing. AWS will quarantine an account if any credentials get accidentally exposed and this will cause problems. Make sure to update `.gitignore` to avoid pushing sensitive data to public repositories.* ### Makefile Usage -Once an AWS profile is configured and environment variables are exported, the application can be deployed using `make`. +Once an AWS profile is configured and environment variables are exported, the application can be deployed using `make`. You are required to specify the `STAGE` variable everytime `make` is called to ensure that the correct environment is selected when there are multiple deployments. ```bash -make deploy +STAGE= make deploy ``` It is also possible to deploy or update the database or pipeline services. ```bash # Deploy/update only the database service -make database.deploy +STAGE= make database.deploy # Deploy/update only the pipeline service -make pipeline.deploy +STAGE= make pipeline.deploy + +# Deploy/update only the pipeline serverless stack +STAGE= make pipeline.functions.deploy + +# Deploy/update only the Docker image for the build job +STAGE= make pipeline.jobs.deploy ``` *Note:* It is recommended to only deploy from the project root. This is because common parameters are passed from the root Makefile to nested Makefiles. If a stack has not been changed, the deployment script will continue until it reaches a stack with changes and deploy that. @@ -227,40 +259,40 @@ make pipeline.deploy To see a list of possible commands using Make, run `make` on the command line. You can also refer to the `Makefile Usage` section in the [Sphinx documentation](#documentation). ```bash # Deploy all CloudFormation based services -make deploy +STAGE= make deploy # Deploy config files and scripts to S3 -make config.deploy +STAGE= make config.deploy # Run the StepFunctions State Machine to load Neo4j -make database.load.run releases= align= kir= limit= +STAGE= make database.load.run releases= align= kir= limit= # Retrieve Neo4j credentials after deployment -make database.get-credentials +STAGE= make database.get.credentials # Retrieve Neo4j URL after deployment -make database.get-url +STAGE= make database.get.endpoint # Download logs from EC2 -make get.logs +STAGE= make get.logs # Download CSV data from S3 -make get.data +STAGE= make get.data -# Delete all CloudFormation based services and data -make delete +# Delete all CloudFormation based services and data, default is data=false +STAGE= make delete data= # Delete a specific layer -make pipeline.delete +STAGE= make pipeline.delete # Subscribe an email for notifications (unsubscribe using console) -make monitoring.subscribe-email email= +STAGE= make monitoring.subscribe-email email= ``` ## Managing Configuration Configuration is managed using JSON files, SSM Parameter Store, Secrets Manager, and shell variables. To deploy changes in these files, run the command. ```bash -make config.deploy +STAGE= make config.deploy ``` ### Database Configuration @@ -282,13 +314,10 @@ gfe-db/database/scripts └── start_task.sh # Coordinates database loading with the Step Functions API ``` -To update shell scripts on the Neo4j instance, run the following commands in sequence. +To update shell scripts on the Neo4j instance, run the following command. ```bash -# sync the scripts to S3 -make config.deploy - # sync the scripts from S3 to the instance (using Systems Manager Run Command) -make database.sync-scripts +STAGE= make database.sync-scripts ``` #### Cypher Scripts @@ -330,7 +359,7 @@ Base input parameters (excluding the `releases` value) are passed to the Step Fu The data pipeline can also be invoked from the command line: ```bash -make database.load.run releases= align= kir= limit= +STAGE= make database.load.run releases= align= kir= limit= ``` #### IMGT/HLA Release Versions State @@ -358,19 +387,19 @@ The application's state tracks which releases have been processed and added to t ## Loading Neo4j For each invocation the data pipeline will download raw data from [ANHIG/IMGTHLA](https://github.com/ANHIG/IMGTHLA) GitHub repository, build a set of intermediate CSV files and load these into Neo4j via S3. To invoke the pipeline, run the following command. ```bash -make database.load.run releases="" +STAGE= make database.load.run releases="" # Example for single version -make database.load.run releases=3510 +STAGE= make database.load.run releases="3510" # Example for multiple versions -make database.load.run releases=3490,3500,3510 +STAGE= make database.load.run releases="3490,3500,3510" # Example with limit -make database.load.run releases=3510 limit=1000 +STAGE= make database.load.run releases="3510" limit="1000" # Example with all arguments included -make database.load.run releases=3510 limit="" align=false kir=false +STAGE= make database.load.run releases="3510" limit="" align="False" kir="False" ``` These commands build an event payload to send to the `invoke-gfe-db-pipeline` Lambda. @@ -407,15 +436,15 @@ The Lambda function returns the following object which can be viewed in CloudWat ### Clean Up To tear down resources run the command. You will need to manually delete the data in the S3 bucket first to avoid an error in CloudFormation. ```bash -make delete +STAGE= make delete data= ``` Use the following commands to tear down individual services. Make sure to [backup](#backup--restore) your data first. ```bash # Delete only the database service -make database.delete +STAGE= make database.delete # Delete only the pipeline service -make pipeline.delete +STAGE= make pipeline.delete ``` ## Backup & Restore @@ -425,7 +454,7 @@ make pipeline.delete Backups are orchestrated by Systems Manager and automated everyday at midnight US/Central time by default. To create a backup, run the command. ```bash -make database.backup +STAGE= make database.backup ``` This will create a backup of the Neo4j database and store it in S3 under the path `s3:///backups/neo4j/YYYY/MM/DD/HH/gfedb.zip`. @@ -435,13 +464,13 @@ This will create a backup of the Neo4j database and store it in S3 under the pat To see a list of available backup dates that can be restored, run the command. ```bash -make database.backup.list +STAGE= make database.backup.list ``` To restore from a backup, pass the date of the backup you wish to restore using the format YYYY/MM/DD/HH. ```bash -make database.restore from_date= +STAGE= make database.restore from_date= ``` ## Local Development @@ -471,7 +500,7 @@ jupyter kernelspec uninstall It is not necessary to install Sphinx to view `gfe-db` documentation because it is already built and available in the `docs/` folder, but you will need it to edit them. To get the local `index.html` path run the command and navigate to the URL in a browser. ```bash -make docs.url +STAGE= make docs.url ``` ### Editing and Building the Documentation @@ -485,7 +514,7 @@ pip install -r requirements-docs.txt After making your edits, you can build the HTML assets by running the command. ```bash -make docs.build +STAGE= make docs.build ``` ## Troubleshooting diff --git a/gfe-db/database/Makefile b/gfe-db/database/Makefile index 23589791..b59e808a 100644 --- a/gfe-db/database/Makefile +++ b/gfe-db/database/Makefile @@ -35,11 +35,11 @@ service.config.neo4j.deploy: service.config.neo4j.update-dns @aws s3 cp config/neo4j/neo4j.conf s3://$$DATA_BUCKET_NAME/$${CONFIG_S3_PATH}/${SERVICE}/neo4j/neo4j.conf 2>&1 | tee -a $$CFN_LOG_PATH @aws s3 cp --recursive config/neo4j/cypher/ s3://$$DATA_BUCKET_NAME/$${CONFIG_S3_PATH}/${SERVICE}/neo4j/cypher/ 2>&1 | tee -a $$CFN_LOG_PATH -# # This target will configure the database to use the correct DNS name using env vars -# service.config.neo4j.update-dns: -# @[ "${HOST_DOMAIN}" != "" ] && \ -# cat neo4j/neo4j.template | \ -# sed s"/# dbms.default_advertised_address=.*/dbms.default_advertised_address=${SUBDOMAIN}.${HOST_DOMAIN}/" > neo4j/neo4j.conf +# This target will configure the database to use the correct DNS name using env vars +service.config.update-dns: + @[ "${HOST_DOMAIN}" != "" ] && \ + cat neo4j/neo4j.template | \ + sed s"/# dbms.default_advertised_address=.*/dbms.default_advertised_address=${SUBDOMAIN}.${HOST_DOMAIN}/" > neo4j/neo4j.conf service.config.scripts.deploy: @script_s3_path=s3://$$DATA_BUCKET_NAME/$${CONFIG_S3_PATH}/${SERVICE}/scripts/ && \ diff --git a/gfe-db/infrastructure/Makefile b/gfe-db/infrastructure/Makefile index c04d5fb8..54b12800 100644 --- a/gfe-db/infrastructure/Makefile +++ b/gfe-db/infrastructure/Makefile @@ -18,9 +18,15 @@ service.deploy.update-dns: --query "Parameters[0].Value" \ --output text) && \ sed -e "s//${SUBDOMAIN}.${HOST_DOMAIN}./g" -e "s//$$elastic_ip/g" $$config_path > $$config_path.tmp && \ - aws route53 change-resource-record-sets --hosted-zone-id $${HOSTED_ZONE_ID} --change-batch file://$$config_path.tmp + echo "Updating DNS records with:" && \ + cat $$config_path.tmp | jq -r && \ + res=$$(aws route53 change-resource-record-sets --hosted-zone-id $${HOSTED_ZONE_ID} --change-batch file://$$config_path.tmp) && \ + echo && \ + echo "Response:" && \ + echo $$res | jq -r # TODO test AWSCLI output and validate the stack was created successfully +# Add stateful check from SSM Param of vpc=true/false to correctly set CREATE_VPC service.deploy: @echo "$$(gdate -u +'%Y-%m-%d %H:%M:%S.%3N') - Deploying CloudFormation" 2>&1 | tee -a $${CFN_LOG_PATH} @aws cloudformation deploy \ @@ -31,11 +37,12 @@ service.deploy: --parameter-overrides \ Stage="$${STAGE}" \ AppName="$${APP_NAME}" \ - AdminEmail="$${ADMIN_EMAIL}" \ - DataBucketName="$${DATA_BUCKET_NAME}" \ + createVpc="$${CREATE_VPC}" \ VpcId="$${VPC_ID}" \ PublicSubnetId="$${PUBLIC_SUBNET_ID}" \ - 2>&1 | tee -a $${CFN_LOG_PATH} || true + DataBucketName="$${DATA_BUCKET_NAME}" \ + GitHubPersonalAccessToken="$$GITHUB_PERSONAL_ACCESS_TOKEN" \ + AdminEmail="$${ADMIN_EMAIL}" 2>&1 | tee -a $${CFN_LOG_PATH} || true $(MAKE) service.deploy.update-dns service.monitoring.create-subscriptions: diff --git a/gfe-db/infrastructure/template.yaml b/gfe-db/infrastructure/template.yaml index 0f3df650..74f3b6dc 100644 --- a/gfe-db/infrastructure/template.yaml +++ b/gfe-db/infrastructure/template.yaml @@ -7,15 +7,33 @@ Parameters: Description: Stage of production AppName: Type: String - AdminEmail: + createVpc: Type: String + Default: 'false' + AllowedValues: + - 'true' + - 'false' DataBucketName: Type: String VpcId: Type: String + Description: Required if createVpc is false + Default: '' PublicSubnetId: Type: String - + Description: Required if createVpc is false + Default: '' + GitHubPersonalAccessToken: + Type: String + NoEcho: true + AdminEmail: + Type: String + Description: Email address for Let's Encrypt SSL certificates + +Conditions: + CreateVpc: !Equals [!Ref createVpc, 'true'] + UseExternalVpc: !Equals [!Ref createVpc, 'false'] + Mappings: # AvailabilityZoneMap defines availability zones where an m5d.xlarge instance is available (used for Neo4j server). AvailabilityZoneMap: @@ -35,29 +53,127 @@ Mappings: AvailabilityZone: eu-west-3a Resources: - - DataBucket: - Type: AWS::S3::Bucket - # Condition: CreateDataBucket + + StageParameter: + Type: AWS::SSM::Parameter Properties: - BucketName: !Ref DataBucketName + Type: String + Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/Stage' + Description: "Stage of production" + Value: !Ref Stage - VpcIDParameter: + AppParameter: Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/AppName' + Description: "Name of application" + Value: !Ref AppName + + CreateVpcParameter: + Type: AWS::SSM::Parameter + Properties: + Type: String + Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/CreateVpc' + Description: !Sub "True if stack creates a VPC, false if stack uses an existing VPC" + Value: !Ref createVpc + + Vpc: + Type: AWS::EC2::VPC + Condition: CreateVpc + Properties: + CidrBlock: 10.0.0.0/16 + EnableDnsHostnames: true + EnableDnsSupport: true + Tags: + - Key: Name + Value: !Sub '${Stage}-${AppName}-${AWS::Region}-vpc' + + PublicSubnet: + Type: AWS::EC2::Subnet + Condition: CreateVpc + Properties: + CidrBlock: 10.0.0.0/24 + VpcId: !Ref Vpc + AvailabilityZone: !FindInMap [AvailabilityZoneMap, !Ref AWS::Region, AvailabilityZone] + MapPublicIpOnLaunch: true + + InternetGateway: + Type: AWS::EC2::InternetGateway + Condition: CreateVpc + + RouteTable: + Type: AWS::EC2::RouteTable + Condition: CreateVpc + Properties: + VpcId: !Ref Vpc + + VpcGatewayAttachment: + Type: AWS::EC2::VPCGatewayAttachment + Condition: CreateVpc + Properties: + VpcId: !Ref Vpc + InternetGatewayId: !Ref InternetGateway + + Route: + Type: AWS::EC2::Route + Condition: CreateVpc + DependsOn: + - InternetGateway + - VpcGatewayAttachment + Properties: + RouteTableId: !Ref RouteTable + DestinationCidrBlock: 0.0.0.0/0 + GatewayId: !Ref InternetGateway + + SubnetRouteTableAssociation: + Type: AWS::EC2::SubnetRouteTableAssociation + Condition: CreateVpc + Properties: + RouteTableId: !Ref RouteTable + SubnetId: !Ref PublicSubnet + + CreateVpcConditionVpcIDParameter: + Type: AWS::SSM::Parameter + Condition: CreateVpc + Properties: + Type: String + Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/VpcID' + Description: !Sub "Name of VPC network for ${AppName}" + Value: !Ref Vpc + + CreateVpcConditionPublicSubnetIDParameter: + Type: AWS::SSM::Parameter + Condition: CreateVpc + Properties: + Type: String + Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/PublicSubnetID' + Description: !Sub "Public Subnet for the ${AppName} Neo4j server" + Value: !Ref PublicSubnet + + UseExternalVpcConditionVpcIDParameter: + Type: AWS::SSM::Parameter + Condition: UseExternalVpc Properties: Type: String Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/VpcID' Description: !Sub "Name of VPC network for ${AppName}" Value: !Ref VpcId - PublicSubnetIDParameter: + UseExternalVpcConditionPublicSubnetIDParameter: Type: AWS::SSM::Parameter + Condition: UseExternalVpc Properties: Type: String Name: !Sub '/${AppName}/${Stage}/${AWS::Region}/PublicSubnetID' Description: !Sub "Public Subnet for the ${AppName} Neo4j server" Value: !Ref PublicSubnetId + DataBucket: + Type: AWS::S3::Bucket + Properties: + BucketName: !Ref DataBucketName + DataBucketNameParameter: Type: AWS::SSM::Parameter Properties: diff --git a/notebooks/0.2-github-api-eda.ipynb b/notebooks/0.2-github-api-eda.ipynb index 8f2f556b..42404ae8 100644 --- a/notebooks/0.2-github-api-eda.ipynb +++ b/notebooks/0.2-github-api-eda.ipynb @@ -5,19 +5,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# GitHub API EDA\n", - "Using NetworkX for exploration of graph data from the GitHub API." + "# GitHub API EDA" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv, find_dotenv\n", - "load_dotenv(find_dotenv());\n", + "load_dotenv('/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/.env.dev');\n", "from itertools import chain, starmap\n", "import json\n", "import requests\n", @@ -32,7 +31,27 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/ammon/Projects/nmdp-bioinformatics/02-Repositories/gfe-db/notebooks'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -40,13 +59,12 @@ "AWS_REGION = os.environ[\"AWS_REGION\"] \n", "GITHUB_PERSONAL_ACCESS_TOKEN = os.environ[\"GITHUB_PERSONAL_ACCESS_TOKEN\"]\n", "GITHUB_REPOSITORY_OWNER = \"ANHIG\" # os.environ[\"GITHUB_REPOSITORY_OWNER\"]\n", - "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n", - "GFE_BUCKET = os.environ[\"GFE_BUCKET\"]" + "GITHUB_REPOSITORY_NAME = \"IMGTHLA\" # os.environ[\"GITHUB_REPOSITORY_NAME\"]\n" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -88,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -117,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -174,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -203,7 +221,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -237,47 +255,91 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "# branches = get_branches(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME)" + "from pygethub import list_branches, GitHubPaginator" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ - "# branch = get_branch(GITHUB_REPOSITORY_OWNER, GITHUB_REPOSITORY_NAME, \"Latest\")" + "paginator = GitHubPaginator(GITHUB_PERSONAL_ACCESS_TOKEN)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "# branch['commit']['sha']" + "# Get all branches\n", + "\n", + "branches_gen = paginator.get_paginator(list_branches, owner=GITHUB_REPOSITORY_OWNER, repo=GITHUB_REPOSITORY_NAME)" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Page 1: 55 items\n" + ] + } + ], + "source": [ + "all_branches = list(branches_gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Page 1: 55 items\n" + ] + } + ], "source": [ - "# branches_df = pd.DataFrame([flatten_json(branch) for branch in branches])" + "branches_df = pd.DataFrame([flatten_json(branch) for branch in branches])" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "# branches_df" + "releases = list(branches_df['name'].unique())[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "300,310,320,330,340,350,360,370,380,390,3100,3110,3120,3130,3140,3150,3160,3170,3180,3190,3200,3210,3220,3230,3240,3250,3260,3270,3280,3290,3300,3310,3320,3330,3340,3350,3360,3370,3380,3390,3400,3410,3420,3430,3440,3450,3460,3470,3480,3490,3500,3510,3520,3530\n" + ] + } + ], + "source": [ + "print(\",\".join(releases))" ] }, { @@ -1283,7 +1345,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.12" }, "orig_nbformat": 4 }, diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt new file mode 100644 index 00000000..a94cf69e --- /dev/null +++ b/notebooks/requirements.txt @@ -0,0 +1,2 @@ +requests +pandas \ No newline at end of file