chore: added scripts to load or take snapshots + docs

graphops · Apr 30, 2024 · a7c8ccc · a7c8ccc
1 parent a008722
commit a7c8ccc
Show file tree

Hide file tree

Showing 3 changed files with 244 additions and 0 deletions.
diff --git a/docs/snapshot-workflow.md b/docs/snapshot-workflow.md
@@ -0,0 +1,26 @@
+# Snapshot Workflow
+
+Sharing Subgraph snapshot is the insipration that led to the creation of Files Service. In this document we describe the workflows for producers and consumers of Subgraph snapshots in detail.
+
+
+## Producer taking Subgraph snapshot
+
+Data producer owns a database that contains a various amount of indexed Subgraphs. They want to share a particular Subgraph Deployment from the database. We assume that they are already running a file server ([guide](/docs/server_guide.md)).
+
+Thus they set Postgres environmental variables `POSTGRES_DB_HOST, POSTGRES_DB_PORT, POSTGRES_DB_NAME, POSTGRES_DB_USER,  POSTGRES_DB_PSWD`, and the intented Subgraph Deployment `SNAPSHOT_SUBGRAPH_DEPLOYMENT_IPFS_HASH`. Then, they will run the script we provided at `scripts/take-snapshot.sh`, which will check for the existence of intended deployment, take `pg_dump` of the corresponding schema into a file locally, and print out a line of metadata. 
+
+The script also takes environmental variable `SERVER_ADMIN_ENDPOINT` and `AUTH_TOKEN` to automatically publish and add the snapshot as part of their file service. The producer can also manually go to their file service admin endpoint to publish and serve the snapshot; they should include the snapshot file, and the metadata printed out in the description.
+
+## Consumer downloading and loading Subgraph Snapshot
+
+Data consumer similarly has a graph-node database. They are syncing a Subgraph Deployment, but decided it would be better to simply buy the latest snapshot from a data producer which reduces the consumer's computation cost and opportunity cost to serve queries. 
+
+Here we assume that the Data consumer knows the public endpoints of data producers. They ping the endpoints for all available files and identifies the manifest hash for the Subgraph Snapshot they would like to download. 
+
+They use the downloader to download the file. Refer to [Downloader](/docs/client_guide.md) for more details.
+
+Once the file is downloaded, the data consumer may use the script provided at `scripts/load_snapshot.sh`. This script will check for existence of the locally syncing Subgraph schema, delete the local copy, modify the remote snapshot to use local schema identifier and owner, load in the localized snapshot, and update the metadata as provided in the manifest.
+
+To confirm everything is working, visit graph-node's index node server to check the indexing statuses and the subgraph query endpoint to test the queries.
+
+
diff --git a/scripts/load_snapshot.sh b/scripts/load_snapshot.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+
+source .env
+
+DB_HOST=${POSTGRES_DB_HOST}
+DB_PORT=${POSTGRES_DB_PORT}
+DB_NAME=${POSTGRES_DB_NAME}
+DB_USER=${POSTGRES_DB_USER}
+DB_PSWD=${POSTGRES_DB_PSWD}
+IPFS_HASH=${SNAPSHOT_SUBGRAPH_DEPLOYMENT_IPFS_HASH}
+SNAPSHOT_FILE=${SNAPSHOT_SUBGRAPH_DEPLOYMENT_FILE}
+MANIFEST_HASH=${SNAPSHOT_SUBGRAPH_MANIFEST_IPFS_HASH}
+
+DEPLOYMENT_SCHEMA_QUERY="SELECT * FROM deployment_schemas WHERE subgraph = '${IPFS_HASH}';"
+DEPLOYMENT_SCHEMA_RESULT=$(PGPASSWORD=$DB_PSWD psql -h ${DB_HOST} -p ${DB_PORT} -U ${DB_USER} -d ${DB_NAME} -t -A -c "${DEPLOYMENT_SCHEMA_QUERY}")
+# Ensure that graph-node already has the metadata skeleton is not empty
+echo "First check for schema skeleton"
+echo ${DEPLOYMENT_SCHEMA_RESULT}
+if [ -n "${DEPLOYMENT_SCHEMA_RESULT}" ]; then
+echo "Depolyment found in the database!"
+else
+echo "Deployment schema not found in the database. Make sure to first deploy on graph-node"
+exit
+fi
+
+# #  id |                    subgraph                    | name  | version |  shard  | network | active |          created_at           
+# # ----+------------------------------------------------+-------+---------+---------+---------+--------+-------------------------------
+# #  10 | QmRAbgoZ2mBpxqj4Z32KFaso8rsDhdm8KcwEQhWMzD8bTN | sgd10 |       1 | primary | mainnet | t      | 2024-04-25 13:33:31.955611-07
+# # (1 row)
+
+IFS='|' read -r -a array <<< "${DEPLOYMENT_SCHEMA_RESULT}"
+# CHECK SCHEMA id sgdNNN
+schema_id=${array[0]}
+sgdNNN=${array[2]}
+echo "ID: ${schema_id}"
+echo "sgdNNN: ${sgdNNN}"
+
+# Customize the snapshot file to use the identifier assigned locally; and modifies the owner
+sed -i "s/sgd[0-9]*/${sgdNNN}/g" ${SNAPSHOT_FILE}
+sed -i "s/OWNER TO [^;]*;/OWNER TO ${DB_USER};/g" ${SNAPSHOT_FILE}
+
+# Drop local schema and load in remote
+echo "Deleting local subgraph schema"
+DROP_SCHEMA_QUERY="DROP SCHEMA IF EXISTS ${array[2]} CASCADE;"
+DELETE_LOCAL_SCHEMA_RESULT=$(PGPASSWORD=$DB_PSWD psql -h ${DB_HOST} -p ${DB_PORT} -U ${DB_USER} -d ${DB_NAME} -t -A -c "${DROP_SCHEMA_QUERY}")
+echo "Delete result: ${DELETE_LOCAL_RESULT}"
+echo "Loading subgraph snapshot"
+REMOTE_LOAD_RESULT=$(PGPASSWORD=$DB_PSWD psql -h $DB_HOST -p $DB_PORT -d $DB_NAME -U $DB_USER -f $SNAPSHOT_FILE)
+echo "Load result: ${REMOTE_LOAD_RESULT}"
+
+# Update metadata
+response=$(curl -s "https://ipfs.network.thegraph.com/ipfs/api/v0/cat?arg=${MANIFEST_HASH}")
+echo "Snapshot metadata: ${response}"
+description=$(echo "$response" | grep 'description:' | cut -d ':' -f 2- | tr -d ' ')
+IFS='|' read -r -a fields <<< "$description"
+
+convert_types() {
+    for i in "${!fields[@]}"; do
+        if [ -z "${fields[$i]}" ]; then
+            fields[$i]="NULL"
+        elif [ "${fields[$i]}" == "{}" ]; then
+            # Handle the non_fatal_errors field
+            if [ $i -eq 10 ]; then  # Assuming index 10 is non_fatal_errors
+                fields[$i]="ARRAY[]::text[]"
+            else
+                fields[$i]="NULL"
+            fi
+        elif [[ "${fields[$i]}" =~ ^\{.*\}$ ]]; then
+            # Handle non_empty non_fatal_errors
+            if [ $i -eq 10 ]; then
+                # Trim leading and trailing braces and convert to PostgreSQL array format
+                local clean_errors="${fields[$i]#\{}"   # Remove leading {
+                clean_errors="${clean_errors%\}}"       # Remove trailing }
+                fields[$i]="ARRAY['${clean_errors//,/','}']::text[]"
+            fi
+        else
+            # Convert 'f' and 't' to FALSE and TRUE
+            case "${fields[$i]}" in
+                f) fields[$i]="FALSE" ;;
+                t) fields[$i]="TRUE" ;;
+            esac
+        fi
+    done
+}
+convert_types
+
+# Construct the SQL UPDATE statement
+METADATA_MUTATION="UPDATE subgraphs.subgraph_deployment SET
+    failed = ${fields[1]},
+    synced = ${fields[2]},
+    latest_ethereum_block_hash = '${fields[3]}',
+    latest_ethereum_block_number = ${fields[4]},
+    entity_count = ${fields[5]},
+    graft_base = ${fields[6]},
+    graft_block_hash = ${fields[7]},
+    graft_block_number = ${fields[8]},
+    fatal_error = ${fields[9]},
+    non_fatal_errors = ${fields[10]},
+    health = '${fields[11]}',
+    reorg_count = ${fields[12]},
+    current_reorg_depth = ${fields[13]},
+    max_reorg_depth = ${fields[14]},
+    last_healthy_ethereum_block_hash = ${fields[15]},
+    last_healthy_ethereum_block_number = ${fields[16]},
+    id = ${schema_id},
+    firehose_cursor = ${fields[18]},
+    debug_fork = ${fields[19]},
+    earliest_block_number = ${fields[20]}
+WHERE deployment = '${fields[0]}';"
+
+echo "------- created mutation statement ---------"
+echo ${METADATA_MUTATION}
+
+# Apply metadata changes
+METADATA_MUTATION_RESULT=$(PGPASSWORD=$DB_PSWD psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME <<< "$METADATA_MUTATION")
+echo "Mutation result: ${METADATA_MUTATION_RESULT}"
+
+# Some error handling (exit script on failure)
+if [ $? -ne 0 ]; then
+  echo "Error: Script failed to execute SQL statements."
+  exit 1
+fi
+
+echo "Script completed successfully."
diff --git a/scripts/take_snapshot.sh b/scripts/take_snapshot.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+source ../.env
+
+# ASSUME file service is running
+DB_HOST=${POSTGRES_DB_HOST}
+DB_PORT=${POSTGRES_DB_PORT}
+DB_NAME=${POSTGRES_DB_NAME}
+DB_USER=${POSTGRES_DB_USER}
+DB_PSWD=${POSTGRES_DB_PSWD}
+IPFS_HASH=${SNAPSHOT_SUBGRAPH_DEPLOYMENT_IPFS_HASH}
+SERVER_ADMIN_ENDPOINT=${SERVER_ADMIN_ENDPOINT}
+AUTH_TOKEN=${AUTH_TOKEN}
+SERVER_STORE=${SERVER_STORE}
+
+DEPLOYMENT_SCHEMA_QUERY="SELECT * FROM deployment_schemas WHERE subgraph = '${IPFS_HASH}';"
+DEPLOYMENT_SCHEMA_RESULT=$(PGPASSWORD=$DB_PSWD psql -h ${DB_HOST} -p ${DB_PORT} -U ${DB_USER} -d ${DB_NAME} -t -A -c "${DEPLOYMENT_SCHEMA_QUERY}")
+
+echo ${DEPLOYMENT_SCHEMA_RESULT}
+if [ -n "${DEPLOYMENT_SCHEMA_RESULT}" ]; then
+echo "Deployment found in the database!"
+else
+echo "Deployment schema not found in the database. Make sure you have this deployment on graph-node"
+exit
+fi
+
+# #  id |                    subgraph                    | name  | version |  shard  | network | active |          created_at           
+# # ----+------------------------------------------------+-------+---------+---------+---------+--------+-------------------------------
+# #  10 | QmRAbgoZ2mBpxqj4Z32KFaso8rsDhdm8KcwEQhWMzD8bTN | sgd10 |       1 | primary | mainnet | t      | 2024-04-25 13:33:31.955611-07
+# # (1 row)
+
+IFS='|' read -r -a array <<< "${DEPLOYMENT_SCHEMA_RESULT}"
+# CHECK SCHEMA id sgdNNN
+schema_id=${array[0]}
+sgdNNN=${array[2]}
+echo "ID: ${schema_id}"
+echo "sgdNNN: ${sgdNNN}"
+
+
+# Take the snapshot using pg_dump
+FILE_NAME="snapshot_${IPFS_HASH}.sql"
+SNAPSHOT_ACTION=$(PGPASSWORD=$DB_PSWD pg_dump -h $DB_HOST -U $DB_USER -d $DB_NAME -n $sgdNNN > ${SERVER_STORE}/${FILE_NAME})
+echo "Action result: ${SNAPSHOT_ACTION}"
+echo "Snapshot created: ${FILE_NAME}"
+
+# Query metadata data
+METADATA_QUERY="SELECT deployment, failed, synced, latest_ethereum_block_hash, latest_ethereum_block_number, entity_count, graft_base, graft_block_hash, graft_block_number, fatal_error, non_fatal_errors, health, reorg_count, current_reorg_depth, max_reorg_depth, last_healthy_ethereum_block_hash, last_healthy_ethereum_block_number, id, firehose_cursor, debug_fork, earliest_block_number FROM subgraphs.subgraph_deployment WHERE deployment = '${IPFS_HASH}';"
+METADATA_RESULT=$(PGPASSWORD=$DB_PSWD psql -h ${DB_HOST} -p ${DB_PORT} -U ${DB_USER} -d ${DB_NAME} -t -A -c "${METADATA_QUERY}")
+echo "Metadata result: ${METADATA_RESULT}"
+
+# Publish to Indexer file service
+# Files: snapshot_IPFS_HASH.sql
+# With metadata in description
+IFS='|' read -r -a array <<< "${METADATA_RESULT}"
+deployment=${array[0]}
+failed=${array[1]}
+synced=${array[2]}
+temp_hash = ${array[3]}
+latest_ethereum_block_hash=$(echo "${temp_hash}" | sed 's/\\/0/g')
+latest_ethereum_block_number=${array[4]}
+entity_count=${array[5]}
+graft_base=${array[6]}
+graft_block_hash=${array[7]}
+graft_block_number=${array[8]}
+fatal_error=${array[9]}
+non_fatal_errors=${array[10]}
+health=${array[11]}
+reorg_count=${array[12]}
+current_reorg_depth=${array[13]}
+max_reorg_depth=${array[14]}
+last_healthy_ethereum_block_hash=${array[15]}
+last_healthy_ethereum_block_number=${array[16]}
+id=${array[17]}
+firehose_cursor=${array[18]}
+debug_fork=${array[19]}
+earliest_block_number=${array[20]}
+
+latest_ethereum_block_hash="${array[3]//\\x/0x}"
+description="""$deployment|$failed|$synced|$latest_ethereum_block_hash|$latest_ethereum_block_number|$entity_count|$graft_base|$graft_block_hash|$graft_block_number|$fatal_error|$non_fatal_errors|$health|$reorg_count|$current_reorg_depth|$max_reorg_depth|$last_healthy_ethereum_block_hash|$last_healthy_ethereum_block_number|$id|$firehose_cursor|$debug_fork|$earliest_block_number"""
+echo "\n"
+echo "DescriptioN: ${description}"
+
+GRAPHQL_QUERY="""mutation{publishAndServeBundle(filenames:[\\\"${FILE_NAME}\\\"], description:\\\"${description}\\\"){ipfsHash}}"""
+
+echo "Response: "  
+curl -X POST -H "Content-Type: application/json" -H "Authorization: Bearer ${AUTH_TOKEN}" -d "{\"query\": \"${GRAPHQL_QUERY}\"}" ${SERVER_ADMIN_ENDPOINT}
+
+# Some error handling (exit script on failure)
+if [ $? -ne 0 ]; then
+  echo "Error: Script failed to execute."
+  exit 1
+fi
+
+echo "Script completed successfully."