Skip to content

Commit

Permalink
Do not update clusters table (#61)
Browse files Browse the repository at this point in the history
* Do not update clusters table

* Remove references to old marker table

* Delete delete_db_scxa_cell_clusters.sh

* Delete delete_db_scxa_marker_genes.sh

* Use new table structure for first marker gene test

* Fix tests to remove reference to deprecated DB structures

* Remove references to deprecated tables in reindex script

* Fix comments, remove references to defunct tables

* Remove mat view stuff

* syntax fix

* Bump schemas for dropped tables
  • Loading branch information
pinin4fjords committed Apr 29, 2022
1 parent ea2c27a commit 1236753
Show file tree
Hide file tree
Showing 9 changed files with 29 additions and 70 deletions.
2 changes: 1 addition & 1 deletion atlas-schemas
2 changes: 1 addition & 1 deletion bin/delete_db_scxa_cell_clusters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ set -e
dbConnection=${dbConnection:-$1}
EXP_ID=${EXP_ID:-$2}

echo "DELETE FROM scxa_cell_clusters WHERE experiment_accession = '"$EXP_ID"'" | psql -v ON_ERROR_STOP=1 $dbConnection
echo "DELETE FROM scxa_cell_group WHERE experiment_accession = '"$EXP_ID"'" | psql -v ON_ERROR_STOP=1 $dbConnection
3 changes: 2 additions & 1 deletion bin/delete_db_scxa_marker_genes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ set -e
dbConnection=${dbConnection:-$1}
EXP_ID=${EXP_ID:-$2}

echo "DELETE FROM scxa_marker_genes WHERE experiment_accession = '"$EXP_ID"'" | psql -v ON_ERROR_STOP=1 $dbConnection
echo "DELETE FROM scxa_cell_group_marker_genes WHERE cell_group_id in (select id from scxa_cell_group where experiment_accession = '"$EXP_ID"')" | \
psql -v ON_ERROR_STOP=1 $dbConnection
20 changes: 3 additions & 17 deletions bin/load_db_scxa_cell_clusters.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env bash

# This script takes the marker genes data, normally available in an irap
# sc_bundle, which is split in different files one per k_value (number of clusters)
# and loads it into the scxa_marker_genes table of AtlasProd.
# This script takes the unsupervised clusterings and curated cell types,
# normally available in an scxa sc_bundle and loads them into the
# scxa_cell_group_membership table of AtlasProd.
set -e

# TODO this type of function should be loaded from a common set of scripts.
Expand Down Expand Up @@ -47,24 +47,10 @@ wideSCCluster2longSCCluster.R -c $EXPERIMENT_CLUSTERS_FILE -e $EXP_ID -o $cluste
print_log "clusters table: Delete rows for $EXP_ID:"
echo "DELETE FROM scxa_cell_group_membership WHERE experiment_accession = '"$EXP_ID"'" | \
psql -v ON_ERROR_STOP=1 $dbConnection
echo "DELETE FROM scxa_cell_clusters WHERE experiment_accession = '"$EXP_ID"'" | \
psql -v ON_ERROR_STOP=1 $dbConnection

# Load data
print_log "Clusters: Loading data for $EXP_ID..."
set +e
printf "\copy scxa_cell_clusters (experiment_accession, cell_id, k, cluster_id) FROM '%s' DELIMITER ',' CSV HEADER;" $clustersToLoad | \
psql -v ON_ERROR_STOP=1 $dbConnection
s=$?

# Roll back if write was unsucessful

if [ $s -ne 0 ]; then
echo "Clusters write failed" 1>&2
echo "DELETE FROM scxa_cell_clusters WHERE experiment_accession = '"$EXP_ID"'" | \
psql -v ON_ERROR_STOP=1 $dbConnection
exit 1
fi

# NEW LAYOUT: define clusterings as cell groups in the DB

Expand Down
6 changes: 3 additions & 3 deletions bin/load_db_scxa_dimred.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env bash

# This script takes the marker genes data, normally available in an irap
# sc_bundle, which is split in different files one per k_value (number of clusters)
# and loads it into the scxa_marker_genes table of AtlasProd.
# This script takes the dimension reduction coordinate data, normally available
# in an SCXA sc_bundle, which is split in different methods and
# parameterisations, and loads it into the scxa_coords table of AtlasProd.
set -e

# TODO this type of function should be loaded from a common set of scripts.
Expand Down
31 changes: 5 additions & 26 deletions bin/load_db_scxa_marker_genes.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/usr/bin/env bash

# This script takes the marker genes data, normally available in an irap
# sc_bundle, which is split in different files one per k_value (number of clusters)
# and loads it into the scxa_marker_genes table of AtlasProd.
# This script takes the marker genes data, normally available in an scxa
# sc_bundle, which is split in different files one per k_value (number of
# clusters) or cell annotation type and loads them into the
# scxa_cell_groups_marker_genes table of AtlasProd.
set -e

# TODO this type of function should be loaded from a common set of scripts.
Expand Down Expand Up @@ -59,13 +60,6 @@ else
echo "WARNING No marker gene files declared on MANIFEST."
fi

print_log "## Loading Marker genes for $EXP_ID (old layout)."

# Delete marker gene table content for current EXP_ID
print_log "Marker genes: Delete rows for $EXP_ID:"
echo "DELETE FROM scxa_marker_genes WHERE experiment_accession = '"$EXP_ID"'" | \
psql -v ON_ERROR_STOP=1 $dbConnection

if [[ -z ${NUMBER_MGENES_FILES+x} || $NUMBER_MGENES_FILES -gt 0 ]]; then
# Create file with data
# Please note that this relies on:
Expand Down Expand Up @@ -106,22 +100,7 @@ if [[ -z ${NUMBER_MGENES_FILES+x} || $NUMBER_MGENES_FILES -gt 0 ]]; then
print_log "Marker genes: Loading data for $EXP_ID..."

set +e
printf "\copy scxa_marker_genes (experiment_accession, gene_id, k, cluster_id, marker_probability) FROM '%s' WITH (DELIMITER ',');" $markerGenesToLoad | \
psql -v ON_ERROR_STOP=1 $dbConnection

s=$?

# Roll back if write was unsucessful

if [ $s -ne 0 ]; then
echo "Marker table write failed" 1>&2
echo "DELETE FROM scxa_marker_genes WHERE experiment_accession = '"$EXP_ID"'" | \
psql -v ON_ERROR_STOP=1 $dbConnection
exit 1
fi

print_log "## Marker genes (old layout): Loading done for $EXP_ID"
print_log "## Loading Marker genes for $EXP_ID (new layout)."
print_log "## Loading Marker genes for $EXP_ID."

# NEW LAYOUT: point at cell groups table, retrieving cell group integer IDs from there first

Expand Down
9 changes: 0 additions & 9 deletions bin/refresh_materialised_views.sh

This file was deleted.

8 changes: 4 additions & 4 deletions bin/reindex_tables.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ set -e
psql -v ON_ERROR_STOP=1 $dbConnection <<EOF
SET maintenance_work_mem='2GB';
REINDEX TABLE scxa_coords;
REINDEX TABLE scxa_marker_genes;
REINDEX TABLE scxa_cell_clusters;
REINDEX TABLE scxa_cell_group;
REINDEX TABLE scxa_cell_group_membership;
REINDEX TABLE scxa_cell_group_marker_genes;
REINDEX TABLE scxa_cell_group_marker_gene_stats;
REINDEX TABLE experiment;
CLUSTER scxa_marker_genes USING scxa_marker_genes_experiment_accession_gene_id_k_cluster_id_pk;
CLUSTER scxa_cell_clusters USING scxa_cell_clusters_experiment_accession_cell_id_k_pk;
RESET maintenance_work_mem;
EOF
18 changes: 10 additions & 8 deletions tests/random-data-set.bats
Original file line number Diff line number Diff line change
Expand Up @@ -233,16 +233,17 @@

@test "Marker genes: Check number of loaded rows" {
# Get third line with count of total entries in the database after our load
count=$(echo "SELECT COUNT(*) FROM scxa_marker_genes where experiment_accession='TEST-EXP1'" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
count=$(echo "SELECT COUNT(*) FROM scxa_cell_group_marker_genes, scxa_cell_group where scxa_cell_group_marker_genes.cell_group_id = scxa_cell_group.id and experiment_accession='TEST-EXP1'" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
# TODO improve, highly dependent on test files we have, but in a hurry for now.
run [ $count -eq 274 ]
run [ $count -eq 330 ]
echo "count = $count"
echo "output = ${output}"
[ "$status" -eq 0 ]
}

@test "Marker genes: Check that k=12 was not loaded" {
# Get third line with count of total entries in the database after our load
count=$(echo "SELECT COUNT(*) FROM scxa_marker_genes WHERE k = 12" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
count=$(echo "SELECT COUNT(*) FROM scxa_cell_group_marker_genes, scxa_cell_group where scxa_cell_group_marker_genes.cell_group_id = scxa_cell_group.id and variable='12'" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
echo "Count: "$count
# TODO improve, highly dependent on test files we have, but in a hurry for now.
run [ $count -eq 0 ]
Expand Down Expand Up @@ -273,12 +274,13 @@
}

@test "Marker genes: Delete rows for experiment" {
countBefore=$(echo "SELECT COUNT(*) FROM scxa_marker_genes" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
countBefore=$(echo "SELECT COUNT(*) FROM scxa_cell_group_marker_genes, scxa_cell_group where scxa_cell_group_marker_genes.cell_group_id = scxa_cell_group.id" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
export EXP_ID=TEST-EXP2
run delete_db_scxa_marker_genes.sh
echo "output = ${output}"
[ "$status" -eq 0 ]
countAfter=$(echo "SELECT COUNT(*) FROM scxa_marker_genes" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
countAfter=$(echo "SELECT COUNT(*) FROM scxa_cell_group_marker_genes, scxa_cell_group where scxa_cell_group_marker_genes.cell_group_id = scxa_cell_group.id" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
echo "Count before: $countBefore , count after: $countAfter"
[ $(( countBefore - countAfter )) == 274 ]
}

Expand Down Expand Up @@ -353,9 +355,9 @@

@test "Clusters: Check number of loaded rows" {
# Get third line with count of total entries in the database after our load
count=$(echo "SELECT COUNT(*) FROM scxa_cell_clusters" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
count=$(echo "SELECT COUNT(*) FROM scxa_cell_group_membership" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
# TODO improve, highly dependent on test files we have, but in a hurry for now.
run [ $count -eq 12537 ]
run [ $count -eq 13930 ]
echo "output = ${output} count = $count"
[ "$status" -eq 0 ]
}
Expand All @@ -365,7 +367,7 @@
run delete_db_scxa_cell_clusters.sh
echo "output = ${output}"
[ "$status" -eq 0 ]
count=$(echo "SELECT COUNT(*) FROM scxa_cell_clusters WHERE experiment_accession = '"$EXP_ID"'" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
count=$(echo "SELECT COUNT(*) FROM scxa_cell_group_membership, scxa_cell_group WHERE scxa_cell_group_membership.cell_group_id=scxa_cell_group.id AND scxa_cell_group.experiment_accession = '"$EXP_ID"'" | psql -v ON_ERROR_STOP=1 $dbConnection | awk 'NR==3')
# TODO improve, highly dependent on test files we have, but in a hurry for now.
run [ $count -eq 0 ]
echo "output = ${output}"
Expand Down

0 comments on commit 1236753

Please sign in to comment.