Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Converts the report json to a csv #200

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions workflows/short-read-mngs/postprocess.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,57 @@ task CombineTaxonCounts {
}
}

task OutputResultsCSV {
input {
String docker_image_id
String s3_wd_uri
File counts_json_file
File lineage_csv
}
command <<<
set -euxo pipefail
gunzip -c "~{lineage_csv}" > "lineage.csv"

python3 <<CODE
import json
import pandas as pd
from idseq_dag.util.dict import open_file_db_by_extension

with open("~{counts_json_file}") as f:
taxon_counts = json.load(f)["pipeline_output"]["taxon_counts_attributes"]
df = pd.DataFrame(taxon_counts)
df = df.loc[df["count_type"] != "merged_NT_NR", ~df.columns.isin(['base_count', 'source_count_type'])]
lineage = pd.read_csv("lineage.csv", index_col='taxid', usecols = ['taxid','tax_name'], squeeze=True).to_dict()
df["family"] = df.family_taxid.map(lambda x: lineage.get(int(x), x))
df["genus"] = df.genus_taxid.map(lambda x: lineage.get(int(x), x))
df["taxon_name"] = df.tax_id.map(lambda x: lineage.get(int(x), x))

column_order = [
"tax_id",
"tax_level",
"taxon_name",
"genus",
"family",
"count_type",
"count",
"nonunique_count",
"dcr",
"percent_identity",
"alignment_length",
"e_value",
]
df[column_order].sort_values(by=["count_type", "tax_level", "count"], ascending=False).to_csv("result.csv", index=None)

CODE
>>>
output {
File result_csv = "result.csv"
}
runtime {
docker: docker_image_id
}
}

task CombineJson {
input {
String docker_image_id
Expand Down Expand Up @@ -486,6 +537,8 @@ workflow czid_postprocess {
File lineage_db = "s3://czid-public-references/taxonomy/2021-01-22/taxid-lineages.db"
File taxon_blacklist = "s3://czid-public-references/taxonomy/2021-01-22/taxon_blacklist.txt"
File deuterostome_db = "s3://czid-public-references/taxonomy/2021-01-22/deuterostome_taxids.txt"
File lineage_csv = "s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/versioned-taxid-lineages.csv.gz"
Boolean output_results_csv = false
Boolean use_deuterostome_filter = true
Boolean use_taxon_whitelist = false
Int min_contig_length = 100
Expand Down Expand Up @@ -608,6 +661,16 @@ workflow czid_postprocess {
]
}

if (output_results_csv) {
call OutputResultsCSV {
input:
docker_image_id = docker_image_id,
s3_wd_uri = s3_wd_uri,
counts_json_file = CombineTaxonCounts.assembly_refined_taxon_counts_with_dcr_json,
lineage_csv = lineage_csv
}
}

call CombineJson {
input:
docker_image_id = docker_image_id,
Expand Down
3 changes: 2 additions & 1 deletion workflows/short-read-mngs/test/local_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ non_host_alignment.accession2taxid_db: s3://czid-public-references/ncbi-indexes-
minimap2_local_db_path: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt
diamond_local_db_path: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nr
diamond_args: "mid-sensitive"
postprocess.output_results_csv: true
postprocess.nt_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt
postprocess.nt_loc_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt_loc.marisa
postprocess.nr_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nr
postprocess.nr_loc_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nr_loc.marisa
experimental.nt_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt
experimental.nt_loc_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt_loc.marisa
experimental.nt_info_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt_info.marisa
experimental.nt_info_db: s3://czid-public-references/ncbi-indexes-prod/2021-01-22/index-generation-2/nt_info.marisa
2 changes: 1 addition & 1 deletion workflows/short-read-mngs/test/local_test_viral.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ host_filter.human_hisat2_index_tar: s3://czid-public-references/host_filter/ercc
host_filter.max_input_fragments: 9000
host_filter.max_subsample_fragments: 9000
non_host_alignment.accession2taxid_db: s3://czid-public-references/mini-database/alignment_indexes/2020-08-20-viral/viral_accessions2taxid.marisa
non_host_alignment.alignment_scalability: true
minimap2_local_db_path: s3://czid-public-references/test/viral-alignment-indexes/viral_nt
diamond_local_db_path: s3://czid-public-references/test/viral-alignment-indexes/viral_nr
diamond_args: "mid-sensitive"
postprocess.nt_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt
postprocess.output_results_csv: true
postprocess.nt_loc_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nt_loc.marisa
postprocess.nr_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nr
postprocess.nr_loc_db: s3://czid-public-references/test/viral-alignment-indexes/viral_nr_loc.marisa
Expand Down
Loading