## Objective

Download wgs data from the hmp project

## Materials, Methods, and Functions

In [47]:
# default parameters
@info "define_parameters"
metadata_directory = "../../metadata/hmp_wgs_fastq"
data_dir = "../../data/fastqs"

┌ Info: define_parameters
└ @ Main /workspaces/Mycelia/notebooks/scripts/02.download-hmp-wgs-data.ipynb:2


"../../data/fastqs"

In [48]:
@info "initialize"
mkpath(data_dir)

┌ Info: initialize
└ @ Main /workspaces/Mycelia/notebooks/scripts/02.download-hmp-wgs-data.ipynb:1


"../../data/fastqs"

In [49]:
@info "import libraries"
import Mycelia
import DataFrames
import uCSV
import ProgressMeter

┌ Info: import libraries
└ @ Main /workspaces/Mycelia/notebooks/scripts/02.download-hmp-wgs-data.ipynb:1


In [62]:
@info "define functions"
function download_and_untar_hmp_data(tar_url)
    local_path = joinpath(data_dir, basename(tar_url))
    if !isfile(local_path)
        download(tar_url, local_path)
    end
    untarred_local_path = replace(local_path, ".tar.bz2" => "")
    if !isfile(untarred_local_path)
        run(`tar -xf $local_path`)
    end
end

┌ Info: define functions
└ @ Main /workspaces/Mycelia/notebooks/scripts/02.download-hmp-wgs-data.ipynb:1


download_and_untar_hmp_data (generic function with 1 method)

In [50]:
manifest_file = first(filter(x -> occursin("hmp_manifest", x) && !occursin("metadata", x), readdir(metadata_directory)))
manifest_table = DataFrames.DataFrame(uCSV.read(joinpath(metadata_directory, manifest_file), delim='\t', header=1, typedetectrows=100)...)
filtered_manifest_table = manifest_table[map(x -> !occursin(r"private"i, x), manifest_table[!, "urls"]), :]
show(filtered_manifest_table, allcols=true)

[1m6228×5 DataFrame[0m
[1m  Row [0m│[1m file_id                          [0m[1m md5                              [0m[1m size        [0m[1m urls                              [0m[1m sample_id                        [0m
[1m      [0m│[90m String                           [0m[90m String                           [0m[90m String      [0m[90m String                            [0m[90m String                           [0m
──────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    1 │ 54a24ca84a57a7d5b06687939f76371f  16cffb012e01bdb97dfe36f66e7af26d  3244481532   https://downloads.hmpdacc.org/da…  faab18e1a137731cffda88256053e2dd
    2 │ 54a24ca84a57a7d5b06687939f767d56  744a078665e03eb9eb829aeefb155602  29075030     https://downloads.hmpdacc.org/da…  faab18e1a137731cffda882560e6abcc
    3 │ 54a24ca84a57a7d5b06687939f765102  a5021b05a43d479be53ac9fb13c12b92  721587657 

In [51]:
manifest_metadata_file = first(filter(x -> occursin("hmp_manifest_metadata", x), readdir(metadata_directory)))
manifest_metadata_table = DataFrames.DataFrame(uCSV.read(joinpath(metadata_directory, manifest_metadata_file), delim='\t', header=1, typedetectrows=100)...)
show(manifest_metadata_table, allcols=true)

[1m7355×9 DataFrame[0m
[1m  Row [0m│[1m sample_id                        [0m[1m subject_id [0m[1m subject_uuid                     [0m[1m sample_body_site       [0m[1m visit_number [0m[1m subject_gender [0m[1m subject_race [0m[1m study_full_name                   [0m[1m project_name                      [0m
[1m      [0m│[90m String                           [0m[90m String     [0m[90m String                           [0m[90m String                 [0m[90m Int64        [0m[90m String         [0m[90m String       [0m[90m String                            [0m[90m String                            [0m
──────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    1 │ 634416dfd8b630dcc6bba6aec4ac19c7  ZK112BX     88af6472fb03642dd5eaf8cddc379db4  feces                             12  fe

In [52]:
joint_manifest_table = unique(DataFrames.innerjoin(manifest_metadata_table, filtered_manifest_table, on="sample_id"))
show(joint_manifest_table, allcols=true)

[1m6228×13 DataFrame[0m
[1m  Row [0m│[1m sample_id                        [0m[1m subject_id [0m[1m subject_uuid                     [0m[1m sample_body_site           [0m[1m visit_number [0m[1m subject_gender [0m[1m subject_race [0m[1m study_full_name                   [0m[1m project_name                   [0m[1m file_id                          [0m[1m md5                              [0m[1m size        [0m[1m urls                              [0m
[1m      [0m│[90m String                           [0m[90m String     [0m[90m String                           [0m[90m String                     [0m[90m Int64        [0m[90m String         [0m[90m String       [0m[90m String                            [0m[90m String                         [0m[90m String                           [0m[90m String                           [0m[90m String      [0m[90m String                            [0m
──────┼────────────────────────────────────────────

In [59]:
tar_urls = string.(first.(split.(joint_manifest_table[!, "urls"], ",")))

6228-element Vector{String}:
 "https://downloads.hmpdacc.org/d" ⋯ 43 bytes ⋯ "sm/v1/454/SRS019030_454.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v2/SRS019062.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v2/SRS019030.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v1/SRS019030.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 43 bytes ⋯ "sm/v1/454/SRS078197_454.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v2/SRS078197.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v2/SRS078241.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 43 bytes ⋯ "sm/v1/454/SRS058723_454.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v2/SRS047069.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 36 bytes ⋯ "is/hmwgsqc/v2/SRS058723.tar.bz2"
 ⋮
 "https://downloads.hmpdacc.org/d" ⋯ 41 bytes ⋯ "/not_affected/SRS072422.tar.bz2"
 "https://downloads.hmpdacc.org/d" ⋯ 37 bytes ⋯ "rynx/affected/SRS

In [63]:
for tar_url in tar_urls[1:10]
    download_and_untar_hmp_data(tar_url)
end

"../../data/fastqs/SRS019030_454.tar.bz2"

## Experimental/Simulated Observations

N/A

## Analysis, Statistics, and Visualizations

N/A

## Summary of Results

N/A

## Conclusions and Future Directions

N/A