In [None]:
# default = 100
# ENV["DATAFRAMES_COLUMNS"] = 100
# default = 25
ENV["DATAFRAMES_ROWS"] = 3

import Pkg

pkgs = [
"ProgressMeter",
"uCSV",
"DataFrames",
"JSON"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
base_directory = dirname(pwd())

In [None]:
metadata_directory = joinpath(base_directory, "metadata", "exposome")

In [None]:
biosample_xml_file = joinpath(metadata_directory, "biosample_result.xml")

In [None]:
biosample_json = JSON.parse(open(`yq --input-format "xml" --output-format "json" $(biosample_xml_file)`))
# top two levels only have the single entry, 3rd level is all of the samples
biosample_json = biosample_json["BioSampleSet"]["BioSample"]
biosample_table = DataFrames.DataFrame(biosample_json)
# not helpful
biosample_table = biosample_table[!, DataFrames.Not(["+id", "Status", "Models", "Description", "Package", "Owner"])]

id_values = map(x -> x["Id"], biosample_table[!, "Ids"])
# first dictionary is SAMN biosample id, which is already present
biosample_table[!, "Sample name"] = map(x -> x[2]["+content"], id_values)
biosample_table[!, "SRA identifier"] = map(x -> x[3]["+content"], id_values)
biosample_table = biosample_table[!, DataFrames.Not("Ids")]

biosample_table[!, "Bioproject identifier"] = map(x -> x["Link"]["+label"], biosample_table[!, "Links"])
biosample_table = biosample_table[!, DataFrames.Not("Links")]

attributes_table = DataFrames.DataFrame()
for attributes in biosample_table[!, "Attributes"]
    # only has one entry at this level
    attributes = attributes["Attribute"]
    # extract key information from each sub-attribute
    attributes = Dict(attribute["+attribute_name"] => attribute["+content"] for attribute in attributes)
    push!(attributes_table, attributes, cols=:union)
end
attributes_table

biosample_table = hcat(biosample_table, attributes_table)
biosample_table = biosample_table[!, DataFrames.Not("Attributes")]

In [None]:
# write out this table
biosample_parsed_csv_file = biosample_xml_file * ".parsed.csv"
uCSV.write(biosample_parsed_csv_file, biosample_table)

In [None]:
# join all 3 key metadata tables
sra_result_table = DataFrames.DataFrame(uCSV.read(joinpath(metadata_directory, "sra_result.csv"), header=1, quotes='"')...)
sra_run_info_table = DataFrames.DataFrame(uCSV.read(joinpath(metadata_directory, "SraRunInfo.csv"), header=1)...)
joint_metadata_table = DataFrames.innerjoin(
    sra_result_table,
    sra_run_info_table,
    on = [
        "Experiment Accession" => "Experiment",
        "Organism Name" => "ScientificName",
        "Instrument" => "Model",
        "Study Accession" => "SRAStudy",
        "Sample Accession" => "Sample",
        "Library Name" => "SampleName",
        "Library Strategy" => "LibraryStrategy",
        "Library Source" => "LibrarySource",
        "Library Selection" => "LibrarySelection"
        ]
)

joint_metadata_table = DataFrames.innerjoin(
    joint_metadata_table,
    biosample_table,
    on = [
        "Library Name" => "Sample name",
        "BioSample" => "+accession",
        "Sample Accession" => "SRA identifier",
        "BioProject" => "Bioproject identifier"
    ]
)

joint_metadata_file = joinpath(metadata_directory, "joint_sample_metadata.tsv")
uCSV.write(joint_metadata_file, joint_metadata_table, delim='\t')