In [127]:
import os
import subprocess
import itertools

from Bio import SeqIO


In [128]:
def get_read_ids(fasta_filename):
    # get all read ids from fasta
    with open(fasta_filename) as handle:
        return [record.id for record in SeqIO.FastaIO.FastaIterator(handle)]

    
def get_read_id_w_params(pathogen, similarity_threshold):
    # get read ids for a pathogen and compression score
    fasta_filename = f"nt_compressed_{similarity_threshold}_{pathogen}.fa"
    return get_read_ids(fasta_filename)


def get_output_read_groupings(pathogen, similarity_threshold):   
    # get read ids and group by similar read_ids
    read_ids = get_read_id_w_params(pathogen, similarity_threshold)
    
    # group by key (ex: 0.99-1 => 99)
    key_func = lambda x: x.strip("0.").split("-")[0]
    for k, g in itertools.groupby(read_ids, key=key_func):
        print(k, list(g))


def compare_reads_before_and_after_compression(before, after):
    reads_before_compression = get_read_ids(before)
    reads_after_compression = get_read_ids(after)

    return [read for read in reads_before_compression if read not in reads_after_compression]



In [129]:
def compare_params(pathogen, uncompressed_fn, similarity_threshold, scaled=1000, ksize=31):
    # run compression, output read_ids that have dropped out after compression
    
    compressed_fn = f"nt_compressed_{similarity_threshold}_{pathogen}.fa"
    taxids_to_drop = ['9606']
    mapping_test_data_dir = "test_data/accession2taxid"
    mapping_nulc_wgs = os.path.join(mapping_test_data_dir, "nucl_wgs.accession2taxid.subset")
    mapping_nucl_gb = os.path.join(mapping_test_data_dir, "nucl_gb.accession2taxid.subset")
    mapping_pdb = os.path.join(mapping_test_data_dir, "pdb.accession2taxid.subset")

    ncbi_compress_command = f"""
    ./target/release/ncbi-compress  \
        --input-fasta {uncompressed_fn}  \
        --accession-mapping-files {mapping_nulc_wgs}  \
        --accession-mapping-files {mapping_nucl_gb}  \
        --accession-mapping-files {mapping_pdb}  \
        --taxids-to-drop {', '.join(taxids_to_drop)}  \
        --output-fasta {compressed_fn}  \
        --k {ksize}  \
        --scaled {scaled}  \
        --similarity-threshold {similarity_threshold}
    """.replace("\n", "")
    
    completed_process = subprocess.run(ncbi_compress_command, capture_output=True, shell=True)
    if completed_process.returncode == 0:
        t = compare_reads_before_and_after_compression(uncompressed_fn, compressed_fn)
        print(t)
    else:
        completed_process
    
    
def log_params(pathogen, uncompressed_fn):
    # log different parameters for compression, run compress for each iteration, output the dropped read_ids
    print(f"starting compression comparison results for {pathogen}")
    scaled_params = [100, 1000, 10000]
    similarity_threshold_params = [0.6, 0.5, 0.4, 0.3]
    ksize_params = [21, 31, 51]
    
#     for ksize in ksize_params:
#         print(f"------ ksize {ksize} ------")
#         print("")
#         for scaled in scaled_params:
#             print(f"------ scaled {scaled} ------")
    for st in similarity_threshold_params:
        print(f"threshold: {st}")
        compare_params(pathogen, uncompressed_fn, st, 1000, ksize=31)
#             print("")
#         print("")
        
    


In [130]:
# rhinovirus
log_params("rhinovirus", "test_data/simulated_seqs/all_simulated_seqs_rhinovirus_sorted.fasta")


starting compression comparison results for rhinovirus
threshold: 0.6
['0.99-3', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9']
threshold: 0.5
['0.97-2', '0.99-10', '0.99-3', '0.99-4', '0.99-5', '0.99-7', '0.99-8', '0.99-9']
threshold: 0.4
['0.97-2', '0.99-10', '0.99-2', '0.99-3', '0.99-4', '0.99-5', '0.99-7', '0.99-8', '0.99-9']
threshold: 0.3
['0.97-10', '0.97-2', '0.97-5', '0.97-6', '0.99-10', '0.99-2', '0.99-3', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9']


In [119]:
# get compressed final groupings
print("0.4") 
get_output_read_groupings("rhinovirus", 0.4)
print("----------------------")
print("0.5")
get_output_read_groupings("rhinovirus", 0.5)

0.4
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9']
92 ['0.92-1', '0.92-10', '0.92-2', '0.92-3', '0.92-4', '0.92-5', '0.92-6', '0.92-7', '0.92-8', '0.92-9']
95 ['0.95-1', '0.95-10', '0.95-2', '0.95-3', '0.95-4', '0.95-5', '0.95-6', '0.95-7', '0.95-8', '0.95-9']
97 ['0.97-1', '0.97-10', '0.97-3', '0.97-4', '0.97-5', '0.97-6', '0.97-7', '0.97-8', '0.97-9']
99 ['0.99-1', '0.99-6']
NC_001617.1 ['NC_001617.1']
----------------------
0.5
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9']
92 ['0.92-1', '0.92-10', '

In [101]:
# strep
log_params("streptococcus", "test_data/simulated_seqs/all_simulated_seqs_streptococcus_sorted.fasta")

starting compression comparison results for streptococcus
threshold: 0.6
['NZ_UYIP01000001.1']
threshold: 0.5
['0.99-10', '0.99-2', '0.99-3', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9', 'NZ_UYIP01000001.1']
threshold: 0.4
['0.99-10', '0.99-2', '0.99-3', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9', 'NZ_UYIP01000001.1']
threshold: 0.3
['0.99-1', '0.99-10', '0.99-2', '0.99-3', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9', 'NZ_UYIP01000001.1']


In [120]:
# get_read_id_w_params("streptococcus", 0.4)


# get compressed final groupings
print("0.4") 
get_output_read_groupings("streptococcus", 0.4)
print("----------------------")
print("0.5")
get_output_read_groupings("streptococcus", 0.5)

0.4
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9']
92 ['0.92-1', '0.92-10', '0.92-2', '0.92-3', '0.92-4', '0.92-5', '0.92-6', '0.92-7', '0.92-8', '0.92-9']
95 ['0.95-1', '0.95-10', '0.95-2', '0.95-3', '0.95-4', '0.95-5', '0.95-6', '0.95-7', '0.95-8', '0.95-9']
97 ['0.97-1', '0.97-10', '0.97-2', '0.97-3', '0.97-4', '0.97-5', '0.97-6', '0.97-7', '0.97-8', '0.97-9']
99 ['0.99-1']
----------------------
0.5
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9']
92 ['0.92-1', '0.92-10', '0.92-2', '0.92-3', '0.92-4',

In [103]:
# chkv
log_params("chkv", "test_data/simulated_seqs/all_simulated_seqs_chkv_sorted.fasta")


starting compression comparison results for chkv
threshold: 0.6
['0.99-4', '0.99-8', '0.99-9']
threshold: 0.5
['0.99-2', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9']
threshold: 0.4
['0.97-9', '0.99-10', '0.99-2', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9']
threshold: 0.3
['0.97-2', '0.97-4', '0.97-9', '0.99-1', '0.99-10', '0.99-2', '0.99-3', '0.99-4', '0.99-5', '0.99-6', '0.99-7', '0.99-8', '0.99-9']


In [121]:
# get_read_id_w_params("chkv", 0.4)

# get_output_read_groupings("chkv", 0.4)

# get compressed final groupings
print("0.4") 
get_output_read_groupings("chkv", 0.4)
print("----------------------")
print("0.5")
get_output_read_groupings("chkv", 0.5)

0.4
NC_004162.2 ['NC_004162.2']
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9']
92 ['0.92-1', '0.92-10', '0.92-2', '0.92-3', '0.92-4', '0.92-5', '0.92-6', '0.92-7', '0.92-8', '0.92-9']
95 ['0.95-1', '0.95-10', '0.95-2', '0.95-3', '0.95-4', '0.95-5', '0.95-6', '0.95-7', '0.95-8', '0.95-9']
97 ['0.97-1', '0.97-10', '0.97-2', '0.97-3', '0.97-4', '0.97-5', '0.97-6', '0.97-7', '0.97-8']
99 ['0.99-1', '0.99-3']
----------------------
0.5
NC_004162.2 ['NC_004162.2']
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9'

In [115]:
get_output_read_groupings("chkv", 0.3)


NC_004162.2 ['NC_004162.2']
8 ['0.8-1', '0.8-10', '0.8-2', '0.8-3', '0.8-4', '0.8-5', '0.8-6', '0.8-7', '0.8-8', '0.8-9']
85 ['0.85-1', '0.85-10', '0.85-2', '0.85-3', '0.85-4', '0.85-5', '0.85-6', '0.85-7', '0.85-8', '0.85-9']
9 ['0.9-1', '0.9-10', '0.9-2', '0.9-3', '0.9-4', '0.9-5', '0.9-6', '0.9-7', '0.9-8', '0.9-9']
92 ['0.92-1', '0.92-10', '0.92-2', '0.92-3', '0.92-4', '0.92-5', '0.92-6', '0.92-7', '0.92-8', '0.92-9']
95 ['0.95-1', '0.95-10', '0.95-2', '0.95-3', '0.95-4', '0.95-5', '0.95-6', '0.95-7', '0.95-8', '0.95-9']
97 ['0.97-1', '0.97-10', '0.97-3', '0.97-5', '0.97-6', '0.97-7', '0.97-8']


In [126]:
compare_reads_before_and_after_compression(
    "test_data/test_output.fasta",
    "tests/test_data/expected_compression_results/nt_compressed_0.6_chkv.fa"
)

[]