In [None]:
import os

genome_dir = '../../gtdb-contam-dna'
output_dir = '../../output.gtdb-contam-dna'
genbank_genomes = '../../genbank_genomes'
name = 'GCF_001683825.1_genomic.fna.gz'

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import importlib
import pprint
import json
import glob
import itertools

import charcoal.alignplot
importlib.reload(charcoal.alignplot)

from charcoal import alignplot
from charcoal.alignplot import AlignmentContainer, StackedDotPlot, AlignmentSlopeDiagram
from charcoal import utils

In [None]:
# configure paths to files based on parameters
genomebase = name
queryfile = f'{genome_dir}/{genomebase}'
matches_info_file = f'{output_dir}/stage2/{genomebase}.matches.json'

In [None]:
from IPython.display import Markdown as md
from IPython.display import display
md(f"# Charcoal alignment report for genome `{genomebase}`")

In [None]:
with open(matches_info_file, 'rt') as fp:
    matches_info = json.load(fp)
matches_info

genome_lin = utils.make_lineage(matches_info['query_info']['genome_lineage'])
match_rank = matches_info['query_info']['match_rank']
scaled = matches_info['query_info']['scaled']

clean_accs = []
clean_accs_d = {}
dirty_accs = []
dirty_accs_d = {}
for match_acc, acc_info in matches_info['matches'].items():
    match_counts = acc_info['counts']
    match_type = acc_info['match_type']
    match_lineage = acc_info['lineage']

    if match_type == 'clean':
        clean_accs.append((match_acc, match_lineage, match_counts))
        clean_accs_d[match_acc] = (match_lineage, match_counts)
    elif match_type == 'dirty':
        dirty_accs.append((match_acc, match_lineage, match_counts))
        dirty_accs_d[match_acc] = (match_lineage, match_counts)
        
clean_accs.sort(key=lambda x: -x[2])
dirty_accs.sort(key=lambda x: -x[2])

In [None]:
def load_target_pairs(match_list):
    pairs = []
    for acc, _, _ in match_list:
        filename = glob.glob(f'{genbank_genomes}/{acc}*.fna.gz')
        #assert len(filename) == 1, filename # @CTB
        filename = filename[0]
        pairs.append((acc, filename))
        
    return pairs

contaminant_pairs = load_target_pairs(dirty_accs)
clean_pairs = load_target_pairs(clean_accs)

In [None]:
dirty_alignment = AlignmentContainer(genomebase, queryfile, contaminant_pairs, f'{output_dir}/stage2/hitlist-accessions.info.csv')

results = {}
for t_acc, _ in contaminant_pairs:
    mashmap_file = f'{output_dir}/stage2/{genomebase}.x.{t_acc}.mashmap.align'
    results[t_acc] = dirty_alignment._read_mashmap(mashmap_file)
dirty_alignment.results = results

display(md('filtering dirty alignments to query size >= 500 and identity >= 95%'))
dirty_alignment.filter(query_size=0.5, pident=95)

sum_dirty_kb = sum(dirty_alignment.calc_shared().values())
display(md(f'**dirty bases: {sum_dirty_kb:.1f}kb of alignments to query genome, across all targets.**'))

In [None]:
clean_alignment = AlignmentContainer(genomebase, queryfile, clean_pairs, f'{output_dir}/stage2/hitlist-accessions.info.csv')

results = {}
for t_acc, _ in clean_pairs:
    mashmap_file = f'{output_dir}/stage2/{genomebase}.x.{t_acc}.mashmap.align'
    results[t_acc] = clean_alignment._read_mashmap(mashmap_file)
clean_alignment.results = results

display(md('filtering clean alignments to query size >= 500 and identity >= 95%'))
clean_alignment.filter(query_size=0.5, pident=95)

sum_clean_kb = sum(clean_alignment.calc_shared().values())
display(md(f'**clean bases: {sum_clean_kb:.1f}kb of alignments to query genome, across all targets.**'))

In [None]:
dirty_alignment = dirty_alignment.filter_by_query_coverage(0.5)
clean_alignment = clean_alignment.filter_by_query_coverage(0.5)

In [None]:
contigs_by_acc = {}
contigs_to_acc = {}
all_sizes = {}
all_sizes.update(alignplot.load_contig_sizes(queryfile))
for acc, _, _ in itertools.chain(clean_accs, dirty_accs):
    filename = glob.glob(f'{genbank_genomes}/{acc}*.fna.gz')
    filename = filename[0]
    sizes = alignplot.load_contig_sizes(filename)
    all_sizes.update(sizes)
    contigs_by_acc[acc] = sizes
    for contig_name in sizes:
        assert contig_name not in contigs_to_acc
        contigs_to_acc[contig_name] = acc

dirty_alignment = AlignmentContainer(genomebase, queryfile, contaminant_pairs, f'{output_dir}/stage2/hitlist-accessions.info.csv')

results = {}
for t_acc, _ in contaminant_pairs:
    mashmap_file = f'{output_dir}/stage2/{genomebase}.x.{t_acc}.mashmap.align'
    results[t_acc] = dirty_alignment._read_mashmap(mashmap_file)
dirty_alignment.results = results

output = []

output.append(f'filtering dirty alignments to query size >= 500 and identity >= 95%')
dirty_alignment.filter(query_size=0.5, pident=95)
output.append(f'filtering dirty alignments to min query coverage 50%')
dirty_alignment = dirty_alignment.filter_by_query_coverage(0.5)

sum_dirty_kb = sum(dirty_alignment.calc_shared().values())
#output.append(f'query genome lineage: {utils.display_lineage(genome_lin)}')
output.append(f'**dirty bases: {sum_dirty_kb:.1f}kb of alignments to query genome, across all targets.**')

all_regions = []
for t_acc, vv in dirty_alignment.results.items():
    all_regions.extend(vv)
regions_by_query = alignplot.group_regions_by(all_regions, 'query')
query_shared = dirty_alignment.calc_shared()

sum_to_remove = 0
for k, covered_bases in query_shared.items():
    sum_to_remove += all_sizes[k]
    #output.append(f'removing {all_sizes[k]:.0f}kb with {covered_bases:.0f}kb dirty, contig name {k}.')
    for region in regions_by_query[k]:
        source_acc = contigs_to_acc[region.target]
        source_lin = utils.make_lineage(dirty_accs_d[source_acc][0])
        query_aligned = alignplot.region_size(region, 'query')
        #print(f'   {query_aligned:.0f}kb aligns to {source_acc}:{region.target} at {region.pident:.1f}%')
        #print(f'   ({utils.display_lineage(source_lin)})')
        disagree_rank = utils.find_disagree_rank(genome_lin, source_lin)
        query_at_rank = utils.pop_to_rank(genome_lin, disagree_rank)[-1].name
        source_at_rank = utils.pop_to_rank(source_lin, disagree_rank)[-1].name
        #print(f"   ** disagreement at rank '{disagree_rank}'; query lineage {query_at_rank}, database lineage {source_at_rank}")

output.append(f'**removing {sum_to_remove:.0f}kb total in contigs >= 50% dirty, based on alignments at 95% identity over 0.5kb or more**')

display(md("\n\n".join(output)))


## Alignment views

In [None]:
if clean_alignment:
    display(md(f"showing {len(clean_alignment)} clean alignments total."))

    clean_dotplot = StackedDotPlot(clean_alignment)
    fig = clean_dotplot.plot()
    fig.set_size_inches(10, 8)
    _ = plt.title('Alignments to clean genomes')
    
    clean_slope = AlignmentSlopeDiagram(clean_alignment)
    fig = clean_slope.plot()
    fig.set_size_inches(10, 8)

    _ = plt.title('Alignments to clean genomes')
else:
    display(md("**no alignments to clean genomes to show!**"))

In [None]:
if dirty_alignment:
    display(md(f"showing {len(dirty_alignment)} dirty alignments total."))
    dirty_dotplot = StackedDotPlot(dirty_alignment)
    fig = dirty_dotplot.plot()
    fig.set_size_inches(10, 8)

    _ = plt.title('Alignments to dirty genomes')

    dirty_slope = AlignmentSlopeDiagram(dirty_alignment)
    fig = dirty_slope.plot()
    fig.set_size_inches(10,8)

    _ = plt.title('Alignments to dirty genomes')    
else:
    display(md("**no alignments to clean genomes to show!**"))

## Contig alignment report

In [None]:
output = []

sum_dirty_kb = sum(dirty_alignment.calc_shared().values())
#output.append(f'query genome lineage: {utils.display_lineage(genome_lin)}')
output.append(f'**dirty bases: {sum_dirty_kb:.1f}kb of alignments to query genome, across all targets.**')

all_regions = []
for t_acc, vv in dirty_alignment.results.items():
    all_regions.extend(vv)
regions_by_query = alignplot.group_regions_by(all_regions, 'query')
query_shared = dirty_alignment.calc_shared()

sum_to_remove = 0
for k, covered_bases in query_shared.items():
    sum_to_remove += all_sizes[k]
    output.append(f'\ncontig name {k}, total size {all_sizes[k]:.0f}kb, of which {covered_bases:.0f}kb ({covered_bases / all_sizes[k]*100:.0f}%) aligns to other genomes with a taxonomic mismatch.')
    for region in regions_by_query[k]:
        source_acc = contigs_to_acc[region.target]
        source_lin = utils.make_lineage(dirty_accs_d[source_acc][0])
        query_aligned = alignplot.region_size(region, 'query')
        output.append(f'\n * {query_aligned:.0f}kb aligns to genome `{source_acc}`, contig `{region.target}` at {region.pident:.1f}% identity')
        #output.append(f'   ({utils.display_lineage(source_lin)})')
        output.append(f'\n   * alignment from query `{region.query}[{int(region.qstart*1000)}:{int(region.qend*1000)}]` to database `{region.target}[{int(region.tstart*1000)}:{int(region.tend*1000)}]`')
        disagree_rank = utils.find_disagree_rank(genome_lin, source_lin)
        query_at_rank = utils.pop_to_rank(genome_lin, disagree_rank)[-1].name
        source_at_rank = utils.pop_to_rank(source_lin, disagree_rank)[-1].name
        output.append(f"   * disagreement at rank '{disagree_rank}'; query lineage `{query_at_rank}`, database lineage `{source_at_rank}`")

display(md("\n".join(output)))
