## cluster the gut queries

here, we cluster the gut queries by max containment, for display.

```
rm -fr gut-clust-queries
mkdir gut-clust-queries
../2022-sourmash-uniqify/sourmash-uniqify.py gut/gut-queries.zip --max-containment --merge --prefix gut-clust-queries/

# put the cluster sigs in as the left side
sourmash sig cat gut-clust-queries/*.sig -o gut-clust-queries/gut-clust-queries-queries.zip

# put the original _neighborhoods_ in as right side
ln gut/gut-nbhds.zip gut-clust-queries/gut-clust-queries-nbhds.zip

# link in metagenome
ln gut/p8808mo11.abundtrim.sig gut-clust-queries/
```

In [1]:
name = 'gut-clust-queries'
acc = 'p8808mo11'
ksize=31

In [2]:
metag_filename = f'{name}/{acc}.abundtrim.sig'
queries_filename = f'{name}/{name}-queries.zip'
nbhds_filename = f'{name}/{name}-nbhds.zip'

In [3]:
import sourmash
import csv
import subprocess
import os
import collections

In [31]:
def get_ident(name):
    "pick off identifier, stripping off nbhd: prefix if present."
    name = name.split(' ')[0]
    if name.startswith('nbhd:'):
        name = name[5:]
    return name

def load_gather(filename):
    gather_rows = []
    with open(filename, newline="") as fp:
        r = csv.DictReader(fp)
        gather_rows.extend(r)

    full_idents = {}

    gather_d = {}
    gather_keys = []
    for row in gather_rows:
        ident = get_ident(row['name'])
        full_idents[ident] = " ".join(row['name'].split(' ')[:3])
        gather_d[ident] = row
        gather_keys.append(ident)
        
    return gather_keys, gather_d, full_idents

def gather_and_load(name, metag_filename, query_sigs, nbhd_sigs, ksize=31):
    assert os.path.exists(metag_filename)
    assert os.path.exists(query_sigs)
    assert os.path.exists(nbhd_sigs)
    
    query_out = f"{name}/queries.gather.csv"
    nbhd_out = f"{name}/nbhd.gather.csv"
    stdout1_out = f"{name}/queries.gather.out"
    stdout2_out = f"{name}/nbhd.gather.out"
    
    cmd = f"sourmash gather -k {ksize} {metag_filename} {query_sigs} -o {query_out} --ignore-abundance >& {stdout1_out}"
    print(cmd)
    if not os.path.exists(query_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {query_out} already exists; not rerunning sourmash gather on queries")
        
    stdout1 = open(stdout1_out, 'rt').read()


    cmd = f"sourmash gather -k {ksize} {metag_filename} {nbhd_sigs} -o {nbhd_out} --ignore-abundance >& {stdout2_out}"
    print(cmd)
    if not os.path.exists(nbhd_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {nbhd_out} already exists; not rerunning sourmash gather on nbhds")
        
    stdout2 = open(stdout2_out, 'rt').read()

       
    gather1_keys, gather1_d, full_idents = load_gather(query_out)
    gather2_keys, gather2_d, full_idents2 = load_gather(nbhd_out)
    for k, v in full_idents2.items():
        if k not in full_idents:
            if v.startswith('nbhd:'):
                v = v[5:]
            full_idents[k] = v
    
    metag = sourmash.load_one_signature(metag_filename, ksize=ksize)
    
    queries = list(sourmash.load_file_as_signatures(query_sigs))
    queries_d = {}
    for ss in queries:
        ident = get_ident(ss.name)
        queries_d[ident] = ss
        
    nbhds = list(sourmash.load_file_as_signatures(nbhd_sigs))
    nbhds_d = {}
    for ss in nbhds:
        ident = get_ident(ss.name)
        nbhds_d[ident] = ss
        
    tup = collections.namedtuple('augmented', 'metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2')
    t = tup(metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2)
    
    return t
    


## Running the things

In [32]:
t = gather_and_load(name, metag_filename, queries_filename, nbhds_filename, ksize=ksize)
metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2 = t

sourmash gather -k 31 gut-clust-queries/p8808mo11.abundtrim.sig gut-clust-queries/gut-clust-queries-queries.zip -o gut-clust-queries/queries.gather.csv --ignore-abundance >& gut-clust-queries/queries.gather.out
** gut-clust-queries/queries.gather.csv already exists; not rerunning sourmash gather on queries
sourmash gather -k 31 gut-clust-queries/p8808mo11.abundtrim.sig gut-clust-queries/gut-clust-queries-nbhds.zip -o gut-clust-queries/nbhd.gather.csv --ignore-abundance >& gut-clust-queries/nbhd.gather.out
** gut-clust-queries/nbhd.gather.csv already exists; not rerunning sourmash gather on nbhds


## gather output

In [33]:
print(stdout1)



overlap     p_query p_match
---------   ------- -------
5.6 Mbp        2.0%   91.7%    CP048626.1 Blautia producta ATCC 2734...
5.0 Mbp        1.8%   99.6%    DABGPL010000001.1 TPA_asm: Escherichi...
3.2 Mbp        1.1%   99.3%    JULC01000001.1 Bifidobacterium scardo...
2.9 Mbp        1.0%   89.6%    SMCQ01000001.1 Longibaculum muris str...
1.6 Mbp        0.5%   51.8%    HF995324.1 Firmicutes bacterium CAG:6...
0.9 Mbp        0.3%   27.9%    WMQE01000038.1 Turicibacter sanguinis...
0.8 Mbp        0.3%   37.7%    AEKO01000011.1 Streptococcus vestibul...
0.7 Mbp        0.2%   29.6%    JAABLG010000001.1 Streptococcus vesti...
0.6 Mbp        0.2%   32.5%    UQNI01000001.1 uncultured Ruminococca...
0.6 Mbp        0.2%   17.9%    QVGF01000001.1 Lachnospiraceae bacter...
0.6 Mbp        0.2%   16.2%    BAHW02000078.1 Clostridiales bacteriu...
0.5 Mbp        0.2%   15.6%    URPP01000001.1 uncultured Clostridial...
0.6 Mbp        0.2%   22.7%    CABHOC010000001.1 Bifidobacterium lon...
496.0 

In [34]:
print(stdout2)



overlap     p_query p_match
---------   ------- -------
9.5 Mbp        3.3%  100.0%    nbhd:CVRR01000001.1 Roseburia faecis ...
8.7 Mbp        3.0%   99.5%    nbhd:NIHW01000001.1 [Ruminococcus] gn...
7.8 Mbp        2.7%  100.0%    nbhd:DABGPL010000001.1 TPA_asm: Esche...
7.6 Mbp        2.7%   99.2%    nbhd:CP048626.1 Blautia producta ATCC...
7.6 Mbp        2.6%   97.8%    nbhd:CZAB01000001.1 [Clostridium] clo...
7.4 Mbp        2.6%   99.6%    nbhd:AENW01000060.1 Clostridium sp. H...
6.8 Mbp        2.4%   99.1%    nbhd:HF996869.1 Clostridium hathewayi...
5.8 Mbp        2.0%   96.4%    nbhd:GL834357.1 Clostridium symbiosum...
5.6 Mbp        1.8%   90.7%    nbhd:WTVF01000090.1 Enterocloster ald...
4.8 Mbp        1.7%   98.6%    nbhd:JH599901.1 Coprobacillus sp. 8_2...
4.1 Mbp        1.4%   99.8%    nbhd:JULC01000001.1 Bifidobacterium s...
4.0 Mbp        1.3%   95.4%    nbhd:CABJAX010000001.1 Lachnospiracea...
3.8 Mbp        1.3%   96.0%    nbhd:JH590866.1 Lachnospiraceae bacte...
3.6 Mb

## building mapping from query to neighborhood

In [35]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_1x = []
for ident in gather1_keys:
    row = gather1_d[ident]
    match = queries_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_1x.append((ident, unique_overlap))


In [36]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_2x = []
for ident in gather2_keys:
    row = gather2_d[ident]
    match = nbhds_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_2x.append((ident, unique_overlap))


In [37]:
for (nbhd_ident, nbhd_match) in unique_hashes_2x:
    total = 0
    for (query_ident, query_match) in unique_hashes_1x:
        overlap = query_match & nbhd_match
        
        if overlap:
            print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
            total += len(overlap)
    #print('xxx', nbhd_ident, len(nbhd_match), total)
    #print('---')

CVRR01000001.1 <= CP048626.1 - 5
CVRR01000001.1 <= HF995324.1 - 9
CVRR01000001.1 <= AEKO01000011.1 - 1
CVRR01000001.1 <= UQNI01000001.1 - 1
CVRR01000001.1 <= QVGF01000001.1 - 4
CVRR01000001.1 <= BAHW02000078.1 - 1
CVRR01000001.1 <= CABHOC010000001.1 - 1
CVRR01000001.1 <= PDMO01000909.1 - 112
CVRR01000001.1 <= QUIC01000001.1 - 14
CVRR01000001.1 <= WKYR01000010.1 - 1
NIHW01000001.1 <= CP048626.1 - 6
NIHW01000001.1 <= SMCQ01000001.1 - 1
NIHW01000001.1 <= HF995324.1 - 7
NIHW01000001.1 <= QVGF01000001.1 - 8
NIHW01000001.1 <= BAHW02000078.1 - 6
NIHW01000001.1 <= CABHOC010000001.1 - 1
NIHW01000001.1 <= CP047191.1 - 1
NIHW01000001.1 <= QUIC01000001.1 - 3
NIHW01000001.1 <= JNHJ01000001.1 - 1
DABGPL010000001.1 <= CP048626.1 - 2
DABGPL010000001.1 <= DABGPL010000001.1 - 5017
DABGPL010000001.1 <= PDMO01000909.1 - 1
CP048626.1 <= CP048626.1 - 5621
CP048626.1 <= HF995324.1 - 3
CP048626.1 <= QVGF01000001.1 - 9
CP048626.1 <= BAHW02000078.1 - 1
CZAB01000001.1 <= JULC01000001.1 - 1
CZAB01000001.1 <= HF99

In [38]:
import plotly.graph_objects as go

def make_fig(max_num=None):
    #labels = obj.make_labels()
    #src_l, dest_l, cnt_l, color_l, label_l = obj.make_lists()
    labels = []
    src_l = []
    dest_l = []
    cnt_l = []
    color_l = []
    label_l = []
    
    source_idx = {}
    for n, (query_ident, _) in enumerate(unique_hashes_1x):
        source_idx[query_ident] = n
        labels.append(full_idents.get(query_ident, 'LOST: ' + query_ident))
    dest_idx = {}
    source_idx["unassigned"] = len(unique_hashes_1x)
    labels.append("unassigned")
    base = len(unique_hashes_1x) + 1
    for n, (nbhd_ident, _) in enumerate(unique_hashes_2x):
        dest_idx[nbhd_ident] = base + n
        labels.append(full_idents.get(nbhd_ident, 'LOST: ' + query_ident))
        
    #source_idx["unassigned"] = base + n + 1
    
    # iterate over all sinks, account for all sources
    num = 0
    leftovers = []
    for (nbhd_ident, nbhd_match) in unique_hashes_2x:
        total = 0
        for (query_ident, query_match) in unique_hashes_1x:
            overlap = query_match & nbhd_match
        
            if overlap:
                #print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
                total += len(overlap)
                from_idx = source_idx[query_ident]
                to_idx = dest_idx[nbhd_ident]
                
                src_l.append(from_idx)
                dest_l.append(to_idx)
                cnt_l.append(len(overlap))
                if query_ident == nbhd_ident:
                    color_l.append("lightseagreen")
                else:
                    color_l.append("palevioletred")
                label_l.append("")
    
        #print('xxx', nbhd_ident, len(nbhd_match), total)
        leftover = len(nbhd_match) - total
        
        if leftover:
            leftovers.append((nbhd_ident, leftover))

        #print('---')
        num += 1
        if max_num and num >= max_num:
            break
            
    if leftovers:
        for nbhd_ident, leftover in leftovers:
            from_idx = source_idx["unassigned"]
            to_idx = dest_idx[nbhd_ident]

            src_l.append(from_idx)
            dest_l.append(to_idx)
            cnt_l.append(leftover)
            color_l.append("grey")
            label_l.append("")
            #print('XYZ', from_idx, to_idx)
        

    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = labels,
          color = "blue"
        ),
        link = dict(
          source = src_l,
          target = dest_l,
          value = cnt_l,
          color = color_l,
          label = label_l,
      ))])
    
    return fig

In [39]:
NUM=9
fig = make_fig(NUM)

fig.update_layout(title_text=f"original genome contributions to top {NUM} neighborhood covers for {name} ({acc})", font_size=10)
fig.update_layout(width=800, height=1000)
fig.show()