## cluster the gut queries

here, we cluster the gut queries by max containment, for display.

```
rm -fr gut-clust-queries2
mkdir gut-clust-queries2
../2022-sourmash-uniqify/sourmash-uniqify.py gut/gut-queries.zip --max-containment --merge --prefix gut-clust-queries2/

# put the clusters in as the right side
sourmash sig cat gut-clust-queries2/*.sig -o gut-clust-queries2/gut-clust-queries2-nbhds.zip

# put the original queries in as left side
ln gut/gut-queries.zip gut-clust-queries2/gut-clust-queries2-queries.zip

# link in metagenome
ln gut/p8808mo11.abundtrim.sig gut-clust-queries2/
```

In [3]:
name = 'gut-clust-queries2'
acc = 'p8808mo11'
ksize=31

In [4]:
metag_filename = f'{name}/{acc}.abundtrim.sig'
queries_filename = f'{name}/{name}-queries.zip'
nbhds_filename = f'{name}/{name}-nbhds.zip'

In [5]:
import sourmash
import csv
import subprocess
import os
import collections

In [6]:
def get_ident(name):
    "pick off identifier, stripping off nbhd: prefix if present."
    name = name.split(' ')[0]
    if name.startswith('nbhd:'):
        name = name[5:]
    return name

def load_gather(filename):
    gather_rows = []
    with open(filename, newline="") as fp:
        r = csv.DictReader(fp)
        gather_rows.extend(r)

    full_idents = {}

    gather_d = {}
    gather_keys = []
    for row in gather_rows:
        ident = get_ident(row['name'])
        full_idents[ident] = " ".join(row['name'].split(' ')[:3])
        gather_d[ident] = row
        gather_keys.append(ident)
        
    return gather_keys, gather_d, full_idents

def gather_and_load(name, metag_filename, query_sigs, nbhd_sigs, ksize=31):
    assert os.path.exists(metag_filename)
    assert os.path.exists(query_sigs)
    assert os.path.exists(nbhd_sigs)
    
    query_out = f"{name}/queries.gather.csv"
    nbhd_out = f"{name}/nbhd.gather.csv"
    stdout1_out = f"{name}/queries.gather.out"
    stdout2_out = f"{name}/nbhd.gather.out"
    
    cmd = f"sourmash gather -k {ksize} {metag_filename} {query_sigs} -o {query_out} --ignore-abundance >& {stdout1_out}"
    print(cmd)
    if not os.path.exists(query_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {query_out} already exists; not rerunning sourmash gather on queries")
        
    stdout1 = open(stdout1_out, 'rt').read()


    cmd = f"sourmash gather -k {ksize} {metag_filename} {nbhd_sigs} -o {nbhd_out} --ignore-abundance >& {stdout2_out}"
    print(cmd)
    if not os.path.exists(nbhd_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {nbhd_out} already exists; not rerunning sourmash gather on nbhds")
        
    stdout2 = open(stdout2_out, 'rt').read()

       
    gather1_keys, gather1_d, full_idents = load_gather(query_out)
    gather2_keys, gather2_d, full_idents2 = load_gather(nbhd_out)
    for k, v in full_idents2.items():
        if k not in full_idents:
            if v.startswith('nbhd:'):
                v = v[5:]
            full_idents[k] = v
    
    metag = sourmash.load_one_signature(metag_filename, ksize=ksize)
    
    queries = list(sourmash.load_file_as_signatures(query_sigs))
    queries_d = {}
    for ss in queries:
        ident = get_ident(ss.name)
        queries_d[ident] = ss
        
    nbhds = list(sourmash.load_file_as_signatures(nbhd_sigs))
    nbhds_d = {}
    for ss in nbhds:
        ident = get_ident(ss.name)
        nbhds_d[ident] = ss
        
    tup = collections.namedtuple('augmented', 'metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2')
    t = tup(metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2)
    
    return t
    


## Running the things

In [9]:
t = gather_and_load(name, metag_filename, queries_filename, nbhds_filename, ksize=ksize)
metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2 = t

sourmash gather -k 31 gut-clust-queries2/p8808mo11.abundtrim.sig gut-clust-queries2/gut-clust-queries2-queries.zip -o gut-clust-queries2/queries.gather.csv --ignore-abundance >& gut-clust-queries2/queries.gather.out
** gut-clust-queries2/queries.gather.csv already exists; not rerunning sourmash gather on queries
sourmash gather -k 31 gut-clust-queries2/p8808mo11.abundtrim.sig gut-clust-queries2/gut-clust-queries2-nbhds.zip -o gut-clust-queries2/nbhd.gather.csv --ignore-abundance >& gut-clust-queries2/nbhd.gather.out
** gut-clust-queries2/nbhd.gather.csv already exists; not rerunning sourmash gather on nbhds


## gather output

In [10]:
print(stdout1)



overlap     p_query p_match
---------   ------- -------
5.6 Mbp        2.0%   91.7%    CP048626.1 Blautia producta ATCC 2734...
5.5 Mbp        1.9%   99.6%    HF996869.1 Clostridium hathewayi CAG:...
5.0 Mbp        1.8%   99.6%    DABGPL010000001.1 TPA_asm: Escherichi...
4.6 Mbp        1.6%   83.6%    CZAB01000001.1 [Clostridium] clostrid...
4.2 Mbp        1.5%   66.2%    WTVF01000090.1 Enterocloster aldenens...
4.1 Mbp        1.4%   88.0%    GL834357.1 Clostridium symbiosum WAL-...
3.5 Mbp        1.2%   87.3%    AENW01000060.1 Clostridium sp. HGF2 c...
3.2 Mbp        1.1%   99.3%    JULC01000001.1 Bifidobacterium scardo...
3.0 Mbp        1.1%   98.2%    HF995415.1 Coprobacillus sp. CAG:183 ...
2.9 Mbp        1.0%   89.5%    SMCQ01000001.1 Longibaculum muris str...
2.6 Mbp        0.9%   73.3%    MARQ01000035.1 Clostridium perfringen...
2.5 Mbp        0.9%   74.6%    CVRR01000001.1 Roseburia faecis genom...
2.5 Mbp        0.9%   80.2%    NIHW01000001.1 [Ruminococcus] gnavus ...
2.4 Mb

In [11]:
print(stdout2)



overlap     p_query p_match
---------   ------- -------
5.6 Mbp        2.0%   91.7%    CP048626.1 Blautia producta ATCC 2734...
5.0 Mbp        1.8%   99.6%    DABGPL010000001.1 TPA_asm: Escherichi...
3.2 Mbp        1.1%   99.3%    JULC01000001.1 Bifidobacterium scardo...
2.9 Mbp        1.0%   89.6%    SMCQ01000001.1 Longibaculum muris str...
1.6 Mbp        0.5%   51.8%    HF995324.1 Firmicutes bacterium CAG:6...
0.9 Mbp        0.3%   27.9%    WMQE01000038.1 Turicibacter sanguinis...
0.8 Mbp        0.3%   37.7%    AEKO01000011.1 Streptococcus vestibul...
0.7 Mbp        0.2%   29.6%    JAABLG010000001.1 Streptococcus vesti...
0.6 Mbp        0.2%   32.5%    UQNI01000001.1 uncultured Ruminococca...
0.6 Mbp        0.2%   17.9%    QVGF01000001.1 Lachnospiraceae bacter...
0.6 Mbp        0.2%   16.2%    BAHW02000078.1 Clostridiales bacteriu...
0.5 Mbp        0.2%   15.6%    URPP01000001.1 uncultured Clostridial...
0.6 Mbp        0.2%   22.7%    CABHOC010000001.1 Bifidobacterium lon...
496.0 

## building mapping from query to neighborhood

In [12]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_1x = []
for ident in gather1_keys:
    row = gather1_d[ident]
    match = queries_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_1x.append((ident, unique_overlap))


In [13]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_2x = []
for ident in gather2_keys:
    row = gather2_d[ident]
    match = nbhds_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_2x.append((ident, unique_overlap))


In [14]:
for (nbhd_ident, nbhd_match) in unique_hashes_2x:
    total = 0
    for (query_ident, query_match) in unique_hashes_1x:
        overlap = query_match & nbhd_match
        
        if overlap:
            print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
            total += len(overlap)
    #print('xxx', nbhd_ident, len(nbhd_match), total)
    #print('---')

CP048626.1 <= CP048626.1 - 5634
DABGPL010000001.1 <= DABGPL010000001.1 - 5017
JULC01000001.1 <= WTVF01000090.1 - 1
JULC01000001.1 <= JULC01000001.1 - 3197
SMCQ01000001.1 <= HF995415.1 - 4
SMCQ01000001.1 <= SMCQ01000001.1 - 2861
HF995324.1 <= CZAB01000001.1 - 6
HF995324.1 <= WTVF01000090.1 - 1
HF995324.1 <= CABIYM010000001.1 - 1
HF995324.1 <= FMEP01000075.1 - 3
HF995324.1 <= CABJAX010000001.1 - 1
HF995324.1 <= HF995324.1 - 1546
WMQE01000038.1 <= WMQE01000038.1 - 903
AEKO01000011.1 <= USJS01000001.1 - 1
AEKO01000011.1 <= LR793273.1 - 219
AEKO01000011.1 <= JAABLG010000001.1 - 48
AEKO01000011.1 <= KV794391.1 - 2
AEKO01000011.1 <= AEKO01000011.1 - 480
JAABLG010000001.1 <= LR793273.1 - 97
JAABLG010000001.1 <= JAABLG010000001.1 - 525
UQNI01000001.1 <= CVRR01000001.1 - 1
UQNI01000001.1 <= FMEP01000075.1 - 1
UQNI01000001.1 <= UQNI01000001.1 - 604
QVGF01000001.1 <= HF996869.1 - 2
QVGF01000001.1 <= CZAB01000001.1 - 5
QVGF01000001.1 <= WTVF01000090.1 - 3
QVGF01000001.1 <= GL834357.1 - 4
QVGF010000

In [15]:
import plotly.graph_objects as go

def make_fig(max_num=None):
    #labels = obj.make_labels()
    #src_l, dest_l, cnt_l, color_l, label_l = obj.make_lists()
    labels = []
    src_l = []
    dest_l = []
    cnt_l = []
    color_l = []
    label_l = []
    
    source_idx = {}
    for n, (query_ident, _) in enumerate(unique_hashes_1x):
        source_idx[query_ident] = n
        labels.append(full_idents.get(query_ident, 'LOST: ' + query_ident))
    dest_idx = {}
    source_idx["unassigned"] = len(unique_hashes_1x)
    labels.append("unassigned")
    base = len(unique_hashes_1x) + 1
    for n, (nbhd_ident, _) in enumerate(unique_hashes_2x):
        dest_idx[nbhd_ident] = base + n
        labels.append(full_idents.get(nbhd_ident, 'LOST: ' + query_ident))
        
    #source_idx["unassigned"] = base + n + 1
    
    # iterate over all sinks, account for all sources
    num = 0
    leftovers = []
    for (nbhd_ident, nbhd_match) in unique_hashes_2x:
        total = 0
        for (query_ident, query_match) in unique_hashes_1x:
            overlap = query_match & nbhd_match
        
            if overlap:
                #print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
                total += len(overlap)
                from_idx = source_idx[query_ident]
                to_idx = dest_idx[nbhd_ident]
                
                src_l.append(from_idx)
                dest_l.append(to_idx)
                cnt_l.append(len(overlap))
                if query_ident == nbhd_ident:
                    color_l.append("lightseagreen")
                else:
                    color_l.append("palevioletred")
                label_l.append("")
    
        #print('xxx', nbhd_ident, len(nbhd_match), total)
        leftover = len(nbhd_match) - total
        
        if leftover:
            leftovers.append((nbhd_ident, leftover))

        #print('---')
        num += 1
        if max_num and num >= max_num:
            break
            
    if leftovers:
        for nbhd_ident, leftover in leftovers:
            from_idx = source_idx["unassigned"]
            to_idx = dest_idx[nbhd_ident]

            src_l.append(from_idx)
            dest_l.append(to_idx)
            cnt_l.append(leftover)
            color_l.append("grey")
            label_l.append("")
            #print('XYZ', from_idx, to_idx)
        

    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = labels,
          color = "blue"
        ),
        link = dict(
          source = src_l,
          target = dest_l,
          value = cnt_l,
          color = color_l,
          label = label_l,
      ))])
    
    return fig

In [17]:
NUM=20
fig = make_fig(NUM)

fig.update_layout(title_text=f"original genome contributions to top {NUM} neighborhood covers for {name} ({acc})", font_size=10)
fig.update_layout(width=800, height=1000)
fig.show()