## cluster the gut nbhds

(need more work to understand what's going on - bug?)

here, we cluster the gut neighborhoods by max containment.

```
rm -fr gut-clust-nbhds
mkdir gut-clust-nbhds
../2022-sourmash-uniqify/sourmash-uniqify.py gut/gut-nbhds.zip --max-containment --merge --prefix gut-clust-nbhds/

# put the cluster nbhds in as the right side
sourmash sig cat gut-clust-nbhds/*.sig -o gut-clust-nbhds/gut-clust-nbhds-nbhds.zip

# put the original _neighborhoods_ in as left side
ln gut/gut-nbhds.zip gut-clust-nbhds/gut-clust-nbhds-queries.zip

# link in metagenome
ln gut/p8808mo11.abundtrim.sig gut-clust-nbhds
```

In [22]:
name = 'gut-clust-nbhds'
acc = 'p8808mo11'
ksize=31

In [23]:
metag_filename = f'{name}/{acc}.abundtrim.sig'
queries_filename = f'{name}/{name}-queries.zip'
nbhds_filename = f'{name}/{name}-nbhds.zip'

In [24]:
import sourmash
import csv
import subprocess
import os
import collections

In [25]:
def get_ident(name):
    "pick off identifier, stripping off nbhd: prefix if present."
    name = name.split(' ')[0]
    if name.startswith('nbhd:'):
        name = name[5:]
    return name

def load_gather(filename):
    gather_rows = []
    with open(filename, newline="") as fp:
        r = csv.DictReader(fp)
        gather_rows.extend(r)

    full_idents = {}

    gather_d = {}
    gather_keys = []
    for row in gather_rows:
        ident = get_ident(row['name'])
        full_idents[ident] = " ".join(row['name'].split(' ')[:3])
        gather_d[ident] = row
        gather_keys.append(ident)
        
    return gather_keys, gather_d, full_idents

def gather_and_load(name, metag_filename, query_sigs, nbhd_sigs, ksize=31):
    assert os.path.exists(metag_filename)
    assert os.path.exists(query_sigs)
    assert os.path.exists(nbhd_sigs)
    
    query_out = f"{name}/queries.gather.csv"
    nbhd_out = f"{name}/nbhd.gather.csv"
    stdout1_out = f"{name}/queries.gather.out"
    stdout2_out = f"{name}/nbhd.gather.out"
    
    cmd = f"sourmash gather -k {ksize} {metag_filename} {query_sigs} -o {query_out} --ignore-abundance >& {stdout1_out}"
    print(cmd)
    if not os.path.exists(query_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {query_out} already exists; not rerunning sourmash gather on queries")
        
    stdout1 = open(stdout1_out, 'rt').read()


    cmd = f"sourmash gather -k {ksize} {metag_filename} {nbhd_sigs} -o {nbhd_out} --ignore-abundance >& {stdout2_out}"
    print(cmd)
    if not os.path.exists(nbhd_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {nbhd_out} already exists; not rerunning sourmash gather on nbhds")
        
    stdout2 = open(stdout2_out, 'rt').read()

       
    gather1_keys, gather1_d, full_idents = load_gather(query_out)
    gather2_keys, gather2_d, _ = load_gather(nbhd_out)
    
    metag = sourmash.load_one_signature(metag_filename, ksize=ksize)
    
    queries = list(sourmash.load_file_as_signatures(query_sigs))
    queries_d = {}
    for ss in queries:
        ident = get_ident(ss.name)
        queries_d[ident] = ss
        
    nbhds = list(sourmash.load_file_as_signatures(nbhd_sigs))
    nbhds_d = {}
    for ss in nbhds:
        ident = get_ident(ss.name)
        nbhds_d[ident] = ss
        
    tup = collections.namedtuple('augmented', 'metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2')
    t = tup(metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2)
    
    return t
    


## Running the things

In [26]:
t = gather_and_load(name, metag_filename, queries_filename, nbhds_filename, ksize=ksize)
metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2 = t

sourmash gather -k 31 gut-clust-nbhds/p8808mo11.abundtrim.sig gut-clust-nbhds/gut-clust-nbhds-queries.zip -o gut-clust-nbhds/queries.gather.csv --ignore-abundance >& gut-clust-nbhds/queries.gather.out
** gut-clust-nbhds/queries.gather.csv already exists; not rerunning sourmash gather on queries
sourmash gather -k 31 gut-clust-nbhds/p8808mo11.abundtrim.sig gut-clust-nbhds/gut-clust-nbhds-nbhds.zip -o gut-clust-nbhds/nbhd.gather.csv --ignore-abundance >& gut-clust-nbhds/nbhd.gather.out
** gut-clust-nbhds/nbhd.gather.csv already exists; not rerunning sourmash gather on nbhds


## gather output

In [27]:
print(stdout1)



overlap     p_query p_match
---------   ------- -------
9.5 Mbp        3.3%  100.0%    nbhd:CVRR01000001.1 Roseburia faecis ...
8.7 Mbp        3.0%   99.5%    nbhd:NIHW01000001.1 [Ruminococcus] gn...
7.8 Mbp        2.7%  100.0%    nbhd:DABGPL010000001.1 TPA_asm: Esche...
7.6 Mbp        2.7%   99.2%    nbhd:CP048626.1 Blautia producta ATCC...
7.6 Mbp        2.6%   97.8%    nbhd:CZAB01000001.1 [Clostridium] clo...
7.4 Mbp        2.6%   99.6%    nbhd:AENW01000060.1 Clostridium sp. H...
6.8 Mbp        2.4%   99.1%    nbhd:HF996869.1 Clostridium hathewayi...
5.8 Mbp        2.0%   96.4%    nbhd:GL834357.1 Clostridium symbiosum...
5.6 Mbp        1.8%   90.7%    nbhd:WTVF01000090.1 Enterocloster ald...
4.8 Mbp        1.7%   98.6%    nbhd:JH599901.1 Coprobacillus sp. 8_2...
4.1 Mbp        1.4%   99.8%    nbhd:JULC01000001.1 Bifidobacterium s...
4.0 Mbp        1.3%   95.4%    nbhd:CABJAX010000001.1 Lachnospiracea...
3.8 Mbp        1.3%   96.0%    nbhd:JH590866.1 Lachnospiraceae bacte...
3.6 Mb

In [28]:
print(stdout2)



overlap     p_query p_match
---------   ------- -------
7.8 Mbp        2.7%  100.0%    nbhd:DABGPL010000001.1 TPA_asm: Esche...
7.6 Mbp        2.7%  100.0%    nbhd:CP048626.1 Blautia producta ATCC...
3.4 Mbp        1.2%   99.3%    nbhd:SMCQ01000001.1 Longibaculum muri...
1.8 Mbp        0.6%   98.8%    nbhd:HF995324.1 Firmicutes bacterium ...
1.1 Mbp        0.4%   96.0%    nbhd:WMQE01000038.1 Turicibacter sang...
0.8 Mbp        0.3%   97.1%    nbhd:CZYD01000001.1 Terrisporobacter ...
0.8 Mbp        0.3%   98.4%    nbhd:URPP01000001.1 uncultured Clostr...
0.8 Mbp        0.3%   88.3%    nbhd:QUIC01000001.1 Ruminococcus sp. ...
0.7 Mbp        0.3%   99.6%    nbhd:UQNI01000001.1 uncultured Rumino...
0.6 Mbp        0.2%   96.0%    nbhd:CYZX01000001.1 Clostridium dispo...
0.6 Mbp        0.2%   86.4%    nbhd:OBJV01000049.1 Clostridium terti...
458.0 kbp      0.2%   99.1%    nbhd:CAAEVI010000001.1 TPA_asm: Erysi...
0.5 Mbp        0.2%   80.4%    nbhd:CYYS01000001.1 Turicibacter sang...
350.0 

## building mapping from query to neighborhood

In [29]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_1x = []
for ident in gather1_keys:
    row = gather1_d[ident]
    match = queries_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_1x.append((ident, unique_overlap))


In [30]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_2x = []
for ident in gather2_keys:
    row = gather2_d[ident]
    match = nbhds_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_2x.append((ident, unique_overlap))


In [31]:
for (nbhd_ident, nbhd_match) in unique_hashes_2x:
    total = 0
    for (query_ident, query_match) in unique_hashes_1x:
        overlap = query_match & nbhd_match
        
        if overlap:
            print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
            total += len(overlap)
    #print('xxx', nbhd_ident, len(nbhd_match), total)
    #print('---')

DABGPL010000001.1 <= CVRR01000001.1 - 1
DABGPL010000001.1 <= NIHW01000001.1 - 1
DABGPL010000001.1 <= DABGPL010000001.1 - 7766
CP048626.1 <= CVRR01000001.1 - 47
CP048626.1 <= NIHW01000001.1 - 13
CP048626.1 <= CP048626.1 - 7561
SMCQ01000001.1 <= CVRR01000001.1 - 1
SMCQ01000001.1 <= NIHW01000001.1 - 1
SMCQ01000001.1 <= AENW01000060.1 - 5
SMCQ01000001.1 <= JH599901.1 - 58
SMCQ01000001.1 <= CABJAX010000001.1 - 1
SMCQ01000001.1 <= JH590866.1 - 2
SMCQ01000001.1 <= FMFU01000020.1 - 2
SMCQ01000001.1 <= SMCQ01000001.1 - 3277
HF995324.1 <= CVRR01000001.1 - 13
HF995324.1 <= NIHW01000001.1 - 15
HF995324.1 <= CZAB01000001.1 - 14
HF995324.1 <= AENW01000060.1 - 2
HF995324.1 <= HF996869.1 - 5
HF995324.1 <= GL834357.1 - 3
HF995324.1 <= WTVF01000090.1 - 8
HF995324.1 <= CABJAX010000001.1 - 18
HF995324.1 <= MARQ01000035.1 - 1
HF995324.1 <= SPHM01000100.1 - 5
HF995324.1 <= FMEP01000075.1 - 14
HF995324.1 <= NFLB01000001.1 - 2
HF995324.1 <= USJS01000001.1 - 8
HF995324.1 <= HF995324.1 - 1716
WMQE01000038.1 <= 

In [32]:
import plotly.graph_objects as go

def make_fig(max_num=None):
    #labels = obj.make_labels()
    #src_l, dest_l, cnt_l, color_l, label_l = obj.make_lists()
    labels = []
    src_l = []
    dest_l = []
    cnt_l = []
    color_l = []
    label_l = []
    
    source_idx = {}
    for n, (query_ident, _) in enumerate(unique_hashes_1x):
        source_idx[query_ident] = n
        labels.append(full_idents[query_ident])
    dest_idx = {}
    source_idx["unassigned"] = len(unique_hashes_1x)
    labels.append("unassigned")
    base = len(unique_hashes_1x) + 1
    for n, (nbhd_ident, _) in enumerate(unique_hashes_2x):
        dest_idx[nbhd_ident] = base + n
        labels.append(full_idents[nbhd_ident])
        
    #source_idx["unassigned"] = base + n + 1
    
    # iterate over all sinks, account for all sources
    num = 0
    leftovers = []
    for (nbhd_ident, nbhd_match) in unique_hashes_2x:
        total = 0
        for (query_ident, query_match) in unique_hashes_1x:
            overlap = query_match & nbhd_match
        
            if overlap:
                #print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
                total += len(overlap)
                from_idx = source_idx[query_ident]
                to_idx = dest_idx[nbhd_ident]
                
                src_l.append(from_idx)
                dest_l.append(to_idx)
                cnt_l.append(len(overlap))
                if query_ident == nbhd_ident:
                    color_l.append("lightseagreen")
                else:
                    color_l.append("palevioletred")
                label_l.append("")
    
        #print('xxx', nbhd_ident, len(nbhd_match), total)
        leftover = len(nbhd_match) - total
        
        if leftover:
            leftovers.append((nbhd_ident, leftover))

        #print('---')
        num += 1
        if max_num and num >= max_num:
            break
            
    if leftovers:
        for nbhd_ident, leftover in leftovers:
            from_idx = source_idx["unassigned"]
            to_idx = dest_idx[nbhd_ident]

            src_l.append(from_idx)
            dest_l.append(to_idx)
            cnt_l.append(leftover)
            color_l.append("grey")
            label_l.append("")
            #print('XYZ', from_idx, to_idx)
        

    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = labels,
          color = "blue"
        ),
        link = dict(
          source = src_l,
          target = dest_l,
          value = cnt_l,
          color = color_l,
          label = label_l,
      ))])
    
    return fig

In [33]:
NUM=10
fig = make_fig(NUM)

fig.update_layout(title_text=f"original genome contributions to top {NUM} neighborhood covers for {name} ({acc})", font_size=10)
fig.update_layout(width=800, height=1000)
fig.show()