## gut preclust

cluster the queries (max-containment > 20%) before doing sgc.

```
~/2022-sourmash-uniqify/uniqify-genomes.py --max-containment --merge --prefix ~/genome-grist/sgc.paper/gut-matches.cluster *.fna.gz
```

In [6]:
name = 'gut-preclust'
acc = 'p8808mo11'
ksize=31

In [7]:
metag_filename = f'{name}/{acc}.abundtrim.sig'
queries_filename = f'{name}/{name}-queries.zip'
nbhds_filename = f'{name}/{name}-nbhds.zip'

In [8]:
import sourmash
import csv
import subprocess
import os
import collections

In [9]:
def get_ident(name):
    "pick off identifier, stripping off nbhd: prefix if present."
    name = name.split(' ')[0]
    if name.startswith('nbhd:'):
        name = name[5:]
    return name

def load_gather(filename):
    gather_rows = []
    with open(filename, newline="") as fp:
        r = csv.DictReader(fp)
        gather_rows.extend(r)

    full_idents = {}

    gather_d = {}
    gather_keys = []
    for row in gather_rows:
        ident = get_ident(row['name'])
        full_idents[ident] = " ".join(row['name'].split(' ')[:3])
        gather_d[ident] = row
        gather_keys.append(ident)
        
    return gather_keys, gather_d, full_idents

def gather_and_load(name, metag_filename, query_sigs, nbhd_sigs, ksize=31):
    assert os.path.exists(metag_filename)
    assert os.path.exists(query_sigs)
    assert os.path.exists(nbhd_sigs)
    
    query_out = f"{name}/queries.gather.csv"
    nbhd_out = f"{name}/nbhd.gather.csv"
    stdout1_out = f"{name}/queries.gather.out"
    stdout2_out = f"{name}/nbhd.gather.out"
    
    cmd = f"sourmash gather -k {ksize} {metag_filename} {query_sigs} -o {query_out} --ignore-abundance >& {stdout1_out}"
    print(cmd)
    if not os.path.exists(query_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {query_out} already exists; not rerunning sourmash gather on queries")
        
    stdout1 = open(stdout1_out, 'rt').read()


    cmd = f"sourmash gather -k {ksize} {metag_filename} {nbhd_sigs} -o {nbhd_out} --ignore-abundance >& {stdout2_out}"
    print(cmd)
    if not os.path.exists(nbhd_out):
        result = subprocess.run(cmd, shell=True)
    else:
        print(f"** {nbhd_out} already exists; not rerunning sourmash gather on nbhds")
        
    stdout2 = open(stdout2_out, 'rt').read()

       
    gather1_keys, gather1_d, full_idents = load_gather(query_out)
    gather2_keys, gather2_d, _ = load_gather(nbhd_out)
    
    metag = sourmash.load_one_signature(metag_filename, ksize=ksize)
    
    queries = list(sourmash.load_file_as_signatures(query_sigs))
    queries_d = {}
    for ss in queries:
        ident = get_ident(ss.name)
        queries_d[ident] = ss
        
    nbhds = list(sourmash.load_file_as_signatures(nbhd_sigs))
    nbhds_d = {}
    for ss in nbhds:
        ident = get_ident(ss.name)
        nbhds_d[ident] = ss
        
    tup = collections.namedtuple('augmented', 'metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2')
    t = tup(metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2)
    
    return t
    


## Running the things

In [10]:
t = gather_and_load(name, metag_filename, queries_filename, nbhds_filename, ksize=ksize)
metag, queries_d, nbhds_d, gather1_keys, gather1_d, gather2_keys, gather2_d, full_idents, stdout1, stdout2 = t

sourmash gather -k 31 gut-preclust/p8808mo11.abundtrim.sig gut-preclust/gut-preclust-queries.zip -o gut-preclust/queries.gather.csv --ignore-abundance >& gut-preclust/queries.gather.out
sourmash gather -k 31 gut-preclust/p8808mo11.abundtrim.sig gut-preclust/gut-preclust-nbhds.zip -o gut-preclust/nbhd.gather.csv --ignore-abundance >& gut-preclust/nbhd.gather.out


## gather output

In [11]:
print(stdout1)



overlap     p_query p_match
---------   ------- -------
5.8 Mbp        2.1%   64.1%    HF996869.1 Clostridium hathewayi CAG:...
5.6 Mbp        2.0%   90.8%    NZ_CP039126.1 Blautia producta strain...
5.6 Mbp        1.9%   44.5%    NZ_UAVW01000021.1 Enterocloster clost...
5.0 Mbp        1.8%   98.7%    NZ_JVOS01000029.1 Escherichia coli st...
4.9 Mbp        1.6%   40.0%    NZ_JAAITT010000099.1 Enterocloster al...
4.6 Mbp        1.5%   63.6%    NZ_JAAISL010000100.1 [Clostridium] sy...
3.7 Mbp        1.3%   69.8%    NZ_AENW01000060.1 Clostridium sp. HGF...
3.3 Mbp        1.1%   78.9%    HF995415.1 Coprobacillus sp. CAG:183 ...
3.2 Mbp        1.1%   99.3%    NZ_JULC01000001.1 Bifidobacterium sca...
3.2 Mbp        1.1%   59.3%    NZ_RQNR01000100.1 Clostridium perfrin...
3.1 Mbp        1.1%   45.9%    NZ_SPHF01000091.1 Roseburia hominis s...
3.1 Mbp        1.1%   56.6%    NZ_DS499569.1 Intestinibacter bartlet...
3.1 Mbp        1.1%   38.0%    NZ_NFJM01000001.1 Flavonifractor plau...
3.1 Mb

In [12]:
print(stdout2)



overlap     p_query p_match
---------   ------- -------
11.6 Mbp       4.1%  100.0%    nbhd:NZ_SPHF01000091.1 Roseburia homi...
10.2 Mbp       3.5%   98.1%    nbhd:NZ_NIHW01000001.1 [Ruminococcus]...
9.3 Mbp        3.2%   97.2%    nbhd:NZ_UAVW01000021.1 Enterocloster ...
7.7 Mbp        2.7%   99.9%    nbhd:NZ_JVOS01000029.1 Escherichia co...
7.7 Mbp        2.7%   98.6%    nbhd:NZ_AENW01000060.1 Clostridium sp...
7.6 Mbp        2.6%   96.4%    nbhd:NZ_CP039126.1 Blautia producta s...
7.6 Mbp        2.5%   93.0%    nbhd:HF996869.1 Clostridium hathewayi...
6.5 Mbp        2.0%   87.4%    nbhd:NZ_JAAISL010000100.1 [Clostridiu...
6.3 Mbp        1.8%   78.8%    nbhd:NZ_JAAITT010000099.1 Enteroclost...
5.0 Mbp        1.7%   97.4%    nbhd:HF995415.1 Coprobacillus sp. CAG...
4.2 Mbp        1.5%   98.4%    nbhd:NZ_DS499569.1 Intestinibacter ba...
4.1 Mbp        1.4%   99.7%    nbhd:NZ_JULC01000001.1 Bifidobacteriu...
4.4 Mbp        1.4%   90.5%    nbhd:CABJAX010000001.1 Lachnospiracea...
4.4 Mb

## building mapping from query to neighborhood

In [13]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_1x = []
for ident in gather1_keys:
    row = gather1_d[ident]
    match = queries_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_1x.append((ident, unique_overlap))


In [14]:
remaining_hashes = set(metag.minhash.hashes)
unique_hashes_2x = []
for ident in gather2_keys:
    row = gather2_d[ident]
    match = nbhds_d[ident]
    match_hashes = set(match.minhash.hashes)
    unique_overlap = remaining_hashes & match_hashes
    #print(ident, len(remaining_hashes), len(unique_overlap), row['unique_intersect_bp'], row['remaining_bp'])
    remaining_hashes -= unique_overlap
    unique_hashes_2x.append((ident, unique_overlap))


In [15]:
for (nbhd_ident, nbhd_match) in unique_hashes_2x:
    total = 0
    for (query_ident, query_match) in unique_hashes_1x:
        overlap = query_match & nbhd_match
        
        if overlap:
            print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
            total += len(overlap)
    #print('xxx', nbhd_ident, len(nbhd_match), total)
    #print('---')

NZ_SPHF01000091.1 <= HF996869.1 - 23
NZ_SPHF01000091.1 <= NZ_CP039126.1 - 9
NZ_SPHF01000091.1 <= NZ_UAVW01000021.1 - 21
NZ_SPHF01000091.1 <= NZ_JAAITT010000099.1 - 19
NZ_SPHF01000091.1 <= NZ_JAAISL010000100.1 - 8
NZ_SPHF01000091.1 <= NZ_AENW01000060.1 - 4
NZ_SPHF01000091.1 <= HF995415.1 - 5
NZ_SPHF01000091.1 <= NZ_RQNR01000100.1 - 6
NZ_SPHF01000091.1 <= NZ_SPHF01000091.1 - 3093
NZ_SPHF01000091.1 <= NZ_NFJM01000001.1 - 4
NZ_SPHF01000091.1 <= NZ_NFHY01000001.1 - 7
NZ_SPHF01000091.1 <= NZ_NIHW01000001.1 - 4
NZ_SPHF01000091.1 <= NZ_CABIYM010000001.1 - 1
NZ_SPHF01000091.1 <= NZ_WMZZ01000099.1 - 7
NZ_SPHF01000091.1 <= NZ_NFJI01000001.1 - 3
NZ_SPHF01000091.1 <= NZ_CP019721.1 - 2
NZ_SPHF01000091.1 <= CABJAX010000001.1 - 2
NZ_SPHF01000091.1 <= HF995324.1 - 9
NZ_SPHF01000091.1 <= USJS01000001.1 - 1
NZ_SPHF01000091.1 <= NZ_LR793273.1 - 2
NZ_SPHF01000091.1 <= NZ_JVCZ01000002.1 - 2
NZ_SPHF01000091.1 <= NZ_CABHOC010000001.1 - 1
NZ_SPHF01000091.1 <= URHS01000001.1 - 1
NZ_SPHF01000091.1 <= NZ_JACOOS01

In [20]:
import plotly.graph_objects as go

def make_fig(max_num=None):
    #labels = obj.make_labels()
    #src_l, dest_l, cnt_l, color_l, label_l = obj.make_lists()
    labels = []
    src_l = []
    dest_l = []
    cnt_l = []
    color_l = []
    label_l = []
    
    source_idx = {}
    for n, (query_ident, _) in enumerate(unique_hashes_1x):
        source_idx[query_ident] = n
        labels.append(full_idents[query_ident])
    dest_idx = {}
    source_idx["unassigned"] = len(unique_hashes_1x)
    labels.append("unassigned")
    base = len(unique_hashes_1x) + 1
    for n, (nbhd_ident, _) in enumerate(unique_hashes_2x):
        dest_idx[nbhd_ident] = base + n
        labels.append(full_idents[nbhd_ident])
        
    #source_idx["unassigned"] = base + n + 1
    
    # iterate over all sinks, account for all sources
    num = 0
    leftovers = []
    for (nbhd_ident, nbhd_match) in unique_hashes_2x:
        total = 0
        for (query_ident, query_match) in unique_hashes_1x:
            overlap = query_match & nbhd_match
        
            if len(overlap) > 10:
                #print(f"{nbhd_ident} <= {query_ident} - {len(overlap)}")
                total += len(overlap)
                from_idx = source_idx[query_ident]
                to_idx = dest_idx[nbhd_ident]
                
                src_l.append(from_idx)
                dest_l.append(to_idx)
                cnt_l.append(len(overlap))
                if query_ident == nbhd_ident:
                    color_l.append("lightseagreen")
                else:
                    color_l.append("palevioletred")
                label_l.append("")
    
        #print('xxx', nbhd_ident, len(nbhd_match), total)
        leftover = len(nbhd_match) - total
        
        if leftover:
            leftovers.append((nbhd_ident, leftover))

        #print('---')
        num += 1
        if max_num and num >= max_num:
            break
            
    if leftovers:
        for nbhd_ident, leftover in leftovers:
            from_idx = source_idx["unassigned"]
            to_idx = dest_idx[nbhd_ident]

            src_l.append(from_idx)
            dest_l.append(to_idx)
            cnt_l.append(leftover)
            color_l.append("grey")
            label_l.append("")
            #print('XYZ', from_idx, to_idx)
        

    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = labels,
          color = "blue"
        ),
        link = dict(
          source = src_l,
          target = dest_l,
          value = cnt_l,
          color = color_l,
          label = label_l,
      ))])
    
    return fig

In [21]:
NUM=5
fig = make_fig(NUM)

fig.update_layout(title_text=f"original genome contributions to top {NUM} neighborhood covers for {name} ({acc})", font_size=10)
fig.update_layout(width=800, height=1000)
fig.show()