In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
%matplotlib inline

In [2]:
from collections import Counter, defaultdict
# import seaborn as sns

In [3]:
# !/Users/josh/packages/cd-hit-v4.8.1-2019-0228/cd-hit-est -M 2000 -T 3 -d 0  \
#     -i ~/src/skeeters/data/500_contigs.fasta \
#     -o ~/src/skeeters/data/500_contigs_cluster

In [4]:
from collections import defaultdict, namedtuple
Member = namedtuple('Member', ['contig', 'length', 'percent_id', 'percent_id_sign', 'sample', 'coverage'])

In [5]:
def parse_cdhit_row(row):
    if '*' in row:
        index, length, name, percent_id = row.split()
        percent_id_sign, percent_id = '0', 100
    else:
        index, length, name, _, percent_id = row.split()
    length = int(length.strip(',nt'))
    name = name.strip('>').strip('.')
    sample, contig = name.split('~')
    coverage = float(contig.split('_')[-1])
    
    if percent_id != 100:
        percent_id_sign, percent_id = percent_id.strip('%').split('/')
        percent_id = float(percent_id)
    return Member(contig=contig, sample=sample, length=length,
                  percent_id=percent_id, percent_id_sign=percent_id_sign, coverage=coverage)

In [6]:
example_row = '3	2441nt, >CMS002_026d_Rb_S149_L004~NODE_2_length_2441_cov_25.388748... at -/99.80%'
parse_cdhit_row(example_row)

Member(contig='NODE_2_length_2441_cov_25.388748', length=2441, percent_id=99.8, percent_id_sign='-', sample='CMS002_026d_Rb_S149_L004', coverage=25.388748)

In [7]:
clusters = defaultdict(list)
with open('/Users/evogytis/Documents/manuscripts/skeeters/data/500_contigs_cluster.clstr', 'r') as file:
    for line in file:
        if line.startswith('>Cluster'):
            cluster_id = line.split()[-1]
        else:
            member = parse_cdhit_row(line)
            if 'water' in member.sample.lower():
                continue
            clusters[cluster_id].append(member)

In [8]:
len(clusters)

82003

In [9]:
# remove singletons
clusters = {cluster_id: clusters[cluster_id] for cluster_id in clusters if len(clusters[cluster_id]) > 1}
samples = list(set([member.sample for cluster_id in clusters for member in clusters[cluster_id]]))

In [10]:
def get_representative(cluster):
    representative = [member for member in cluster if member.percent_id_sign == '0'][0]
    return representative
def print_cluster(cluster_id):
    cluster = clusters[cluster_id]
    representative = get_representative(cluster)
    samples = defaultdict(list)
    for member in cluster:
        samples[member.sample].append(str(member.length))
    lengths_list = ','.join(['+'.join(lengths) for lengths in samples.values()])
    
    print('Cluster: ', cluster_id)
    print('Rep contig: ', representative.sample, representative.contig)
    print('Rep Length: ', representative.length)
    print('N samples: ', len(samples))
    print('Contig Lengths: ', lengths_list)

In [11]:
print_cluster('100')

Cluster:  100
Rep contig:  CMS002_053a_Rb_S7_L004 NODE_14_length_5720_cov_8.857700
Rep Length:  5720
N samples:  29
Contig Lengths:  1585+1244+1034,1158,695+628+507,1922+987+940,619+520,1315,975,963,1050+707+524,1037+615+580+570,917+587,575,761,953+908,757,858+502,542,876,809+574,670,858,1253+1025,908+562+523,535,746,521,863+549,1096+820,5720


In [12]:
contig_to_cluster = {member.contig: cluster_id for cluster_id in clusters for member in clusters[cluster_id]}

In [13]:
def graph_from_clusters(clusters):
    G = nx.Graph()
    for cluster_id in clusters:
        for member in clusters[cluster_id]:
            if member.length > 700:
                G.add_edge(cluster_id, member.sample, attr_dict=member._asdict())
    return G

In [14]:
G = graph_from_clusters(clusters)

In [15]:
def df_from_subset(subset, aggregation = None):
    df = pd.DataFrame(columns = [cluster_id for cluster_id in clusters if cluster_id in subset], 
                      index = [sample for sample in samples if sample in subset], dtype=int).fillna(0)
    # make a heatmap
    for cluster_id in clusters:
        if cluster_id not in subset:
            continue
        for member in clusters[cluster_id]:
            if member.sample not in subset:
                continue
            if aggregation == None:
                df.loc[member.sample, cluster_id] = 1
            if aggregation == 'coverage':
                df.loc[member.sample, cluster_id] += member.coverage
            if aggregation == 'length':
                df.loc[member.sample, cluster_id] += member.length
    return df

In [16]:
def walk(nodes, G=G):
    if not isinstance(nodes, list):
        nodes = [nodes]
    return [nbr for node in nodes for nbr in G.neighbors(node) ]

In [17]:
def nbhd(start, depth=1):
    if isinstance(start, str):
        start = [start]
    n = start
    for i in range(depth):
        n = n + walk(n)
    return set(n)

In [18]:
# Test that neighborhoods grow
[len(nbhd(contig_to_cluster['NODE_2_length_2441_cov_25.388748'], i)) for i in range(4)]

[1, 16, 50, 135]

## Examples

In [19]:
def split_hannah(input):
    return [line.split('\t')[2] for line in input.split('\n')]
wuhan6 = """CMS001_042_Ra_S23	bin10	NODE_10_length_2444_cov_49.183354	1702	36384680	46.77792961 seg1=PB2
CMS001_042_Ra_S23	bin11	NODE_11_length_2437_cov_68.228814	2361	36384680	64.88994819 seg2=PB1
CMS001_042_Ra_S23	bin12	NODE_12_length_2238_cov_65.551134	2072	36384680	56.94704475 seg3=PA
CMS001_042_Ra_S23	bin13	NODE_15_length_1855_cov_99.279528	2584	36384680	71.01890136 seg4=NP
CMS001_042_Ra_S23	bin3	NODE_28_length_1501_cov_37.367978	787	36384680	21.62998273 seg5=gp
CMS001_042_Ra_S23	bin4	NODE_131_length_861_cov_90.163265	1027	36384680	28.22616552 seg6=hypoth"""

wuhan6_ish_contigs = split_hannah(wuhan6)



In [20]:
whidbey_ish = """CMS002_018b_Rb_S129_L004	bin1	NODE_4_length_2203_cov_29.747883	906	89837146	10.08491521 PA
CMS002_018b_Rb_S129_L004	bin15	NODE_10_length_884_cov_107.267658	1237	89837146	13.76935995 NP
CMS002_018b_Rb_S129_L004	bin15	NODE_9_length_939_cov_28.932715	361	89837146	4.018382329 NP
CMS002_018b_Rb_S129_L004	bin2	NODE_2_length_2430_cov_26.274968	876	89837146	9.750977619 PB1"""

whidbey_ish_contigs = split_hannah(whidbey_ish)



In [21]:
wm3_ish = """CMS001_027_Ra_S16	bin14	NODE_6_length_2203_cov_26.203669	930	29422908	31.6080246 PA
CMS001_027_Ra_S16	bin16	NODE_16_length_1779_cov_20.285546	605	29422908	20.56220955 NP
CMS001_027_Ra_S16	bin17	NODE_3_length_2447_cov_21.006751	882	29422908	29.97664269 PB1
CMS001_027_Ra_S16	bin7	NODE_4_length_2420_cov_26.034571	1026	29422908	34.87078843 PB2"""

wm3_ish_contigs = split_hannah(wm3_ish)



In [22]:
def sort_by(x_list, idx_list):
     return [x for (idx,x) in sorted(zip(idx_list,x_list), key=lambda pair: pair[0])]

In [23]:
def fish_dark_matter(bait_contigs, overlap_cutoff=0.9, aggregation=None):
    bait_clusters = [contig_to_cluster[c] for c in bait_contigs]
    neighborhood = nbhd(bait_clusters, 3)
    df = df_from_subset(neighborhood)
    samples_with_bait = df.loc[:, bait_clusters].sum(axis = 1) == len(bait_clusters)
    n_samples_with_bait = df.loc[:, bait_clusters].sum().mean()
    clusters_containing_bait = df.loc[samples_with_bait].sum() >= (overlap_cutoff * n_samples_with_bait)
    clusters_not_overflowing_bait = (df.sum() * overlap_cutoff) <= n_samples_with_bait

    overlapping_clusters = df.columns[clusters_containing_bait & clusters_not_overflowing_bait]
    
    if aggregation:
        df = df_from_subset(neighborhood, aggregation=aggregation)
    df = df[overlapping_clusters]
    df = df.loc[df.sum(axis = 1) > 0,:]
    
    return df

base_path='/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/s3_bucket/contigs/'

import os,glob
from Bio import SeqIO
def printContigs(marked):
    base_path='/Users/evogytis/Dropbox/Jupyter_notebooks/Biohub/California_mosquitoes/s3_bucket/contigs/'
    output=[]
    for fpath in glob.glob(os.path.join(base_path,'*','*.fasta')):
        sname=os.path.basename(fpath)
        
        contigs=SeqIO.parse(open(fpath,'r'),'fasta')
        for seq in contigs:
#             print(marked,seq.id)
            if seq.id in marked:
                output.append('>%s|%s\n%s'%(os.path.basename(os.path.dirname(fpath)),seq.id,seq.seq))
    return '\n'.join(output)+'\n'
#     print('>%s|%s\n%s'%(os.path.basename(os.path.dirname(fpath)),seq.id,seq.seq))

def fast_fish(bait_contigs, overlap_cutoff=0.9, aggregation=None):
    bait_clusters = [contig_to_cluster[c] for c in bait_contigs]
    neighborhood = nbhd(bait_clusters, 3)
    df = df_from_subset(neighborhood)
    samples_with_bait = df.loc[:, bait_clusters].sum(axis = 1) == len(bait_clusters)
    n_samples_with_bait = df.loc[:, bait_clusters].sum().mean()
    clusters_containing_bait = df.loc[samples_with_bait].sum() >= (overlap_cutoff * n_samples_with_bait)
    clusters_not_overflowing_bait = (df.sum() * overlap_cutoff) <= n_samples_with_bait

    overlapping_clusters = df.columns[clusters_containing_bait & clusters_not_overflowing_bait]
    
    if aggregation:
        df = df_from_subset(neighborhood, aggregation=aggregation)
    df = df[overlapping_clusters]
    df = df.loc[df.sum(axis = 1) > 0,:]
    
    return [cid for cid in df]
            
def display_dark_matter(df,report_seq=False,fltr=None,toFile=None):
    overlapping_clusters = df.columns
    
    if fltr==None:
        fltr=lambda x: True
    
    print('Found ', len(overlapping_clusters), ' candidate segments.')
    lengths = [get_representative(clusters[cluster_id]).length for cluster_id in overlapping_clusters]
    print('Segment lengths: ', sorted(lengths, reverse=True))
    
    if report_seq==True:
        for cluster_id in overlapping_clusters:
#             if toFile:
#                 toFile.write('%s\n'%('>%s\n%s'%(contig.contig,contig.)))
            out=printContigs([contig.contig for contig in clusters[cluster_id] if fltr(contig)])
            if toFile:
                toFile.write(out)
            else:
                print(out)
    
    for cluster in sort_by(overlapping_clusters, lengths):
        print('---')
        print_cluster(cluster)
#     if (df > 0).mean().mean() < 1:
#         sns.clustermap(df)

def segment_correlations(abundances):
    df = pd.DataFrame(np.corrcoef(abundances.T), columns=abundances.columns, index=abundances.columns)
    sns.clustermap(df)

In [24]:
# From one wuhan 6 contig
display_dark_matter(fish_dark_matter(wuhan6_ish_contigs[:1], 0.8))

Found  6  candidate segments.
Segment lengths:  [2444, 2441, 2239, 1871, 1532, 866]
---
Cluster:  24013
Rep contig:  CMS001_038_Ra_S22 NODE_80_length_866_cov_105.239544
Rep Length:  866
N samples:  13
Contig Lengths:  852,862,863,861,854,866,857,861,851,861,862,853,858
---
Cluster:  5024
Rep contig:  CMS002_029e_Rb_S164_L004 NODE_27_length_1532_cov_20.452921
Rep Length:  1532
N samples:  13
Contig Lengths:  1491,1500,1515,1496,1501,1495,1496,1501,1497,1496,1504,1532,1500
---
Cluster:  2693
Rep contig:  CMS002_056a_Rb_S9_L004 NODE_23_length_1871_cov_203.951505
Rep Length:  1871
N samples:  13
Contig Lengths:  1860,1867,1865,1850,1859,1858,1856,1855,1845,1865,1860,1868,1871
---
Cluster:  1525
Rep contig:  CMS002_029e_Rb_S164_L004 NODE_13_length_2239_cov_35.559667
Rep Length:  2239
N samples:  13
Contig Lengths:  2225,2228,2227,2221,2238,2223,2222,2238,2228,2228,2234,2239,2235
---
Cluster:  1170
Rep contig:  CMS002_029c_Rb_S161_L004 NODE_9_length_2441_cov_36.508037
Rep Length:  2441
N sam

In [25]:
# wm3 candidates
display_dark_matter(fish_dark_matter(wm3_ish_contigs))

Found  6  candidate segments.
Segment lengths:  [2447, 2420, 2203, 1779, 1449, 813]
---
Cluster:  27844
Rep contig:  CMS001_046_Ra_S3 NODE_46_length_813_cov_29.523098
Rep Length:  813
N samples:  2
Contig Lengths:  803,813
---
Cluster:  5942
Rep contig:  CMS001_046_Ra_S3 NODE_15_length_1449_cov_6.100583
Rep Length:  1449
N samples:  2
Contig Lengths:  1443,1449
---
Cluster:  3153
Rep contig:  CMS001_027_Ra_S16 NODE_16_length_1779_cov_20.285546
Rep Length:  1779
N samples:  2
Contig Lengths:  1779,1765
---
Cluster:  1610
Rep contig:  CMS001_027_Ra_S16 NODE_6_length_2203_cov_26.203669
Rep Length:  2203
N samples:  2
Contig Lengths:  2203,2196
---
Cluster:  1204
Rep contig:  CMS001_027_Ra_S16 NODE_4_length_2420_cov_26.034571
Rep Length:  2420
N samples:  2
Contig Lengths:  2420,2418
---
Cluster:  1160
Rep contig:  CMS001_027_Ra_S16 NODE_3_length_2447_cov_21.006751
Rep Length:  2447
N samples:  2
Contig Lengths:  2447,2447


In [27]:
awuhan6_seed = 'NODE_4_length_2235_cov_10.352641'

rep_seq=False
fltr=None
toFile=None
# toFile=open('/Users/evogytis/Downloads/your_fasta.fasta','w')

display_dark_matter(fish_dark_matter([awuhan6_seed]),report_seq=rep_seq,fltr=fltr,toFile=toFile)
toFile.close()

Found  9  candidate segments.
Segment lengths:  [6634, 2449, 2442, 2301, 2245, 2123, 1842, 1534, 869]
---
Cluster:  23854
Rep contig:  CMS002_045c_Rb_S185_L004 NODE_14_length_869_cov_33.226010
Rep Length:  869
N samples:  15
Contig Lengths:  857,857,857,853,860,664,857,534,857,863,869,856,856,861,857
---
Cluster:  4999
Rep contig:  CMS002_045f_Rb_S189_L004 NODE_10_length_1534_cov_13.974605
Rep Length:  1534
N samples:  15
Contig Lengths:  1494,1505,1514,1507,1496,1001,1476,899,1505,1510,1510,1515,1511,1534,1513
---
Cluster:  2828
Rep contig:  CMS002_045e_Rb_S188_L004 NODE_7_length_1842_cov_33.126346
Rep Length:  1842
N samples:  15
Contig Lengths:  1838,1823,1815,1839,1830,1079+689,1819,1804,1817,1832,1835,1838,1842,1841,1827
---
Cluster:  1813
Rep contig:  CMS002_026e_Rb_S150_L004 NODE_5_length_2123_cov_21.675953
Rep Length:  2123
N samples:  16
Contig Lengths:  2091,2085,2107,2093,2123,1404,793,2056,2040,2087,2084,2093,2093,2089,2079,2096
---
Cluster:  1513
Rep contig:  CMS002_026c_R

AttributeError: 'NoneType' object has no attribute 'close'

If we contract the search to an exact match, we find just six segments.

In [None]:
display_dark_matter(fish_dark_matter([awuhan6_seed], 1.0))

# Adding Species

In [None]:
metadata = pd.read_csv('../data/CMS001_CMS002_MergedAnnotations_190325.csv')
metadata.head()

In [None]:
metadata.columns

In [None]:
species = {sample: species for sample, species in zip(metadata['NewIDseqName'], metadata['compute_species'])}
genus = {sample: species for sample, species in zip(metadata['NewIDseqName'], metadata['compute_genus'])}

Likely, many of the Culex contigs are host which made it through filtering, because the genomes for the Culex species is much worse than that for Aedes.

In [None]:
genus_combo = pd.Series(["-".join(set([genus[member.sample] for member in clusters[cluster_id]]))
                      for cluster_id in clusters]).value_counts()
genus_combo.plot.bar(color='C0')

In [None]:
species_combo = pd.Series(["-".join(set([species[member.sample] for member in clusters[cluster_id]]))
                      for cluster_id in clusters]).value_counts()
species_combo.plot.bar(color='C0', figsize=(20,5))

## Graph

In [None]:
from cyjupyter import Cytoscape
from networkx.readwrite.json_graph import cytoscape_data

In [None]:
components = [G.subgraph(c) for c in nx.connected_components(G)]

In [None]:
subset = nbhd(wuhan6_exemplar_clusters, 1)
subg = G.subgraph(subset)

In [None]:
nx.draw(subg)

In [None]:
Cytoscape(data=cytoscape_data(subg))

In [None]:
def is_sample(node):
    return 1 if node.startswith('CMS') else 0

In [None]:
nx.draw(subg, node_size=50, node_color = [is_sample(node) for node in subg.nodes()])

In [None]:
Cytoscape(data=cytoscape_data(aw6_subgraph), layout_name='circle')