This notebook extracts all viral contigs from the CD-HIT-EST hits, and concats them together for domain finding.

In [90]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [91]:
from glob import glob
import pandas as pd
from Bio import SeqIO

import merge_cluster_lca

In [92]:
lca_df = merge_cluster_lca.load_lca()
clusters = merge_cluster_lca.load_cdhit_clusters('../data/darkmatter/500_contigs_cluster.clstr')



In [93]:
df = merge_cluster_lca.merge_clusters_lca(clusters, lca_df)

In [94]:
viral_df = df[df['kingdom'] == 'Viruses']
viral_df.to_csv('../data/darkmatter/viral_clusters.csv')
len(viral_df)

340

In [95]:
bacteria_df = df[df['kingdom'] == 'Bacteria']
bacteria_df.to_csv('../data/darkmatter/bacteria_clusters.csv')
len(bacteria_df)

1564

In [96]:
recs = [record for record in SeqIO.parse("../data/s3/contigs/500_contigs_cluster.fasta", "fasta")]

In [97]:
from copy import deepcopy
key_to_cluster = dict(zip(df.contig_key, df.cluster.map(str)))
def clean_rec(record):
    record = deepcopy(record)
    record.id = key_to_cluster.get(record.id)
    record.name = key_to_cluster.get(record.id)
    record.description = ''
    return record

In [98]:
viral_records = [clean_rec(record) for record in recs if record.id in viral_df.contig_key.values]

print("There are ", len(viral_records), " viral clusters.")

SeqIO.write(viral_records, open('../data/darkmatter/viral_clusters.fasta', 'w'), 'fasta')

There are  340  viral clusters.


340

In [99]:
bacteria_records = [clean_rec(record) for record in recs if record.id in bacteria_df.contig_key.values]

print("There are ", len(bacteria_records), " bacteria clusters.")

SeqIO.write(bacteria_records, open('../data/darkmatter/bacterial_clusters.fasta', 'w'), 'fasta')

There are  1564  bacteria clusters.


1564

In [100]:
all_records = [clean_rec(record) for record in recs]

print("There are ", len(all_records), "  clusters.")

SeqIO.write(viral_records, open('../data/darkmatter/all_clusters.fasta', 'w'), 'fasta')

There are  82161   clusters.


340

# Scratch

In [423]:
# How many contigs did we not have hits for?
t = 0
n = 0
for cluster in clusters.values():
    for member in cluster:
        n += 1
        if member.taxid is None:
            t += 1
print(t, n, t/n)

111328 117833 0.9447947518946306


In [424]:
# How many clusters did we not have hits for?
n = 0
t = 0
for cluster in clusters.values():
    n += 1
    if all([member.taxid is None for member in cluster]):
        t += 1
print(t, n, t/n)

77488 82161 0.9431238665546914


For each cluster, we want the most specific mapping.

In [466]:
from statistics import mode
def mode(lst):
    if len(lst) > 0:
        return max(set(lst), key=lst.count)
    else:
        return None
def get_cluster_leaf(cluster):
    taxid_list = [member.taxid for member in cluster]
    if all([taxid is None for taxid in taxid_list]):
        return None
    leaf = None
    lineage = []
    for taxid in taxid_list:
        if taxid in lineage or taxid is None:
            continue
        else:
            new_lineage = ncbi.get_lineage(taxid)
            if leaf is None or leaf in new_lineage:
                leaf = taxid
                lineage = new_lineage
            else:
                return -1
    return leaf

def get_cluster_mode(cluster):
    taxid_list = [member.taxid for member in cluster]
    return mode(taxid_list)

def get_cluster_rep(cluster):
    for member in cluster:
        if member.is_ref:
            return member

# Fetch Viral Contigs

In [432]:
def get_cluster_kingdom(cluster):
    taxid_list = [member.taxid for member in cluster]
    kingdoms = [get_kingdom(taxid) for taxid in taxid_list]
    kingdoms = [k for k in kingdoms if k is not None]
    if len(kingdoms) > 0:
        return mode(kingdoms)
    else:
        return None

In [502]:
cluster_kingdoms = {id: get_cluster_kingdom(cluster) for id, cluster in clusters.items()}



In [542]:
viral_clusters = [id for id in cluster_kingdoms if cluster_kingdoms[id] == 'Viruses']
viral_clusters = sorted(viral_clusters, key=int)
viral_contigs =  set([member_to_key(get_cluster_rep(clusters[id])) for id in viral_clusters])

In [546]:
len(viral_clusters), species_classified

(394, 189)

In [548]:
species_classified = 0
for id in viral_clusters:
    rep_rank = taxid2rank.get(get_cluster_rep(clusters[id]).taxid)
    if rep_rank == 'species':
        print('Species:', get_cluster_rep(clusters[id]).name)
        species_classified += 1
    elif rep_rank == 'no rank':
        print('No Rank:', get_cluster_rep(clusters[id]).name)
    elif rep_rank == 'family':
        print('Family:', get_cluster_rep(clusters[id]).name)
    else:
        print(id)
        print(get_cluster_rep(clusters[id]).name,
          taxid2name.get(get_cluster_leaf(clusters[id])),
          taxid2name.get(get_cluster_mode(clusters[id])))

1
None Shayang fly virus 4 None
Species: Shayang fly virus 4
Species: Wuhan Mosquito Virus 9
Species: Wuhan Mosquito Virus 9
No Rank: unclassified Anphevirus
7
Viruses unclassified Anphevirus None
No Rank: unclassified Rhabdoviridae
Species: Hubei virga-like virus 2
Species: Hubei virga-like virus 2
Species: Culex pipiens associated Tunisia virus
Species: Hubei virga-like virus 2
Family: Flaviviridae
Species: Hubei virga-like virus 2
19
None Hubei virga-like virus 2 None
23
None Culex Iflavi-like virus 4 Culex Iflavi-like virus 4
Species: Culex Iflavi-like virus 4
Species: Wuhan fly virus 4
26
None Deformed wing virus None
Species: Culex Iflavi-like virus 3
29
Picornavirales Picornavirales Picornavirales
Species: Culex Iflavi-like virus 3
No Rank: Riboviria
42
None Ambiguous dsRNA virus environmental sample
Species: dsRNA virus environmental sample
44
Bunyavirales Ambiguous Bunyavirales
Species: Culex mosquito virus 4
49
None Riboviria None
55
None Ambiguous Totivirus-like Culex mosqui

There are  4297  viral clusters.


In [456]:
SeqIO.write(viral_records[:20], open('../data/darkmatter/viral_clusters.fasta', 'w'), 'fasta')

20

In [458]:
!head -n 300 '../data/darkmatter/viral_clusters.fasta'

>CMS001_001_Ra_S1~NODE_5_length_5116_cov_2209.828736 NODE_5_length_5116_cov_2209.828736
GTGAAAACCCAGCACGATAGGCTGATCCCAATAAGATTTGATTCAAGGCAGTCTAGGCCC
ACTAAGTTATTTCCGAGCCAGTGCTTAATTTCTGCTCAACAGCCATCTGCCGTTGTTGGT
TTTATGGAAGAGTGCCACACAGTTGGTGCCCTGTTGTTGAGGGGGGGGTGAAAACCCAGC
ACGATAGGCTGATCCCAATAAAATTTGATTCAATGCACTCTAGGCCCCCTAAGTTATTTC
CTAGCCAGTGCTTAATTTCTCCACAACACCCATCTACCGTTGTTGGTTGTATGGATGAGT
GCCACACAGTTGGTGCCCTGTTGTTGAGGTACGGACACGAACTGGAATAGGTACTGAACG
ATGACGTTGTTGGACGACAGGACGTATAGGCGATACGTCTGTTCAATGGCAGCCGGTGTG
TCTTCGCGCAGCACTACGCGCCAAGAGCGGTAGAGACTGCCTGAGTCGTAACTGTAAGCA
CCTCCATTGGTAAGTAGAGACCCGCCGAATGGGAAAGCTACCGGAGGGTAGGATCCCTTA
TCGAGTAAACTGGAAAGCAGTTTATTGGTCTCTCTAGTGTTGGTGGTGACTTCATACAGC
CCGGCGAAGACCGCGTTGTAAACGCGATCCAGCGACTGAACCAAAGTGGTGTCACTGGAT
GCTAAGGCGGCAACAATAGCCGTCTTGGCTGCCTTGGTGGAGTCCGCGGCCGTTTGTGCG
GAAGCGGAAACACTAGCGACTATTTCGTTGGCAGAAATCGCTTGTGCTCTGTCCAGGGAC
GCAATCCCCTCCAAAAATGCGGACGGCCCCAGATGGGGGCCCGCAATCCTAATTGTGTCT
GAGACGTGCTCTACTGAAGATTGAACTTCTTGAAGATGGCTTC

## Riboviria

In [None]:
for member in clusters['57']:
    if member.is_ref:
        print("REF")
    print('Length:', member.length)
    print('Name:', member.name)
    print('Contig', member.contig)

In [None]:
for id, cluster in clusters.items():
    has_riboviria = False
    for member in cluster:
        if member.name and 'Riboviria' in member.name:
            has_riboviria = True
    if has_riboviria:
        print(id, get_cluster_rep(cluster).name, get_cluster_rep(cluster).length)
        print(taxid2name.get(get_cluster_leaf(cluster)))

# LCA Consensus

In [240]:
cluster_lcas = {id: get_cluster_leaf([contig_to_lca(member.sample + '~' + member.contig) for member in cluster])            
                for id, cluster in clusters.items()}



Which clusters do not have a consensus leaf?

In [245]:
for cluster, lca in cluster_lcas.items():
    if lca is -1:
        print(cluster)
        for member in clusters[cluster]:
            contig_lca = contig_to_lca(member.sample + '~' + member.contig)
            contig_name = taxid2name.get(contig_lca)
            if contig_name is not None:
                print('\t', contig_name, contig_lca, '\t', member.length)

21050
	 Bilateria 33213 	 893
	 Bilateria 33213 	 915
	 Bilateria 33213 	 900
	 Dictyocaulus viviparus 29172 	 501
	 Bilateria 33213 	 895
	 Ecdysozoa 1206794 	 878
	 Ecdysozoa 1206794 	 761
	 Bilateria 33213 	 882
	 Ecdysozoa 1206794 	 877
	 Ecdysozoa 1206794 	 888
	 Ecdysozoa 1206794 	 834
	 Euteleostomi 117571 	 521
	 Ecdysozoa 1206794 	 880
	 Bilateria 33213 	 903
	 Ecdysozoa 1206794 	 827
	 Ecdysozoa 1206794 	 805
	 Ecdysozoa 1206794 	 875
	 Ecdysozoa 1206794 	 874
2279
	 Chryseobacterium antarcticum 266748 	 508
	 Aspergillus sclerotiicarbonarius CBS 121057 1448318 	 736
22419
	 Macrostomum lignano 282301 	 545
	 Salmonella enterica subsp. enterica serovar Agona str. 68.U.05 1265620 	 891
43
	 dsRNA virus environmental sample 1075826 	 1480
	 Riboviria 2559587 	 878
	 dsRNA virus environmental sample 1075826 	 7687
	 dsRNA virus environmental sample 1075826 	 539
	 Riboviria 2559587 	 518
	 dsRNA virus environmental sample 1075826 	 7398
	 dsRNA virus environmental sample 1075826

	 Lactobacillus reuteri 1598 	 833
	 Bacilli 91061 	 1136
5305
	 Wuhan Mosquito Virus 9 1608134 	 645
	 Wuhan Mosquito Virus 9 1608134 	 823
	 Wuhan Mosquito Virus 9 1608134 	 626
	 Grenada mosquito rhabdovirus 1 2125980 	 507
	 Wuhan Mosquito Virus 9 1608134 	 1504
2578
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1845
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1067
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1773
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1898
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1849
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1886
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1837
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 1084
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 924
	 Wolbachia endosymbiont of Drosophila ananassae 307502 	 929
	 uncultured alpha proteobacterium HF0010_30A23 710802 	 537
	 Wolbachia endosymbiont of Drosophila 

	 Neocallimastix californiae 1754190 	 785
	 Neocallimastix californiae 1754190 	 902
	 Neocallimastigaceae 29007 	 876
	 Piromyces finnis 1754191 	 900
	 Cryptosporidium muris RN66 441375 	 663
58988
	 Flavonifractor sp. An306 1965629 	 584
	 Verticillium longisporum 100787 	 577
41
	 Helobdella robusta 6412 	 692
	 Opisthokonta 33154 	 769
	 Sinocyclocheilus rhinocerous 307959 	 1135
	 Bilateria 33213 	 1070
18824
	 Nonomuraea candida 359159 	 957
	 Streptomyces viridosporus ATCC 14672 566461 	 899
132
	 cellular organisms 131567 	 5085
	 cellular organisms 131567 	 4210
	 Elysia chlorotica 188477 	 634
	 Solemya velum gill symbiont 2340 	 856
	 Solemya velum gill symbiont 2340 	 564
	 cellular organisms 131567 	 2772
	 Capitella teleta 283909 	 670
3584
	 Aster yellows phytoplasma 35779 	 598
	 Trichuris suis 68888 	 570
3702
	 Chenopodium quinoa 63459 	 740
	 Microtus ochrogaster 79684 	 1693
	 Eukaryota 2759 	 502
860
	 Elysia chlorotica 188477 	 584
	 Elysia chlorotica 188477 	 1

For which clusters do the mode differ from the leaf? From the longest?

In [218]:
for id, cluster in sorted(clusters.items(), key=lambda x: int(x[0])):
    print(id)
    cluster_taxids = [contig_to_lca(member.sample + '~' + member.contig) for member in cluster]
    leaf = get_cluster_leaf(cluster_taxids)
    mode = get_cluster_mode(cluster_taxids)
    if leaf != mode:
        print('\t', leaf, mode, [taxid2name[taxid] for taxid in cluster_taxids])        

0
	 -1 None [None, None, None, None, None, None, None, None, None, None, None, 'Bacillus thuringiensis', None, None, None, None, None, None, None, None, None, 'Opisthokonta', None, None, None, 'Arcobacter lekithochrous', None, None, None, None, None, None, None, 'Bacillus thuringiensis', None, None, None, None, None, None, None, 'Gammaproteobacteria', None, 'Arcobacter lekithochrous', None, None]
1
2
3
4
5
6
7
	 2059674 None ['unclassified Anphevirus', None, 'Viruses', None, 'Viruses', 'Anphevirus', 'Viruses', None, None, None, 'Viruses', 'unclassified Anphevirus', 'unclassified Anphevirus', 'unclassified Anphevirus', 'unclassified Anphevirus']
8
	 1803034 35303 ['unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'Merida virus', 'unclassified Rhabdoviridae', 'unclassified Rhabdoviridae', 'unclassifie



KeyError: 2109339

In [176]:
nr_df[nr_df['sample'] == 'CMS001_009_Ra_S13']

nr_df[nr_df['contig'] == 'NODE_4532_length_539_cov_1.203463']

Unnamed: 0,contig,blast_type,sample,identity,align_length,mismatches,gaps,qstart,qend,sstart,send,bitscore,taxid,contig_key


CMS001_001_Ra_S1~NODE_5_length_5116_cov_2209.828736
CMS001_001_Ra_S1~NODE_7_length_3782_cov_120.096086
CMS001_001_Ra_S1~NODE_9_length_3713_cov_129.105611
CMS001_001_Ra_S1~NODE_11_length_3224_cov_1346.461710
CMS001_001_Ra_S1~NODE_14_length_2766_cov_1.243213
CMS001_001_Ra_S1~NODE_15_length_2714_cov_560.751991
CMS001_001_Ra_S1~NODE_17_length_2485_cov_1.920681
CMS001_001_Ra_S1~NODE_18_length_2469_cov_1.543896
CMS001_001_Ra_S1~NODE_19_length_2463_cov_0.659681
CMS001_001_Ra_S1~NODE_20_length_2429_cov_2.034439
CMS001_001_Ra_S1~NODE_21_length_2269_cov_429.505474
CMS001_001_Ra_S1~NODE_22_length_2269_cov_0.967609
CMS001_001_Ra_S1~NODE_24_length_2143_cov_3.526621
CMS001_001_Ra_S1~NODE_26_length_2085_cov_2.417829
CMS001_001_Ra_S1~NODE_28_length_2042_cov_0.630025
CMS001_001_Ra_S1~NODE_29_length_1985_cov_1.600105
CMS001_001_Ra_S1~NODE_30_length_1939_cov_1.577336
CMS001_001_Ra_S1~NODE_31_length_1909_cov_6.485262
CMS001_001_Ra_S1~NODE_32_length_1847_cov_6.180791
CMS001_001_Ra_S1~NODE_33_length_1797_co

CMS001_003_Ra_S2~NODE_2164_length_506_cov_1.032634
CMS001_003_Ra_S2~NODE_2166_length_506_cov_0.990676
CMS001_003_Ra_S2~NODE_2168_length_506_cov_0.937063
CMS001_003_Ra_S2~NODE_2170_length_506_cov_0.832168
CMS001_003_Ra_S2~NODE_2172_length_506_cov_0.759907
CMS001_003_Ra_S2~NODE_2173_length_506_cov_0.750583
CMS001_003_Ra_S2~NODE_2174_length_506_cov_0.708625
CMS001_003_Ra_S2~NODE_2175_length_506_cov_0.696970
CMS001_003_Ra_S2~NODE_2180_length_505_cov_1.628505
CMS001_003_Ra_S2~NODE_2185_length_505_cov_1.021028
CMS001_003_Ra_S2~NODE_2186_length_505_cov_1.007009
CMS001_003_Ra_S2~NODE_2187_length_505_cov_0.873832
CMS001_003_Ra_S2~NODE_2190_length_505_cov_0.775701
CMS001_003_Ra_S2~NODE_2197_length_504_cov_1.035129
CMS001_003_Ra_S2~NODE_2198_length_504_cov_0.927400
CMS001_003_Ra_S2~NODE_2199_length_504_cov_0.918033
CMS001_003_Ra_S2~NODE_2200_length_504_cov_0.845433
CMS001_003_Ra_S2~NODE_2202_length_503_cov_1.734742
CMS001_003_Ra_S2~NODE_2204_length_503_cov_1.049296
CMS001_003_Ra_S2~NODE_2206_leng

CMS001_005_Ra_S3~NODE_1846_length_679_cov_2.054817
CMS001_005_Ra_S3~NODE_1850_length_679_cov_1.325581
CMS001_005_Ra_S3~NODE_1851_length_679_cov_0.986711
CMS001_005_Ra_S3~NODE_1855_length_678_cov_1.612313
CMS001_005_Ra_S3~NODE_1856_length_678_cov_1.334443
CMS001_005_Ra_S3~NODE_1859_length_678_cov_0.670549
CMS001_005_Ra_S3~NODE_1863_length_677_cov_1.553333
CMS001_005_Ra_S3~NODE_1865_length_677_cov_1.200000
CMS001_005_Ra_S3~NODE_1866_length_677_cov_0.910000
CMS001_005_Ra_S3~NODE_1868_length_676_cov_1.430718
CMS001_005_Ra_S3~NODE_1870_length_676_cov_0.863105
CMS001_005_Ra_S3~NODE_1871_length_676_cov_0.808013
CMS001_005_Ra_S3~NODE_1872_length_676_cov_0.734558
CMS001_005_Ra_S3~NODE_1873_length_675_cov_4.334448
CMS001_005_Ra_S3~NODE_1876_length_675_cov_1.008361
CMS001_005_Ra_S3~NODE_1877_length_675_cov_0.971572
CMS001_005_Ra_S3~NODE_1878_length_675_cov_0.622074
CMS001_005_Ra_S3~NODE_1884_length_674_cov_1.412060
CMS001_005_Ra_S3~NODE_1885_length_674_cov_1.221106
CMS001_005_Ra_S3~NODE_1886_leng

CMS001_006_Ra_S5~NODE_820_length_540_cov_0.997840
CMS001_006_Ra_S5~NODE_830_length_536_cov_0.636166
CMS001_006_Ra_S5~NODE_832_length_535_cov_0.600437
CMS001_006_Ra_S5~NODE_834_length_534_cov_1.277899
CMS001_006_Ra_S5~NODE_835_length_534_cov_0.894967
CMS001_006_Ra_S5~NODE_836_length_534_cov_0.645514
CMS001_006_Ra_S5~NODE_837_length_534_cov_0.592998
CMS001_006_Ra_S5~NODE_839_length_533_cov_1.278509
CMS001_006_Ra_S5~NODE_840_length_533_cov_1.203947
CMS001_006_Ra_S5~NODE_842_length_533_cov_0.677632
CMS001_006_Ra_S5~NODE_844_length_531_cov_1.473568
CMS001_006_Ra_S5~NODE_846_length_530_cov_0.918322
CMS001_006_Ra_S5~NODE_849_length_528_cov_1.773836
CMS001_006_Ra_S5~NODE_850_length_528_cov_1.263858
CMS001_006_Ra_S5~NODE_851_length_528_cov_0.698448
CMS001_006_Ra_S5~NODE_853_length_526_cov_8.592428
CMS001_006_Ra_S5~NODE_855_length_526_cov_2.229399
CMS001_006_Ra_S5~NODE_858_length_526_cov_1.138085
CMS001_006_Ra_S5~NODE_861_length_525_cov_0.812500
CMS001_006_Ra_S5~NODE_864_length_523_cov_0.612108


CMS001_009_Ra_S13~NODE_638_length_1255_cov_1.028014
CMS001_009_Ra_S13~NODE_640_length_1254_cov_1.201359
CMS001_009_Ra_S13~NODE_641_length_1253_cov_8.506803
CMS001_009_Ra_S13~NODE_642_length_1253_cov_2.365646
CMS001_009_Ra_S13~NODE_643_length_1252_cov_3.915745
CMS001_009_Ra_S13~NODE_644_length_1252_cov_0.978723
CMS001_009_Ra_S13~NODE_645_length_1251_cov_3.335605
CMS001_009_Ra_S13~NODE_646_length_1251_cov_2.994889
CMS001_009_Ra_S13~NODE_647_length_1250_cov_2.478261
CMS001_009_Ra_S13~NODE_649_length_1248_cov_5.518360
CMS001_009_Ra_S13~NODE_650_length_1248_cov_1.530316
CMS001_009_Ra_S13~NODE_651_length_1247_cov_3.731624
CMS001_009_Ra_S13~NODE_652_length_1247_cov_3.588889
CMS001_009_Ra_S13~NODE_653_length_1246_cov_1.601369
CMS001_009_Ra_S13~NODE_654_length_1244_cov_2.549272
CMS001_009_Ra_S13~NODE_656_length_1243_cov_4.449400
CMS001_009_Ra_S13~NODE_657_length_1242_cov_1.553648
CMS001_009_Ra_S13~NODE_658_length_1240_cov_3.294067
CMS001_009_Ra_S13~NODE_659_length_1240_cov_3.027515
CMS001_009_R

CMS001_009_Ra_S13~NODE_2785_length_682_cov_3.452893
CMS001_009_Ra_S13~NODE_2786_length_682_cov_3.115702
CMS001_009_Ra_S13~NODE_2787_length_682_cov_3.100826
CMS001_009_Ra_S13~NODE_2788_length_682_cov_2.844628
CMS001_009_Ra_S13~NODE_2790_length_682_cov_2.647934
CMS001_009_Ra_S13~NODE_2792_length_682_cov_2.309091
CMS001_009_Ra_S13~NODE_2793_length_682_cov_1.900826
CMS001_009_Ra_S13~NODE_2794_length_682_cov_1.742149
CMS001_009_Ra_S13~NODE_2795_length_682_cov_1.406612
CMS001_009_Ra_S13~NODE_2797_length_682_cov_1.143802
CMS001_009_Ra_S13~NODE_2799_length_681_cov_30.943709
CMS001_009_Ra_S13~NODE_2802_length_681_cov_3.137417
CMS001_009_Ra_S13~NODE_2803_length_681_cov_3.000000
CMS001_009_Ra_S13~NODE_2804_length_681_cov_2.779801
CMS001_009_Ra_S13~NODE_2805_length_681_cov_1.673841
CMS001_009_Ra_S13~NODE_2806_length_681_cov_1.647351
CMS001_009_Ra_S13~NODE_2807_length_681_cov_1.605960
CMS001_009_Ra_S13~NODE_2808_length_681_cov_1.571192
CMS001_009_Ra_S13~NODE_2809_length_681_cov_1.192053
CMS001_009_

CMS001_011_Ra_S4~NODE_88_length_923_cov_0.689125
CMS001_011_Ra_S4~NODE_89_length_922_cov_3.480473
CMS001_011_Ra_S4~NODE_92_length_911_cov_2.881295
CMS001_011_Ra_S4~NODE_93_length_910_cov_15.246098
CMS001_011_Ra_S4~NODE_94_length_905_cov_0.743961
CMS001_011_Ra_S4~NODE_95_length_902_cov_2.562424
CMS001_011_Ra_S4~NODE_96_length_896_cov_2.293040
CMS001_011_Ra_S4~NODE_98_length_891_cov_0.882064
CMS001_011_Ra_S4~NODE_101_length_888_cov_0.797781
CMS001_011_Ra_S4~NODE_104_length_877_cov_0.816250
CMS001_011_Ra_S4~NODE_105_length_873_cov_14.952261
CMS001_011_Ra_S4~NODE_107_length_870_cov_0.650694
CMS001_011_Ra_S4~NODE_109_length_857_cov_0.724359
CMS001_011_Ra_S4~NODE_111_length_856_cov_0.824134
CMS001_011_Ra_S4~NODE_114_length_850_cov_0.900388
CMS001_011_Ra_S4~NODE_116_length_843_cov_4.214099
CMS001_011_Ra_S4~NODE_117_length_842_cov_0.661438
CMS001_011_Ra_S4~NODE_119_length_841_cov_0.761780
CMS001_011_Ra_S4~NODE_122_length_838_cov_0.854139
CMS001_011_Ra_S4~NODE_124_length_837_cov_0.948684
CMS001

CMS001_013_Ra_S5~NODE_133_length_1000_cov_0.646804
CMS001_013_Ra_S5~NODE_134_length_999_cov_1.851410
CMS001_013_Ra_S5~NODE_136_length_988_cov_1.032931
CMS001_013_Ra_S5~NODE_138_length_984_cov_1.002205
CMS001_013_Ra_S5~NODE_140_length_978_cov_2.956715
CMS001_013_Ra_S5~NODE_142_length_966_cov_0.816648
CMS001_013_Ra_S5~NODE_143_length_959_cov_0.541950
CMS001_013_Ra_S5~NODE_144_length_957_cov_0.800000
CMS001_013_Ra_S5~NODE_145_length_954_cov_1.559863
CMS001_013_Ra_S5~NODE_146_length_949_cov_2.131881
CMS001_013_Ra_S5~NODE_148_length_946_cov_0.613349
CMS001_013_Ra_S5~NODE_151_length_942_cov_1.290173
CMS001_013_Ra_S5~NODE_152_length_941_cov_1.560185
CMS001_013_Ra_S5~NODE_153_length_935_cov_1.227273
CMS001_013_Ra_S5~NODE_154_length_935_cov_1.079254
CMS001_013_Ra_S5~NODE_157_length_931_cov_0.850117
CMS001_013_Ra_S5~NODE_160_length_929_cov_1.400235
CMS001_013_Ra_S5~NODE_161_length_929_cov_1.029343
CMS001_013_Ra_S5~NODE_162_length_927_cov_1.143529
CMS001_013_Ra_S5~NODE_165_length_925_cov_1.103774

CMS001_016_Ra_S6~NODE_771_length_798_cov_1.857143
CMS001_016_Ra_S6~NODE_773_length_798_cov_1.447989
CMS001_016_Ra_S6~NODE_774_length_798_cov_1.385576
CMS001_016_Ra_S6~NODE_775_length_798_cov_1.249653
CMS001_016_Ra_S6~NODE_776_length_798_cov_1.000000
CMS001_016_Ra_S6~NODE_777_length_798_cov_0.574202
CMS001_016_Ra_S6~NODE_778_length_796_cov_2.162726
CMS001_016_Ra_S6~NODE_779_length_796_cov_0.770515
CMS001_016_Ra_S6~NODE_780_length_795_cov_8.552925
CMS001_016_Ra_S6~NODE_781_length_795_cov_2.898329
CMS001_016_Ra_S6~NODE_782_length_795_cov_1.452646
CMS001_016_Ra_S6~NODE_783_length_794_cov_1.545328
CMS001_016_Ra_S6~NODE_784_length_794_cov_0.977685
CMS001_016_Ra_S6~NODE_785_length_793_cov_5.724860
CMS001_016_Ra_S6~NODE_786_length_793_cov_5.224860
CMS001_016_Ra_S6~NODE_788_length_793_cov_0.882682
CMS001_016_Ra_S6~NODE_789_length_792_cov_3.408392
CMS001_016_Ra_S6~NODE_790_length_792_cov_1.676923
CMS001_016_Ra_S6~NODE_791_length_792_cov_0.862937
CMS001_016_Ra_S6~NODE_792_length_792_cov_0.763636


CMS001_018_Ra_S14~NODE_59_length_1024_cov_0.513200
CMS001_018_Ra_S14~NODE_61_length_1012_cov_0.633155
CMS001_018_Ra_S14~NODE_65_length_999_cov_0.425163
CMS001_018_Ra_S14~NODE_66_length_998_cov_1.263844
CMS001_018_Ra_S14~NODE_68_length_977_cov_24.494444
CMS001_018_Ra_S14~NODE_69_length_957_cov_0.600000
CMS001_018_Ra_S14~NODE_70_length_953_cov_0.757991
CMS001_018_Ra_S14~NODE_72_length_934_cov_0.619603
CMS001_018_Ra_S14~NODE_73_length_928_cov_0.568743
CMS001_018_Ra_S14~NODE_76_length_923_cov_0.861702
CMS001_018_Ra_S14~NODE_78_length_915_cov_0.705251
CMS001_018_Ra_S14~NODE_83_length_891_cov_0.916462
CMS001_018_Ra_S14~NODE_84_length_886_cov_0.495674
CMS001_018_Ra_S14~NODE_85_length_876_cov_0.618273
CMS001_018_Ra_S14~NODE_86_length_870_cov_1.174023
CMS001_018_Ra_S14~NODE_87_length_868_cov_6.305942
CMS001_018_Ra_S14~NODE_88_length_868_cov_0.594185
CMS001_018_Ra_S14~NODE_90_length_867_cov_0.900000
CMS001_018_Ra_S14~NODE_91_length_860_cov_0.650064
CMS001_018_Ra_S14~NODE_92_length_846_cov_0.7841

CMS001_021_Ra_S16~NODE_1163_length_546_cov_0.614072
CMS001_021_Ra_S16~NODE_1166_length_545_cov_1.472222
CMS001_021_Ra_S16~NODE_1168_length_545_cov_1.305556
CMS001_021_Ra_S16~NODE_1169_length_545_cov_0.771368
CMS001_021_Ra_S16~NODE_1170_length_545_cov_0.487179
CMS001_021_Ra_S16~NODE_1171_length_544_cov_1.676660
CMS001_021_Ra_S16~NODE_1173_length_544_cov_0.828694
CMS001_021_Ra_S16~NODE_1174_length_543_cov_1.669528
CMS001_021_Ra_S16~NODE_1175_length_543_cov_1.542918
CMS001_021_Ra_S16~NODE_1176_length_543_cov_1.154506
CMS001_021_Ra_S16~NODE_1177_length_543_cov_0.401288
CMS001_021_Ra_S16~NODE_1180_length_542_cov_1.240860
CMS001_021_Ra_S16~NODE_1182_length_542_cov_0.716129
CMS001_021_Ra_S16~NODE_1183_length_542_cov_0.582796
CMS001_021_Ra_S16~NODE_1185_length_541_cov_1.234914
CMS001_021_Ra_S16~NODE_1186_length_541_cov_0.943966
CMS001_021_Ra_S16~NODE_1187_length_541_cov_0.894397
CMS001_021_Ra_S16~NODE_1188_length_541_cov_0.885776
CMS001_021_Ra_S16~NODE_1189_length_541_cov_0.883621
CMS001_021_R

CMS001_026_Ra_S18~NODE_153_length_1664_cov_0.837429
CMS001_026_Ra_S18~NODE_154_length_1661_cov_1.410354
CMS001_026_Ra_S18~NODE_156_length_1655_cov_1.895437
CMS001_026_Ra_S18~NODE_157_length_1651_cov_1.777001
CMS001_026_Ra_S18~NODE_158_length_1650_cov_2.080102
CMS001_026_Ra_S18~NODE_161_length_1647_cov_0.632484
CMS001_026_Ra_S18~NODE_166_length_1628_cov_0.800774
CMS001_026_Ra_S18~NODE_167_length_1622_cov_4.867961
CMS001_026_Ra_S18~NODE_170_length_1609_cov_2.786554
CMS001_026_Ra_S18~NODE_171_length_1600_cov_0.929744
CMS001_026_Ra_S18~NODE_172_length_1599_cov_1.772011
CMS001_026_Ra_S18~NODE_173_length_1589_cov_1.478175
CMS001_026_Ra_S18~NODE_174_length_1587_cov_0.701987
CMS001_026_Ra_S18~NODE_175_length_1583_cov_1.162019
CMS001_026_Ra_S18~NODE_176_length_1574_cov_0.981296
CMS001_026_Ra_S18~NODE_178_length_1564_cov_4.156019
CMS001_026_Ra_S18~NODE_180_length_1556_cov_0.813387
CMS001_026_Ra_S18~NODE_182_length_1552_cov_2.600000
CMS001_026_Ra_S18~NODE_184_length_1544_cov_4.286980
CMS001_026_R

CMS001_026_Ra_S18~NODE_2812_length_585_cov_0.980315
CMS001_026_Ra_S18~NODE_2813_length_585_cov_0.769685
CMS001_026_Ra_S18~NODE_2814_length_585_cov_0.610236
CMS001_026_Ra_S18~NODE_2817_length_584_cov_1.865878
CMS001_026_Ra_S18~NODE_2818_length_584_cov_1.619329
CMS001_026_Ra_S18~NODE_2819_length_584_cov_1.581854
CMS001_026_Ra_S18~NODE_2821_length_584_cov_1.386588
CMS001_026_Ra_S18~NODE_2822_length_584_cov_0.692308
CMS001_026_Ra_S18~NODE_2823_length_584_cov_0.489152
CMS001_026_Ra_S18~NODE_2827_length_583_cov_1.486166
CMS001_026_Ra_S18~NODE_2828_length_583_cov_1.415020
CMS001_026_Ra_S18~NODE_2829_length_583_cov_1.409091
CMS001_026_Ra_S18~NODE_2830_length_583_cov_1.399209
CMS001_026_Ra_S18~NODE_2833_length_582_cov_10.823762
CMS001_026_Ra_S18~NODE_2834_length_582_cov_2.223762
CMS001_026_Ra_S18~NODE_2837_length_582_cov_1.293069
CMS001_026_Ra_S18~NODE_2839_length_582_cov_1.223762
CMS001_026_Ra_S18~NODE_2840_length_582_cov_1.083168
CMS001_026_Ra_S18~NODE_2841_length_582_cov_1.011881
CMS001_026_

CMS001_034_Ra_S19~NODE_24_length_1332_cov_1.035060
CMS001_034_Ra_S19~NODE_25_length_1313_cov_0.467638
CMS001_034_Ra_S19~NODE_26_length_1241_cov_1.676976
CMS001_034_Ra_S19~NODE_27_length_1238_cov_0.641688
CMS001_034_Ra_S19~NODE_28_length_1236_cov_1.264021
CMS001_034_Ra_S19~NODE_29_length_1200_cov_3.979519
CMS001_034_Ra_S19~NODE_32_length_1145_cov_0.749064
CMS001_034_Ra_S19~NODE_33_length_1138_cov_1.345900
CMS001_034_Ra_S19~NODE_34_length_1119_cov_0.570058
CMS001_034_Ra_S19~NODE_37_length_1029_cov_1.078782
CMS001_034_Ra_S19~NODE_39_length_1007_cov_1.145161
CMS001_034_Ra_S19~NODE_40_length_1004_cov_0.544768
CMS001_034_Ra_S19~NODE_43_length_973_cov_0.585938
CMS001_034_Ra_S19~NODE_45_length_947_cov_0.572414
CMS001_034_Ra_S19~NODE_46_length_937_cov_2.813953
CMS001_034_Ra_S19~NODE_48_length_919_cov_5.767221
CMS001_034_Ra_S19~NODE_49_length_915_cov_1.112172
CMS001_034_Ra_S19~NODE_51_length_900_cov_0.696233
CMS001_034_Ra_S19~NODE_52_length_895_cov_0.800733
CMS001_034_Ra_S19~NODE_55_length_876_c

CMS001_036_Ra_S20~NODE_1592_length_501_cov_0.884434
CMS001_037_Ra_S21~NODE_4_length_2769_cov_1.602897
CMS001_037_Ra_S21~NODE_5_length_2512_cov_1.668994
CMS001_037_Ra_S21~NODE_7_length_2325_cov_0.631228
CMS001_037_Ra_S21~NODE_8_length_2300_cov_1.009447
CMS001_037_Ra_S21~NODE_9_length_2259_cov_1.807516
CMS001_037_Ra_S21~NODE_13_length_2018_cov_0.912931
CMS001_037_Ra_S21~NODE_15_length_1888_cov_3.689122
CMS001_037_Ra_S21~NODE_16_length_1882_cov_0.759557
CMS001_037_Ra_S21~NODE_18_length_1790_cov_1.008173
CMS001_037_Ra_S21~NODE_20_length_1745_cov_1.118705
CMS001_037_Ra_S21~NODE_22_length_1696_cov_2.251390
CMS001_037_Ra_S21~NODE_23_length_1675_cov_0.616395
CMS001_037_Ra_S21~NODE_24_length_1667_cov_5.737107
CMS001_037_Ra_S21~NODE_25_length_1628_cov_1.125725
CMS001_037_Ra_S21~NODE_27_length_1570_cov_0.618218
CMS001_037_Ra_S21~NODE_28_length_1564_cov_36.942165
CMS001_037_Ra_S21~NODE_30_length_1551_cov_0.668250
CMS001_037_Ra_S21~NODE_31_length_1550_cov_1.327223
CMS001_037_Ra_S21~NODE_32_length_1

CMS001_045_Ra_S2~NODE_319_length_535_cov_0.436681
CMS001_045_Ra_S2~NODE_322_length_534_cov_0.752735
CMS001_045_Ra_S2~NODE_324_length_533_cov_0.611842
CMS001_045_Ra_S2~NODE_325_length_530_cov_0.576159
CMS001_045_Ra_S2~NODE_326_length_529_cov_0.595133
CMS001_045_Ra_S2~NODE_327_length_529_cov_0.380531
CMS001_045_Ra_S2~NODE_328_length_528_cov_2.106430
CMS001_045_Ra_S2~NODE_329_length_528_cov_1.268293
CMS001_045_Ra_S2~NODE_332_length_527_cov_0.720000
CMS001_045_Ra_S2~NODE_334_length_526_cov_0.975501
CMS001_045_Ra_S2~NODE_335_length_526_cov_0.804009
CMS001_045_Ra_S2~NODE_337_length_526_cov_0.706013
CMS001_045_Ra_S2~NODE_338_length_525_cov_0.486607
CMS001_045_Ra_S2~NODE_340_length_524_cov_0.742729
CMS001_045_Ra_S2~NODE_341_length_523_cov_1.156951
CMS001_045_Ra_S2~NODE_342_length_522_cov_0.822472
CMS001_045_Ra_S2~NODE_343_length_521_cov_0.479730
CMS001_045_Ra_S2~NODE_346_length_519_cov_0.649321
CMS001_045_Ra_S2~NODE_349_length_518_cov_0.945578
CMS001_045_Ra_S2~NODE_350_length_517_cov_1.990909


CMS001_055_Ra_S9~NODE_50_length_829_cov_0.527926
CMS001_055_Ra_S9~NODE_52_length_826_cov_0.666222
CMS001_055_Ra_S9~NODE_55_length_804_cov_0.729023
CMS001_055_Ra_S9~NODE_56_length_803_cov_1.126722
CMS001_055_Ra_S9~NODE_59_length_790_cov_0.709677
CMS001_055_Ra_S9~NODE_60_length_788_cov_0.575246
CMS001_055_Ra_S9~NODE_61_length_786_cov_0.823695
CMS001_055_Ra_S9~NODE_63_length_781_cov_0.576705
CMS001_055_Ra_S9~NODE_65_length_768_cov_0.635311
CMS001_055_Ra_S9~NODE_66_length_763_cov_0.667638
CMS001_055_Ra_S9~NODE_71_length_751_cov_1.485163
CMS001_055_Ra_S9~NODE_77_length_732_cov_0.691603
CMS001_055_Ra_S9~NODE_82_length_725_cov_0.671296
CMS001_055_Ra_S9~NODE_84_length_723_cov_0.693498
CMS001_055_Ra_S9~NODE_86_length_714_cov_0.640502
CMS001_055_Ra_S9~NODE_87_length_713_cov_0.550314
CMS001_055_Ra_S9~NODE_88_length_712_cov_0.662992
CMS001_055_Ra_S9~NODE_89_length_707_cov_1.034921
CMS001_055_Ra_S9~NODE_90_length_704_cov_1.239234
CMS001_055_Ra_S9~NODE_91_length_700_cov_0.616372
CMS001_055_Ra_S9~NOD

CMS001_060_Ra_S12~NODE_510_length_1350_cov_1.858602
CMS001_060_Ra_S12~NODE_511_length_1350_cov_1.808327
CMS001_060_Ra_S12~NODE_512_length_1349_cov_1.533805
CMS001_060_Ra_S12~NODE_513_length_1349_cov_1.344340
CMS001_060_Ra_S12~NODE_514_length_1348_cov_3.907946
CMS001_060_Ra_S12~NODE_515_length_1347_cov_7.388976
CMS001_060_Ra_S12~NODE_516_length_1347_cov_1.784252
CMS001_060_Ra_S12~NODE_518_length_1346_cov_1.334909
CMS001_060_Ra_S12~NODE_519_length_1345_cov_8.211356
CMS001_060_Ra_S12~NODE_520_length_1343_cov_1.394945
CMS001_060_Ra_S12~NODE_521_length_1343_cov_0.612954
CMS001_060_Ra_S12~NODE_522_length_1341_cov_6.836234
CMS001_060_Ra_S12~NODE_523_length_1341_cov_0.859177
CMS001_060_Ra_S12~NODE_525_length_1340_cov_1.573238
CMS001_060_Ra_S12~NODE_526_length_1340_cov_1.388757
CMS001_060_Ra_S12~NODE_527_length_1339_cov_2.996830
CMS001_060_Ra_S12~NODE_528_length_1339_cov_1.748019
CMS001_060_Ra_S12~NODE_529_length_1335_cov_5.267886
CMS001_060_Ra_S12~NODE_530_length_1334_cov_3.945903
CMS001_060_R

CMS001_060_Ra_S12~NODE_2320_length_767_cov_1.379710
CMS001_060_Ra_S12~NODE_2321_length_767_cov_0.824638
CMS001_060_Ra_S12~NODE_2322_length_767_cov_0.530435
CMS001_060_Ra_S12~NODE_2323_length_766_cov_12.193033
CMS001_060_Ra_S12~NODE_2324_length_766_cov_3.021771
CMS001_060_Ra_S12~NODE_2325_length_766_cov_2.692308
CMS001_060_Ra_S12~NODE_2326_length_766_cov_2.605225
CMS001_060_Ra_S12~NODE_2327_length_766_cov_2.358491
CMS001_060_Ra_S12~NODE_2328_length_766_cov_1.449927
CMS001_060_Ra_S12~NODE_2329_length_766_cov_1.444122
CMS001_060_Ra_S12~NODE_2330_length_766_cov_1.349782
CMS001_060_Ra_S12~NODE_2331_length_766_cov_1.281567
CMS001_060_Ra_S12~NODE_2332_length_766_cov_1.207547
CMS001_060_Ra_S12~NODE_2333_length_766_cov_0.998549
CMS001_060_Ra_S12~NODE_2334_length_766_cov_0.825835
CMS001_060_Ra_S12~NODE_2335_length_766_cov_0.735849
CMS001_060_Ra_S12~NODE_2336_length_765_cov_3.194767
CMS001_060_Ra_S12~NODE_2337_length_765_cov_2.531977
CMS001_060_Ra_S12~NODE_2338_length_765_cov_1.375000
CMS001_060_

CMS001_060_Ra_S12~NODE_4461_length_580_cov_3.526839
CMS001_060_Ra_S12~NODE_4462_length_580_cov_2.330020
CMS001_060_Ra_S12~NODE_4463_length_580_cov_1.920477
CMS001_060_Ra_S12~NODE_4464_length_580_cov_1.850895
CMS001_060_Ra_S12~NODE_4465_length_580_cov_1.654076
CMS001_060_Ra_S12~NODE_4466_length_580_cov_1.648111
CMS001_060_Ra_S12~NODE_4467_length_580_cov_1.495030
CMS001_060_Ra_S12~NODE_4468_length_580_cov_1.457256
CMS001_060_Ra_S12~NODE_4469_length_580_cov_1.413519
CMS001_060_Ra_S12~NODE_4470_length_580_cov_1.250497
CMS001_060_Ra_S12~NODE_4471_length_580_cov_1.170974
CMS001_060_Ra_S12~NODE_4472_length_580_cov_1.168986
CMS001_060_Ra_S12~NODE_4473_length_580_cov_1.153082
CMS001_060_Ra_S12~NODE_4474_length_580_cov_1.017893
CMS001_060_Ra_S12~NODE_4475_length_580_cov_0.908549
CMS001_060_Ra_S12~NODE_4476_length_580_cov_0.846918
CMS001_060_Ra_S12~NODE_4477_length_579_cov_1.950199
CMS001_060_Ra_S12~NODE_4478_length_579_cov_1.643426
CMS001_060_Ra_S12~NODE_4479_length_579_cov_1.448207
CMS001_060_R

KeyboardInterrupt: 

## Conserved Domain Finding

https://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi

Assemblies identified as RNA viruses were screened against the Conserved Doman Database (www.ncbi.nlm.nih.gov/Structure/cdd/wrpsb.cgi) with an expected value threshold of 1×10−3 374 to identify viral sequence motifs. The mitochondrial COX1 gene, mined from the sequence data, and all contigs with RdRp-motifs was mapped back, using Bowtie2 (31), against all quality trimmed libraries to estimate abundance. A virus was considered to be in high abundance if: (i) it represented >0.1% of total non-ribosomal RNA in the library, and (ii) if the abundance was higher to that of abundant host COX1 gene (12, 32), and hence likely to be mosquito associated. Hits that were below the level of cross380 library contamination due to index-hopping, measured as 0.1% of the most abundant library for the respective virus species or less than 1 read per million mapped to a specific virus contig, was considered negative (coloured grey in Table 1 and Table 2, respectively). 